< prev index next >
jdk/src/java.base/share/classes/sun/text/normalizer/UTF16.java
Print this page
@@ -1,7 +1,7 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
@@ -20,19 +20,14 @@
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
-/*
+/**
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
@@ -55,25 +50,25 @@
* doSomethingWith(ch);
* }
*
* // iteration forwards: Changes for UTF-32
* int ch;
- * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
- * ch = UTF16.charAt(s,i);
+ * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
+ * ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Original
- * for (int i = s.length() -1; i >= 0; --i) {
+ * for (int i = s.length() - 1; i >= 0; --i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Changes for UTF-32
* int ch;
- * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
- * ch = UTF16.charAt(s,i);
+ * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
+ * ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
* }</pre>
* <strong>Notes:</strong>
* <ul>
@@ -159,11 +154,45 @@
/**
* Surrogate minimum value
* @stable ICU 2.1
*/
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
+ /**
+ * Lead surrogate bitmask
+ */
+ private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
+ /**
+ * Trail surrogate bitmask
+ */
+ private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
+ /**
+ * Surrogate bitmask
+ */
+ private static final int SURROGATE_BITMASK = 0xFFFFF800;
+ /**
+ * Lead surrogate bits
+ */
+ private static final int LEAD_SURROGATE_BITS = 0xD800;
+ /**
+ * Trail surrogate bits
+ */
+ private static final int TRAIL_SURROGATE_BITS = 0xDC00;
+ /**
+ * Surrogate bits
+ */
+ private static final int SURROGATE_BITS = 0xD800;
+
+ // constructor --------------------------------------------------------
+ // /CLOVER:OFF
+ /**
+ * Prevent instance from being created.
+ */
+ private UTF16() {
+ }
+
+ // /CLOVER:ON
// public method ------------------------------------------------------
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
@@ -220,33 +249,86 @@
}
return single; // return unmatched surrogate
}
/**
- * Extract a single UTF-32 value from a substring.
+ * Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
- * @param start offset to substring in the source array for analyzing
- * @param limit offset to substring in the source array for analyzing
- * @param offset16 UTF-16 offset relative to start
+ * @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
- * @exception IndexOutOfBoundsException thrown if offset16 is not within
- * the range of start and limit.
+ * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
- public static int charAt(char source[], int start, int limit,
- int offset16)
- {
+ public static int charAt(CharSequence source, int offset16) {
+ char single = source.charAt(offset16);
+ if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
+ return single;
+ }
+ return _charAt(source, offset16, single);
+ }
+
+ private static int _charAt(CharSequence source, int offset16, char single) {
+ if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+ return single;
+ }
+
+ // Convert the UTF-16 surrogate pair if necessary.
+ // For simplicity in usage, and because the frequency of pairs is
+ // low, look both directions.
+
+ if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+ ++offset16;
+ if (source.length() != offset16) {
+ char trail = source.charAt(offset16);
+ if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
+ && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+ return UCharacterProperty.getRawSupplementary(single, trail);
+ }
+ }
+ } else {
+ --offset16;
+ if (offset16 >= 0) {
+ // single is a trail surrogate so
+ char lead = source.charAt(offset16);
+ if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
+ && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+ return UCharacterProperty.getRawSupplementary(lead, single);
+ }
+ }
+ }
+ return single; // return unmatched surrogate
+ }
+
+ /**
+ * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
+ * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
+ * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
+ * </a></code>
+ * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
+ * character will be returned. If a complete supplementary character is not found the incomplete
+ * character will be returned
+ *
+ * @param source Array of UTF-16 chars
+ * @param start Offset to substring in the source array for analyzing
+ * @param limit Offset to substring in the source array for analyzing
+ * @param offset16 UTF-16 offset relative to start
+ * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
+ * of that codepoint are the same as in <code>bounds32()</code>.
+ * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
+ * @stable ICU 2.1
+ */
+ public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
@@ -257,11 +339,11 @@
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
- offset16 ++;
+ offset16++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
@@ -270,11 +352,11 @@
}
else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
- offset16 --;
+ offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
@@ -298,41 +380,38 @@
}
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
- * @return true iff the input character is a surrogate.
+ * @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
- return LEAD_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= TRAIL_SURROGATE_MAX_VALUE;
+ return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
- * @return true iff the input character is a trail surrogate.
+ * @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
- return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= TRAIL_SURROGATE_MAX_VALUE);
+ return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
- * @return true iff the input character is a lead surrogate
+ * @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
- return LEAD_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= LEAD_SURROGATE_MAX_VALUE;
+ return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
* Returns the lead surrogate.
* If a validity check is required, use
@@ -357,22 +436,22 @@
* Returns the trail surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
- * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
+ * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
* the character itself
* @stable ICU 2.1
*/
public static char getTrailSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(TRAIL_SURROGATE_MIN_VALUE +
(char32 & TRAIL_SURROGATE_MASK_));
}
- return (char)char32;
+ return (char) char32;
}
/**
* Convenience method corresponding to String.valueOf(char). Returns a one
* or two char string containing the UTF-32 value in UTF16 format. If a
@@ -417,16 +496,15 @@
{
target.append(getLeadSurrogate(char32));
target.append(getTrailSurrogate(char32));
}
else {
- target.append((char)char32);
+ target.append((char) char32);
}
return target;
}
- //// for StringPrep
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
* @param source char array
* @param start position of the subarray to be performed on
* @param limit position of the subarray to be performed on
@@ -443,45 +521,45 @@
{
int size = source.length;
int count;
char ch;
int result = offset16 + start;
- if (start<0 || limit<start) {
+ if (start < 0 || limit < start) {
throw new StringIndexOutOfBoundsException(start);
}
- if (limit>size) {
+ if (limit > size) {
throw new StringIndexOutOfBoundsException(limit);
}
- if (offset16<0 || result>limit) {
+ if (offset16 < 0 || result > limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
- if (shift32 > 0 ) {
+ if (shift32 > 0) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
count = shift32;
while (result < limit && count > 0)
{
ch = source[result];
- if (isLeadSurrogate(ch) && (result+1 < limit) &&
- isTrailSurrogate(source[result+1])) {
- result ++;
+ if (isLeadSurrogate(ch) && (result + 1 < limit) &&
+ isTrailSurrogate(source[result + 1])) {
+ result++;
}
- count --;
- result ++;
+ count--;
+ result++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
- for (count=-shift32; count>0; count--) {
+ for (count = -shift32; count > 0; count--) {
result--;
- if (result<start) {
+ if (result < start) {
break;
}
ch = source[result];
- if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
+ if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
result--;
}
}
}
if (count != 0) {
@@ -525,11 +603,11 @@
* @return string representation of the code point
*/
private static String toString(int ch)
{
if (ch < SUPPLEMENTARY_MIN_VALUE) {
- return String.valueOf((char)ch);
+ return String.valueOf((char) ch);
}
StringBuilder result = new StringBuilder();
result.append(getLeadSurrogate(ch));
result.append(getTrailSurrogate(ch));
< prev index next >