21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 /** 40 * <p>Standalone utility class providing UTF16 character conversions and 41 * indexing conversions.</p> 42 * <p>Code that uses strings alone rarely need modification. 43 * By design, UTF-16 does not allow overlap, so searching for strings is a safe 44 * operation. Similarly, concatenation is always safe. Substringing is safe if 45 * the start and end are both on UTF-32 boundaries. In normal code, the values 46 * for start and end are on those boundaries, since they arose from operations 47 * like searching. If not, the nearest UTF-32 boundaries can be determined 48 * using <code>bounds()</code>.</p> 49 * <strong>Examples:</strong> 50 * <p>The following examples illustrate use of some of these methods. 51 * <pre> 52 * // iteration forwards: Original 53 * for (int i = 0; i < s.length(); ++i) { 54 * char ch = s.charAt(i); 55 * doSomethingWith(ch); 56 * } 57 * 58 * // iteration forwards: Changes for UTF-32 59 * int ch; 60 * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) { 61 * ch = UTF16.charAt(s,i); 62 * doSomethingWith(ch); 63 * } 64 * 65 * // iteration backwards: Original 66 * for (int i = s.length() -1; i >= 0; --i) { 67 * char ch = s.charAt(i); 68 * doSomethingWith(ch); 69 * } 70 * 71 * // iteration backwards: Changes for UTF-32 72 * int ch; 73 * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) { 74 * ch = UTF16.charAt(s,i); 75 * doSomethingWith(ch); 76 * } 77 * </pre> 78 * <strong>Notes:</strong> 79 * <ul> 80 * <li> 81 * <strong>Naming:</strong> For clarity, High and Low surrogates are called 82 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better 83 * sense of their ordering in a string. <code>offset16</code> and 84 * <code>offset32</code> are used to distinguish offsets to UTF-16 85 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is 86 * used to contain UTF-32 characters, as opposed to <code>char16</code>, 87 * which is a UTF-16 code unit. 88 * </li> 89 * <li> 90 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a 91 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in 92 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and 93 * back if and only if <code>bounds(string, offset16) != TRAIL</code>. 94 * </li> 95 * <li> 96 * <strong>Exceptions:</strong> The error checking will throw an exception 97 * if indices are out of bounds. Other than that, all methods will 498 */ 499 private static final int LEAD_SURROGATE_SHIFT_ = 10; 500 501 /** 502 * Mask to retrieve the significant value from a trail surrogate. 503 */ 504 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 505 506 /** 507 * Value that all lead surrogate starts with 508 */ 509 private static final int LEAD_SURROGATE_OFFSET_ = 510 LEAD_SURROGATE_MIN_VALUE - 511 (SUPPLEMENTARY_MIN_VALUE 512 >> LEAD_SURROGATE_SHIFT_); 513 514 // private methods ------------------------------------------------------ 515 516 /** 517 * <p>Converts argument code point and returns a String object representing 518 * the code point's value in UTF16 format.</p> 519 * <p>This method does not check for the validity of the codepoint, the 520 * results are not guaranteed if a invalid codepoint is passed as 521 * argument.</p> 522 * <p>The result is a string whose length is 1 for non-supplementary code 523 * points, 2 otherwise.</p> 524 * @param ch code point 525 * @return string representation of the code point 526 */ 527 private static String toString(int ch) 528 { 529 if (ch < SUPPLEMENTARY_MIN_VALUE) { 530 return String.valueOf((char)ch); 531 } 532 533 StringBuilder result = new StringBuilder(); 534 result.append(getLeadSurrogate(ch)); 535 result.append(getTrailSurrogate(ch)); 536 return result.toString(); 537 } 538 } | 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 /** 40 * <p>Standalone utility class providing UTF16 character conversions and 41 * indexing conversions. 42 * <p>Code that uses strings alone rarely need modification. 43 * By design, UTF-16 does not allow overlap, so searching for strings is a safe 44 * operation. Similarly, concatenation is always safe. Substringing is safe if 45 * the start and end are both on UTF-32 boundaries. In normal code, the values 46 * for start and end are on those boundaries, since they arose from operations 47 * like searching. If not, the nearest UTF-32 boundaries can be determined 48 * using <code>bounds()</code>. 49 * <strong>Examples:</strong> 50 * <p>The following examples illustrate use of some of these methods. 51 * <pre>{@code 52 * // iteration forwards: Original 53 * for (int i = 0; i < s.length(); ++i) { 54 * char ch = s.charAt(i); 55 * doSomethingWith(ch); 56 * } 57 * 58 * // iteration forwards: Changes for UTF-32 59 * int ch; 60 * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) { 61 * ch = UTF16.charAt(s,i); 62 * doSomethingWith(ch); 63 * } 64 * 65 * // iteration backwards: Original 66 * for (int i = s.length() -1; i >= 0; --i) { 67 * char ch = s.charAt(i); 68 * doSomethingWith(ch); 69 * } 70 * 71 * // iteration backwards: Changes for UTF-32 72 * int ch; 73 * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) { 74 * ch = UTF16.charAt(s,i); 75 * doSomethingWith(ch); 76 * } 77 * }</pre> 78 * <strong>Notes:</strong> 79 * <ul> 80 * <li> 81 * <strong>Naming:</strong> For clarity, High and Low surrogates are called 82 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better 83 * sense of their ordering in a string. <code>offset16</code> and 84 * <code>offset32</code> are used to distinguish offsets to UTF-16 85 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is 86 * used to contain UTF-32 characters, as opposed to <code>char16</code>, 87 * which is a UTF-16 code unit. 88 * </li> 89 * <li> 90 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a 91 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in 92 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and 93 * back if and only if <code>bounds(string, offset16) != TRAIL</code>. 94 * </li> 95 * <li> 96 * <strong>Exceptions:</strong> The error checking will throw an exception 97 * if indices are out of bounds. Other than that, all methods will 498 */ 499 private static final int LEAD_SURROGATE_SHIFT_ = 10; 500 501 /** 502 * Mask to retrieve the significant value from a trail surrogate. 503 */ 504 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 505 506 /** 507 * Value that all lead surrogate starts with 508 */ 509 private static final int LEAD_SURROGATE_OFFSET_ = 510 LEAD_SURROGATE_MIN_VALUE - 511 (SUPPLEMENTARY_MIN_VALUE 512 >> LEAD_SURROGATE_SHIFT_); 513 514 // private methods ------------------------------------------------------ 515 516 /** 517 * <p>Converts argument code point and returns a String object representing 518 * the code point's value in UTF16 format. 519 * <p>This method does not check for the validity of the codepoint, the 520 * results are not guaranteed if a invalid codepoint is passed as 521 * argument. 522 * <p>The result is a string whose length is 1 for non-supplementary code 523 * points, 2 otherwise. 524 * @param ch code point 525 * @return string representation of the code point 526 */ 527 private static String toString(int ch) 528 { 529 if (ch < SUPPLEMENTARY_MIN_VALUE) { 530 return String.valueOf((char)ch); 531 } 532 533 StringBuilder result = new StringBuilder(); 534 result.append(getLeadSurrogate(ch)); 535 result.append(getTrailSurrogate(ch)); 536 return result.toString(); 537 } 538 } |