1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 /** 40 * <p>Standalone utility class providing UTF16 character conversions and 41 * indexing conversions.</p> 42 * <p>Code that uses strings alone rarely need modification. 43 * By design, UTF-16 does not allow overlap, so searching for strings is a safe 44 * operation. Similarly, concatenation is always safe. Substringing is safe if 45 * the start and end are both on UTF-32 boundaries. In normal code, the values 46 * for start and end are on those boundaries, since they arose from operations 47 * like searching. If not, the nearest UTF-32 boundaries can be determined 48 * using <code>bounds()</code>.</p> 49 * <strong>Examples:</strong> 50 * <p>The following examples illustrate use of some of these methods. 51 * <pre> 52 * // iteration forwards: Original 53 * for (int i = 0; i < s.length(); ++i) { 54 * char ch = s.charAt(i); 55 * doSomethingWith(ch); 56 * } 57 * 58 * // iteration forwards: Changes for UTF-32 59 * int ch; 60 * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) { 61 * ch = UTF16.charAt(s,i); 62 * doSomethingWith(ch); 63 * } 64 * 65 * // iteration backwards: Original 66 * for (int i = s.length() -1; i >= 0; --i) { 67 * char ch = s.charAt(i); 68 * doSomethingWith(ch); 69 * } 70 * 71 * // iteration backwards: Changes for UTF-32 72 * int ch; 73 * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) { 74 * ch = UTF16.charAt(s,i); 75 * doSomethingWith(ch); 76 * } 77 * </pre> 78 * <strong>Notes:</strong> 79 * <ul> 80 * <li> 81 * <strong>Naming:</strong> For clarity, High and Low surrogates are called 82 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better 83 * sense of their ordering in a string. <code>offset16</code> and 84 * <code>offset32</code> are used to distinguish offsets to UTF-16 85 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is 86 * used to contain UTF-32 characters, as opposed to <code>char16</code>, 87 * which is a UTF-16 code unit. 88 * </li> 89 * <li> 90 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a 91 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in 92 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and 93 * back if and only if <code>bounds(string, offset16) != TRAIL</code>. 94 * </li> 95 * <li> 96 * <strong>Exceptions:</strong> The error checking will throw an exception 97 * if indices are out of bounds. Other than that, all methods will 98 * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 99 * values are present. <code>UCharacter.isLegal()</code> can be used to check 100 * for validity if desired. 101 * </li> 102 * <li> 103 * <strong>Unmatched Surrogates:</strong> If the string contains unmatched 104 * surrogates, then these are counted as one UTF-32 value. This matches 105 * their iteration behavior, which is vital. It also matches common display 106 * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). 107 * </li> 108 * <li> 109 * <strong>Optimization:</strong> The method implementations may need 110 * optimization if the compiler doesn't fold static final methods. Since 111 * surrogate pairs will form an exceeding small percentage of all the text 112 * in the world, the singleton case should always be optimized for. 113 * </li> 114 * </ul> 115 * @author Mark Davis, with help from Markus Scherer 116 * @stable ICU 2.1 117 */ 118 119 public final class UTF16 120 { 121 // public variables --------------------------------------------------- 122 123 /** 124 * The lowest Unicode code point value. 125 * @stable ICU 2.1 126 */ 127 public static final int CODEPOINT_MIN_VALUE = 0; 128 /** 129 * The highest Unicode code point value (scalar value) according to the 130 * Unicode Standard. 131 * @stable ICU 2.1 132 */ 133 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 134 /** 135 * The minimum value for Supplementary code points 136 * @stable ICU 2.1 137 */ 138 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 139 /** 140 * Lead surrogate minimum value 141 * @stable ICU 2.1 142 */ 143 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 144 /** 145 * Trail surrogate minimum value 146 * @stable ICU 2.1 147 */ 148 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 149 /** 150 * Lead surrogate maximum value 151 * @stable ICU 2.1 152 */ 153 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 154 /** 155 * Trail surrogate maximum value 156 * @stable ICU 2.1 157 */ 158 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 159 /** 160 * Surrogate minimum value 161 * @stable ICU 2.1 162 */ 163 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 164 165 // public method ------------------------------------------------------ 166 167 /** 168 * Extract a single UTF-32 value from a string. 169 * Used when iterating forwards or backwards (with 170 * <code>UTF16.getCharCount()</code>, as well as random access. If a 171 * validity check is required, use 172 * <code><a href="../lang/UCharacter.html#isLegal(char)"> 173 * UCharacter.isLegal()</a></code> on the return value. 174 * If the char retrieved is part of a surrogate pair, its supplementary 175 * character will be returned. If a complete supplementary character is 176 * not found the incomplete character will be returned 177 * @param source array of UTF-16 chars 178 * @param offset16 UTF-16 offset to the start of the character. 179 * @return UTF-32 value for the UTF-32 value that contains the char at 180 * offset16. The boundaries of that codepoint are the same as in 181 * <code>bounds32()</code>. 182 * @exception IndexOutOfBoundsException thrown if offset16 is out of 183 * bounds. 184 * @stable ICU 2.1 185 */ 186 public static int charAt(String source, int offset16) { 187 char single = source.charAt(offset16); 188 if (single < LEAD_SURROGATE_MIN_VALUE) { 189 return single; 190 } 191 return _charAt(source, offset16, single); 192 } 193 194 private static int _charAt(String source, int offset16, char single) { 195 if (single > TRAIL_SURROGATE_MAX_VALUE) { 196 return single; 197 } 198 199 // Convert the UTF-16 surrogate pair if necessary. 200 // For simplicity in usage, and because the frequency of pairs is 201 // low, look both directions. 202 203 if (single <= LEAD_SURROGATE_MAX_VALUE) { 204 ++offset16; 205 if (source.length() != offset16) { 206 char trail = source.charAt(offset16); 207 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 208 return UCharacterProperty.getRawSupplementary(single, trail); 209 } 210 } 211 } else { 212 --offset16; 213 if (offset16 >= 0) { 214 // single is a trail surrogate so 215 char lead = source.charAt(offset16); 216 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 217 return UCharacterProperty.getRawSupplementary(lead, single); 218 } 219 } 220 } 221 return single; // return unmatched surrogate 222 } 223 224 /** 225 * Extract a single UTF-32 value from a substring. 226 * Used when iterating forwards or backwards (with 227 * <code>UTF16.getCharCount()</code>, as well as random access. If a 228 * validity check is required, use 229 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 230 * </a></code> on the return value. 231 * If the char retrieved is part of a surrogate pair, its supplementary 232 * character will be returned. If a complete supplementary character is 233 * not found the incomplete character will be returned 234 * @param source array of UTF-16 chars 235 * @param start offset to substring in the source array for analyzing 236 * @param limit offset to substring in the source array for analyzing 237 * @param offset16 UTF-16 offset relative to start 238 * @return UTF-32 value for the UTF-32 value that contains the char at 239 * offset16. The boundaries of that codepoint are the same as in 240 * <code>bounds32()</code>. 241 * @exception IndexOutOfBoundsException thrown if offset16 is not within 242 * the range of start and limit. 243 * @stable ICU 2.1 244 */ 245 public static int charAt(char source[], int start, int limit, 246 int offset16) 247 { 248 offset16 += start; 249 if (offset16 < start || offset16 >= limit) { 250 throw new ArrayIndexOutOfBoundsException(offset16); 251 } 252 253 char single = source[offset16]; 254 if (!isSurrogate(single)) { 255 return single; 256 } 257 258 // Convert the UTF-16 surrogate pair if necessary. 259 // For simplicity in usage, and because the frequency of pairs is 260 // low, look both directions. 261 if (single <= LEAD_SURROGATE_MAX_VALUE) { 262 offset16 ++; 263 if (offset16 >= limit) { 264 return single; 265 } 266 char trail = source[offset16]; 267 if (isTrailSurrogate(trail)) { 268 return UCharacterProperty.getRawSupplementary(single, trail); 269 } 270 } 271 else { // isTrailSurrogate(single), so 272 if (offset16 == start) { 273 return single; 274 } 275 offset16 --; 276 char lead = source[offset16]; 277 if (isLeadSurrogate(lead)) 278 return UCharacterProperty.getRawSupplementary(lead, single); 279 } 280 return single; // return unmatched surrogate 281 } 282 283 /** 284 * Determines how many chars this char32 requires. 285 * If a validity check is required, use <code> 286 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on 287 * char32 before calling. 288 * @param char32 the input codepoint. 289 * @return 2 if is in supplementary space, otherwise 1. 290 * @stable ICU 2.1 291 */ 292 public static int getCharCount(int char32) 293 { 294 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 295 return 1; 296 } 297 return 2; 298 } 299 300 /** 301 * Determines whether the code value is a surrogate. 302 * @param char16 the input character. 303 * @return true iff the input character is a surrogate. 304 * @stable ICU 2.1 305 */ 306 public static boolean isSurrogate(char char16) 307 { 308 return LEAD_SURROGATE_MIN_VALUE <= char16 && 309 char16 <= TRAIL_SURROGATE_MAX_VALUE; 310 } 311 312 /** 313 * Determines whether the character is a trail surrogate. 314 * @param char16 the input character. 315 * @return true iff the input character is a trail surrogate. 316 * @stable ICU 2.1 317 */ 318 public static boolean isTrailSurrogate(char char16) 319 { 320 return (TRAIL_SURROGATE_MIN_VALUE <= char16 && 321 char16 <= TRAIL_SURROGATE_MAX_VALUE); 322 } 323 324 /** 325 * Determines whether the character is a lead surrogate. 326 * @param char16 the input character. 327 * @return true iff the input character is a lead surrogate 328 * @stable ICU 2.1 329 */ 330 public static boolean isLeadSurrogate(char char16) 331 { 332 return LEAD_SURROGATE_MIN_VALUE <= char16 && 333 char16 <= LEAD_SURROGATE_MAX_VALUE; 334 } 335 336 /** 337 * Returns the lead surrogate. 338 * If a validity check is required, use 339 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 340 * on char32 before calling. 341 * @param char32 the input character. 342 * @return lead surrogate if the getCharCount(ch) is 2; <br> 343 * and 0 otherwise (note: 0 is not a valid lead surrogate). 344 * @stable ICU 2.1 345 */ 346 public static char getLeadSurrogate(int char32) 347 { 348 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 349 return (char)(LEAD_SURROGATE_OFFSET_ + 350 (char32 >> LEAD_SURROGATE_SHIFT_)); 351 } 352 353 return 0; 354 } 355 356 /** 357 * Returns the trail surrogate. 358 * If a validity check is required, use 359 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 360 * on char32 before calling. 361 * @param char32 the input character. 362 * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise 363 * the character itself 364 * @stable ICU 2.1 365 */ 366 public static char getTrailSurrogate(int char32) 367 { 368 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 369 return (char)(TRAIL_SURROGATE_MIN_VALUE + 370 (char32 & TRAIL_SURROGATE_MASK_)); 371 } 372 373 return (char)char32; 374 } 375 376 /** 377 * Convenience method corresponding to String.valueOf(char). Returns a one 378 * or two char string containing the UTF-32 value in UTF16 format. If a 379 * validity check is required, use 380 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 381 * on char32 before calling. 382 * @param char32 the input character. 383 * @return string value of char32 in UTF16 format 384 * @exception IllegalArgumentException thrown if char32 is a invalid 385 * codepoint. 386 * @stable ICU 2.1 387 */ 388 public static String valueOf(int char32) 389 { 390 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 391 throw new IllegalArgumentException("Illegal codepoint"); 392 } 393 return toString(char32); 394 } 395 396 /** 397 * Append a single UTF-32 value to the end of a StringBuffer. 398 * If a validity check is required, use 399 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 400 * on char32 before calling. 401 * @param target the buffer to append to 402 * @param char32 value to append. 403 * @return the updated StringBuffer 404 * @exception IllegalArgumentException thrown when char32 does not lie 405 * within the range of the Unicode codepoints 406 * @stable ICU 2.1 407 */ 408 public static StringBuffer append(StringBuffer target, int char32) 409 { 410 // Check for irregular values 411 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 412 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 413 } 414 415 // Write the UTF-16 values 416 if (char32 >= SUPPLEMENTARY_MIN_VALUE) 417 { 418 target.append(getLeadSurrogate(char32)); 419 target.append(getTrailSurrogate(char32)); 420 } 421 else { 422 target.append((char)char32); 423 } 424 return target; 425 } 426 427 //// for StringPrep 428 /** 429 * Shifts offset16 by the argument number of codepoints within a subarray. 430 * @param source char array 431 * @param start position of the subarray to be performed on 432 * @param limit position of the subarray to be performed on 433 * @param offset16 UTF16 position to shift relative to start 434 * @param shift32 number of codepoints to shift 435 * @return new shifted offset16 relative to start 436 * @exception IndexOutOfBoundsException if the new offset16 is out of 437 * bounds with respect to the subarray or the subarray bounds 438 * are out of range. 439 * @stable ICU 2.1 440 */ 441 public static int moveCodePointOffset(char source[], int start, int limit, 442 int offset16, int shift32) 443 { 444 int size = source.length; 445 int count; 446 char ch; 447 int result = offset16 + start; 448 if (start<0 || limit<start) { 449 throw new StringIndexOutOfBoundsException(start); 450 } 451 if (limit>size) { 452 throw new StringIndexOutOfBoundsException(limit); 453 } 454 if (offset16<0 || result>limit) { 455 throw new StringIndexOutOfBoundsException(offset16); 456 } 457 if (shift32 > 0 ) { 458 if (shift32 + result > size) { 459 throw new StringIndexOutOfBoundsException(result); 460 } 461 count = shift32; 462 while (result < limit && count > 0) 463 { 464 ch = source[result]; 465 if (isLeadSurrogate(ch) && (result+1 < limit) && 466 isTrailSurrogate(source[result+1])) { 467 result ++; 468 } 469 count --; 470 result ++; 471 } 472 } else { 473 if (result + shift32 < start) { 474 throw new StringIndexOutOfBoundsException(result); 475 } 476 for (count=-shift32; count>0; count--) { 477 result--; 478 if (result<start) { 479 break; 480 } 481 ch = source[result]; 482 if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) { 483 result--; 484 } 485 } 486 } 487 if (count != 0) { 488 throw new StringIndexOutOfBoundsException(shift32); 489 } 490 result -= start; 491 return result; 492 } 493 494 // private data members ------------------------------------------------- 495 496 /** 497 * Shift value for lead surrogate to form a supplementary character. 498 */ 499 private static final int LEAD_SURROGATE_SHIFT_ = 10; 500 501 /** 502 * Mask to retrieve the significant value from a trail surrogate. 503 */ 504 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 505 506 /** 507 * Value that all lead surrogate starts with 508 */ 509 private static final int LEAD_SURROGATE_OFFSET_ = 510 LEAD_SURROGATE_MIN_VALUE - 511 (SUPPLEMENTARY_MIN_VALUE 512 >> LEAD_SURROGATE_SHIFT_); 513 514 // private methods ------------------------------------------------------ 515 516 /** 517 * <p>Converts argument code point and returns a String object representing 518 * the code point's value in UTF16 format.</p> 519 * <p>This method does not check for the validity of the codepoint, the 520 * results are not guaranteed if a invalid codepoint is passed as 521 * argument.</p> 522 * <p>The result is a string whose length is 1 for non-supplementary code 523 * points, 2 otherwise.</p> 524 * @param ch code point 525 * @return string representation of the code point 526 */ 527 private static String toString(int ch) 528 { 529 if (ch < SUPPLEMENTARY_MIN_VALUE) { 530 return String.valueOf((char)ch); 531 } 532 533 StringBuilder result = new StringBuilder(); 534 result.append(getLeadSurrogate(ch)); 535 result.append(getTrailSurrogate(ch)); 536 return result.toString(); 537 } 538 }