1 /* 2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /** 26 ******************************************************************************* 27 * Copyright (C) 1996-2014, International Business Machines Corporation and 28 * others. All Rights Reserved. 29 ******************************************************************************* 30 */ 31 32 package sun.text.normalizer; 33 34 /** 35 * <p>Standalone utility class providing UTF16 character conversions and 36 * indexing conversions. 37 * <p>Code that uses strings alone rarely need modification. 38 * By design, UTF-16 does not allow overlap, so searching for strings is a safe 39 * operation. Similarly, concatenation is always safe. Substringing is safe if 40 * the start and end are both on UTF-32 boundaries. In normal code, the values 41 * for start and end are on those boundaries, since they arose from operations 42 * like searching. If not, the nearest UTF-32 boundaries can be determined 43 * using <code>bounds()</code>. 44 * <strong>Examples:</strong> 45 * <p>The following examples illustrate use of some of these methods. 46 * <pre>{@code 47 * // iteration forwards: Original 48 * for (int i = 0; i < s.length(); ++i) { 49 * char ch = s.charAt(i); 50 * doSomethingWith(ch); 51 * } 52 * 53 * // iteration forwards: Changes for UTF-32 54 * int ch; 55 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 56 * ch = UTF16.charAt(s, i); 57 * doSomethingWith(ch); 58 * } 59 * 60 * // iteration backwards: Original 61 * for (int i = s.length() - 1; i >= 0; --i) { 62 * char ch = s.charAt(i); 63 * doSomethingWith(ch); 64 * } 65 * 66 * // iteration backwards: Changes for UTF-32 67 * int ch; 68 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 69 * ch = UTF16.charAt(s, i); 70 * doSomethingWith(ch); 71 * } 72 * }</pre> 73 * <strong>Notes:</strong> 74 * <ul> 75 * <li> 76 * <strong>Naming:</strong> For clarity, High and Low surrogates are called 77 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better 78 * sense of their ordering in a string. <code>offset16</code> and 79 * <code>offset32</code> are used to distinguish offsets to UTF-16 80 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is 81 * used to contain UTF-32 characters, as opposed to <code>char16</code>, 82 * which is a UTF-16 code unit. 83 * </li> 84 * <li> 85 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a 86 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in 87 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and 88 * back if and only if <code>bounds(string, offset16) != TRAIL</code>. 89 * </li> 90 * <li> 91 * <strong>Exceptions:</strong> The error checking will throw an exception 92 * if indices are out of bounds. Other than that, all methods will 93 * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 94 * values are present. <code>UCharacter.isLegal()</code> can be used to check 95 * for validity if desired. 96 * </li> 97 * <li> 98 * <strong>Unmatched Surrogates:</strong> If the string contains unmatched 99 * surrogates, then these are counted as one UTF-32 value. This matches 100 * their iteration behavior, which is vital. It also matches common display 101 * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). 102 * </li> 103 * <li> 104 * <strong>Optimization:</strong> The method implementations may need 105 * optimization if the compiler doesn't fold static final methods. Since 106 * surrogate pairs will form an exceeding small percentage of all the text 107 * in the world, the singleton case should always be optimized for. 108 * </li> 109 * </ul> 110 * @author Mark Davis, with help from Markus Scherer 111 * @stable ICU 2.1 112 */ 113 114 public final class UTF16 115 { 116 // public variables --------------------------------------------------- 117 118 /** 119 * The lowest Unicode code point value. 120 * @stable ICU 2.1 121 */ 122 public static final int CODEPOINT_MIN_VALUE = 0; 123 /** 124 * The highest Unicode code point value (scalar value) according to the 125 * Unicode Standard. 126 * @stable ICU 2.1 127 */ 128 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 129 /** 130 * The minimum value for Supplementary code points 131 * @stable ICU 2.1 132 */ 133 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 134 /** 135 * Lead surrogate minimum value 136 * @stable ICU 2.1 137 */ 138 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 139 /** 140 * Trail surrogate minimum value 141 * @stable ICU 2.1 142 */ 143 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 144 /** 145 * Lead surrogate maximum value 146 * @stable ICU 2.1 147 */ 148 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 149 /** 150 * Trail surrogate maximum value 151 * @stable ICU 2.1 152 */ 153 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 154 /** 155 * Surrogate minimum value 156 * @stable ICU 2.1 157 */ 158 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 159 /** 160 * Lead surrogate bitmask 161 */ 162 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 163 /** 164 * Trail surrogate bitmask 165 */ 166 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 167 /** 168 * Surrogate bitmask 169 */ 170 private static final int SURROGATE_BITMASK = 0xFFFFF800; 171 /** 172 * Lead surrogate bits 173 */ 174 private static final int LEAD_SURROGATE_BITS = 0xD800; 175 /** 176 * Trail surrogate bits 177 */ 178 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 179 /** 180 * Surrogate bits 181 */ 182 private static final int SURROGATE_BITS = 0xD800; 183 184 // constructor -------------------------------------------------------- 185 186 // /CLOVER:OFF 187 /** 188 * Prevent instance from being created. 189 */ 190 private UTF16() { 191 } 192 193 // /CLOVER:ON 194 // public method ------------------------------------------------------ 195 196 /** 197 * Extract a single UTF-32 value from a string. 198 * Used when iterating forwards or backwards (with 199 * <code>UTF16.getCharCount()</code>, as well as random access. If a 200 * validity check is required, use 201 * <code><a href="../lang/UCharacter.html#isLegal(char)"> 202 * UCharacter.isLegal()</a></code> on the return value. 203 * If the char retrieved is part of a surrogate pair, its supplementary 204 * character will be returned. If a complete supplementary character is 205 * not found the incomplete character will be returned 206 * @param source array of UTF-16 chars 207 * @param offset16 UTF-16 offset to the start of the character. 208 * @return UTF-32 value for the UTF-32 value that contains the char at 209 * offset16. The boundaries of that codepoint are the same as in 210 * <code>bounds32()</code>. 211 * @exception IndexOutOfBoundsException thrown if offset16 is out of 212 * bounds. 213 * @stable ICU 2.1 214 */ 215 public static int charAt(String source, int offset16) { 216 char single = source.charAt(offset16); 217 if (single < LEAD_SURROGATE_MIN_VALUE) { 218 return single; 219 } 220 return _charAt(source, offset16, single); 221 } 222 223 private static int _charAt(String source, int offset16, char single) { 224 if (single > TRAIL_SURROGATE_MAX_VALUE) { 225 return single; 226 } 227 228 // Convert the UTF-16 surrogate pair if necessary. 229 // For simplicity in usage, and because the frequency of pairs is 230 // low, look both directions. 231 232 if (single <= LEAD_SURROGATE_MAX_VALUE) { 233 ++offset16; 234 if (source.length() != offset16) { 235 char trail = source.charAt(offset16); 236 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 237 return UCharacterProperty.getRawSupplementary(single, trail); 238 } 239 } 240 } else { 241 --offset16; 242 if (offset16 >= 0) { 243 // single is a trail surrogate so 244 char lead = source.charAt(offset16); 245 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 246 return UCharacterProperty.getRawSupplementary(lead, single); 247 } 248 } 249 } 250 return single; // return unmatched surrogate 251 } 252 253 /** 254 * Extract a single UTF-32 value from a string. 255 * Used when iterating forwards or backwards (with 256 * <code>UTF16.getCharCount()</code>, as well as random access. If a 257 * validity check is required, use 258 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 259 * </a></code> on the return value. 260 * If the char retrieved is part of a surrogate pair, its supplementary 261 * character will be returned. If a complete supplementary character is 262 * not found the incomplete character will be returned 263 * @param source array of UTF-16 chars 264 * @param offset16 UTF-16 offset to the start of the character. 265 * @return UTF-32 value for the UTF-32 value that contains the char at 266 * offset16. The boundaries of that codepoint are the same as in 267 * <code>bounds32()</code>. 268 * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds. 269 * @stable ICU 2.1 270 */ 271 public static int charAt(CharSequence source, int offset16) { 272 char single = source.charAt(offset16); 273 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 274 return single; 275 } 276 return _charAt(source, offset16, single); 277 } 278 279 private static int _charAt(CharSequence source, int offset16, char single) { 280 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 281 return single; 282 } 283 284 // Convert the UTF-16 surrogate pair if necessary. 285 // For simplicity in usage, and because the frequency of pairs is 286 // low, look both directions. 287 288 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 289 ++offset16; 290 if (source.length() != offset16) { 291 char trail = source.charAt(offset16); 292 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 293 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 294 return UCharacterProperty.getRawSupplementary(single, trail); 295 } 296 } 297 } else { 298 --offset16; 299 if (offset16 >= 0) { 300 // single is a trail surrogate so 301 char lead = source.charAt(offset16); 302 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 303 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 304 return UCharacterProperty.getRawSupplementary(lead, single); 305 } 306 } 307 } 308 return single; // return unmatched surrogate 309 } 310 311 /** 312 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 313 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 314 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 315 * </a></code> 316 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 317 * character will be returned. If a complete supplementary character is not found the incomplete 318 * character will be returned 319 * 320 * @param source Array of UTF-16 chars 321 * @param start Offset to substring in the source array for analyzing 322 * @param limit Offset to substring in the source array for analyzing 323 * @param offset16 UTF-16 offset relative to start 324 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 325 * of that codepoint are the same as in <code>bounds32()</code>. 326 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 327 * @stable ICU 2.1 328 */ 329 public static int charAt(char source[], int start, int limit, int offset16) { 330 offset16 += start; 331 if (offset16 < start || offset16 >= limit) { 332 throw new ArrayIndexOutOfBoundsException(offset16); 333 } 334 335 char single = source[offset16]; 336 if (!isSurrogate(single)) { 337 return single; 338 } 339 340 // Convert the UTF-16 surrogate pair if necessary. 341 // For simplicity in usage, and because the frequency of pairs is 342 // low, look both directions. 343 if (single <= LEAD_SURROGATE_MAX_VALUE) { 344 offset16++; 345 if (offset16 >= limit) { 346 return single; 347 } 348 char trail = source[offset16]; 349 if (isTrailSurrogate(trail)) { 350 return UCharacterProperty.getRawSupplementary(single, trail); 351 } 352 } 353 else { // isTrailSurrogate(single), so 354 if (offset16 == start) { 355 return single; 356 } 357 offset16--; 358 char lead = source[offset16]; 359 if (isLeadSurrogate(lead)) 360 return UCharacterProperty.getRawSupplementary(lead, single); 361 } 362 return single; // return unmatched surrogate 363 } 364 365 /** 366 * Determines how many chars this char32 requires. 367 * If a validity check is required, use <code> 368 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on 369 * char32 before calling. 370 * @param char32 the input codepoint. 371 * @return 2 if is in supplementary space, otherwise 1. 372 * @stable ICU 2.1 373 */ 374 public static int getCharCount(int char32) 375 { 376 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 377 return 1; 378 } 379 return 2; 380 } 381 382 /** 383 * Determines whether the code value is a surrogate. 384 * @param char16 the input character. 385 * @return true if the input character is a surrogate. 386 * @stable ICU 2.1 387 */ 388 public static boolean isSurrogate(char char16) 389 { 390 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 391 } 392 393 /** 394 * Determines whether the character is a trail surrogate. 395 * @param char16 the input character. 396 * @return true if the input character is a trail surrogate. 397 * @stable ICU 2.1 398 */ 399 public static boolean isTrailSurrogate(char char16) 400 { 401 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 402 } 403 404 /** 405 * Determines whether the character is a lead surrogate. 406 * @param char16 the input character. 407 * @return true if the input character is a lead surrogate 408 * @stable ICU 2.1 409 */ 410 public static boolean isLeadSurrogate(char char16) 411 { 412 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 413 } 414 415 /** 416 * Returns the lead surrogate. 417 * If a validity check is required, use 418 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 419 * on char32 before calling. 420 * @param char32 the input character. 421 * @return lead surrogate if the getCharCount(ch) is 2; <br> 422 * and 0 otherwise (note: 0 is not a valid lead surrogate). 423 * @stable ICU 2.1 424 */ 425 public static char getLeadSurrogate(int char32) 426 { 427 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 428 return (char)(LEAD_SURROGATE_OFFSET_ + 429 (char32 >> LEAD_SURROGATE_SHIFT_)); 430 } 431 432 return 0; 433 } 434 435 /** 436 * Returns the trail surrogate. 437 * If a validity check is required, use 438 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 439 * on char32 before calling. 440 * @param char32 the input character. 441 * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise 442 * the character itself 443 * @stable ICU 2.1 444 */ 445 public static char getTrailSurrogate(int char32) 446 { 447 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 448 return (char)(TRAIL_SURROGATE_MIN_VALUE + 449 (char32 & TRAIL_SURROGATE_MASK_)); 450 } 451 452 return (char) char32; 453 } 454 455 /** 456 * Convenience method corresponding to String.valueOf(char). Returns a one 457 * or two char string containing the UTF-32 value in UTF16 format. If a 458 * validity check is required, use 459 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 460 * on char32 before calling. 461 * @param char32 the input character. 462 * @return string value of char32 in UTF16 format 463 * @exception IllegalArgumentException thrown if char32 is a invalid 464 * codepoint. 465 * @stable ICU 2.1 466 */ 467 public static String valueOf(int char32) 468 { 469 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 470 throw new IllegalArgumentException("Illegal codepoint"); 471 } 472 return toString(char32); 473 } 474 475 /** 476 * Append a single UTF-32 value to the end of a StringBuffer. 477 * If a validity check is required, use 478 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 479 * on char32 before calling. 480 * @param target the buffer to append to 481 * @param char32 value to append. 482 * @return the updated StringBuffer 483 * @exception IllegalArgumentException thrown when char32 does not lie 484 * within the range of the Unicode codepoints 485 * @stable ICU 2.1 486 */ 487 public static StringBuffer append(StringBuffer target, int char32) 488 { 489 // Check for irregular values 490 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 491 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 492 } 493 494 // Write the UTF-16 values 495 if (char32 >= SUPPLEMENTARY_MIN_VALUE) 496 { 497 target.append(getLeadSurrogate(char32)); 498 target.append(getTrailSurrogate(char32)); 499 } 500 else { 501 target.append((char) char32); 502 } 503 return target; 504 } 505 506 /** 507 * Shifts offset16 by the argument number of codepoints within a subarray. 508 * @param source char array 509 * @param start position of the subarray to be performed on 510 * @param limit position of the subarray to be performed on 511 * @param offset16 UTF16 position to shift relative to start 512 * @param shift32 number of codepoints to shift 513 * @return new shifted offset16 relative to start 514 * @exception IndexOutOfBoundsException if the new offset16 is out of 515 * bounds with respect to the subarray or the subarray bounds 516 * are out of range. 517 * @stable ICU 2.1 518 */ 519 public static int moveCodePointOffset(char source[], int start, int limit, 520 int offset16, int shift32) 521 { 522 int size = source.length; 523 int count; 524 char ch; 525 int result = offset16 + start; 526 if (start < 0 || limit < start) { 527 throw new StringIndexOutOfBoundsException(start); 528 } 529 if (limit > size) { 530 throw new StringIndexOutOfBoundsException(limit); 531 } 532 if (offset16 < 0 || result > limit) { 533 throw new StringIndexOutOfBoundsException(offset16); 534 } 535 if (shift32 > 0) { 536 if (shift32 + result > size) { 537 throw new StringIndexOutOfBoundsException(result); 538 } 539 count = shift32; 540 while (result < limit && count > 0) 541 { 542 ch = source[result]; 543 if (isLeadSurrogate(ch) && (result + 1 < limit) && 544 isTrailSurrogate(source[result + 1])) { 545 result++; 546 } 547 count--; 548 result++; 549 } 550 } else { 551 if (result + shift32 < start) { 552 throw new StringIndexOutOfBoundsException(result); 553 } 554 for (count = -shift32; count > 0; count--) { 555 result--; 556 if (result < start) { 557 break; 558 } 559 ch = source[result]; 560 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 561 result--; 562 } 563 } 564 } 565 if (count != 0) { 566 throw new StringIndexOutOfBoundsException(shift32); 567 } 568 result -= start; 569 return result; 570 } 571 572 // private data members ------------------------------------------------- 573 574 /** 575 * Shift value for lead surrogate to form a supplementary character. 576 */ 577 private static final int LEAD_SURROGATE_SHIFT_ = 10; 578 579 /** 580 * Mask to retrieve the significant value from a trail surrogate. 581 */ 582 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 583 584 /** 585 * Value that all lead surrogate starts with 586 */ 587 private static final int LEAD_SURROGATE_OFFSET_ = 588 LEAD_SURROGATE_MIN_VALUE - 589 (SUPPLEMENTARY_MIN_VALUE 590 >> LEAD_SURROGATE_SHIFT_); 591 592 // private methods ------------------------------------------------------ 593 594 /** 595 * <p>Converts argument code point and returns a String object representing 596 * the code point's value in UTF16 format. 597 * <p>This method does not check for the validity of the codepoint, the 598 * results are not guaranteed if a invalid codepoint is passed as 599 * argument. 600 * <p>The result is a string whose length is 1 for non-supplementary code 601 * points, 2 otherwise. 602 * @param ch code point 603 * @return string representation of the code point 604 */ 605 private static String toString(int ch) 606 { 607 if (ch < SUPPLEMENTARY_MIN_VALUE) { 608 return String.valueOf((char) ch); 609 } 610 611 StringBuilder result = new StringBuilder(); 612 result.append(getLeadSurrogate(ch)); 613 result.append(getTrailSurrogate(ch)); 614 return result.toString(); 615 } 616 }