1 /* 2 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /** 26 ******************************************************************************* 27 * Copyright (C) 1996-2014, International Business Machines Corporation and 28 * others. All Rights Reserved. 29 ******************************************************************************* 30 */ 31 32 package jdk.internal.icu.text; 33 34 import jdk.internal.icu.impl.UCharacterProperty; 35 36 /** 37 * <p>Standalone utility class providing UTF16 character conversions and 38 * indexing conversions. 39 * <p>Code that uses strings alone rarely need modification. 40 * By design, UTF-16 does not allow overlap, so searching for strings is a safe 41 * operation. Similarly, concatenation is always safe. Substringing is safe if 42 * the start and end are both on UTF-32 boundaries. In normal code, the values 43 * for start and end are on those boundaries, since they arose from operations 44 * like searching. If not, the nearest UTF-32 boundaries can be determined 45 * using <code>bounds()</code>. 46 * <strong>Examples:</strong> 47 * <p>The following examples illustrate use of some of these methods. 48 * <pre>{@code 49 * // iteration forwards: Original 50 * for (int i = 0; i < s.length(); ++i) { 51 * char ch = s.charAt(i); 52 * doSomethingWith(ch); 53 * } 54 * 55 * // iteration forwards: Changes for UTF-32 56 * int ch; 57 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 58 * ch = UTF16.charAt(s, i); 59 * doSomethingWith(ch); 60 * } 61 * 62 * // iteration backwards: Original 63 * for (int i = s.length() - 1; i >= 0; --i) { 64 * char ch = s.charAt(i); 65 * doSomethingWith(ch); 66 * } 67 * 68 * // iteration backwards: Changes for UTF-32 69 * int ch; 70 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 71 * ch = UTF16.charAt(s, i); 72 * doSomethingWith(ch); 73 * } 74 * }</pre> 75 * <strong>Notes:</strong> 76 * <ul> 77 * <li> 78 * <strong>Naming:</strong> For clarity, High and Low surrogates are called 79 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better 80 * sense of their ordering in a string. <code>offset16</code> and 81 * <code>offset32</code> are used to distinguish offsets to UTF-16 82 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is 83 * used to contain UTF-32 characters, as opposed to <code>char16</code>, 84 * which is a UTF-16 code unit. 85 * </li> 86 * <li> 87 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a 88 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in 89 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and 90 * back if and only if <code>bounds(string, offset16) != TRAIL</code>. 91 * </li> 92 * <li> 93 * <strong>Exceptions:</strong> The error checking will throw an exception 94 * if indices are out of bounds. Other than that, all methods will 95 * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 96 * values are present. <code>UCharacter.isLegal()</code> can be used to check 97 * for validity if desired. 98 * </li> 99 * <li> 100 * <strong>Unmatched Surrogates:</strong> If the string contains unmatched 101 * surrogates, then these are counted as one UTF-32 value. This matches 102 * their iteration behavior, which is vital. It also matches common display 103 * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). 104 * </li> 105 * <li> 106 * <strong>Optimization:</strong> The method implementations may need 107 * optimization if the compiler doesn't fold static final methods. Since 108 * surrogate pairs will form an exceeding small percentage of all the text 109 * in the world, the singleton case should always be optimized for. 110 * </li> 111 * </ul> 112 * @author Mark Davis, with help from Markus Scherer 113 * @stable ICU 2.1 114 */ 115 116 public final class UTF16 117 { 118 // public variables --------------------------------------------------- 119 120 /** 121 * The lowest Unicode code point value. 122 * @stable ICU 2.1 123 */ 124 public static final int CODEPOINT_MIN_VALUE = 0; 125 /** 126 * The highest Unicode code point value (scalar value) according to the 127 * Unicode Standard. 128 * @stable ICU 2.1 129 */ 130 public static final int CODEPOINT_MAX_VALUE = 0x10ffff; 131 /** 132 * The minimum value for Supplementary code points 133 * @stable ICU 2.1 134 */ 135 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; 136 /** 137 * Lead surrogate minimum value 138 * @stable ICU 2.1 139 */ 140 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; 141 /** 142 * Trail surrogate minimum value 143 * @stable ICU 2.1 144 */ 145 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; 146 /** 147 * Lead surrogate maximum value 148 * @stable ICU 2.1 149 */ 150 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; 151 /** 152 * Trail surrogate maximum value 153 * @stable ICU 2.1 154 */ 155 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; 156 /** 157 * Surrogate minimum value 158 * @stable ICU 2.1 159 */ 160 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; 161 /** 162 * Lead surrogate bitmask 163 */ 164 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; 165 /** 166 * Trail surrogate bitmask 167 */ 168 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; 169 /** 170 * Surrogate bitmask 171 */ 172 private static final int SURROGATE_BITMASK = 0xFFFFF800; 173 /** 174 * Lead surrogate bits 175 */ 176 private static final int LEAD_SURROGATE_BITS = 0xD800; 177 /** 178 * Trail surrogate bits 179 */ 180 private static final int TRAIL_SURROGATE_BITS = 0xDC00; 181 /** 182 * Surrogate bits 183 */ 184 private static final int SURROGATE_BITS = 0xD800; 185 186 // constructor -------------------------------------------------------- 187 188 // /CLOVER:OFF 189 /** 190 * Prevent instance from being created. 191 */ 192 private UTF16() { 193 } 194 195 // /CLOVER:ON 196 // public method ------------------------------------------------------ 197 198 /** 199 * Extract a single UTF-32 value from a string. 200 * Used when iterating forwards or backwards (with 201 * <code>UTF16.getCharCount()</code>, as well as random access. If a 202 * validity check is required, use 203 * <code><a href="../lang/UCharacter.html#isLegal(char)"> 204 * UCharacter.isLegal()</a></code> on the return value. 205 * If the char retrieved is part of a surrogate pair, its supplementary 206 * character will be returned. If a complete supplementary character is 207 * not found the incomplete character will be returned 208 * @param source array of UTF-16 chars 209 * @param offset16 UTF-16 offset to the start of the character. 210 * @return UTF-32 value for the UTF-32 value that contains the char at 211 * offset16. The boundaries of that codepoint are the same as in 212 * <code>bounds32()</code>. 213 * @exception IndexOutOfBoundsException thrown if offset16 is out of 214 * bounds. 215 * @stable ICU 2.1 216 */ 217 public static int charAt(String source, int offset16) { 218 char single = source.charAt(offset16); 219 if (single < LEAD_SURROGATE_MIN_VALUE) { 220 return single; 221 } 222 return _charAt(source, offset16, single); 223 } 224 225 private static int _charAt(String source, int offset16, char single) { 226 if (single > TRAIL_SURROGATE_MAX_VALUE) { 227 return single; 228 } 229 230 // Convert the UTF-16 surrogate pair if necessary. 231 // For simplicity in usage, and because the frequency of pairs is 232 // low, look both directions. 233 234 if (single <= LEAD_SURROGATE_MAX_VALUE) { 235 ++offset16; 236 if (source.length() != offset16) { 237 char trail = source.charAt(offset16); 238 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { 239 return UCharacterProperty.getRawSupplementary(single, trail); 240 } 241 } 242 } else { 243 --offset16; 244 if (offset16 >= 0) { 245 // single is a trail surrogate so 246 char lead = source.charAt(offset16); 247 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { 248 return UCharacterProperty.getRawSupplementary(lead, single); 249 } 250 } 251 } 252 return single; // return unmatched surrogate 253 } 254 255 /** 256 * Extract a single UTF-32 value from a string. 257 * Used when iterating forwards or backwards (with 258 * <code>UTF16.getCharCount()</code>, as well as random access. If a 259 * validity check is required, use 260 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 261 * </a></code> on the return value. 262 * If the char retrieved is part of a surrogate pair, its supplementary 263 * character will be returned. If a complete supplementary character is 264 * not found the incomplete character will be returned 265 * @param source array of UTF-16 chars 266 * @param offset16 UTF-16 offset to the start of the character. 267 * @return UTF-32 value for the UTF-32 value that contains the char at 268 * offset16. The boundaries of that codepoint are the same as in 269 * <code>bounds32()</code>. 270 * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds. 271 * @stable ICU 2.1 272 */ 273 public static int charAt(CharSequence source, int offset16) { 274 char single = source.charAt(offset16); 275 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { 276 return single; 277 } 278 return _charAt(source, offset16, single); 279 } 280 281 private static int _charAt(CharSequence source, int offset16, char single) { 282 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { 283 return single; 284 } 285 286 // Convert the UTF-16 surrogate pair if necessary. 287 // For simplicity in usage, and because the frequency of pairs is 288 // low, look both directions. 289 290 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 291 ++offset16; 292 if (source.length() != offset16) { 293 char trail = source.charAt(offset16); 294 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE 295 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { 296 return UCharacterProperty.getRawSupplementary(single, trail); 297 } 298 } 299 } else { 300 --offset16; 301 if (offset16 >= 0) { 302 // single is a trail surrogate so 303 char lead = source.charAt(offset16); 304 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE 305 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 306 return UCharacterProperty.getRawSupplementary(lead, single); 307 } 308 } 309 } 310 return single; // return unmatched surrogate 311 } 312 313 /** 314 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards 315 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is 316 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() 317 * </a></code> 318 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary 319 * character will be returned. If a complete supplementary character is not found the incomplete 320 * character will be returned 321 * 322 * @param source Array of UTF-16 chars 323 * @param start Offset to substring in the source array for analyzing 324 * @param limit Offset to substring in the source array for analyzing 325 * @param offset16 UTF-16 offset relative to start 326 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries 327 * of that codepoint are the same as in <code>bounds32()</code>. 328 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. 329 * @stable ICU 2.1 330 */ 331 public static int charAt(char source[], int start, int limit, int offset16) { 332 offset16 += start; 333 if (offset16 < start || offset16 >= limit) { 334 throw new ArrayIndexOutOfBoundsException(offset16); 335 } 336 337 char single = source[offset16]; 338 if (!isSurrogate(single)) { 339 return single; 340 } 341 342 // Convert the UTF-16 surrogate pair if necessary. 343 // For simplicity in usage, and because the frequency of pairs is 344 // low, look both directions. 345 if (single <= LEAD_SURROGATE_MAX_VALUE) { 346 offset16++; 347 if (offset16 >= limit) { 348 return single; 349 } 350 char trail = source[offset16]; 351 if (isTrailSurrogate(trail)) { 352 return UCharacterProperty.getRawSupplementary(single, trail); 353 } 354 } 355 else { // isTrailSurrogate(single), so 356 if (offset16 == start) { 357 return single; 358 } 359 offset16--; 360 char lead = source[offset16]; 361 if (isLeadSurrogate(lead)) 362 return UCharacterProperty.getRawSupplementary(lead, single); 363 } 364 return single; // return unmatched surrogate 365 } 366 367 /** 368 * Determines how many chars this char32 requires. 369 * If a validity check is required, use <code> 370 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on 371 * char32 before calling. 372 * @param char32 the input codepoint. 373 * @return 2 if is in supplementary space, otherwise 1. 374 * @stable ICU 2.1 375 */ 376 public static int getCharCount(int char32) 377 { 378 if (char32 < SUPPLEMENTARY_MIN_VALUE) { 379 return 1; 380 } 381 return 2; 382 } 383 384 /** 385 * Determines whether the code value is a surrogate. 386 * @param char16 the input character. 387 * @return true if the input character is a surrogate. 388 * @stable ICU 2.1 389 */ 390 public static boolean isSurrogate(char char16) 391 { 392 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; 393 } 394 395 /** 396 * Determines whether the character is a trail surrogate. 397 * @param char16 the input character. 398 * @return true if the input character is a trail surrogate. 399 * @stable ICU 2.1 400 */ 401 public static boolean isTrailSurrogate(char char16) 402 { 403 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; 404 } 405 406 /** 407 * Determines whether the character is a lead surrogate. 408 * @param char16 the input character. 409 * @return true if the input character is a lead surrogate 410 * @stable ICU 2.1 411 */ 412 public static boolean isLeadSurrogate(char char16) 413 { 414 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; 415 } 416 417 /** 418 * Returns the lead surrogate. 419 * If a validity check is required, use 420 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 421 * on char32 before calling. 422 * @param char32 the input character. 423 * @return lead surrogate if the getCharCount(ch) is 2; <br> 424 * and 0 otherwise (note: 0 is not a valid lead surrogate). 425 * @stable ICU 2.1 426 */ 427 public static char getLeadSurrogate(int char32) 428 { 429 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 430 return (char)(LEAD_SURROGATE_OFFSET_ + 431 (char32 >> LEAD_SURROGATE_SHIFT_)); 432 } 433 434 return 0; 435 } 436 437 /** 438 * Returns the trail surrogate. 439 * If a validity check is required, use 440 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 441 * on char32 before calling. 442 * @param char32 the input character. 443 * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise 444 * the character itself 445 * @stable ICU 2.1 446 */ 447 public static char getTrailSurrogate(int char32) 448 { 449 if (char32 >= SUPPLEMENTARY_MIN_VALUE) { 450 return (char)(TRAIL_SURROGATE_MIN_VALUE + 451 (char32 & TRAIL_SURROGATE_MASK_)); 452 } 453 454 return (char) char32; 455 } 456 457 /** 458 * Convenience method corresponding to String.valueOf(char). Returns a one 459 * or two char string containing the UTF-32 value in UTF16 format. If a 460 * validity check is required, use 461 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 462 * on char32 before calling. 463 * @param char32 the input character. 464 * @return string value of char32 in UTF16 format 465 * @exception IllegalArgumentException thrown if char32 is a invalid 466 * codepoint. 467 * @stable ICU 2.1 468 */ 469 public static String valueOf(int char32) 470 { 471 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 472 throw new IllegalArgumentException("Illegal codepoint"); 473 } 474 return toString(char32); 475 } 476 477 /** 478 * Append a single UTF-32 value to the end of a StringBuffer. 479 * If a validity check is required, use 480 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> 481 * on char32 before calling. 482 * @param target the buffer to append to 483 * @param char32 value to append. 484 * @return the updated StringBuffer 485 * @exception IllegalArgumentException thrown when char32 does not lie 486 * within the range of the Unicode codepoints 487 * @stable ICU 2.1 488 */ 489 public static StringBuffer append(StringBuffer target, int char32) 490 { 491 // Check for irregular values 492 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { 493 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); 494 } 495 496 // Write the UTF-16 values 497 if (char32 >= SUPPLEMENTARY_MIN_VALUE) 498 { 499 target.append(getLeadSurrogate(char32)); 500 target.append(getTrailSurrogate(char32)); 501 } 502 else { 503 target.append((char) char32); 504 } 505 return target; 506 } 507 508 /** 509 * Shifts offset16 by the argument number of codepoints within a subarray. 510 * @param source char array 511 * @param start position of the subarray to be performed on 512 * @param limit position of the subarray to be performed on 513 * @param offset16 UTF16 position to shift relative to start 514 * @param shift32 number of codepoints to shift 515 * @return new shifted offset16 relative to start 516 * @exception IndexOutOfBoundsException if the new offset16 is out of 517 * bounds with respect to the subarray or the subarray bounds 518 * are out of range. 519 * @stable ICU 2.1 520 */ 521 public static int moveCodePointOffset(char source[], int start, int limit, 522 int offset16, int shift32) 523 { 524 int size = source.length; 525 int count; 526 char ch; 527 int result = offset16 + start; 528 if (start < 0 || limit < start) { 529 throw new StringIndexOutOfBoundsException(start); 530 } 531 if (limit > size) { 532 throw new StringIndexOutOfBoundsException(limit); 533 } 534 if (offset16 < 0 || result > limit) { 535 throw new StringIndexOutOfBoundsException(offset16); 536 } 537 if (shift32 > 0) { 538 if (shift32 + result > size) { 539 throw new StringIndexOutOfBoundsException(result); 540 } 541 count = shift32; 542 while (result < limit && count > 0) 543 { 544 ch = source[result]; 545 if (isLeadSurrogate(ch) && (result + 1 < limit) && 546 isTrailSurrogate(source[result + 1])) { 547 result++; 548 } 549 count--; 550 result++; 551 } 552 } else { 553 if (result + shift32 < start) { 554 throw new StringIndexOutOfBoundsException(result); 555 } 556 for (count = -shift32; count > 0; count--) { 557 result--; 558 if (result < start) { 559 break; 560 } 561 ch = source[result]; 562 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { 563 result--; 564 } 565 } 566 } 567 if (count != 0) { 568 throw new StringIndexOutOfBoundsException(shift32); 569 } 570 result -= start; 571 return result; 572 } 573 574 // private data members ------------------------------------------------- 575 576 /** 577 * Shift value for lead surrogate to form a supplementary character. 578 */ 579 private static final int LEAD_SURROGATE_SHIFT_ = 10; 580 581 /** 582 * Mask to retrieve the significant value from a trail surrogate. 583 */ 584 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; 585 586 /** 587 * Value that all lead surrogate starts with 588 */ 589 private static final int LEAD_SURROGATE_OFFSET_ = 590 LEAD_SURROGATE_MIN_VALUE - 591 (SUPPLEMENTARY_MIN_VALUE 592 >> LEAD_SURROGATE_SHIFT_); 593 594 // private methods ------------------------------------------------------ 595 596 /** 597 * <p>Converts argument code point and returns a String object representing 598 * the code point's value in UTF16 format. 599 * <p>This method does not check for the validity of the codepoint, the 600 * results are not guaranteed if a invalid codepoint is passed as 601 * argument. 602 * <p>The result is a string whose length is 1 for non-supplementary code 603 * points, 2 otherwise. 604 * @param ch code point 605 * @return string representation of the code point 606 */ 607 private static String toString(int ch) 608 { 609 if (ch < SUPPLEMENTARY_MIN_VALUE) { 610 return String.valueOf((char) ch); 611 } 612 613 StringBuilder result = new StringBuilder(); 614 result.append(getLeadSurrogate(ch)); 615 result.append(getTrailSurrogate(ch)); 616 return result.toString(); 617 } 618 }