1 /*
   2  * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /**
  26  *******************************************************************************
  27  * Copyright (C) 1996-2014, International Business Machines Corporation and
  28  * others. All Rights Reserved.
  29  *******************************************************************************
  30  */
  31 
  32 package jdk.internal.icu.text;
  33 
  34 import jdk.internal.icu.impl.UCharacterProperty;
  35 
  36 /**
  37  * <p>Standalone utility class providing UTF16 character conversions and
  38  * indexing conversions.
  39  * <p>Code that uses strings alone rarely need modification.
  40  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  41  * operation. Similarly, concatenation is always safe. Substringing is safe if
  42  * the start and end are both on UTF-32 boundaries. In normal code, the values
  43  * for start and end are on those boundaries, since they arose from operations
  44  * like searching. If not, the nearest UTF-32 boundaries can be determined
  45  * using <code>bounds()</code>.
  46  * <strong>Examples:</strong>
  47  * <p>The following examples illustrate use of some of these methods.
  48  * <pre>{@code
  49  * // iteration forwards: Original
  50  * for (int i = 0; i < s.length(); ++i) {
  51  *     char ch = s.charAt(i);
  52  *     doSomethingWith(ch);
  53  * }
  54  *
  55  * // iteration forwards: Changes for UTF-32
  56  * int ch;
  57  * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
  58  *     ch = UTF16.charAt(s, i);
  59  *     doSomethingWith(ch);
  60  * }
  61  *
  62  * // iteration backwards: Original
  63  * for (int i = s.length() - 1; i >= 0; --i) {
  64  *     char ch = s.charAt(i);
  65  *     doSomethingWith(ch);
  66  * }
  67  *
  68  * // iteration backwards: Changes for UTF-32
  69  * int ch;
  70  * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
  71  *     ch = UTF16.charAt(s, i);
  72  *     doSomethingWith(ch);
  73  * }
  74  * }</pre>
  75  * <strong>Notes:</strong>
  76  * <ul>
  77  *   <li>
  78  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  79  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  80  *   sense of their ordering in a string. <code>offset16</code> and
  81  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  82  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  83  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  84  *   which is a UTF-16 code unit.
  85  *   </li>
  86  *   <li>
  87  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  88  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  89  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  90  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  91  *   </li>
  92  *   <li>
  93  *   <strong>Exceptions:</strong> The error checking will throw an exception
  94  *   if indices are out of bounds. Other than that, all methods will
  95  *   behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
  96  *   values are present. <code>UCharacter.isLegal()</code> can be used to check
  97  *   for validity if desired.
  98  *   </li>
  99  *   <li>
 100  *   <strong>Unmatched Surrogates:</strong> If the string contains unmatched
 101  *   surrogates, then these are counted as one UTF-32 value. This matches
 102  *   their iteration behavior, which is vital. It also matches common display
 103  *   practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
 104  *   </li>
 105  *   <li>
 106  *   <strong>Optimization:</strong> The method implementations may need
 107  *   optimization if the compiler doesn't fold static final methods. Since
 108  *   surrogate pairs will form an exceeding small percentage of all the text
 109  *   in the world, the singleton case should always be optimized for.
 110  *   </li>
 111  * </ul>
 112  * @author Mark Davis, with help from Markus Scherer
 113  * @stable ICU 2.1
 114  */
 115 
 116 public final class UTF16
 117 {
 118     // public variables ---------------------------------------------------
 119 
 120     /**
 121      * The lowest Unicode code point value.
 122      * @stable ICU 2.1
 123      */
 124     public static final int CODEPOINT_MIN_VALUE = 0;
 125     /**
 126      * The highest Unicode code point value (scalar value) according to the
 127      * Unicode Standard.
 128      * @stable ICU 2.1
 129      */
 130     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
 131     /**
 132      * The minimum value for Supplementary code points
 133      * @stable ICU 2.1
 134      */
 135     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
 136     /**
 137      * Lead surrogate minimum value
 138      * @stable ICU 2.1
 139      */
 140     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
 141     /**
 142      * Trail surrogate minimum value
 143      * @stable ICU 2.1
 144      */
 145     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 146     /**
 147      * Lead surrogate maximum value
 148      * @stable ICU 2.1
 149      */
 150     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
 151     /**
 152      * Trail surrogate maximum value
 153      * @stable ICU 2.1
 154      */
 155     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
 156     /**
 157      * Surrogate minimum value
 158      * @stable ICU 2.1
 159      */
 160     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
 161     /**
 162      * Lead surrogate bitmask
 163      */
 164     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
 165     /**
 166      * Trail surrogate bitmask
 167      */
 168     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
 169     /**
 170      * Surrogate bitmask
 171      */
 172     private static final int SURROGATE_BITMASK = 0xFFFFF800;
 173     /**
 174      * Lead surrogate bits
 175      */
 176     private static final int LEAD_SURROGATE_BITS = 0xD800;
 177     /**
 178      * Trail surrogate bits
 179      */
 180     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
 181     /**
 182      * Surrogate bits
 183      */
 184     private static final int SURROGATE_BITS = 0xD800;
 185 
 186     // constructor --------------------------------------------------------
 187 
 188     // /CLOVER:OFF
 189     /**
 190      * Prevent instance from being created.
 191      */
 192     private UTF16() {
 193     }
 194 
 195     // /CLOVER:ON
 196     // public method ------------------------------------------------------
 197 
 198     /**
 199      * Extract a single UTF-32 value from a string.
 200      * Used when iterating forwards or backwards (with
 201      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 202      * validity check is required, use
 203      * <code><a href="../lang/UCharacter.html#isLegal(char)">
 204      * UCharacter.isLegal()</a></code> on the return value.
 205      * If the char retrieved is part of a surrogate pair, its supplementary
 206      * character will be returned. If a complete supplementary character is
 207      * not found the incomplete character will be returned
 208      * @param source array of UTF-16 chars
 209      * @param offset16 UTF-16 offset to the start of the character.
 210      * @return UTF-32 value for the UTF-32 value that contains the char at
 211      *         offset16. The boundaries of that codepoint are the same as in
 212      *         <code>bounds32()</code>.
 213      * @exception IndexOutOfBoundsException thrown if offset16 is out of
 214      *            bounds.
 215      * @stable ICU 2.1
 216      */
 217     public static int charAt(String source, int offset16) {
 218         char single = source.charAt(offset16);
 219         if (single < LEAD_SURROGATE_MIN_VALUE) {
 220             return single;
 221         }
 222         return _charAt(source, offset16, single);
 223     }
 224 
 225     private static int _charAt(String source, int offset16, char single) {
 226         if (single > TRAIL_SURROGATE_MAX_VALUE) {
 227             return single;
 228         }
 229 
 230         // Convert the UTF-16 surrogate pair if necessary.
 231         // For simplicity in usage, and because the frequency of pairs is
 232         // low, look both directions.
 233 
 234         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 235             ++offset16;
 236             if (source.length() != offset16) {
 237                 char trail = source.charAt(offset16);
 238                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
 239                     return UCharacterProperty.getRawSupplementary(single, trail);
 240                 }
 241             }
 242         } else {
 243             --offset16;
 244             if (offset16 >= 0) {
 245                 // single is a trail surrogate so
 246                 char lead = source.charAt(offset16);
 247                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
 248                     return UCharacterProperty.getRawSupplementary(lead, single);
 249                 }
 250             }
 251         }
 252         return single; // return unmatched surrogate
 253     }
 254 
 255     /**
 256      * Extract a single UTF-32 value from a string.
 257      * Used when iterating forwards or backwards (with
 258      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 259      * validity check is required, use
 260      * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 261      * </a></code> on the return value.
 262      * If the char retrieved is part of a surrogate pair, its supplementary
 263      * character will be returned. If a complete supplementary character is
 264      * not found the incomplete character will be returned
 265      * @param source array of UTF-16 chars
 266      * @param offset16 UTF-16 offset to the start of the character.
 267      * @return UTF-32 value for the UTF-32 value that contains the char at
 268      *         offset16. The boundaries of that codepoint are the same as in
 269      *         <code>bounds32()</code>.
 270      * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
 271      * @stable ICU 2.1
 272      */
 273     public static int charAt(CharSequence source, int offset16) {
 274         char single = source.charAt(offset16);
 275         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 276             return single;
 277         }
 278         return _charAt(source, offset16, single);
 279     }
 280 
 281     private static int _charAt(CharSequence source, int offset16, char single) {
 282         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 283             return single;
 284         }
 285 
 286         // Convert the UTF-16 surrogate pair if necessary.
 287         // For simplicity in usage, and because the frequency of pairs is
 288         // low, look both directions.
 289 
 290         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 291             ++offset16;
 292             if (source.length() != offset16) {
 293                 char trail = source.charAt(offset16);
 294                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
 295                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 296                     return UCharacterProperty.getRawSupplementary(single, trail);
 297                 }
 298             }
 299         } else {
 300             --offset16;
 301             if (offset16 >= 0) {
 302                 // single is a trail surrogate so
 303                 char lead = source.charAt(offset16);
 304                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
 305                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 306                     return UCharacterProperty.getRawSupplementary(lead, single);
 307                 }
 308             }
 309         }
 310         return single; // return unmatched surrogate
 311     }
 312 
 313     /**
 314      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
 315      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 316      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 317      * </a></code>
 318      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 319      * character will be returned. If a complete supplementary character is not found the incomplete
 320      * character will be returned
 321      *
 322      * @param source Array of UTF-16 chars
 323      * @param start Offset to substring in the source array for analyzing
 324      * @param limit Offset to substring in the source array for analyzing
 325      * @param offset16 UTF-16 offset relative to start
 326      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 327      *         of that codepoint are the same as in <code>bounds32()</code>.
 328      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
 329      * @stable ICU 2.1
 330      */
 331     public static int charAt(char source[], int start, int limit, int offset16) {
 332         offset16 += start;
 333         if (offset16 < start || offset16 >= limit) {
 334             throw new ArrayIndexOutOfBoundsException(offset16);
 335         }
 336 
 337         char single = source[offset16];
 338         if (!isSurrogate(single)) {
 339             return single;
 340         }
 341 
 342         // Convert the UTF-16 surrogate pair if necessary.
 343         // For simplicity in usage, and because the frequency of pairs is
 344         // low, look both directions.
 345         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 346             offset16++;
 347             if (offset16 >= limit) {
 348                 return single;
 349             }
 350             char trail = source[offset16];
 351             if (isTrailSurrogate(trail)) {
 352                 return UCharacterProperty.getRawSupplementary(single, trail);
 353             }
 354         }
 355         else { // isTrailSurrogate(single), so
 356             if (offset16 == start) {
 357                 return single;
 358             }
 359             offset16--;
 360             char lead = source[offset16];
 361             if (isLeadSurrogate(lead))
 362                 return UCharacterProperty.getRawSupplementary(lead, single);
 363         }
 364         return single; // return unmatched surrogate
 365     }
 366 
 367     /**
 368      * Determines how many chars this char32 requires.
 369      * If a validity check is required, use <code>
 370      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
 371      * char32 before calling.
 372      * @param char32 the input codepoint.
 373      * @return 2 if is in supplementary space, otherwise 1.
 374      * @stable ICU 2.1
 375      */
 376     public static int getCharCount(int char32)
 377     {
 378         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
 379             return 1;
 380         }
 381         return 2;
 382     }
 383 
 384     /**
 385      * Determines whether the code value is a surrogate.
 386      * @param char16 the input character.
 387      * @return true if the input character is a surrogate.
 388      * @stable ICU 2.1
 389      */
 390     public static boolean isSurrogate(char char16)
 391     {
 392         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
 393     }
 394 
 395     /**
 396      * Determines whether the character is a trail surrogate.
 397      * @param char16 the input character.
 398      * @return true if the input character is a trail surrogate.
 399      * @stable ICU 2.1
 400      */
 401     public static boolean isTrailSurrogate(char char16)
 402     {
 403         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
 404     }
 405 
 406     /**
 407      * Determines whether the character is a lead surrogate.
 408      * @param char16 the input character.
 409      * @return true if the input character is a lead surrogate
 410      * @stable ICU 2.1
 411      */
 412     public static boolean isLeadSurrogate(char char16)
 413     {
 414         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
 415     }
 416 
 417     /**
 418      * Returns the lead surrogate.
 419      * If a validity check is required, use
 420      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 421      * on char32 before calling.
 422      * @param char32 the input character.
 423      * @return lead surrogate if the getCharCount(ch) is 2; <br>
 424      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
 425      * @stable ICU 2.1
 426      */
 427     public static char getLeadSurrogate(int char32)
 428     {
 429         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 430             return (char)(LEAD_SURROGATE_OFFSET_ +
 431                           (char32 >> LEAD_SURROGATE_SHIFT_));
 432         }
 433 
 434         return 0;
 435     }
 436 
 437     /**
 438      * Returns the trail surrogate.
 439      * If a validity check is required, use
 440      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 441      * on char32 before calling.
 442      * @param char32 the input character.
 443      * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
 444      *         the character itself
 445      * @stable ICU 2.1
 446      */
 447     public static char getTrailSurrogate(int char32)
 448     {
 449         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 450             return (char)(TRAIL_SURROGATE_MIN_VALUE +
 451                           (char32 & TRAIL_SURROGATE_MASK_));
 452         }
 453 
 454         return (char) char32;
 455     }
 456 
 457     /**
 458      * Convenience method corresponding to String.valueOf(char). Returns a one
 459      * or two char string containing the UTF-32 value in UTF16 format. If a
 460      * validity check is required, use
 461      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 462      * on char32 before calling.
 463      * @param char32 the input character.
 464      * @return string value of char32 in UTF16 format
 465      * @exception IllegalArgumentException thrown if char32 is a invalid
 466      *            codepoint.
 467      * @stable ICU 2.1
 468      */
 469     public static String valueOf(int char32)
 470     {
 471         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 472             throw new IllegalArgumentException("Illegal codepoint");
 473         }
 474         return toString(char32);
 475     }
 476 
 477     /**
 478      * Append a single UTF-32 value to the end of a StringBuffer.
 479      * If a validity check is required, use
 480      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 481      * on char32 before calling.
 482      * @param target the buffer to append to
 483      * @param char32 value to append.
 484      * @return the updated StringBuffer
 485      * @exception IllegalArgumentException thrown when char32 does not lie
 486      *            within the range of the Unicode codepoints
 487      * @stable ICU 2.1
 488      */
 489     public static StringBuffer append(StringBuffer target, int char32)
 490     {
 491         // Check for irregular values
 492         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 493             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
 494         }
 495 
 496         // Write the UTF-16 values
 497         if (char32 >= SUPPLEMENTARY_MIN_VALUE)
 498             {
 499             target.append(getLeadSurrogate(char32));
 500             target.append(getTrailSurrogate(char32));
 501         }
 502         else {
 503             target.append((char) char32);
 504         }
 505         return target;
 506     }
 507 
 508     /**
 509      * Shifts offset16 by the argument number of codepoints within a subarray.
 510      * @param source char array
 511      * @param start position of the subarray to be performed on
 512      * @param limit position of the subarray to be performed on
 513      * @param offset16 UTF16 position to shift relative to start
 514      * @param shift32 number of codepoints to shift
 515      * @return new shifted offset16 relative to start
 516      * @exception IndexOutOfBoundsException if the new offset16 is out of
 517      *            bounds with respect to the subarray or the subarray bounds
 518      *            are out of range.
 519      * @stable ICU 2.1
 520      */
 521     public static int moveCodePointOffset(char source[], int start, int limit,
 522                                           int offset16, int shift32)
 523     {
 524         int size = source.length;
 525         int count;
 526         char ch;
 527         int result = offset16 + start;
 528         if (start < 0 || limit < start) {
 529             throw new StringIndexOutOfBoundsException(start);
 530         }
 531         if (limit > size) {
 532             throw new StringIndexOutOfBoundsException(limit);
 533         }
 534         if (offset16 < 0 || result > limit) {
 535             throw new StringIndexOutOfBoundsException(offset16);
 536         }
 537         if (shift32 > 0) {
 538             if (shift32 + result > size) {
 539                 throw new StringIndexOutOfBoundsException(result);
 540             }
 541             count = shift32;
 542             while (result < limit && count > 0)
 543             {
 544                 ch = source[result];
 545                 if (isLeadSurrogate(ch) && (result + 1 < limit) &&
 546                     isTrailSurrogate(source[result + 1])) {
 547                     result++;
 548                 }
 549                 count--;
 550                 result++;
 551             }
 552         } else {
 553             if (result + shift32 < start) {
 554                 throw new StringIndexOutOfBoundsException(result);
 555             }
 556             for (count = -shift32; count > 0; count--) {
 557                 result--;
 558                 if (result < start) {
 559                     break;
 560                 }
 561                 ch = source[result];
 562                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
 563                     result--;
 564                 }
 565             }
 566         }
 567         if (count != 0) {
 568             throw new StringIndexOutOfBoundsException(shift32);
 569         }
 570         result -= start;
 571         return result;
 572     }
 573 
 574     // private data members -------------------------------------------------
 575 
 576     /**
 577      * Shift value for lead surrogate to form a supplementary character.
 578      */
 579     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 580 
 581     /**
 582      * Mask to retrieve the significant value from a trail surrogate.
 583      */
 584     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
 585 
 586     /**
 587      * Value that all lead surrogate starts with
 588      */
 589     private static final int LEAD_SURROGATE_OFFSET_ =
 590         LEAD_SURROGATE_MIN_VALUE -
 591         (SUPPLEMENTARY_MIN_VALUE
 592         >> LEAD_SURROGATE_SHIFT_);
 593 
 594     // private methods ------------------------------------------------------
 595 
 596     /**
 597      * <p>Converts argument code point and returns a String object representing
 598      * the code point's value in UTF16 format.
 599      * <p>This method does not check for the validity of the codepoint, the
 600      * results are not guaranteed if a invalid codepoint is passed as
 601      * argument.
 602      * <p>The result is a string whose length is 1 for non-supplementary code
 603      * points, 2 otherwise.
 604      * @param ch code point
 605      * @return string representation of the code point
 606      */
 607     private static String toString(int ch)
 608     {
 609         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 610             return String.valueOf((char) ch);
 611         }
 612 
 613         StringBuilder result = new StringBuilder();
 614         result.append(getLeadSurrogate(ch));
 615         result.append(getTrailSurrogate(ch));
 616         return result.toString();
 617     }
 618 }