1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /**
  26  *******************************************************************************
  27  * Copyright (C) 1996-2014, International Business Machines Corporation and
  28  * others. All Rights Reserved.
  29  *******************************************************************************
  30  */
  31 
  32 package sun.text.normalizer;
  33 
  34 /**
  35  * <p>Standalone utility class providing UTF16 character conversions and
  36  * indexing conversions.
  37  * <p>Code that uses strings alone rarely need modification.
  38  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  39  * operation. Similarly, concatenation is always safe. Substringing is safe if
  40  * the start and end are both on UTF-32 boundaries. In normal code, the values
  41  * for start and end are on those boundaries, since they arose from operations
  42  * like searching. If not, the nearest UTF-32 boundaries can be determined
  43  * using <code>bounds()</code>.
  44  * <strong>Examples:</strong>
  45  * <p>The following examples illustrate use of some of these methods.
  46  * <pre>{@code
  47  * // iteration forwards: Original
  48  * for (int i = 0; i < s.length(); ++i) {
  49  *     char ch = s.charAt(i);
  50  *     doSomethingWith(ch);
  51  * }
  52  *
  53  * // iteration forwards: Changes for UTF-32
  54  * int ch;
  55  * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
  56  *     ch = UTF16.charAt(s, i);
  57  *     doSomethingWith(ch);
  58  * }
  59  *
  60  * // iteration backwards: Original
  61  * for (int i = s.length() - 1; i >= 0; --i) {
  62  *     char ch = s.charAt(i);
  63  *     doSomethingWith(ch);
  64  * }
  65  *
  66  * // iteration backwards: Changes for UTF-32
  67  * int ch;
  68  * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
  69  *     ch = UTF16.charAt(s, i);
  70  *     doSomethingWith(ch);
  71  * }
  72  * }</pre>
  73  * <strong>Notes:</strong>
  74  * <ul>
  75  *   <li>
  76  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  77  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  78  *   sense of their ordering in a string. <code>offset16</code> and
  79  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  80  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  81  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  82  *   which is a UTF-16 code unit.
  83  *   </li>
  84  *   <li>
  85  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  86  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  87  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  88  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  89  *   </li>
  90  *   <li>
  91  *   <strong>Exceptions:</strong> The error checking will throw an exception
  92  *   if indices are out of bounds. Other than that, all methods will
  93  *   behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
  94  *   values are present. <code>UCharacter.isLegal()</code> can be used to check
  95  *   for validity if desired.
  96  *   </li>
  97  *   <li>
  98  *   <strong>Unmatched Surrogates:</strong> If the string contains unmatched
  99  *   surrogates, then these are counted as one UTF-32 value. This matches
 100  *   their iteration behavior, which is vital. It also matches common display
 101  *   practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
 102  *   </li>
 103  *   <li>
 104  *   <strong>Optimization:</strong> The method implementations may need
 105  *   optimization if the compiler doesn't fold static final methods. Since
 106  *   surrogate pairs will form an exceeding small percentage of all the text
 107  *   in the world, the singleton case should always be optimized for.
 108  *   </li>
 109  * </ul>
 110  * @author Mark Davis, with help from Markus Scherer
 111  * @stable ICU 2.1
 112  */
 113 
 114 public final class UTF16
 115 {
 116     // public variables ---------------------------------------------------
 117 
 118     /**
 119      * The lowest Unicode code point value.
 120      * @stable ICU 2.1
 121      */
 122     public static final int CODEPOINT_MIN_VALUE = 0;
 123     /**
 124      * The highest Unicode code point value (scalar value) according to the
 125      * Unicode Standard.
 126      * @stable ICU 2.1
 127      */
 128     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
 129     /**
 130      * The minimum value for Supplementary code points
 131      * @stable ICU 2.1
 132      */
 133     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
 134     /**
 135      * Lead surrogate minimum value
 136      * @stable ICU 2.1
 137      */
 138     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
 139     /**
 140      * Trail surrogate minimum value
 141      * @stable ICU 2.1
 142      */
 143     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 144     /**
 145      * Lead surrogate maximum value
 146      * @stable ICU 2.1
 147      */
 148     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
 149     /**
 150      * Trail surrogate maximum value
 151      * @stable ICU 2.1
 152      */
 153     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
 154     /**
 155      * Surrogate minimum value
 156      * @stable ICU 2.1
 157      */
 158     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
 159     /**
 160      * Lead surrogate bitmask
 161      */
 162     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
 163     /**
 164      * Trail surrogate bitmask
 165      */
 166     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
 167     /**
 168      * Surrogate bitmask
 169      */
 170     private static final int SURROGATE_BITMASK = 0xFFFFF800;
 171     /**
 172      * Lead surrogate bits
 173      */
 174     private static final int LEAD_SURROGATE_BITS = 0xD800;
 175     /**
 176      * Trail surrogate bits
 177      */
 178     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
 179     /**
 180      * Surrogate bits
 181      */
 182     private static final int SURROGATE_BITS = 0xD800;
 183 
 184     // constructor --------------------------------------------------------
 185 
 186     // /CLOVER:OFF
 187     /**
 188      * Prevent instance from being created.
 189      */
 190     private UTF16() {
 191     }
 192 
 193     // /CLOVER:ON
 194     // public method ------------------------------------------------------
 195 
 196     /**
 197      * Extract a single UTF-32 value from a string.
 198      * Used when iterating forwards or backwards (with
 199      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 200      * validity check is required, use
 201      * <code><a href="../lang/UCharacter.html#isLegal(char)">
 202      * UCharacter.isLegal()</a></code> on the return value.
 203      * If the char retrieved is part of a surrogate pair, its supplementary
 204      * character will be returned. If a complete supplementary character is
 205      * not found the incomplete character will be returned
 206      * @param source array of UTF-16 chars
 207      * @param offset16 UTF-16 offset to the start of the character.
 208      * @return UTF-32 value for the UTF-32 value that contains the char at
 209      *         offset16. The boundaries of that codepoint are the same as in
 210      *         <code>bounds32()</code>.
 211      * @exception IndexOutOfBoundsException thrown if offset16 is out of
 212      *            bounds.
 213      * @stable ICU 2.1
 214      */
 215     public static int charAt(String source, int offset16) {
 216         char single = source.charAt(offset16);
 217         if (single < LEAD_SURROGATE_MIN_VALUE) {
 218             return single;
 219         }
 220         return _charAt(source, offset16, single);
 221     }
 222 
 223     private static int _charAt(String source, int offset16, char single) {
 224         if (single > TRAIL_SURROGATE_MAX_VALUE) {
 225             return single;
 226         }
 227 
 228         // Convert the UTF-16 surrogate pair if necessary.
 229         // For simplicity in usage, and because the frequency of pairs is
 230         // low, look both directions.
 231 
 232         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 233             ++offset16;
 234             if (source.length() != offset16) {
 235                 char trail = source.charAt(offset16);
 236                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
 237                     return UCharacterProperty.getRawSupplementary(single, trail);
 238                 }
 239             }
 240         } else {
 241             --offset16;
 242             if (offset16 >= 0) {
 243                 // single is a trail surrogate so
 244                 char lead = source.charAt(offset16);
 245                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
 246                     return UCharacterProperty.getRawSupplementary(lead, single);
 247                 }
 248             }
 249         }
 250         return single; // return unmatched surrogate
 251     }
 252 
 253     /**
 254      * Extract a single UTF-32 value from a string.
 255      * Used when iterating forwards or backwards (with
 256      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 257      * validity check is required, use
 258      * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 259      * </a></code> on the return value.
 260      * If the char retrieved is part of a surrogate pair, its supplementary
 261      * character will be returned. If a complete supplementary character is
 262      * not found the incomplete character will be returned
 263      * @param source array of UTF-16 chars
 264      * @param offset16 UTF-16 offset to the start of the character.
 265      * @return UTF-32 value for the UTF-32 value that contains the char at
 266      *         offset16. The boundaries of that codepoint are the same as in
 267      *         <code>bounds32()</code>.
 268      * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
 269      * @stable ICU 2.1
 270      */
 271     public static int charAt(CharSequence source, int offset16) {
 272         char single = source.charAt(offset16);
 273         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 274             return single;
 275         }
 276         return _charAt(source, offset16, single);
 277     }
 278 
 279     private static int _charAt(CharSequence source, int offset16, char single) {
 280         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 281             return single;
 282         }
 283 
 284         // Convert the UTF-16 surrogate pair if necessary.
 285         // For simplicity in usage, and because the frequency of pairs is
 286         // low, look both directions.
 287 
 288         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 289             ++offset16;
 290             if (source.length() != offset16) {
 291                 char trail = source.charAt(offset16);
 292                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
 293                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 294                     return UCharacterProperty.getRawSupplementary(single, trail);
 295                 }
 296             }
 297         } else {
 298             --offset16;
 299             if (offset16 >= 0) {
 300                 // single is a trail surrogate so
 301                 char lead = source.charAt(offset16);
 302                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
 303                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 304                     return UCharacterProperty.getRawSupplementary(lead, single);
 305                 }
 306             }
 307         }
 308         return single; // return unmatched surrogate
 309     }
 310 
 311     /**
 312      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
 313      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 314      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 315      * </a></code>
 316      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 317      * character will be returned. If a complete supplementary character is not found the incomplete
 318      * character will be returned
 319      *
 320      * @param source Array of UTF-16 chars
 321      * @param start Offset to substring in the source array for analyzing
 322      * @param limit Offset to substring in the source array for analyzing
 323      * @param offset16 UTF-16 offset relative to start
 324      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 325      *         of that codepoint are the same as in <code>bounds32()</code>.
 326      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
 327      * @stable ICU 2.1
 328      */
 329     public static int charAt(char source[], int start, int limit, int offset16) {
 330         offset16 += start;
 331         if (offset16 < start || offset16 >= limit) {
 332             throw new ArrayIndexOutOfBoundsException(offset16);
 333         }
 334 
 335         char single = source[offset16];
 336         if (!isSurrogate(single)) {
 337             return single;
 338         }
 339 
 340         // Convert the UTF-16 surrogate pair if necessary.
 341         // For simplicity in usage, and because the frequency of pairs is
 342         // low, look both directions.
 343         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 344             offset16++;
 345             if (offset16 >= limit) {
 346                 return single;
 347             }
 348             char trail = source[offset16];
 349             if (isTrailSurrogate(trail)) {
 350                 return UCharacterProperty.getRawSupplementary(single, trail);
 351             }
 352         }
 353         else { // isTrailSurrogate(single), so
 354             if (offset16 == start) {
 355                 return single;
 356             }
 357             offset16--;
 358             char lead = source[offset16];
 359             if (isLeadSurrogate(lead))
 360                 return UCharacterProperty.getRawSupplementary(lead, single);
 361         }
 362         return single; // return unmatched surrogate
 363     }
 364 
 365     /**
 366      * Determines how many chars this char32 requires.
 367      * If a validity check is required, use <code>
 368      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
 369      * char32 before calling.
 370      * @param char32 the input codepoint.
 371      * @return 2 if is in supplementary space, otherwise 1.
 372      * @stable ICU 2.1
 373      */
 374     public static int getCharCount(int char32)
 375     {
 376         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
 377             return 1;
 378         }
 379         return 2;
 380     }
 381 
 382     /**
 383      * Determines whether the code value is a surrogate.
 384      * @param char16 the input character.
 385      * @return true if the input character is a surrogate.
 386      * @stable ICU 2.1
 387      */
 388     public static boolean isSurrogate(char char16)
 389     {
 390         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
 391     }
 392 
 393     /**
 394      * Determines whether the character is a trail surrogate.
 395      * @param char16 the input character.
 396      * @return true if the input character is a trail surrogate.
 397      * @stable ICU 2.1
 398      */
 399     public static boolean isTrailSurrogate(char char16)
 400     {
 401         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
 402     }
 403 
 404     /**
 405      * Determines whether the character is a lead surrogate.
 406      * @param char16 the input character.
 407      * @return true if the input character is a lead surrogate
 408      * @stable ICU 2.1
 409      */
 410     public static boolean isLeadSurrogate(char char16)
 411     {
 412         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
 413     }
 414 
 415     /**
 416      * Returns the lead surrogate.
 417      * If a validity check is required, use
 418      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 419      * on char32 before calling.
 420      * @param char32 the input character.
 421      * @return lead surrogate if the getCharCount(ch) is 2; <br>
 422      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
 423      * @stable ICU 2.1
 424      */
 425     public static char getLeadSurrogate(int char32)
 426     {
 427         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 428             return (char)(LEAD_SURROGATE_OFFSET_ +
 429                           (char32 >> LEAD_SURROGATE_SHIFT_));
 430         }
 431 
 432         return 0;
 433     }
 434 
 435     /**
 436      * Returns the trail surrogate.
 437      * If a validity check is required, use
 438      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 439      * on char32 before calling.
 440      * @param char32 the input character.
 441      * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
 442      *         the character itself
 443      * @stable ICU 2.1
 444      */
 445     public static char getTrailSurrogate(int char32)
 446     {
 447         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 448             return (char)(TRAIL_SURROGATE_MIN_VALUE +
 449                           (char32 & TRAIL_SURROGATE_MASK_));
 450         }
 451 
 452         return (char) char32;
 453     }
 454 
 455     /**
 456      * Convenience method corresponding to String.valueOf(char). Returns a one
 457      * or two char string containing the UTF-32 value in UTF16 format. If a
 458      * validity check is required, use
 459      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 460      * on char32 before calling.
 461      * @param char32 the input character.
 462      * @return string value of char32 in UTF16 format
 463      * @exception IllegalArgumentException thrown if char32 is a invalid
 464      *            codepoint.
 465      * @stable ICU 2.1
 466      */
 467     public static String valueOf(int char32)
 468     {
 469         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 470             throw new IllegalArgumentException("Illegal codepoint");
 471         }
 472         return toString(char32);
 473     }
 474 
 475     /**
 476      * Append a single UTF-32 value to the end of a StringBuffer.
 477      * If a validity check is required, use
 478      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 479      * on char32 before calling.
 480      * @param target the buffer to append to
 481      * @param char32 value to append.
 482      * @return the updated StringBuffer
 483      * @exception IllegalArgumentException thrown when char32 does not lie
 484      *            within the range of the Unicode codepoints
 485      * @stable ICU 2.1
 486      */
 487     public static StringBuffer append(StringBuffer target, int char32)
 488     {
 489         // Check for irregular values
 490         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 491             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
 492         }
 493 
 494         // Write the UTF-16 values
 495         if (char32 >= SUPPLEMENTARY_MIN_VALUE)
 496             {
 497             target.append(getLeadSurrogate(char32));
 498             target.append(getTrailSurrogate(char32));
 499         }
 500         else {
 501             target.append((char) char32);
 502         }
 503         return target;
 504     }
 505 
 506     /**
 507      * Shifts offset16 by the argument number of codepoints within a subarray.
 508      * @param source char array
 509      * @param start position of the subarray to be performed on
 510      * @param limit position of the subarray to be performed on
 511      * @param offset16 UTF16 position to shift relative to start
 512      * @param shift32 number of codepoints to shift
 513      * @return new shifted offset16 relative to start
 514      * @exception IndexOutOfBoundsException if the new offset16 is out of
 515      *            bounds with respect to the subarray or the subarray bounds
 516      *            are out of range.
 517      * @stable ICU 2.1
 518      */
 519     public static int moveCodePointOffset(char source[], int start, int limit,
 520                                           int offset16, int shift32)
 521     {
 522         int size = source.length;
 523         int count;
 524         char ch;
 525         int result = offset16 + start;
 526         if (start < 0 || limit < start) {
 527             throw new StringIndexOutOfBoundsException(start);
 528         }
 529         if (limit > size) {
 530             throw new StringIndexOutOfBoundsException(limit);
 531         }
 532         if (offset16 < 0 || result > limit) {
 533             throw new StringIndexOutOfBoundsException(offset16);
 534         }
 535         if (shift32 > 0) {
 536             if (shift32 + result > size) {
 537                 throw new StringIndexOutOfBoundsException(result);
 538             }
 539             count = shift32;
 540             while (result < limit && count > 0)
 541             {
 542                 ch = source[result];
 543                 if (isLeadSurrogate(ch) && (result + 1 < limit) &&
 544                     isTrailSurrogate(source[result + 1])) {
 545                     result++;
 546                 }
 547                 count--;
 548                 result++;
 549             }
 550         } else {
 551             if (result + shift32 < start) {
 552                 throw new StringIndexOutOfBoundsException(result);
 553             }
 554             for (count = -shift32; count > 0; count--) {
 555                 result--;
 556                 if (result < start) {
 557                     break;
 558                 }
 559                 ch = source[result];
 560                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
 561                     result--;
 562                 }
 563             }
 564         }
 565         if (count != 0) {
 566             throw new StringIndexOutOfBoundsException(shift32);
 567         }
 568         result -= start;
 569         return result;
 570     }
 571 
 572     // private data members -------------------------------------------------
 573 
 574     /**
 575      * Shift value for lead surrogate to form a supplementary character.
 576      */
 577     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 578 
 579     /**
 580      * Mask to retrieve the significant value from a trail surrogate.
 581      */
 582     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
 583 
 584     /**
 585      * Value that all lead surrogate starts with
 586      */
 587     private static final int LEAD_SURROGATE_OFFSET_ =
 588         LEAD_SURROGATE_MIN_VALUE -
 589         (SUPPLEMENTARY_MIN_VALUE
 590         >> LEAD_SURROGATE_SHIFT_);
 591 
 592     // private methods ------------------------------------------------------
 593 
 594     /**
 595      * <p>Converts argument code point and returns a String object representing
 596      * the code point's value in UTF16 format.
 597      * <p>This method does not check for the validity of the codepoint, the
 598      * results are not guaranteed if a invalid codepoint is passed as
 599      * argument.
 600      * <p>The result is a string whose length is 1 for non-supplementary code
 601      * points, 2 otherwise.
 602      * @param ch code point
 603      * @return string representation of the code point
 604      */
 605     private static String toString(int ch)
 606     {
 607         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 608             return String.valueOf((char) ch);
 609         }
 610 
 611         StringBuilder result = new StringBuilder();
 612         result.append(getLeadSurrogate(ch));
 613         result.append(getTrailSurrogate(ch));
 614         return result.toString();
 615     }
 616 }