1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 /**
  40  * <p>Standalone utility class providing UTF16 character conversions and
  41  * indexing conversions.</p>
  42  * <p>Code that uses strings alone rarely need modification.
  43  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  44  * operation. Similarly, concatenation is always safe. Substringing is safe if
  45  * the start and end are both on UTF-32 boundaries. In normal code, the values
  46  * for start and end are on those boundaries, since they arose from operations
  47  * like searching. If not, the nearest UTF-32 boundaries can be determined
  48  * using <code>bounds()</code>.</p>
  49  * <strong>Examples:</strong>
  50  * <p>The following examples illustrate use of some of these methods.
  51  * <pre>
  52  * // iteration forwards: Original
  53  * for (int i = 0; i &lt; s.length(); ++i) {
  54  *     char ch = s.charAt(i);
  55  *     doSomethingWith(ch);
  56  * }
  57  *
  58  * // iteration forwards: Changes for UTF-32
  59  * int ch;
  60  * for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
  61  *     ch = UTF16.charAt(s,i);
  62  *     doSomethingWith(ch);
  63  * }
  64  *
  65  * // iteration backwards: Original
  66  * for (int i = s.length() -1; i >= 0; --i) {
  67  *     char ch = s.charAt(i);
  68  *     doSomethingWith(ch);
  69  * }
  70  *
  71  * // iteration backwards: Changes for UTF-32
  72  * int ch;
  73  * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
  74  *     ch = UTF16.charAt(s,i);
  75  *     doSomethingWith(ch);
  76  * }
  77  * </pre>
  78  * <strong>Notes:</strong>
  79  * <ul>
  80  *   <li>
  81  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  82  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  83  *   sense of their ordering in a string. <code>offset16</code> and
  84  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  85  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  86  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  87  *   which is a UTF-16 code unit.
  88  *   </li>
  89  *   <li>
  90  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  91  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  92  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  93  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  94  *   </li>
  95  *   <li>
  96  *    <strong>Exceptions:</strong> The error checking will throw an exception
  97  *   if indices are out of bounds. Other than that, all methods will
  98  *   behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
  99  *   values are present. <code>UCharacter.isLegal()</code> can be used to check
 100  *   for validity if desired.
 101  *   </li>
 102  *   <li>
 103  *   <strong>Unmatched Surrogates:</strong> If the string contains unmatched
 104  *   surrogates, then these are counted as one UTF-32 value. This matches
 105  *   their iteration behavior, which is vital. It also matches common display
 106  *   practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
 107  *   </li>
 108  *   <li>
 109  *     <strong>Optimization:</strong> The method implementations may need
 110  *     optimization if the compiler doesn't fold static final methods. Since
 111  *     surrogate pairs will form an exceeding small percentage of all the text
 112  *     in the world, the singleton case should always be optimized for.
 113  *   </li>
 114  * </ul>
 115  * @author Mark Davis, with help from Markus Scherer
 116  * @stable ICU 2.1
 117  */
 118 
 119 public final class UTF16
 120 {
 121     // public variables ---------------------------------------------------
 122 
 123     /**
 124      * The lowest Unicode code point value.
 125      * @stable ICU 2.1
 126      */
 127     public static final int CODEPOINT_MIN_VALUE = 0;
 128     /**
 129      * The highest Unicode code point value (scalar value) according to the
 130      * Unicode Standard.
 131      * @stable ICU 2.1
 132      */
 133     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
 134     /**
 135      * The minimum value for Supplementary code points
 136      * @stable ICU 2.1
 137      */
 138     public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;
 139     /**
 140      * Lead surrogate minimum value
 141      * @stable ICU 2.1
 142      */
 143     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
 144     /**
 145      * Trail surrogate minimum value
 146      * @stable ICU 2.1
 147      */
 148     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 149     /**
 150      * Lead surrogate maximum value
 151      * @stable ICU 2.1
 152      */
 153     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
 154     /**
 155      * Trail surrogate maximum value
 156      * @stable ICU 2.1
 157      */
 158     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
 159     /**
 160      * Surrogate minimum value
 161      * @stable ICU 2.1
 162      */
 163     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
 164 
 165     // public method ------------------------------------------------------
 166 
 167     /**
 168      * Extract a single UTF-32 value from a string.
 169      * Used when iterating forwards or backwards (with
 170      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 171      * validity check is required, use
 172      * <code><a href="../lang/UCharacter.html#isLegal(char)">
 173      * UCharacter.isLegal()</a></code> on the return value.
 174      * If the char retrieved is part of a surrogate pair, its supplementary
 175      * character will be returned. If a complete supplementary character is
 176      * not found the incomplete character will be returned
 177      * @param source array of UTF-16 chars
 178      * @param offset16 UTF-16 offset to the start of the character.
 179      * @return UTF-32 value for the UTF-32 value that contains the char at
 180      *         offset16. The boundaries of that codepoint are the same as in
 181      *         <code>bounds32()</code>.
 182      * @exception IndexOutOfBoundsException thrown if offset16 is out of
 183      *            bounds.
 184      * @stable ICU 2.1
 185      */
 186     public static int charAt(String source, int offset16) {
 187         char single = source.charAt(offset16);
 188         if (single < LEAD_SURROGATE_MIN_VALUE) {
 189             return single;
 190         }
 191         return _charAt(source, offset16, single);
 192     }
 193 
 194     private static int _charAt(String source, int offset16, char single) {
 195         if (single > TRAIL_SURROGATE_MAX_VALUE) {
 196             return single;
 197         }
 198 
 199         // Convert the UTF-16 surrogate pair if necessary.
 200         // For simplicity in usage, and because the frequency of pairs is
 201         // low, look both directions.
 202 
 203         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 204             ++offset16;
 205             if (source.length() != offset16) {
 206                 char trail = source.charAt(offset16);
 207                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
 208                     return UCharacterProperty.getRawSupplementary(single, trail);
 209                 }
 210             }
 211         } else {
 212             --offset16;
 213             if (offset16 >= 0) {
 214                 // single is a trail surrogate so
 215                 char lead = source.charAt(offset16);
 216                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
 217                     return UCharacterProperty.getRawSupplementary(lead, single);
 218                 }
 219             }
 220         }
 221         return single; // return unmatched surrogate
 222     }
 223 
 224     /**
 225      * Extract a single UTF-32 value from a substring.
 226      * Used when iterating forwards or backwards (with
 227      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 228      * validity check is required, use
 229      * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 230      * </a></code> on the return value.
 231      * If the char retrieved is part of a surrogate pair, its supplementary
 232      * character will be returned. If a complete supplementary character is
 233      * not found the incomplete character will be returned
 234      * @param source array of UTF-16 chars
 235      * @param start offset to substring in the source array for analyzing
 236      * @param limit offset to substring in the source array for analyzing
 237      * @param offset16 UTF-16 offset relative to start
 238      * @return UTF-32 value for the UTF-32 value that contains the char at
 239      *         offset16. The boundaries of that codepoint are the same as in
 240      *         <code>bounds32()</code>.
 241      * @exception IndexOutOfBoundsException thrown if offset16 is not within
 242      *            the range of start and limit.
 243      * @stable ICU 2.1
 244      */
 245     public static int charAt(char source[], int start, int limit,
 246                              int offset16)
 247     {
 248         offset16 += start;
 249         if (offset16 < start || offset16 >= limit) {
 250             throw new ArrayIndexOutOfBoundsException(offset16);
 251         }
 252 
 253         char single = source[offset16];
 254         if (!isSurrogate(single)) {
 255             return single;
 256         }
 257 
 258         // Convert the UTF-16 surrogate pair if necessary.
 259         // For simplicity in usage, and because the frequency of pairs is
 260         // low, look both directions.
 261         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 262             offset16 ++;
 263             if (offset16 >= limit) {
 264                 return single;
 265             }
 266             char trail = source[offset16];
 267             if (isTrailSurrogate(trail)) {
 268                 return UCharacterProperty.getRawSupplementary(single, trail);
 269             }
 270         }
 271         else { // isTrailSurrogate(single), so
 272             if (offset16 == start) {
 273                 return single;
 274             }
 275             offset16 --;
 276             char lead = source[offset16];
 277             if (isLeadSurrogate(lead))
 278                 return UCharacterProperty.getRawSupplementary(lead, single);
 279         }
 280         return single; // return unmatched surrogate
 281     }
 282 
 283     /**
 284      * Determines how many chars this char32 requires.
 285      * If a validity check is required, use <code>
 286      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
 287      * char32 before calling.
 288      * @param char32 the input codepoint.
 289      * @return 2 if is in supplementary space, otherwise 1.
 290      * @stable ICU 2.1
 291      */
 292     public static int getCharCount(int char32)
 293     {
 294         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
 295             return 1;
 296         }
 297         return 2;
 298     }
 299 
 300     /**
 301      * Determines whether the code value is a surrogate.
 302      * @param char16 the input character.
 303      * @return true iff the input character is a surrogate.
 304      * @stable ICU 2.1
 305      */
 306     public static boolean isSurrogate(char char16)
 307     {
 308         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
 309             char16 <= TRAIL_SURROGATE_MAX_VALUE;
 310     }
 311 
 312     /**
 313      * Determines whether the character is a trail surrogate.
 314      * @param char16 the input character.
 315      * @return true iff the input character is a trail surrogate.
 316      * @stable ICU 2.1
 317      */
 318     public static boolean isTrailSurrogate(char char16)
 319     {
 320         return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
 321                 char16 <= TRAIL_SURROGATE_MAX_VALUE);
 322     }
 323 
 324     /**
 325      * Determines whether the character is a lead surrogate.
 326      * @param char16 the input character.
 327      * @return true iff the input character is a lead surrogate
 328      * @stable ICU 2.1
 329      */
 330     public static boolean isLeadSurrogate(char char16)
 331     {
 332         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
 333             char16 <= LEAD_SURROGATE_MAX_VALUE;
 334     }
 335 
 336     /**
 337      * Returns the lead surrogate.
 338      * If a validity check is required, use
 339      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 340      * on char32 before calling.
 341      * @param char32 the input character.
 342      * @return lead surrogate if the getCharCount(ch) is 2; <br>
 343      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
 344      * @stable ICU 2.1
 345      */
 346     public static char getLeadSurrogate(int char32)
 347     {
 348         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 349             return (char)(LEAD_SURROGATE_OFFSET_ +
 350                           (char32 >> LEAD_SURROGATE_SHIFT_));
 351         }
 352 
 353         return 0;
 354     }
 355 
 356     /**
 357      * Returns the trail surrogate.
 358      * If a validity check is required, use
 359      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 360      * on char32 before calling.
 361      * @param char32 the input character.
 362      * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
 363      *         the character itself
 364      * @stable ICU 2.1
 365      */
 366     public static char getTrailSurrogate(int char32)
 367     {
 368         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 369             return (char)(TRAIL_SURROGATE_MIN_VALUE +
 370                           (char32 & TRAIL_SURROGATE_MASK_));
 371         }
 372 
 373         return (char)char32;
 374     }
 375 
 376     /**
 377      * Convenience method corresponding to String.valueOf(char). Returns a one
 378      * or two char string containing the UTF-32 value in UTF16 format. If a
 379      * validity check is required, use
 380      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 381      * on char32 before calling.
 382      * @param char32 the input character.
 383      * @return string value of char32 in UTF16 format
 384      * @exception IllegalArgumentException thrown if char32 is a invalid
 385      *            codepoint.
 386      * @stable ICU 2.1
 387      */
 388     public static String valueOf(int char32)
 389     {
 390         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 391             throw new IllegalArgumentException("Illegal codepoint");
 392         }
 393         return toString(char32);
 394     }
 395 
 396     /**
 397      * Append a single UTF-32 value to the end of a StringBuffer.
 398      * If a validity check is required, use
 399      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 400      * on char32 before calling.
 401      * @param target the buffer to append to
 402      * @param char32 value to append.
 403      * @return the updated StringBuffer
 404      * @exception IllegalArgumentException thrown when char32 does not lie
 405      *            within the range of the Unicode codepoints
 406      * @stable ICU 2.1
 407      */
 408     public static StringBuffer append(StringBuffer target, int char32)
 409     {
 410         // Check for irregular values
 411         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 412             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
 413         }
 414 
 415         // Write the UTF-16 values
 416         if (char32 >= SUPPLEMENTARY_MIN_VALUE)
 417             {
 418                 target.append(getLeadSurrogate(char32));
 419                 target.append(getTrailSurrogate(char32));
 420             }
 421         else {
 422             target.append((char)char32);
 423         }
 424         return target;
 425     }
 426 
 427     //// for StringPrep
 428     /**
 429      * Shifts offset16 by the argument number of codepoints within a subarray.
 430      * @param source char array
 431      * @param start position of the subarray to be performed on
 432      * @param limit position of the subarray to be performed on
 433      * @param offset16 UTF16 position to shift relative to start
 434      * @param shift32 number of codepoints to shift
 435      * @return new shifted offset16 relative to start
 436      * @exception IndexOutOfBoundsException if the new offset16 is out of
 437      *            bounds with respect to the subarray or the subarray bounds
 438      *            are out of range.
 439      * @stable ICU 2.1
 440      */
 441     public static int moveCodePointOffset(char source[], int start, int limit,
 442                                           int offset16, int shift32)
 443     {
 444         int         size = source.length;
 445         int         count;
 446         char        ch;
 447         int         result = offset16 + start;
 448         if (start<0 || limit<start) {
 449             throw new StringIndexOutOfBoundsException(start);
 450         }
 451         if (limit>size) {
 452             throw new StringIndexOutOfBoundsException(limit);
 453         }
 454         if (offset16<0 || result>limit) {
 455             throw new StringIndexOutOfBoundsException(offset16);
 456         }
 457         if (shift32 > 0 ) {
 458             if (shift32 + result > size) {
 459                 throw new StringIndexOutOfBoundsException(result);
 460             }
 461             count = shift32;
 462             while (result < limit && count > 0)
 463             {
 464                 ch = source[result];
 465                 if (isLeadSurrogate(ch) && (result+1 < limit) &&
 466                         isTrailSurrogate(source[result+1])) {
 467                     result ++;
 468                 }
 469                 count --;
 470                 result ++;
 471             }
 472         } else {
 473             if (result + shift32 < start) {
 474                 throw new StringIndexOutOfBoundsException(result);
 475             }
 476             for (count=-shift32; count>0; count--) {
 477                 result--;
 478                 if (result<start) {
 479                     break;
 480                 }
 481                 ch = source[result];
 482                 if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
 483                     result--;
 484                 }
 485             }
 486         }
 487         if (count != 0)  {
 488             throw new StringIndexOutOfBoundsException(shift32);
 489         }
 490         result -= start;
 491         return result;
 492     }
 493 
 494     // private data members -------------------------------------------------
 495 
 496     /**
 497      * Shift value for lead surrogate to form a supplementary character.
 498      */
 499     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 500 
 501     /**
 502      * Mask to retrieve the significant value from a trail surrogate.
 503      */
 504     private static final int TRAIL_SURROGATE_MASK_     = 0x3FF;
 505 
 506     /**
 507      * Value that all lead surrogate starts with
 508      */
 509     private static final int LEAD_SURROGATE_OFFSET_ =
 510         LEAD_SURROGATE_MIN_VALUE -
 511         (SUPPLEMENTARY_MIN_VALUE
 512          >> LEAD_SURROGATE_SHIFT_);
 513 
 514     // private methods ------------------------------------------------------
 515 
 516     /**
 517      * <p>Converts argument code point and returns a String object representing
 518      * the code point's value in UTF16 format.</p>
 519      * <p>This method does not check for the validity of the codepoint, the
 520      * results are not guaranteed if a invalid codepoint is passed as
 521      * argument.</p>
 522      * <p>The result is a string whose length is 1 for non-supplementary code
 523      * points, 2 otherwise.</p>
 524      * @param ch code point
 525      * @return string representation of the code point
 526      */
 527     private static String toString(int ch)
 528     {
 529         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 530             return String.valueOf((char)ch);
 531         }
 532 
 533         StringBuilder result = new StringBuilder();
 534         result.append(getLeadSurrogate(ch));
 535         result.append(getTrailSurrogate(ch));
 536         return result.toString();
 537     }
 538 }