< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/UTF16.java

Print this page


   1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 /**
  40  * <p>Standalone utility class providing UTF16 character conversions and
  41  * indexing conversions.
  42  * <p>Code that uses strings alone rarely need modification.
  43  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  44  * operation. Similarly, concatenation is always safe. Substringing is safe if
  45  * the start and end are both on UTF-32 boundaries. In normal code, the values
  46  * for start and end are on those boundaries, since they arose from operations
  47  * like searching. If not, the nearest UTF-32 boundaries can be determined
  48  * using <code>bounds()</code>.
  49  * <strong>Examples:</strong>
  50  * <p>The following examples illustrate use of some of these methods.
  51  * <pre>{@code
  52  * // iteration forwards: Original
  53  * for (int i = 0; i < s.length(); ++i) {
  54  *     char ch = s.charAt(i);
  55  *     doSomethingWith(ch);
  56  * }
  57  *
  58  * // iteration forwards: Changes for UTF-32
  59  * int ch;
  60  * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
  61  *     ch = UTF16.charAt(s,i);
  62  *     doSomethingWith(ch);
  63  * }
  64  *
  65  * // iteration backwards: Original
  66  * for (int i = s.length() -1; i >= 0; --i) {
  67  *     char ch = s.charAt(i);
  68  *     doSomethingWith(ch);
  69  * }
  70  *
  71  * // iteration backwards: Changes for UTF-32
  72  * int ch;
  73  * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
  74  *     ch = UTF16.charAt(s,i);
  75  *     doSomethingWith(ch);
  76  * }
  77  * }</pre>
  78  * <strong>Notes:</strong>
  79  * <ul>
  80  *   <li>
  81  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  82  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  83  *   sense of their ordering in a string. <code>offset16</code> and
  84  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  85  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  86  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  87  *   which is a UTF-16 code unit.
  88  *   </li>
  89  *   <li>
  90  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  91  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  92  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  93  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  94  *   </li>


 144     /**
 145      * Trail surrogate minimum value
 146      * @stable ICU 2.1
 147      */
 148     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 149     /**
 150      * Lead surrogate maximum value
 151      * @stable ICU 2.1
 152      */
 153     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
 154     /**
 155      * Trail surrogate maximum value
 156      * @stable ICU 2.1
 157      */
 158     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
 159     /**
 160      * Surrogate minimum value
 161      * @stable ICU 2.1
 162      */
 163     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;


























 164 








 165     // public method ------------------------------------------------------
 166 
 167     /**
 168      * Extract a single UTF-32 value from a string.
 169      * Used when iterating forwards or backwards (with
 170      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 171      * validity check is required, use
 172      * <code><a href="../lang/UCharacter.html#isLegal(char)">
 173      * UCharacter.isLegal()</a></code> on the return value.
 174      * If the char retrieved is part of a surrogate pair, its supplementary
 175      * character will be returned. If a complete supplementary character is
 176      * not found the incomplete character will be returned
 177      * @param source array of UTF-16 chars
 178      * @param offset16 UTF-16 offset to the start of the character.
 179      * @return UTF-32 value for the UTF-32 value that contains the char at
 180      *         offset16. The boundaries of that codepoint are the same as in
 181      *         <code>bounds32()</code>.
 182      * @exception IndexOutOfBoundsException thrown if offset16 is out of
 183      *            bounds.
 184      * @stable ICU 2.1


 205             if (source.length() != offset16) {
 206                 char trail = source.charAt(offset16);
 207                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
 208                     return UCharacterProperty.getRawSupplementary(single, trail);
 209                 }
 210             }
 211         } else {
 212             --offset16;
 213             if (offset16 >= 0) {
 214                 // single is a trail surrogate so
 215                 char lead = source.charAt(offset16);
 216                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
 217                     return UCharacterProperty.getRawSupplementary(lead, single);
 218                 }
 219             }
 220         }
 221         return single; // return unmatched surrogate
 222     }
 223 
 224     /**
 225      * Extract a single UTF-32 value from a substring.
 226      * Used when iterating forwards or backwards (with
 227      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 228      * validity check is required, use
 229      * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 230      * </a></code> on the return value.
 231      * If the char retrieved is part of a surrogate pair, its supplementary
 232      * character will be returned. If a complete supplementary character is
 233      * not found the incomplete character will be returned
 234      * @param source array of UTF-16 chars
 235      * @param start offset to substring in the source array for analyzing
 236      * @param limit offset to substring in the source array for analyzing
 237      * @param offset16 UTF-16 offset relative to start
 238      * @return UTF-32 value for the UTF-32 value that contains the char at
 239      *         offset16. The boundaries of that codepoint are the same as in
 240      *         <code>bounds32()</code>.
 241      * @exception IndexOutOfBoundsException thrown if offset16 is not within
 242      *            the range of start and limit.
 243      * @stable ICU 2.1
 244      */
 245     public static int charAt(char source[], int start, int limit,
 246                              int offset16)
 247     {
























































 248         offset16 += start;
 249         if (offset16 < start || offset16 >= limit) {
 250             throw new ArrayIndexOutOfBoundsException(offset16);
 251         }
 252 
 253         char single = source[offset16];
 254         if (!isSurrogate(single)) {
 255             return single;
 256         }
 257 
 258         // Convert the UTF-16 surrogate pair if necessary.
 259         // For simplicity in usage, and because the frequency of pairs is
 260         // low, look both directions.
 261         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 262             offset16 ++;
 263             if (offset16 >= limit) {
 264                 return single;
 265             }
 266             char trail = source[offset16];
 267             if (isTrailSurrogate(trail)) {
 268                 return UCharacterProperty.getRawSupplementary(single, trail);
 269             }
 270         }
 271         else { // isTrailSurrogate(single), so
 272             if (offset16 == start) {
 273                 return single;
 274             }
 275             offset16 --;
 276             char lead = source[offset16];
 277             if (isLeadSurrogate(lead))
 278                 return UCharacterProperty.getRawSupplementary(lead, single);
 279         }
 280         return single; // return unmatched surrogate
 281     }
 282 
 283     /**
 284      * Determines how many chars this char32 requires.
 285      * If a validity check is required, use <code>
 286      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
 287      * char32 before calling.
 288      * @param char32 the input codepoint.
 289      * @return 2 if is in supplementary space, otherwise 1.
 290      * @stable ICU 2.1
 291      */
 292     public static int getCharCount(int char32)
 293     {
 294         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
 295             return 1;
 296         }
 297         return 2;
 298     }
 299 
 300     /**
 301      * Determines whether the code value is a surrogate.
 302      * @param char16 the input character.
 303      * @return true iff the input character is a surrogate.
 304      * @stable ICU 2.1
 305      */
 306     public static boolean isSurrogate(char char16)
 307     {
 308         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
 309             char16 <= TRAIL_SURROGATE_MAX_VALUE;
 310     }
 311 
 312     /**
 313      * Determines whether the character is a trail surrogate.
 314      * @param char16 the input character.
 315      * @return true iff the input character is a trail surrogate.
 316      * @stable ICU 2.1
 317      */
 318     public static boolean isTrailSurrogate(char char16)
 319     {
 320         return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
 321                 char16 <= TRAIL_SURROGATE_MAX_VALUE);
 322     }
 323 
 324     /**
 325      * Determines whether the character is a lead surrogate.
 326      * @param char16 the input character.
 327      * @return true iff the input character is a lead surrogate
 328      * @stable ICU 2.1
 329      */
 330     public static boolean isLeadSurrogate(char char16)
 331     {
 332         return LEAD_SURROGATE_MIN_VALUE <= char16 &&
 333             char16 <= LEAD_SURROGATE_MAX_VALUE;
 334     }
 335 
 336     /**
 337      * Returns the lead surrogate.
 338      * If a validity check is required, use
 339      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 340      * on char32 before calling.
 341      * @param char32 the input character.
 342      * @return lead surrogate if the getCharCount(ch) is 2; <br>
 343      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
 344      * @stable ICU 2.1
 345      */
 346     public static char getLeadSurrogate(int char32)
 347     {
 348         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 349             return (char)(LEAD_SURROGATE_OFFSET_ +
 350                           (char32 >> LEAD_SURROGATE_SHIFT_));
 351         }
 352 
 353         return 0;
 354     }
 355 
 356     /**
 357      * Returns the trail surrogate.
 358      * If a validity check is required, use
 359      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 360      * on char32 before calling.
 361      * @param char32 the input character.
 362      * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
 363      *         the character itself
 364      * @stable ICU 2.1
 365      */
 366     public static char getTrailSurrogate(int char32)
 367     {
 368         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 369             return (char)(TRAIL_SURROGATE_MIN_VALUE +
 370                           (char32 & TRAIL_SURROGATE_MASK_));
 371         }
 372 
 373         return (char)char32;
 374     }
 375 
 376     /**
 377      * Convenience method corresponding to String.valueOf(char). Returns a one
 378      * or two char string containing the UTF-32 value in UTF16 format. If a
 379      * validity check is required, use
 380      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 381      * on char32 before calling.
 382      * @param char32 the input character.
 383      * @return string value of char32 in UTF16 format
 384      * @exception IllegalArgumentException thrown if char32 is a invalid
 385      *            codepoint.
 386      * @stable ICU 2.1
 387      */
 388     public static String valueOf(int char32)
 389     {
 390         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 391             throw new IllegalArgumentException("Illegal codepoint");
 392         }
 393         return toString(char32);


 402      * @param char32 value to append.
 403      * @return the updated StringBuffer
 404      * @exception IllegalArgumentException thrown when char32 does not lie
 405      *            within the range of the Unicode codepoints
 406      * @stable ICU 2.1
 407      */
 408     public static StringBuffer append(StringBuffer target, int char32)
 409     {
 410         // Check for irregular values
 411         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 412             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
 413         }
 414 
 415         // Write the UTF-16 values
 416         if (char32 >= SUPPLEMENTARY_MIN_VALUE)
 417             {
 418                 target.append(getLeadSurrogate(char32));
 419                 target.append(getTrailSurrogate(char32));
 420             }
 421         else {
 422             target.append((char)char32);
 423         }
 424         return target;
 425     }
 426 
 427     //// for StringPrep
 428     /**
 429      * Shifts offset16 by the argument number of codepoints within a subarray.
 430      * @param source char array
 431      * @param start position of the subarray to be performed on
 432      * @param limit position of the subarray to be performed on
 433      * @param offset16 UTF16 position to shift relative to start
 434      * @param shift32 number of codepoints to shift
 435      * @return new shifted offset16 relative to start
 436      * @exception IndexOutOfBoundsException if the new offset16 is out of
 437      *            bounds with respect to the subarray or the subarray bounds
 438      *            are out of range.
 439      * @stable ICU 2.1
 440      */
 441     public static int moveCodePointOffset(char source[], int start, int limit,
 442                                           int offset16, int shift32)
 443     {
 444         int         size = source.length;
 445         int         count;
 446         char        ch;
 447         int         result = offset16 + start;
 448         if (start<0 || limit<start) {
 449             throw new StringIndexOutOfBoundsException(start);
 450         }
 451         if (limit>size) {
 452             throw new StringIndexOutOfBoundsException(limit);
 453         }
 454         if (offset16<0 || result>limit) {
 455             throw new StringIndexOutOfBoundsException(offset16);
 456         }
 457         if (shift32 > 0 ) {
 458             if (shift32 + result > size) {
 459                 throw new StringIndexOutOfBoundsException(result);
 460             }
 461             count = shift32;
 462             while (result < limit && count > 0)
 463             {
 464                 ch = source[result];
 465                 if (isLeadSurrogate(ch) && (result+1 < limit) &&
 466                         isTrailSurrogate(source[result+1])) {
 467                     result ++;
 468                 }
 469                 count --;
 470                 result ++;
 471             }
 472         } else {
 473             if (result + shift32 < start) {
 474                 throw new StringIndexOutOfBoundsException(result);
 475             }
 476             for (count=-shift32; count>0; count--) {
 477                 result--;
 478                 if (result<start) {
 479                     break;
 480                 }
 481                 ch = source[result];
 482                 if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
 483                     result--;
 484                 }
 485             }
 486         }
 487         if (count != 0)  {
 488             throw new StringIndexOutOfBoundsException(shift32);
 489         }
 490         result -= start;
 491         return result;
 492     }
 493 
 494     // private data members -------------------------------------------------
 495 
 496     /**
 497      * Shift value for lead surrogate to form a supplementary character.
 498      */
 499     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 500 
 501     /**
 502      * Mask to retrieve the significant value from a trail surrogate.


 510         LEAD_SURROGATE_MIN_VALUE -
 511         (SUPPLEMENTARY_MIN_VALUE
 512          >> LEAD_SURROGATE_SHIFT_);
 513 
 514     // private methods ------------------------------------------------------
 515 
 516     /**
 517      * <p>Converts argument code point and returns a String object representing
 518      * the code point's value in UTF16 format.
 519      * <p>This method does not check for the validity of the codepoint, the
 520      * results are not guaranteed if a invalid codepoint is passed as
 521      * argument.
 522      * <p>The result is a string whose length is 1 for non-supplementary code
 523      * points, 2 otherwise.
 524      * @param ch code point
 525      * @return string representation of the code point
 526      */
 527     private static String toString(int ch)
 528     {
 529         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 530             return String.valueOf((char)ch);
 531         }
 532 
 533         StringBuilder result = new StringBuilder();
 534         result.append(getLeadSurrogate(ch));
 535         result.append(getTrailSurrogate(ch));
 536         return result.toString();
 537     }
 538 }
   1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /**
  26  *******************************************************************************
  27  * Copyright (C) 1996-2014, International Business Machines Corporation and
  28  * others. All Rights Reserved.





  29  *******************************************************************************
  30  */
  31 
  32 package sun.text.normalizer;
  33 
  34 /**
  35  * <p>Standalone utility class providing UTF16 character conversions and
  36  * indexing conversions.
  37  * <p>Code that uses strings alone rarely need modification.
  38  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  39  * operation. Similarly, concatenation is always safe. Substringing is safe if
  40  * the start and end are both on UTF-32 boundaries. In normal code, the values
  41  * for start and end are on those boundaries, since they arose from operations
  42  * like searching. If not, the nearest UTF-32 boundaries can be determined
  43  * using <code>bounds()</code>.
  44  * <strong>Examples:</strong>
  45  * <p>The following examples illustrate use of some of these methods.
  46  * <pre>{@code
  47  * // iteration forwards: Original
  48  * for (int i = 0; i < s.length(); ++i) {
  49  *     char ch = s.charAt(i);
  50  *     doSomethingWith(ch);
  51  * }
  52  *
  53  * // iteration forwards: Changes for UTF-32
  54  * int ch;
  55  * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
  56  *     ch = UTF16.charAt(s, i);
  57  *     doSomethingWith(ch);
  58  * }
  59  *
  60  * // iteration backwards: Original
  61  * for (int i = s.length() - 1; i >= 0; --i) {
  62  *     char ch = s.charAt(i);
  63  *     doSomethingWith(ch);
  64  * }
  65  *
  66  * // iteration backwards: Changes for UTF-32
  67  * int ch;
  68  * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
  69  *     ch = UTF16.charAt(s, i);
  70  *     doSomethingWith(ch);
  71  * }
  72  * }</pre>
  73  * <strong>Notes:</strong>
  74  * <ul>
  75  *   <li>
  76  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  77  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  78  *   sense of their ordering in a string. <code>offset16</code> and
  79  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  80  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  81  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  82  *   which is a UTF-16 code unit.
  83  *   </li>
  84  *   <li>
  85  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  86  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  87  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  88  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  89  *   </li>


 139     /**
 140      * Trail surrogate minimum value
 141      * @stable ICU 2.1
 142      */
 143     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 144     /**
 145      * Lead surrogate maximum value
 146      * @stable ICU 2.1
 147      */
 148     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
 149     /**
 150      * Trail surrogate maximum value
 151      * @stable ICU 2.1
 152      */
 153     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
 154     /**
 155      * Surrogate minimum value
 156      * @stable ICU 2.1
 157      */
 158     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
 159     /**
 160      * Lead surrogate bitmask
 161      */
 162     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
 163     /**
 164      * Trail surrogate bitmask
 165      */
 166     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
 167     /**
 168      * Surrogate bitmask
 169      */
 170     private static final int SURROGATE_BITMASK = 0xFFFFF800;
 171     /**
 172      * Lead surrogate bits
 173      */
 174     private static final int LEAD_SURROGATE_BITS = 0xD800;
 175     /**
 176      * Trail surrogate bits
 177      */
 178     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
 179     /**
 180      * Surrogate bits
 181      */
 182     private static final int SURROGATE_BITS = 0xD800;
 183 
 184     // constructor --------------------------------------------------------
 185 
 186     // /CLOVER:OFF
 187     /**
 188      * Prevent instance from being created.
 189      */
 190     private UTF16() {
 191     }
 192 
 193     // /CLOVER:ON
 194     // public method ------------------------------------------------------
 195 
 196     /**
 197      * Extract a single UTF-32 value from a string.
 198      * Used when iterating forwards or backwards (with
 199      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 200      * validity check is required, use
 201      * <code><a href="../lang/UCharacter.html#isLegal(char)">
 202      * UCharacter.isLegal()</a></code> on the return value.
 203      * If the char retrieved is part of a surrogate pair, its supplementary
 204      * character will be returned. If a complete supplementary character is
 205      * not found the incomplete character will be returned
 206      * @param source array of UTF-16 chars
 207      * @param offset16 UTF-16 offset to the start of the character.
 208      * @return UTF-32 value for the UTF-32 value that contains the char at
 209      *         offset16. The boundaries of that codepoint are the same as in
 210      *         <code>bounds32()</code>.
 211      * @exception IndexOutOfBoundsException thrown if offset16 is out of
 212      *            bounds.
 213      * @stable ICU 2.1


 234             if (source.length() != offset16) {
 235                 char trail = source.charAt(offset16);
 236                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
 237                     return UCharacterProperty.getRawSupplementary(single, trail);
 238                 }
 239             }
 240         } else {
 241             --offset16;
 242             if (offset16 >= 0) {
 243                 // single is a trail surrogate so
 244                 char lead = source.charAt(offset16);
 245                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
 246                     return UCharacterProperty.getRawSupplementary(lead, single);
 247                 }
 248             }
 249         }
 250         return single; // return unmatched surrogate
 251     }
 252 
 253     /**
 254      * Extract a single UTF-32 value from a string.
 255      * Used when iterating forwards or backwards (with
 256      * <code>UTF16.getCharCount()</code>, as well as random access. If a
 257      * validity check is required, use
 258      * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 259      * </a></code> on the return value.
 260      * If the char retrieved is part of a surrogate pair, its supplementary
 261      * character will be returned. If a complete supplementary character is
 262      * not found the incomplete character will be returned
 263      * @param source array of UTF-16 chars
 264      * @param offset16 UTF-16 offset to the start of the character.


 265      * @return UTF-32 value for the UTF-32 value that contains the char at
 266      *         offset16. The boundaries of that codepoint are the same as in
 267      *         <code>bounds32()</code>.
 268      * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.

 269      * @stable ICU 2.1
 270      */
 271     public static int charAt(CharSequence source, int offset16) {
 272         char single = source.charAt(offset16);
 273         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 274             return single;
 275         }
 276         return _charAt(source, offset16, single);
 277     }
 278 
 279     private static int _charAt(CharSequence source, int offset16, char single) {
 280         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 281             return single;
 282         }
 283 
 284         // Convert the UTF-16 surrogate pair if necessary.
 285         // For simplicity in usage, and because the frequency of pairs is
 286         // low, look both directions.
 287 
 288         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 289             ++offset16;
 290             if (source.length() != offset16) {
 291                 char trail = source.charAt(offset16);
 292                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
 293                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 294                     return UCharacterProperty.getRawSupplementary(single, trail);
 295                 }
 296             }
 297         } else {
 298             --offset16;
 299             if (offset16 >= 0) {
 300                 // single is a trail surrogate so
 301                 char lead = source.charAt(offset16);
 302                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
 303                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 304                     return UCharacterProperty.getRawSupplementary(lead, single);
 305                 }
 306             }
 307         }
 308         return single; // return unmatched surrogate
 309     }
 310 
 311     /**
 312      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
 313      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 314      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 315      * </a></code>
 316      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 317      * character will be returned. If a complete supplementary character is not found the incomplete
 318      * character will be returned
 319      *
 320      * @param source Array of UTF-16 chars
 321      * @param start Offset to substring in the source array for analyzing
 322      * @param limit Offset to substring in the source array for analyzing
 323      * @param offset16 UTF-16 offset relative to start
 324      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 325      *         of that codepoint are the same as in <code>bounds32()</code>.
 326      * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
 327      * @stable ICU 2.1
 328      */
 329     public static int charAt(char source[], int start, int limit, int offset16) {
 330         offset16 += start;
 331         if (offset16 < start || offset16 >= limit) {
 332             throw new ArrayIndexOutOfBoundsException(offset16);
 333         }
 334 
 335         char single = source[offset16];
 336         if (!isSurrogate(single)) {
 337             return single;
 338         }
 339 
 340         // Convert the UTF-16 surrogate pair if necessary.
 341         // For simplicity in usage, and because the frequency of pairs is
 342         // low, look both directions.
 343         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 344             offset16++;
 345             if (offset16 >= limit) {
 346                 return single;
 347             }
 348             char trail = source[offset16];
 349             if (isTrailSurrogate(trail)) {
 350                 return UCharacterProperty.getRawSupplementary(single, trail);
 351             }
 352         }
 353         else { // isTrailSurrogate(single), so
 354             if (offset16 == start) {
 355                 return single;
 356             }
 357             offset16--;
 358             char lead = source[offset16];
 359             if (isLeadSurrogate(lead))
 360                 return UCharacterProperty.getRawSupplementary(lead, single);
 361         }
 362         return single; // return unmatched surrogate
 363     }
 364 
 365     /**
 366      * Determines how many chars this char32 requires.
 367      * If a validity check is required, use <code>
 368      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
 369      * char32 before calling.
 370      * @param char32 the input codepoint.
 371      * @return 2 if is in supplementary space, otherwise 1.
 372      * @stable ICU 2.1
 373      */
 374     public static int getCharCount(int char32)
 375     {
 376         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
 377             return 1;
 378         }
 379         return 2;
 380     }
 381 
 382     /**
 383      * Determines whether the code value is a surrogate.
 384      * @param char16 the input character.
 385      * @return true if the input character is a surrogate.
 386      * @stable ICU 2.1
 387      */
 388     public static boolean isSurrogate(char char16)
 389     {
 390         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;

 391     }
 392 
 393     /**
 394      * Determines whether the character is a trail surrogate.
 395      * @param char16 the input character.
 396      * @return true if the input character is a trail surrogate.
 397      * @stable ICU 2.1
 398      */
 399     public static boolean isTrailSurrogate(char char16)
 400     {
 401         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;

 402     }
 403 
 404     /**
 405      * Determines whether the character is a lead surrogate.
 406      * @param char16 the input character.
 407      * @return true if the input character is a lead surrogate
 408      * @stable ICU 2.1
 409      */
 410     public static boolean isLeadSurrogate(char char16)
 411     {
 412         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;

 413     }
 414 
 415     /**
 416      * Returns the lead surrogate.
 417      * If a validity check is required, use
 418      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 419      * on char32 before calling.
 420      * @param char32 the input character.
 421      * @return lead surrogate if the getCharCount(ch) is 2; <br>
 422      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
 423      * @stable ICU 2.1
 424      */
 425     public static char getLeadSurrogate(int char32)
 426     {
 427         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 428             return (char)(LEAD_SURROGATE_OFFSET_ +
 429                           (char32 >> LEAD_SURROGATE_SHIFT_));
 430         }
 431 
 432         return 0;
 433     }
 434 
 435     /**
 436      * Returns the trail surrogate.
 437      * If a validity check is required, use
 438      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 439      * on char32 before calling.
 440      * @param char32 the input character.
 441      * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
 442      *         the character itself
 443      * @stable ICU 2.1
 444      */
 445     public static char getTrailSurrogate(int char32)
 446     {
 447         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 448             return (char)(TRAIL_SURROGATE_MIN_VALUE +
 449                           (char32 & TRAIL_SURROGATE_MASK_));
 450         }
 451 
 452         return (char) char32;
 453     }
 454 
 455     /**
 456      * Convenience method corresponding to String.valueOf(char). Returns a one
 457      * or two char string containing the UTF-32 value in UTF16 format. If a
 458      * validity check is required, use
 459      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 460      * on char32 before calling.
 461      * @param char32 the input character.
 462      * @return string value of char32 in UTF16 format
 463      * @exception IllegalArgumentException thrown if char32 is a invalid
 464      *            codepoint.
 465      * @stable ICU 2.1
 466      */
 467     public static String valueOf(int char32)
 468     {
 469         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 470             throw new IllegalArgumentException("Illegal codepoint");
 471         }
 472         return toString(char32);


 481      * @param char32 value to append.
 482      * @return the updated StringBuffer
 483      * @exception IllegalArgumentException thrown when char32 does not lie
 484      *            within the range of the Unicode codepoints
 485      * @stable ICU 2.1
 486      */
 487     public static StringBuffer append(StringBuffer target, int char32)
 488     {
 489         // Check for irregular values
 490         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 491             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
 492         }
 493 
 494         // Write the UTF-16 values
 495         if (char32 >= SUPPLEMENTARY_MIN_VALUE)
 496             {
 497             target.append(getLeadSurrogate(char32));
 498             target.append(getTrailSurrogate(char32));
 499         }
 500         else {
 501             target.append((char) char32);
 502         }
 503         return target;
 504     }
 505 

 506     /**
 507      * Shifts offset16 by the argument number of codepoints within a subarray.
 508      * @param source char array
 509      * @param start position of the subarray to be performed on
 510      * @param limit position of the subarray to be performed on
 511      * @param offset16 UTF16 position to shift relative to start
 512      * @param shift32 number of codepoints to shift
 513      * @return new shifted offset16 relative to start
 514      * @exception IndexOutOfBoundsException if the new offset16 is out of
 515      *            bounds with respect to the subarray or the subarray bounds
 516      *            are out of range.
 517      * @stable ICU 2.1
 518      */
 519     public static int moveCodePointOffset(char source[], int start, int limit,
 520                                           int offset16, int shift32)
 521     {
 522         int size = source.length;
 523         int count;
 524         char ch;
 525         int result = offset16 + start;
 526         if (start < 0 || limit < start) {
 527             throw new StringIndexOutOfBoundsException(start);
 528         }
 529         if (limit > size) {
 530             throw new StringIndexOutOfBoundsException(limit);
 531         }
 532         if (offset16 < 0 || result > limit) {
 533             throw new StringIndexOutOfBoundsException(offset16);
 534         }
 535         if (shift32 > 0) {
 536             if (shift32 + result > size) {
 537                 throw new StringIndexOutOfBoundsException(result);
 538             }
 539             count = shift32;
 540             while (result < limit && count > 0)
 541             {
 542                 ch = source[result];
 543                 if (isLeadSurrogate(ch) && (result + 1 < limit) &&
 544                     isTrailSurrogate(source[result + 1])) {
 545                     result++;
 546                 }
 547                 count--;
 548                 result++;
 549             }
 550         } else {
 551             if (result + shift32 < start) {
 552                 throw new StringIndexOutOfBoundsException(result);
 553             }
 554             for (count = -shift32; count > 0; count--) {
 555                 result--;
 556                 if (result < start) {
 557                     break;
 558                 }
 559                 ch = source[result];
 560                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
 561                     result--;
 562                 }
 563             }
 564         }
 565         if (count != 0) {
 566             throw new StringIndexOutOfBoundsException(shift32);
 567         }
 568         result -= start;
 569         return result;
 570     }
 571 
 572     // private data members -------------------------------------------------
 573 
 574     /**
 575      * Shift value for lead surrogate to form a supplementary character.
 576      */
 577     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 578 
 579     /**
 580      * Mask to retrieve the significant value from a trail surrogate.


 588         LEAD_SURROGATE_MIN_VALUE -
 589         (SUPPLEMENTARY_MIN_VALUE
 590         >> LEAD_SURROGATE_SHIFT_);
 591 
 592     // private methods ------------------------------------------------------
 593 
 594     /**
 595      * <p>Converts argument code point and returns a String object representing
 596      * the code point's value in UTF16 format.
 597      * <p>This method does not check for the validity of the codepoint, the
 598      * results are not guaranteed if a invalid codepoint is passed as
 599      * argument.
 600      * <p>The result is a string whose length is 1 for non-supplementary code
 601      * points, 2 otherwise.
 602      * @param ch code point
 603      * @return string representation of the code point
 604      */
 605     private static String toString(int ch)
 606     {
 607         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 608             return String.valueOf((char) ch);
 609         }
 610 
 611         StringBuilder result = new StringBuilder();
 612         result.append(getLeadSurrogate(ch));
 613         result.append(getTrailSurrogate(ch));
 614         return result.toString();
 615     }
 616 }
< prev index next >