< prev index next >

src/java.base/share/classes/sun/text/normalizer/UTF16.java

Print this page




  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 /**
  40  * <p>Standalone utility class providing UTF16 character conversions and
  41  * indexing conversions.</p>
  42  * <p>Code that uses strings alone rarely need modification.
  43  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  44  * operation. Similarly, concatenation is always safe. Substringing is safe if
  45  * the start and end are both on UTF-32 boundaries. In normal code, the values
  46  * for start and end are on those boundaries, since they arose from operations
  47  * like searching. If not, the nearest UTF-32 boundaries can be determined
  48  * using <code>bounds()</code>.</p>
  49  * <strong>Examples:</strong>
  50  * <p>The following examples illustrate use of some of these methods.
  51  * <pre>
  52  * // iteration forwards: Original
  53  * for (int i = 0; i &lt; s.length(); ++i) {
  54  *     char ch = s.charAt(i);
  55  *     doSomethingWith(ch);
  56  * }
  57  *
  58  * // iteration forwards: Changes for UTF-32
  59  * int ch;
  60  * for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
  61  *     ch = UTF16.charAt(s,i);
  62  *     doSomethingWith(ch);
  63  * }
  64  *
  65  * // iteration backwards: Original
  66  * for (int i = s.length() -1; i >= 0; --i) {
  67  *     char ch = s.charAt(i);
  68  *     doSomethingWith(ch);
  69  * }
  70  *
  71  * // iteration backwards: Changes for UTF-32
  72  * int ch;
  73  * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
  74  *     ch = UTF16.charAt(s,i);
  75  *     doSomethingWith(ch);
  76  * }
  77  * </pre>
  78  * <strong>Notes:</strong>
  79  * <ul>
  80  *   <li>
  81  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  82  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  83  *   sense of their ordering in a string. <code>offset16</code> and
  84  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  85  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  86  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  87  *   which is a UTF-16 code unit.
  88  *   </li>
  89  *   <li>
  90  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  91  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  92  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  93  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  94  *   </li>
  95  *   <li>
  96  *    <strong>Exceptions:</strong> The error checking will throw an exception
  97  *   if indices are out of bounds. Other than that, all methods will


 498      */
 499     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 500 
 501     /**
 502      * Mask to retrieve the significant value from a trail surrogate.
 503      */
 504     private static final int TRAIL_SURROGATE_MASK_     = 0x3FF;
 505 
 506     /**
 507      * Value that all lead surrogate starts with
 508      */
 509     private static final int LEAD_SURROGATE_OFFSET_ =
 510         LEAD_SURROGATE_MIN_VALUE -
 511         (SUPPLEMENTARY_MIN_VALUE
 512          >> LEAD_SURROGATE_SHIFT_);
 513 
 514     // private methods ------------------------------------------------------
 515 
 516     /**
 517      * <p>Converts argument code point and returns a String object representing
 518      * the code point's value in UTF16 format.</p>
 519      * <p>This method does not check for the validity of the codepoint, the
 520      * results are not guaranteed if a invalid codepoint is passed as
 521      * argument.</p>
 522      * <p>The result is a string whose length is 1 for non-supplementary code
 523      * points, 2 otherwise.</p>
 524      * @param ch code point
 525      * @return string representation of the code point
 526      */
 527     private static String toString(int ch)
 528     {
 529         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 530             return String.valueOf((char)ch);
 531         }
 532 
 533         StringBuilder result = new StringBuilder();
 534         result.append(getLeadSurrogate(ch));
 535         result.append(getTrailSurrogate(ch));
 536         return result.toString();
 537     }
 538 }


  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 /**
  40  * <p>Standalone utility class providing UTF16 character conversions and
  41  * indexing conversions.
  42  * <p>Code that uses strings alone rarely need modification.
  43  * By design, UTF-16 does not allow overlap, so searching for strings is a safe
  44  * operation. Similarly, concatenation is always safe. Substringing is safe if
  45  * the start and end are both on UTF-32 boundaries. In normal code, the values
  46  * for start and end are on those boundaries, since they arose from operations
  47  * like searching. If not, the nearest UTF-32 boundaries can be determined
  48  * using <code>bounds()</code>.
  49  * <strong>Examples:</strong>
  50  * <p>The following examples illustrate use of some of these methods.
  51  * <pre>{@code
  52  * // iteration forwards: Original
  53  * for (int i = 0; i < s.length(); ++i) {
  54  *     char ch = s.charAt(i);
  55  *     doSomethingWith(ch);
  56  * }
  57  *
  58  * // iteration forwards: Changes for UTF-32
  59  * int ch;
  60  * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
  61  *     ch = UTF16.charAt(s,i);
  62  *     doSomethingWith(ch);
  63  * }
  64  *
  65  * // iteration backwards: Original
  66  * for (int i = s.length() -1; i >= 0; --i) {
  67  *     char ch = s.charAt(i);
  68  *     doSomethingWith(ch);
  69  * }
  70  *
  71  * // iteration backwards: Changes for UTF-32
  72  * int ch;
  73  * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
  74  *     ch = UTF16.charAt(s,i);
  75  *     doSomethingWith(ch);
  76  * }
  77  * }</pre>
  78  * <strong>Notes:</strong>
  79  * <ul>
  80  *   <li>
  81  *   <strong>Naming:</strong> For clarity, High and Low surrogates are called
  82  *   <code>Lead</code> and <code>Trail</code> in the API, which gives a better
  83  *   sense of their ordering in a string. <code>offset16</code> and
  84  *   <code>offset32</code> are used to distinguish offsets to UTF-16
  85  *   boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
  86  *   used to contain UTF-32 characters, as opposed to <code>char16</code>,
  87  *   which is a UTF-16 code unit.
  88  *   </li>
  89  *   <li>
  90  *   <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
  91  *   UTF-32 offset to a UTF-16 offset and back. Because of the difference in
  92  *   structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
  93  *   back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  94  *   </li>
  95  *   <li>
  96  *    <strong>Exceptions:</strong> The error checking will throw an exception
  97  *   if indices are out of bounds. Other than that, all methods will


 498      */
 499     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 500 
 501     /**
 502      * Mask to retrieve the significant value from a trail surrogate.
 503      */
 504     private static final int TRAIL_SURROGATE_MASK_     = 0x3FF;
 505 
 506     /**
 507      * Value that all lead surrogate starts with
 508      */
 509     private static final int LEAD_SURROGATE_OFFSET_ =
 510         LEAD_SURROGATE_MIN_VALUE -
 511         (SUPPLEMENTARY_MIN_VALUE
 512          >> LEAD_SURROGATE_SHIFT_);
 513 
 514     // private methods ------------------------------------------------------
 515 
 516     /**
 517      * <p>Converts argument code point and returns a String object representing
 518      * the code point's value in UTF16 format.
 519      * <p>This method does not check for the validity of the codepoint, the
 520      * results are not guaranteed if a invalid codepoint is passed as
 521      * argument.
 522      * <p>The result is a string whose length is 1 for non-supplementary code
 523      * points, 2 otherwise.
 524      * @param ch code point
 525      * @return string representation of the code point
 526      */
 527     private static String toString(int ch)
 528     {
 529         if (ch < SUPPLEMENTARY_MIN_VALUE) {
 530             return String.valueOf((char)ch);
 531         }
 532 
 533         StringBuilder result = new StringBuilder();
 534         result.append(getLeadSurrogate(ch));
 535         result.append(getTrailSurrogate(ch));
 536         return result.toString();
 537     }
 538 }
< prev index next >