< prev index next >

src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java

Print this page




  38 
  39 import java.text.CharacterIterator;
  40 import java.text.Normalizer;
  41 
  42 /**
  43  * Unicode Normalization
  44  *
  45  * <h2>Unicode normalization API</h2>
  46  *
  47  * <code>normalize</code> transforms Unicode text into an equivalent composed or
  48  * decomposed form, allowing for easier sorting and searching of text.
  49  * <code>normalize</code> supports the standard normalization forms described in
  50  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
  51  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  52  *
  53  * Characters with accents or other adornments can be encoded in
  54  * several different ways in Unicode.  For example, take the character A-acute.
  55  * In Unicode, this can be encoded as a single character (the
  56  * "composed" form):
  57  *
  58  * <p>
  59  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
  60  * </p>
  61  *
  62  * or as two separate characters (the "decomposed" form):
  63  *
  64  * <p>
  65  *      0041    LATIN CAPITAL LETTER A
  66  *      0301    COMBINING ACUTE ACCENT
  67  * </p>
  68  *
  69  * To a user of your program, however, both of these sequences should be
  70  * treated as the same "user-level" character "A with acute accent".  When you
  71  * are searching or comparing text, you must ensure that these two sequences are
  72  * treated equivalently.  In addition, you must handle characters with more than
  73  * one accent.  Sometimes the order of a character's combining accents is
  74  * significant, while in other cases accent sequences in different orders are
  75  * really equivalent.
  76  *
  77  * Similarly, the string "ffi" can be encoded as three separate letters:
  78  *
  79  * <p>
  80  *      0066    LATIN SMALL LETTER F
  81  *      0066    LATIN SMALL LETTER F
  82  *      0069    LATIN SMALL LETTER I
  83  * </p>
  84  *
  85  * or as the single character
  86  *
  87  * <p>
  88  *      FB03    LATIN SMALL LIGATURE FFI
  89  * </p>
  90  *
  91  * The ffi ligature is not a distinct semantic character, and strictly speaking
  92  * it shouldn't be in Unicode at all, but it was included for compatibility
  93  * with existing character sets that already provided it.  The Unicode standard
  94  * identifies such characters by giving them "compatibility" decompositions
  95  * into the corresponding semantic characters.  When sorting and searching, you
  96  * will often want to use these mappings.
  97  *
  98  * <code>normalize</code> helps solve these problems by transforming text into
  99  * the canonical composed and decomposed forms as shown in the first example
 100  * above. In addition, you can have it perform compatibility decompositions so
 101  * that you can treat compatibility characters the same as their equivalents.
 102  * Finally, <code>normalize</code> rearranges accents into the proper canonical
 103  * order, so that you do not have to worry about accent rearrangement on your
 104  * own.
 105  *
 106  * Form FCD, "Fast C or D", is also designed for collation.
 107  * It allows to work on strings that are not necessarily normalized
 108  * with an algorithm (like in collation) that works under "canonical closure",
 109  * i.e., it treats precomposed characters and their decomposed equivalents the


 538     public static final QuickCheckResult NO = new QuickCheckResult(0);
 539 
 540     /**
 541      * Indicates that string is in the normalized format
 542      * @stable ICU 2.8
 543      */
 544     public static final QuickCheckResult YES = new QuickCheckResult(1);
 545 
 546     /**
 547      * Indicates it cannot be determined if string is in the normalized
 548      * format without further thorough checks.
 549      * @stable ICU 2.8
 550      */
 551     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
 552 
 553     //-------------------------------------------------------------------------
 554     // Constructors
 555     //-------------------------------------------------------------------------
 556 
 557     /**
 558      * Creates a new <tt>Normalizer</tt> object for iterating over the
 559      * normalized form of a given string.
 560      * <p>
 561      * The <tt>options</tt> parameter specifies which optional
 562      * <tt>Normalizer</tt> features are to be enabled for this object.
 563      * <p>
 564      * @param str  The string to be normalized.  The normalization
 565      *              will start at the beginning of the string.
 566      *
 567      * @param mode The normalization mode.
 568      *
 569      * @param opt Any optional features to be enabled.
 570      *            Currently the only available option is {@link #UNICODE_3_2}.
 571      *            If you want the default behavior corresponding to one of the
 572      *            standard Unicode Normalization Forms, use 0 for this argument.
 573      * @stable ICU 2.6
 574      */
 575     public NormalizerBase(String str, Mode mode, int opt) {
 576         this.text = UCharacterIterator.getInstance(str);
 577         this.mode = mode;
 578         this.options=opt;
 579     }
 580 
 581     /**
 582      * Creates a new <tt>Normalizer</tt> object for iterating over the
 583      * normalized form of the given text.
 584      * <p>
 585      * @param iter  The input text to be normalized.  The normalization
 586      *              will start at the beginning of the string.
 587      *
 588      * @param mode  The normalization mode.
 589      */
 590     public NormalizerBase(CharacterIterator iter, Mode mode) {
 591           this(iter, mode, UNICODE_LATEST);
 592     }
 593 
 594     /**
 595      * Creates a new <tt>Normalizer</tt> object for iterating over the
 596      * normalized form of the given text.
 597      * <p>
 598      * @param iter  The input text to be normalized.  The normalization
 599      *              will start at the beginning of the string.
 600      *
 601      * @param mode  The normalization mode.
 602      *
 603      * @param opt Any optional features to be enabled.
 604      *            Currently the only available option is {@link #UNICODE_3_2}.
 605      *            If you want the default behavior corresponding to one of the
 606      *            standard Unicode Normalization Forms, use 0 for this argument.
 607      * @stable ICU 2.6
 608      */
 609     public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
 610         this.text = UCharacterIterator.getInstance(
 611                                                    (CharacterIterator)iter.clone()
 612                                                    );
 613         this.mode = mode;
 614         this.options = opt;
 615     }
 616 
 617     /**
 618      * Clones this <tt>Normalizer</tt> object.  All properties of this
 619      * object are duplicated in the new object, including the cloning of any
 620      * {@link CharacterIterator} that was passed in to the constructor
 621      * or to {@link #setText(CharacterIterator) setText}.
 622      * However, the text storage underlying
 623      * the <tt>CharacterIterator</tt> is not duplicated unless the
 624      * iterator's <tt>clone</tt> method does so.
 625      * @stable ICU 2.8
 626      */
 627     public Object clone() {
 628         try {
 629             NormalizerBase copy = (NormalizerBase) super.clone();
 630             copy.text = (UCharacterIterator) text.clone();
 631             //clone the internal buffer
 632             if (buffer != null) {
 633                 copy.buffer = new char[buffer.length];
 634                 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
 635             }
 636             return copy;
 637         }
 638         catch (CloneNotSupportedException e) {
 639             throw new InternalError(e.toString(), e);
 640         }
 641     }
 642 
 643     //--------------------------------------------------------------------------
 644     // Static Utility methods


 774      *                   less than the required length
 775      * @stable ICU 2.6
 776      */
 777     public static int normalize(char[] src,int srcStart, int srcLimit,
 778                                 char[] dest,int destStart, int destLimit,
 779                                 Mode  mode, int options) {
 780         int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
 781 
 782         if(length<=(destLimit-destStart)) {
 783             return length;
 784         } else {
 785             throw new IndexOutOfBoundsException(Integer.toString(length));
 786         }
 787     }
 788 
 789     //-------------------------------------------------------------------------
 790     // Iteration API
 791     //-------------------------------------------------------------------------
 792 
 793     /**
 794      * Return the current character in the normalized text->
 795      * @return The codepoint as an int
 796      * @stable ICU 2.8
 797      */
 798     public int current() {
 799         if(bufferPos<bufferLimit || nextNormalize()) {
 800             return getCodePointAt(bufferPos);
 801         } else {
 802             return DONE;
 803         }
 804     }
 805 
 806     /**
 807      * Return the next character in the normalized text and advance
 808      * the iteration position by one.  If the end
 809      * of the text has already been reached, {@link #DONE} is returned.
 810      * @return The codepoint as an int
 811      * @stable ICU 2.8
 812      */
 813     public int next() {
 814         if(bufferPos<bufferLimit ||  nextNormalize()) {


 855      * After setIndexOnly(), getIndex() will return the same index that is
 856      * specified here.
 857      *
 858      * @param index the desired index in the input text.
 859      * @stable ICU 2.8
 860      */
 861     public void setIndexOnly(int index) {
 862         text.setIndex(index);
 863         currentIndex=nextIndex=index; // validates index
 864         clearBuffer();
 865     }
 866 
 867     /**
 868      * Set the iteration position in the input text that is being normalized
 869      * and return the first normalized character at that position.
 870      * <p>
 871      * <b>Note:</b> This method sets the position in the <em>input</em> text,
 872      * while {@link #next} and {@link #previous} iterate through characters
 873      * in the normalized <em>output</em>.  This means that there is not
 874      * necessarily a one-to-one correspondence between characters returned
 875      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
 876      * returned from <tt>setIndex</tt> and {@link #getIndex}.
 877      * <p>
 878      * @param index the desired index in the input text->
 879      *
 880      * @return   the first normalized character that is the result of iterating
 881      *            forward starting at the given index.
 882      *
 883      * @throws IllegalArgumentException if the given index is less than
 884      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
 885      * @return The codepoint as an int
 886      * @deprecated ICU 3.2
 887      * @obsolete ICU 3.2
 888      */
 889      @Deprecated
 890      public int setIndex(int index) {
 891          setIndexOnly(index);
 892          return current();
 893      }
 894 
 895     /**
 896      * Retrieve the index of the start of the input text. This is the begin
 897      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
 898      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
 899      * @deprecated ICU 2.2. Use startIndex() instead.
 900      * @return The codepoint as an int
 901      * @see #startIndex
 902      */
 903     @Deprecated
 904     public int getBeginIndex() {
 905         return 0;
 906     }
 907 
 908     /**
 909      * Retrieve the index of the end of the input text.  This is the end index
 910      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 911      * over which this <tt>Normalizer</tt> is iterating
 912      * @deprecated ICU 2.2. Use endIndex() instead.
 913      * @return The codepoint as an int
 914      * @see #endIndex
 915      */
 916     @Deprecated
 917     public int getEndIndex() {
 918         return endIndex();
 919     }
 920 
 921     /**
 922      * Retrieve the current iteration position in the input text that is
 923      * being normalized.  This method is useful in applications such as
 924      * searching, where you need to be able to determine the position in
 925      * the input text that corresponds to a given normalized output character.
 926      * <p>
 927      * <b>Note:</b> This method sets the position in the <em>input</em>, while
 928      * {@link #next} and {@link #previous} iterate through characters in the
 929      * <em>output</em>.  This means that there is not necessarily a one-to-one
 930      * correspondence between characters returned by <tt>next</tt> and
 931      * <tt>previous</tt> and the indices passed to and returned from
 932      * <tt>setIndex</tt> and {@link #getIndex}.
 933      * @return The current iteration position
 934      * @stable ICU 2.8
 935      */
 936     public int getIndex() {
 937         if(bufferPos<bufferLimit) {
 938             return currentIndex;
 939         } else {
 940             return nextIndex;
 941         }
 942     }
 943 
 944     /**
 945      * Retrieve the index of the end of the input text->  This is the end index
 946      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 947      * over which this <tt>Normalizer</tt> is iterating
 948      * @return The current iteration position
 949      * @stable ICU 2.8
 950      */
 951     public int endIndex() {
 952         return text.getLength();
 953     }
 954 
 955     //-------------------------------------------------------------------------
 956     // Property access methods
 957     //-------------------------------------------------------------------------
 958     /**
 959      * Set the normalization mode for this object.
 960      * <p>
 961      * <b>Note:</b>If the normalization mode is changed while iterating
 962      * over a string, calls to {@link #next} and {@link #previous} may
 963      * return previously buffers characters in the old normalization mode
 964      * until the iteration is able to re-sync at the next base character.
 965      * It is safest to call {@link #setText setText()}, {@link #first},
 966      * {@link #last}, etc. after calling <tt>setMode</tt>.
 967      * <p>
 968      * @param newMode the new mode for this <tt>Normalizer</tt>.
 969      * The supported modes are:
 970      * <ul>
 971      *  <li>{@link #COMPOSE}        - Unicode canonical decompositiion
 972      *                                  followed by canonical composition.
 973      *  <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
 974      *                                  follwed by canonical composition.
 975      *  <li>{@link #DECOMP}         - Unicode canonical decomposition
 976      *  <li>{@link #DECOMP_COMPAT}  - Unicode compatibility decomposition.
 977      *  <li>{@link #NO_OP}          - Do nothing but return characters
 978      *                                  from the underlying input text.
 979      * </ul>
 980      *
 981      * @see #getMode
 982      * @stable ICU 2.8
 983      */
 984     public void setMode(Mode newMode) {
 985         mode = newMode;
 986     }
 987     /**
 988      * Return the basic operation performed by this <tt>Normalizer</tt>
 989      *
 990      * @see #setMode
 991      * @stable ICU 2.8
 992      */
 993     public Mode getMode() {
 994         return mode;
 995     }
 996 
 997     /**
 998      * Set the input text over which this <tt>Normalizer</tt> will iterate.
 999      * The iteration position is set to the beginning of the input text->
1000      * @param newText   The new string to be normalized.
1001      * @stable ICU 2.8
1002      */
1003     public void setText(String newText) {
1004 
1005         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1006         if (newIter == null) {
1007             throw new InternalError("Could not create a new UCharacterIterator");
1008         }
1009         text = newIter;
1010         reset();
1011     }
1012 
1013     /**
1014      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1015      * The iteration position is set to the beginning of the input text->
1016      * @param newText   The new string to be normalized.
1017      * @stable ICU 2.8
1018      */
1019     public void setText(CharacterIterator newText) {
1020 
1021         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1022         if (newIter == null) {
1023             throw new InternalError("Could not create a new UCharacterIterator");
1024         }
1025         text = newIter;
1026         currentIndex=nextIndex=0;
1027         clearBuffer();
1028     }
1029 
1030     //-------------------------------------------------------------------------
1031     // Private utility methods
1032     //-------------------------------------------------------------------------
1033 
1034 
1035     /* backward iteration --------------------------------------------------- */


1554     public static final int UNICODE_3_2_0_ORIGINAL =
1555                                UNICODE_3_2 |
1556                                NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
1557                                NormalizerImpl.BEFORE_PRI_29;
1558 
1559     /*
1560      * Default option for the latest Unicode normalization. This option is
1561      * provided mainly for testing.
1562      * The value zero means that normalization is done with the fixes for
1563      *   - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1564      *   - Corrigendum 5 (Normalization Idempotency)
1565      */
1566     public static final int UNICODE_LATEST = 0x00;
1567 
1568     //
1569     // public constructor and methods for java.text.Normalizer and
1570     // sun.text.Normalizer
1571     //
1572 
1573     /**
1574      * Creates a new <tt>Normalizer</tt> object for iterating over the
1575      * normalized form of a given string.
1576      *
1577      * @param str  The string to be normalized.  The normalization
1578      *              will start at the beginning of the string.
1579      *
1580      * @param mode The normalization mode.
1581      */
1582     public NormalizerBase(String str, Mode mode) {
1583           this(str, mode, UNICODE_LATEST);
1584     }
1585 
1586     /**
1587      * Normalizes a <code>String</code> using the given normalization form.
1588      *
1589      * @param str      the input string to be normalized.
1590      * @param form     the normalization form
1591      */
1592     public static String normalize(String str, Normalizer.Form form) {
1593         return normalize(str, form, UNICODE_LATEST);
1594     }


1629             return asciiOnly ? str : NFKC.normalize(str, options);
1630         case NFKD :
1631             return asciiOnly ? str : NFKD.normalize(str, options);
1632         }
1633 
1634         throw new IllegalArgumentException("Unexpected normalization form: " +
1635                                            form);
1636     }
1637 
1638     /**
1639      * Test if a string is in a given normalization form.
1640      * This is semantically equivalent to source.equals(normalize(source, mode)).
1641      *
1642      * Unlike quickCheck(), this function returns a definitive result,
1643      * never a "maybe".
1644      * For NFD, NFKD, and FCD, both functions work exactly the same.
1645      * For NFC and NFKC where quickCheck may return "maybe", this function will
1646      * perform further tests to arrive at a true/false result.
1647      * @param str       the input string to be checked to see if it is normalized
1648      * @param form      the normalization form
1649      * @param options   the optional features to be enabled.
1650      */
1651     public static boolean isNormalized(String str, Normalizer.Form form) {
1652         return isNormalized(str, form, UNICODE_LATEST);
1653     }
1654 
1655     /**
1656      * Test if a string is in a given normalization form.
1657      * This is semantically equivalent to source.equals(normalize(source, mode)).
1658      *
1659      * Unlike quickCheck(), this function returns a definitive result,
1660      * never a "maybe".
1661      * For NFD, NFKD, and FCD, both functions work exactly the same.
1662      * For NFC and NFKC where quickCheck may return "maybe", this function will
1663      * perform further tests to arrive at a true/false result.
1664      * @param str       the input string to be checked to see if it is normalized
1665      * @param form      the normalization form
1666      * @param options   the optional features to be enabled.
1667      */
1668     public static boolean isNormalized(String str, Normalizer.Form form, int options) {
1669         switch (form) {


  38 
  39 import java.text.CharacterIterator;
  40 import java.text.Normalizer;
  41 
  42 /**
  43  * Unicode Normalization
  44  *
  45  * <h2>Unicode normalization API</h2>
  46  *
  47  * <code>normalize</code> transforms Unicode text into an equivalent composed or
  48  * decomposed form, allowing for easier sorting and searching of text.
  49  * <code>normalize</code> supports the standard normalization forms described in
  50  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
  51  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  52  *
  53  * Characters with accents or other adornments can be encoded in
  54  * several different ways in Unicode.  For example, take the character A-acute.
  55  * In Unicode, this can be encoded as a single character (the
  56  * "composed" form):
  57  *
  58  * <pre>
  59  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
  60  * </pre>
  61  *
  62  * or as two separate characters (the "decomposed" form):
  63  *
  64  * <pre>
  65  *      0041    LATIN CAPITAL LETTER A
  66  *      0301    COMBINING ACUTE ACCENT
  67  * </pre>
  68  *
  69  * To a user of your program, however, both of these sequences should be
  70  * treated as the same "user-level" character "A with acute accent".  When you
  71  * are searching or comparing text, you must ensure that these two sequences are
  72  * treated equivalently.  In addition, you must handle characters with more than
  73  * one accent.  Sometimes the order of a character's combining accents is
  74  * significant, while in other cases accent sequences in different orders are
  75  * really equivalent.
  76  *
  77  * Similarly, the string "ffi" can be encoded as three separate letters:
  78  *
  79  * <pre>
  80  *      0066    LATIN SMALL LETTER F
  81  *      0066    LATIN SMALL LETTER F
  82  *      0069    LATIN SMALL LETTER I
  83  * </pre>
  84  *
  85  * or as the single character
  86  *
  87  * <pre>
  88  *      FB03    LATIN SMALL LIGATURE FFI
  89  * </pre>
  90  *
  91  * The ffi ligature is not a distinct semantic character, and strictly speaking
  92  * it shouldn't be in Unicode at all, but it was included for compatibility
  93  * with existing character sets that already provided it.  The Unicode standard
  94  * identifies such characters by giving them "compatibility" decompositions
  95  * into the corresponding semantic characters.  When sorting and searching, you
  96  * will often want to use these mappings.
  97  *
  98  * <code>normalize</code> helps solve these problems by transforming text into
  99  * the canonical composed and decomposed forms as shown in the first example
 100  * above. In addition, you can have it perform compatibility decompositions so
 101  * that you can treat compatibility characters the same as their equivalents.
 102  * Finally, <code>normalize</code> rearranges accents into the proper canonical
 103  * order, so that you do not have to worry about accent rearrangement on your
 104  * own.
 105  *
 106  * Form FCD, "Fast C or D", is also designed for collation.
 107  * It allows to work on strings that are not necessarily normalized
 108  * with an algorithm (like in collation) that works under "canonical closure",
 109  * i.e., it treats precomposed characters and their decomposed equivalents the


 538     public static final QuickCheckResult NO = new QuickCheckResult(0);
 539 
 540     /**
 541      * Indicates that string is in the normalized format
 542      * @stable ICU 2.8
 543      */
 544     public static final QuickCheckResult YES = new QuickCheckResult(1);
 545 
 546     /**
 547      * Indicates it cannot be determined if string is in the normalized
 548      * format without further thorough checks.
 549      * @stable ICU 2.8
 550      */
 551     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
 552 
 553     //-------------------------------------------------------------------------
 554     // Constructors
 555     //-------------------------------------------------------------------------
 556 
 557     /**
 558      * Creates a new {@code Normalizer} object for iterating over the
 559      * normalized form of a given string.
 560      * <p>
 561      * The {@code options} parameter specifies which optional
 562      * {@code Normalizer} features are to be enabled for this object.
 563      *
 564      * @param str  The string to be normalized.  The normalization
 565      *              will start at the beginning of the string.
 566      *
 567      * @param mode The normalization mode.
 568      *
 569      * @param opt Any optional features to be enabled.
 570      *            Currently the only available option is {@link #UNICODE_3_2}.
 571      *            If you want the default behavior corresponding to one of the
 572      *            standard Unicode Normalization Forms, use 0 for this argument.
 573      * @stable ICU 2.6
 574      */
 575     public NormalizerBase(String str, Mode mode, int opt) {
 576         this.text = UCharacterIterator.getInstance(str);
 577         this.mode = mode;
 578         this.options=opt;
 579     }
 580 
 581     /**
 582      * Creates a new {@code Normalizer} object for iterating over the
 583      * normalized form of the given text.
 584      *
 585      * @param iter  The input text to be normalized.  The normalization
 586      *              will start at the beginning of the string.
 587      *
 588      * @param mode  The normalization mode.
 589      */
 590     public NormalizerBase(CharacterIterator iter, Mode mode) {
 591           this(iter, mode, UNICODE_LATEST);
 592     }
 593 
 594     /**
 595      * Creates a new {@code Normalizer} object for iterating over the
 596      * normalized form of the given text.
 597      *
 598      * @param iter  The input text to be normalized.  The normalization
 599      *              will start at the beginning of the string.
 600      *
 601      * @param mode  The normalization mode.
 602      *
 603      * @param opt Any optional features to be enabled.
 604      *            Currently the only available option is {@link #UNICODE_3_2}.
 605      *            If you want the default behavior corresponding to one of the
 606      *            standard Unicode Normalization Forms, use 0 for this argument.
 607      * @stable ICU 2.6
 608      */
 609     public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
 610         this.text = UCharacterIterator.getInstance(
 611                                                    (CharacterIterator)iter.clone()
 612                                                    );
 613         this.mode = mode;
 614         this.options = opt;
 615     }
 616 
 617     /**
 618      * Clones this {@code Normalizer} object.  All properties of this
 619      * object are duplicated in the new object, including the cloning of any
 620      * {@link CharacterIterator} that was passed in to the constructor
 621      * or to {@link #setText(CharacterIterator) setText}.
 622      * However, the text storage underlying
 623      * the {@code CharacterIterator} is not duplicated unless the
 624      * iterator's {@code clone} method does so.
 625      * @stable ICU 2.8
 626      */
 627     public Object clone() {
 628         try {
 629             NormalizerBase copy = (NormalizerBase) super.clone();
 630             copy.text = (UCharacterIterator) text.clone();
 631             //clone the internal buffer
 632             if (buffer != null) {
 633                 copy.buffer = new char[buffer.length];
 634                 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
 635             }
 636             return copy;
 637         }
 638         catch (CloneNotSupportedException e) {
 639             throw new InternalError(e.toString(), e);
 640         }
 641     }
 642 
 643     //--------------------------------------------------------------------------
 644     // Static Utility methods


 774      *                   less than the required length
 775      * @stable ICU 2.6
 776      */
 777     public static int normalize(char[] src,int srcStart, int srcLimit,
 778                                 char[] dest,int destStart, int destLimit,
 779                                 Mode  mode, int options) {
 780         int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
 781 
 782         if(length<=(destLimit-destStart)) {
 783             return length;
 784         } else {
 785             throw new IndexOutOfBoundsException(Integer.toString(length));
 786         }
 787     }
 788 
 789     //-------------------------------------------------------------------------
 790     // Iteration API
 791     //-------------------------------------------------------------------------
 792 
 793     /**
 794      * Return the current character in the normalized text.
 795      * @return The codepoint as an int
 796      * @stable ICU 2.8
 797      */
 798     public int current() {
 799         if(bufferPos<bufferLimit || nextNormalize()) {
 800             return getCodePointAt(bufferPos);
 801         } else {
 802             return DONE;
 803         }
 804     }
 805 
 806     /**
 807      * Return the next character in the normalized text and advance
 808      * the iteration position by one.  If the end
 809      * of the text has already been reached, {@link #DONE} is returned.
 810      * @return The codepoint as an int
 811      * @stable ICU 2.8
 812      */
 813     public int next() {
 814         if(bufferPos<bufferLimit ||  nextNormalize()) {


 855      * After setIndexOnly(), getIndex() will return the same index that is
 856      * specified here.
 857      *
 858      * @param index the desired index in the input text.
 859      * @stable ICU 2.8
 860      */
 861     public void setIndexOnly(int index) {
 862         text.setIndex(index);
 863         currentIndex=nextIndex=index; // validates index
 864         clearBuffer();
 865     }
 866 
 867     /**
 868      * Set the iteration position in the input text that is being normalized
 869      * and return the first normalized character at that position.
 870      * <p>
 871      * <b>Note:</b> This method sets the position in the <em>input</em> text,
 872      * while {@link #next} and {@link #previous} iterate through characters
 873      * in the normalized <em>output</em>.  This means that there is not
 874      * necessarily a one-to-one correspondence between characters returned
 875      * by {@code next} and {@code previous} and the indices passed to and
 876      * returned from {@code setIndex} and {@link #getIndex}.
 877      *
 878      * @param index the desired index in the input text.
 879      *
 880      * @return   the first normalized character that is the result of iterating
 881      *            forward starting at the given index.
 882      *
 883      * @throws IllegalArgumentException if the given index is less than
 884      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
 885      * @return The codepoint as an int
 886      * @deprecated ICU 3.2
 887      * @obsolete ICU 3.2
 888      */
 889      @Deprecated
 890      public int setIndex(int index) {
 891          setIndexOnly(index);
 892          return current();
 893      }
 894 
 895     /**
 896      * Retrieve the index of the start of the input text. This is the begin
 897      * index of the {@code CharacterIterator} or the start (i.e. 0) of the
 898      * {@code String} over which this {@code Normalizer} is iterating
 899      * @deprecated ICU 2.2. Use startIndex() instead.
 900      * @return The codepoint as an int
 901      * @see #startIndex
 902      */
 903     @Deprecated
 904     public int getBeginIndex() {
 905         return 0;
 906     }
 907 
 908     /**
 909      * Retrieve the index of the end of the input text.  This is the end index
 910      * of the {@code CharacterIterator} or the length of the {@code String}
 911      * over which this {@code Normalizer} is iterating
 912      * @deprecated ICU 2.2. Use endIndex() instead.
 913      * @return The codepoint as an int
 914      * @see #endIndex
 915      */
 916     @Deprecated
 917     public int getEndIndex() {
 918         return endIndex();
 919     }
 920 
 921     /**
 922      * Retrieve the current iteration position in the input text that is
 923      * being normalized.  This method is useful in applications such as
 924      * searching, where you need to be able to determine the position in
 925      * the input text that corresponds to a given normalized output character.
 926      * <p>
 927      * <b>Note:</b> This method sets the position in the <em>input</em>, while
 928      * {@link #next} and {@link #previous} iterate through characters in the
 929      * <em>output</em>.  This means that there is not necessarily a one-to-one
 930      * correspondence between characters returned by {@code next} and
 931      * {@code previous} and the indices passed to and returned from
 932      * {@code setIndex} and {@link #getIndex}.
 933      * @return The current iteration position
 934      * @stable ICU 2.8
 935      */
 936     public int getIndex() {
 937         if(bufferPos<bufferLimit) {
 938             return currentIndex;
 939         } else {
 940             return nextIndex;
 941         }
 942     }
 943 
 944     /**
 945      * Retrieve the index of the end of the input text. This is the end index
 946      * of the {@code CharacterIterator} or the length of the {@code String}
 947      * over which this {@code Normalizer} is iterating
 948      * @return The current iteration position
 949      * @stable ICU 2.8
 950      */
 951     public int endIndex() {
 952         return text.getLength();
 953     }
 954 
 955     //-------------------------------------------------------------------------
 956     // Property access methods
 957     //-------------------------------------------------------------------------
 958     /**
 959      * Set the normalization mode for this object.
 960      * <p>
 961      * <b>Note:</b>If the normalization mode is changed while iterating
 962      * over a string, calls to {@link #next} and {@link #previous} may
 963      * return previously buffers characters in the old normalization mode
 964      * until the iteration is able to re-sync at the next base character.
 965      * It is safest to call {@link #setText setText()}, {@link #first},
 966      * {@link #last}, etc. after calling {@code setMode}.
 967      *
 968      * @param newMode the new mode for this {@code Normalizer}.
 969      * The supported modes are:
 970      * <ul>
 971      *  <li>{@link #COMPOSE}        - Unicode canonical decompositiion
 972      *                                  followed by canonical composition.
 973      *  <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
 974      *                                  follwed by canonical composition.
 975      *  <li>{@link #DECOMP}         - Unicode canonical decomposition
 976      *  <li>{@link #DECOMP_COMPAT}  - Unicode compatibility decomposition.
 977      *  <li>{@link #NO_OP}          - Do nothing but return characters
 978      *                                  from the underlying input text.
 979      * </ul>
 980      *
 981      * @see #getMode
 982      * @stable ICU 2.8
 983      */
 984     public void setMode(Mode newMode) {
 985         mode = newMode;
 986     }
 987     /**
 988      * Return the basic operation performed by this {@code Normalizer}
 989      *
 990      * @see #setMode
 991      * @stable ICU 2.8
 992      */
 993     public Mode getMode() {
 994         return mode;
 995     }
 996 
 997     /**
 998      * Set the input text over which this {@code Normalizer} will iterate.
 999      * The iteration position is set to the beginning of the input text.
1000      * @param newText   The new string to be normalized.
1001      * @stable ICU 2.8
1002      */
1003     public void setText(String newText) {
1004 
1005         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1006         if (newIter == null) {
1007             throw new InternalError("Could not create a new UCharacterIterator");
1008         }
1009         text = newIter;
1010         reset();
1011     }
1012 
1013     /**
1014      * Set the input text over which this {@code Normalizer} will iterate.
1015      * The iteration position is set to the beginning of the input text.
1016      * @param newText   The new string to be normalized.
1017      * @stable ICU 2.8
1018      */
1019     public void setText(CharacterIterator newText) {
1020 
1021         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1022         if (newIter == null) {
1023             throw new InternalError("Could not create a new UCharacterIterator");
1024         }
1025         text = newIter;
1026         currentIndex=nextIndex=0;
1027         clearBuffer();
1028     }
1029 
1030     //-------------------------------------------------------------------------
1031     // Private utility methods
1032     //-------------------------------------------------------------------------
1033 
1034 
1035     /* backward iteration --------------------------------------------------- */


1554     public static final int UNICODE_3_2_0_ORIGINAL =
1555                                UNICODE_3_2 |
1556                                NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
1557                                NormalizerImpl.BEFORE_PRI_29;
1558 
1559     /*
1560      * Default option for the latest Unicode normalization. This option is
1561      * provided mainly for testing.
1562      * The value zero means that normalization is done with the fixes for
1563      *   - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1564      *   - Corrigendum 5 (Normalization Idempotency)
1565      */
1566     public static final int UNICODE_LATEST = 0x00;
1567 
1568     //
1569     // public constructor and methods for java.text.Normalizer and
1570     // sun.text.Normalizer
1571     //
1572 
1573     /**
1574      * Creates a new {@code Normalizer} object for iterating over the
1575      * normalized form of a given string.
1576      *
1577      * @param str  The string to be normalized.  The normalization
1578      *              will start at the beginning of the string.
1579      *
1580      * @param mode The normalization mode.
1581      */
1582     public NormalizerBase(String str, Mode mode) {
1583           this(str, mode, UNICODE_LATEST);
1584     }
1585 
1586     /**
1587      * Normalizes a <code>String</code> using the given normalization form.
1588      *
1589      * @param str      the input string to be normalized.
1590      * @param form     the normalization form
1591      */
1592     public static String normalize(String str, Normalizer.Form form) {
1593         return normalize(str, form, UNICODE_LATEST);
1594     }


1629             return asciiOnly ? str : NFKC.normalize(str, options);
1630         case NFKD :
1631             return asciiOnly ? str : NFKD.normalize(str, options);
1632         }
1633 
1634         throw new IllegalArgumentException("Unexpected normalization form: " +
1635                                            form);
1636     }
1637 
1638     /**
1639      * Test if a string is in a given normalization form.
1640      * This is semantically equivalent to source.equals(normalize(source, mode)).
1641      *
1642      * Unlike quickCheck(), this function returns a definitive result,
1643      * never a "maybe".
1644      * For NFD, NFKD, and FCD, both functions work exactly the same.
1645      * For NFC and NFKC where quickCheck may return "maybe", this function will
1646      * perform further tests to arrive at a true/false result.
1647      * @param str       the input string to be checked to see if it is normalized
1648      * @param form      the normalization form

1649      */
1650     public static boolean isNormalized(String str, Normalizer.Form form) {
1651         return isNormalized(str, form, UNICODE_LATEST);
1652     }
1653 
1654     /**
1655      * Test if a string is in a given normalization form.
1656      * This is semantically equivalent to source.equals(normalize(source, mode)).
1657      *
1658      * Unlike quickCheck(), this function returns a definitive result,
1659      * never a "maybe".
1660      * For NFD, NFKD, and FCD, both functions work exactly the same.
1661      * For NFC and NFKC where quickCheck may return "maybe", this function will
1662      * perform further tests to arrive at a true/false result.
1663      * @param str       the input string to be checked to see if it is normalized
1664      * @param form      the normalization form
1665      * @param options   the optional features to be enabled.
1666      */
1667     public static boolean isNormalized(String str, Normalizer.Form form, int options) {
1668         switch (form) {
< prev index next >