38
39 import java.text.CharacterIterator;
40 import java.text.Normalizer;
41
42 /**
43 * Unicode Normalization
44 *
45 * <h2>Unicode normalization API</h2>
46 *
47 * <code>normalize</code> transforms Unicode text into an equivalent composed or
48 * decomposed form, allowing for easier sorting and searching of text.
49 * <code>normalize</code> supports the standard normalization forms described in
50 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
51 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
52 *
53 * Characters with accents or other adornments can be encoded in
54 * several different ways in Unicode. For example, take the character A-acute.
55 * In Unicode, this can be encoded as a single character (the
56 * "composed" form):
57 *
58 * <p>
59 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
60 * </p>
61 *
62 * or as two separate characters (the "decomposed" form):
63 *
64 * <p>
65 * 0041 LATIN CAPITAL LETTER A
66 * 0301 COMBINING ACUTE ACCENT
67 * </p>
68 *
69 * To a user of your program, however, both of these sequences should be
70 * treated as the same "user-level" character "A with acute accent". When you
71 * are searching or comparing text, you must ensure that these two sequences are
72 * treated equivalently. In addition, you must handle characters with more than
73 * one accent. Sometimes the order of a character's combining accents is
74 * significant, while in other cases accent sequences in different orders are
75 * really equivalent.
76 *
77 * Similarly, the string "ffi" can be encoded as three separate letters:
78 *
79 * <p>
80 * 0066 LATIN SMALL LETTER F
81 * 0066 LATIN SMALL LETTER F
82 * 0069 LATIN SMALL LETTER I
83 * </p>
84 *
85 * or as the single character
86 *
87 * <p>
88 * FB03 LATIN SMALL LIGATURE FFI
89 * </p>
90 *
91 * The ffi ligature is not a distinct semantic character, and strictly speaking
92 * it shouldn't be in Unicode at all, but it was included for compatibility
93 * with existing character sets that already provided it. The Unicode standard
94 * identifies such characters by giving them "compatibility" decompositions
95 * into the corresponding semantic characters. When sorting and searching, you
96 * will often want to use these mappings.
97 *
98 * <code>normalize</code> helps solve these problems by transforming text into
99 * the canonical composed and decomposed forms as shown in the first example
100 * above. In addition, you can have it perform compatibility decompositions so
101 * that you can treat compatibility characters the same as their equivalents.
102 * Finally, <code>normalize</code> rearranges accents into the proper canonical
103 * order, so that you do not have to worry about accent rearrangement on your
104 * own.
105 *
106 * Form FCD, "Fast C or D", is also designed for collation.
107 * It allows to work on strings that are not necessarily normalized
108 * with an algorithm (like in collation) that works under "canonical closure",
109 * i.e., it treats precomposed characters and their decomposed equivalents the
538 public static final QuickCheckResult NO = new QuickCheckResult(0);
539
540 /**
541 * Indicates that string is in the normalized format
542 * @stable ICU 2.8
543 */
544 public static final QuickCheckResult YES = new QuickCheckResult(1);
545
546 /**
547 * Indicates it cannot be determined if string is in the normalized
548 * format without further thorough checks.
549 * @stable ICU 2.8
550 */
551 public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
552
553 //-------------------------------------------------------------------------
554 // Constructors
555 //-------------------------------------------------------------------------
556
557 /**
558 * Creates a new <tt>Normalizer</tt> object for iterating over the
559 * normalized form of a given string.
560 * <p>
561 * The <tt>options</tt> parameter specifies which optional
562 * <tt>Normalizer</tt> features are to be enabled for this object.
563 * <p>
564 * @param str The string to be normalized. The normalization
565 * will start at the beginning of the string.
566 *
567 * @param mode The normalization mode.
568 *
569 * @param opt Any optional features to be enabled.
570 * Currently the only available option is {@link #UNICODE_3_2}.
571 * If you want the default behavior corresponding to one of the
572 * standard Unicode Normalization Forms, use 0 for this argument.
573 * @stable ICU 2.6
574 */
575 public NormalizerBase(String str, Mode mode, int opt) {
576 this.text = UCharacterIterator.getInstance(str);
577 this.mode = mode;
578 this.options=opt;
579 }
580
581 /**
582 * Creates a new <tt>Normalizer</tt> object for iterating over the
583 * normalized form of the given text.
584 * <p>
585 * @param iter The input text to be normalized. The normalization
586 * will start at the beginning of the string.
587 *
588 * @param mode The normalization mode.
589 */
590 public NormalizerBase(CharacterIterator iter, Mode mode) {
591 this(iter, mode, UNICODE_LATEST);
592 }
593
594 /**
595 * Creates a new <tt>Normalizer</tt> object for iterating over the
596 * normalized form of the given text.
597 * <p>
598 * @param iter The input text to be normalized. The normalization
599 * will start at the beginning of the string.
600 *
601 * @param mode The normalization mode.
602 *
603 * @param opt Any optional features to be enabled.
604 * Currently the only available option is {@link #UNICODE_3_2}.
605 * If you want the default behavior corresponding to one of the
606 * standard Unicode Normalization Forms, use 0 for this argument.
607 * @stable ICU 2.6
608 */
609 public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
610 this.text = UCharacterIterator.getInstance(
611 (CharacterIterator)iter.clone()
612 );
613 this.mode = mode;
614 this.options = opt;
615 }
616
617 /**
618 * Clones this <tt>Normalizer</tt> object. All properties of this
619 * object are duplicated in the new object, including the cloning of any
620 * {@link CharacterIterator} that was passed in to the constructor
621 * or to {@link #setText(CharacterIterator) setText}.
622 * However, the text storage underlying
623 * the <tt>CharacterIterator</tt> is not duplicated unless the
624 * iterator's <tt>clone</tt> method does so.
625 * @stable ICU 2.8
626 */
627 public Object clone() {
628 try {
629 NormalizerBase copy = (NormalizerBase) super.clone();
630 copy.text = (UCharacterIterator) text.clone();
631 //clone the internal buffer
632 if (buffer != null) {
633 copy.buffer = new char[buffer.length];
634 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
635 }
636 return copy;
637 }
638 catch (CloneNotSupportedException e) {
639 throw new InternalError(e.toString(), e);
640 }
641 }
642
643 //--------------------------------------------------------------------------
644 // Static Utility methods
774 * less than the required length
775 * @stable ICU 2.6
776 */
777 public static int normalize(char[] src,int srcStart, int srcLimit,
778 char[] dest,int destStart, int destLimit,
779 Mode mode, int options) {
780 int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
781
782 if(length<=(destLimit-destStart)) {
783 return length;
784 } else {
785 throw new IndexOutOfBoundsException(Integer.toString(length));
786 }
787 }
788
789 //-------------------------------------------------------------------------
790 // Iteration API
791 //-------------------------------------------------------------------------
792
793 /**
794 * Return the current character in the normalized text->
795 * @return The codepoint as an int
796 * @stable ICU 2.8
797 */
798 public int current() {
799 if(bufferPos<bufferLimit || nextNormalize()) {
800 return getCodePointAt(bufferPos);
801 } else {
802 return DONE;
803 }
804 }
805
806 /**
807 * Return the next character in the normalized text and advance
808 * the iteration position by one. If the end
809 * of the text has already been reached, {@link #DONE} is returned.
810 * @return The codepoint as an int
811 * @stable ICU 2.8
812 */
813 public int next() {
814 if(bufferPos<bufferLimit || nextNormalize()) {
855 * After setIndexOnly(), getIndex() will return the same index that is
856 * specified here.
857 *
858 * @param index the desired index in the input text.
859 * @stable ICU 2.8
860 */
861 public void setIndexOnly(int index) {
862 text.setIndex(index);
863 currentIndex=nextIndex=index; // validates index
864 clearBuffer();
865 }
866
867 /**
868 * Set the iteration position in the input text that is being normalized
869 * and return the first normalized character at that position.
870 * <p>
871 * <b>Note:</b> This method sets the position in the <em>input</em> text,
872 * while {@link #next} and {@link #previous} iterate through characters
873 * in the normalized <em>output</em>. This means that there is not
874 * necessarily a one-to-one correspondence between characters returned
875 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
876 * returned from <tt>setIndex</tt> and {@link #getIndex}.
877 * <p>
878 * @param index the desired index in the input text->
879 *
880 * @return the first normalized character that is the result of iterating
881 * forward starting at the given index.
882 *
883 * @throws IllegalArgumentException if the given index is less than
884 * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
885 * @return The codepoint as an int
886 * @deprecated ICU 3.2
887 * @obsolete ICU 3.2
888 */
889 @Deprecated
890 public int setIndex(int index) {
891 setIndexOnly(index);
892 return current();
893 }
894
895 /**
896 * Retrieve the index of the start of the input text. This is the begin
897 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
898 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
899 * @deprecated ICU 2.2. Use startIndex() instead.
900 * @return The codepoint as an int
901 * @see #startIndex
902 */
903 @Deprecated
904 public int getBeginIndex() {
905 return 0;
906 }
907
908 /**
909 * Retrieve the index of the end of the input text. This is the end index
910 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
911 * over which this <tt>Normalizer</tt> is iterating
912 * @deprecated ICU 2.2. Use endIndex() instead.
913 * @return The codepoint as an int
914 * @see #endIndex
915 */
916 @Deprecated
917 public int getEndIndex() {
918 return endIndex();
919 }
920
921 /**
922 * Retrieve the current iteration position in the input text that is
923 * being normalized. This method is useful in applications such as
924 * searching, where you need to be able to determine the position in
925 * the input text that corresponds to a given normalized output character.
926 * <p>
927 * <b>Note:</b> This method sets the position in the <em>input</em>, while
928 * {@link #next} and {@link #previous} iterate through characters in the
929 * <em>output</em>. This means that there is not necessarily a one-to-one
930 * correspondence between characters returned by <tt>next</tt> and
931 * <tt>previous</tt> and the indices passed to and returned from
932 * <tt>setIndex</tt> and {@link #getIndex}.
933 * @return The current iteration position
934 * @stable ICU 2.8
935 */
936 public int getIndex() {
937 if(bufferPos<bufferLimit) {
938 return currentIndex;
939 } else {
940 return nextIndex;
941 }
942 }
943
944 /**
945 * Retrieve the index of the end of the input text-> This is the end index
946 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
947 * over which this <tt>Normalizer</tt> is iterating
948 * @return The current iteration position
949 * @stable ICU 2.8
950 */
951 public int endIndex() {
952 return text.getLength();
953 }
954
955 //-------------------------------------------------------------------------
956 // Property access methods
957 //-------------------------------------------------------------------------
958 /**
959 * Set the normalization mode for this object.
960 * <p>
961 * <b>Note:</b>If the normalization mode is changed while iterating
962 * over a string, calls to {@link #next} and {@link #previous} may
963 * return previously buffers characters in the old normalization mode
964 * until the iteration is able to re-sync at the next base character.
965 * It is safest to call {@link #setText setText()}, {@link #first},
966 * {@link #last}, etc. after calling <tt>setMode</tt>.
967 * <p>
968 * @param newMode the new mode for this <tt>Normalizer</tt>.
969 * The supported modes are:
970 * <ul>
971 * <li>{@link #COMPOSE} - Unicode canonical decompositiion
972 * followed by canonical composition.
973 * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
974 * follwed by canonical composition.
975 * <li>{@link #DECOMP} - Unicode canonical decomposition
976 * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
977 * <li>{@link #NO_OP} - Do nothing but return characters
978 * from the underlying input text.
979 * </ul>
980 *
981 * @see #getMode
982 * @stable ICU 2.8
983 */
984 public void setMode(Mode newMode) {
985 mode = newMode;
986 }
987 /**
988 * Return the basic operation performed by this <tt>Normalizer</tt>
989 *
990 * @see #setMode
991 * @stable ICU 2.8
992 */
993 public Mode getMode() {
994 return mode;
995 }
996
997 /**
998 * Set the input text over which this <tt>Normalizer</tt> will iterate.
999 * The iteration position is set to the beginning of the input text->
1000 * @param newText The new string to be normalized.
1001 * @stable ICU 2.8
1002 */
1003 public void setText(String newText) {
1004
1005 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1006 if (newIter == null) {
1007 throw new InternalError("Could not create a new UCharacterIterator");
1008 }
1009 text = newIter;
1010 reset();
1011 }
1012
1013 /**
1014 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1015 * The iteration position is set to the beginning of the input text->
1016 * @param newText The new string to be normalized.
1017 * @stable ICU 2.8
1018 */
1019 public void setText(CharacterIterator newText) {
1020
1021 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1022 if (newIter == null) {
1023 throw new InternalError("Could not create a new UCharacterIterator");
1024 }
1025 text = newIter;
1026 currentIndex=nextIndex=0;
1027 clearBuffer();
1028 }
1029
1030 //-------------------------------------------------------------------------
1031 // Private utility methods
1032 //-------------------------------------------------------------------------
1033
1034
1035 /* backward iteration --------------------------------------------------- */
1554 public static final int UNICODE_3_2_0_ORIGINAL =
1555 UNICODE_3_2 |
1556 NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
1557 NormalizerImpl.BEFORE_PRI_29;
1558
1559 /*
1560 * Default option for the latest Unicode normalization. This option is
1561 * provided mainly for testing.
1562 * The value zero means that normalization is done with the fixes for
1563 * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1564 * - Corrigendum 5 (Normalization Idempotency)
1565 */
1566 public static final int UNICODE_LATEST = 0x00;
1567
1568 //
1569 // public constructor and methods for java.text.Normalizer and
1570 // sun.text.Normalizer
1571 //
1572
1573 /**
1574 * Creates a new <tt>Normalizer</tt> object for iterating over the
1575 * normalized form of a given string.
1576 *
1577 * @param str The string to be normalized. The normalization
1578 * will start at the beginning of the string.
1579 *
1580 * @param mode The normalization mode.
1581 */
1582 public NormalizerBase(String str, Mode mode) {
1583 this(str, mode, UNICODE_LATEST);
1584 }
1585
1586 /**
1587 * Normalizes a <code>String</code> using the given normalization form.
1588 *
1589 * @param str the input string to be normalized.
1590 * @param form the normalization form
1591 */
1592 public static String normalize(String str, Normalizer.Form form) {
1593 return normalize(str, form, UNICODE_LATEST);
1594 }
1629 return asciiOnly ? str : NFKC.normalize(str, options);
1630 case NFKD :
1631 return asciiOnly ? str : NFKD.normalize(str, options);
1632 }
1633
1634 throw new IllegalArgumentException("Unexpected normalization form: " +
1635 form);
1636 }
1637
1638 /**
1639 * Test if a string is in a given normalization form.
1640 * This is semantically equivalent to source.equals(normalize(source, mode)).
1641 *
1642 * Unlike quickCheck(), this function returns a definitive result,
1643 * never a "maybe".
1644 * For NFD, NFKD, and FCD, both functions work exactly the same.
1645 * For NFC and NFKC where quickCheck may return "maybe", this function will
1646 * perform further tests to arrive at a true/false result.
1647 * @param str the input string to be checked to see if it is normalized
1648 * @param form the normalization form
1649 * @param options the optional features to be enabled.
1650 */
1651 public static boolean isNormalized(String str, Normalizer.Form form) {
1652 return isNormalized(str, form, UNICODE_LATEST);
1653 }
1654
1655 /**
1656 * Test if a string is in a given normalization form.
1657 * This is semantically equivalent to source.equals(normalize(source, mode)).
1658 *
1659 * Unlike quickCheck(), this function returns a definitive result,
1660 * never a "maybe".
1661 * For NFD, NFKD, and FCD, both functions work exactly the same.
1662 * For NFC and NFKC where quickCheck may return "maybe", this function will
1663 * perform further tests to arrive at a true/false result.
1664 * @param str the input string to be checked to see if it is normalized
1665 * @param form the normalization form
1666 * @param options the optional features to be enabled.
1667 */
1668 public static boolean isNormalized(String str, Normalizer.Form form, int options) {
1669 switch (form) {
|
38
39 import java.text.CharacterIterator;
40 import java.text.Normalizer;
41
42 /**
43 * Unicode Normalization
44 *
45 * <h2>Unicode normalization API</h2>
46 *
47 * <code>normalize</code> transforms Unicode text into an equivalent composed or
48 * decomposed form, allowing for easier sorting and searching of text.
49 * <code>normalize</code> supports the standard normalization forms described in
50 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
51 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
52 *
53 * Characters with accents or other adornments can be encoded in
54 * several different ways in Unicode. For example, take the character A-acute.
55 * In Unicode, this can be encoded as a single character (the
56 * "composed" form):
57 *
58 * <pre>
59 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
60 * </pre>
61 *
62 * or as two separate characters (the "decomposed" form):
63 *
64 * <pre>
65 * 0041 LATIN CAPITAL LETTER A
66 * 0301 COMBINING ACUTE ACCENT
67 * </pre>
68 *
69 * To a user of your program, however, both of these sequences should be
70 * treated as the same "user-level" character "A with acute accent". When you
71 * are searching or comparing text, you must ensure that these two sequences are
72 * treated equivalently. In addition, you must handle characters with more than
73 * one accent. Sometimes the order of a character's combining accents is
74 * significant, while in other cases accent sequences in different orders are
75 * really equivalent.
76 *
77 * Similarly, the string "ffi" can be encoded as three separate letters:
78 *
79 * <pre>
80 * 0066 LATIN SMALL LETTER F
81 * 0066 LATIN SMALL LETTER F
82 * 0069 LATIN SMALL LETTER I
83 * </pre>
84 *
85 * or as the single character
86 *
87 * <pre>
88 * FB03 LATIN SMALL LIGATURE FFI
89 * </pre>
90 *
91 * The ffi ligature is not a distinct semantic character, and strictly speaking
92 * it shouldn't be in Unicode at all, but it was included for compatibility
93 * with existing character sets that already provided it. The Unicode standard
94 * identifies such characters by giving them "compatibility" decompositions
95 * into the corresponding semantic characters. When sorting and searching, you
96 * will often want to use these mappings.
97 *
98 * <code>normalize</code> helps solve these problems by transforming text into
99 * the canonical composed and decomposed forms as shown in the first example
100 * above. In addition, you can have it perform compatibility decompositions so
101 * that you can treat compatibility characters the same as their equivalents.
102 * Finally, <code>normalize</code> rearranges accents into the proper canonical
103 * order, so that you do not have to worry about accent rearrangement on your
104 * own.
105 *
106 * Form FCD, "Fast C or D", is also designed for collation.
107 * It allows to work on strings that are not necessarily normalized
108 * with an algorithm (like in collation) that works under "canonical closure",
109 * i.e., it treats precomposed characters and their decomposed equivalents the
538 public static final QuickCheckResult NO = new QuickCheckResult(0);
539
540 /**
541 * Indicates that string is in the normalized format
542 * @stable ICU 2.8
543 */
544 public static final QuickCheckResult YES = new QuickCheckResult(1);
545
546 /**
547 * Indicates it cannot be determined if string is in the normalized
548 * format without further thorough checks.
549 * @stable ICU 2.8
550 */
551 public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
552
553 //-------------------------------------------------------------------------
554 // Constructors
555 //-------------------------------------------------------------------------
556
557 /**
558 * Creates a new {@code Normalizer} object for iterating over the
559 * normalized form of a given string.
560 * <p>
561 * The {@code options} parameter specifies which optional
562 * {@code Normalizer} features are to be enabled for this object.
563 *
564 * @param str The string to be normalized. The normalization
565 * will start at the beginning of the string.
566 *
567 * @param mode The normalization mode.
568 *
569 * @param opt Any optional features to be enabled.
570 * Currently the only available option is {@link #UNICODE_3_2}.
571 * If you want the default behavior corresponding to one of the
572 * standard Unicode Normalization Forms, use 0 for this argument.
573 * @stable ICU 2.6
574 */
575 public NormalizerBase(String str, Mode mode, int opt) {
576 this.text = UCharacterIterator.getInstance(str);
577 this.mode = mode;
578 this.options=opt;
579 }
580
581 /**
582 * Creates a new {@code Normalizer} object for iterating over the
583 * normalized form of the given text.
584 *
585 * @param iter The input text to be normalized. The normalization
586 * will start at the beginning of the string.
587 *
588 * @param mode The normalization mode.
589 */
590 public NormalizerBase(CharacterIterator iter, Mode mode) {
591 this(iter, mode, UNICODE_LATEST);
592 }
593
594 /**
595 * Creates a new {@code Normalizer} object for iterating over the
596 * normalized form of the given text.
597 *
598 * @param iter The input text to be normalized. The normalization
599 * will start at the beginning of the string.
600 *
601 * @param mode The normalization mode.
602 *
603 * @param opt Any optional features to be enabled.
604 * Currently the only available option is {@link #UNICODE_3_2}.
605 * If you want the default behavior corresponding to one of the
606 * standard Unicode Normalization Forms, use 0 for this argument.
607 * @stable ICU 2.6
608 */
609 public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
610 this.text = UCharacterIterator.getInstance(
611 (CharacterIterator)iter.clone()
612 );
613 this.mode = mode;
614 this.options = opt;
615 }
616
617 /**
618 * Clones this {@code Normalizer} object. All properties of this
619 * object are duplicated in the new object, including the cloning of any
620 * {@link CharacterIterator} that was passed in to the constructor
621 * or to {@link #setText(CharacterIterator) setText}.
622 * However, the text storage underlying
623 * the {@code CharacterIterator} is not duplicated unless the
624 * iterator's {@code clone} method does so.
625 * @stable ICU 2.8
626 */
627 public Object clone() {
628 try {
629 NormalizerBase copy = (NormalizerBase) super.clone();
630 copy.text = (UCharacterIterator) text.clone();
631 //clone the internal buffer
632 if (buffer != null) {
633 copy.buffer = new char[buffer.length];
634 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
635 }
636 return copy;
637 }
638 catch (CloneNotSupportedException e) {
639 throw new InternalError(e.toString(), e);
640 }
641 }
642
643 //--------------------------------------------------------------------------
644 // Static Utility methods
774 * less than the required length
775 * @stable ICU 2.6
776 */
777 public static int normalize(char[] src,int srcStart, int srcLimit,
778 char[] dest,int destStart, int destLimit,
779 Mode mode, int options) {
780 int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
781
782 if(length<=(destLimit-destStart)) {
783 return length;
784 } else {
785 throw new IndexOutOfBoundsException(Integer.toString(length));
786 }
787 }
788
789 //-------------------------------------------------------------------------
790 // Iteration API
791 //-------------------------------------------------------------------------
792
793 /**
794 * Return the current character in the normalized text.
795 * @return The codepoint as an int
796 * @stable ICU 2.8
797 */
798 public int current() {
799 if(bufferPos<bufferLimit || nextNormalize()) {
800 return getCodePointAt(bufferPos);
801 } else {
802 return DONE;
803 }
804 }
805
806 /**
807 * Return the next character in the normalized text and advance
808 * the iteration position by one. If the end
809 * of the text has already been reached, {@link #DONE} is returned.
810 * @return The codepoint as an int
811 * @stable ICU 2.8
812 */
813 public int next() {
814 if(bufferPos<bufferLimit || nextNormalize()) {
855 * After setIndexOnly(), getIndex() will return the same index that is
856 * specified here.
857 *
858 * @param index the desired index in the input text.
859 * @stable ICU 2.8
860 */
861 public void setIndexOnly(int index) {
862 text.setIndex(index);
863 currentIndex=nextIndex=index; // validates index
864 clearBuffer();
865 }
866
867 /**
868 * Set the iteration position in the input text that is being normalized
869 * and return the first normalized character at that position.
870 * <p>
871 * <b>Note:</b> This method sets the position in the <em>input</em> text,
872 * while {@link #next} and {@link #previous} iterate through characters
873 * in the normalized <em>output</em>. This means that there is not
874 * necessarily a one-to-one correspondence between characters returned
875 * by {@code next} and {@code previous} and the indices passed to and
876 * returned from {@code setIndex} and {@link #getIndex}.
877 *
878 * @param index the desired index in the input text.
879 *
880 * @return the first normalized character that is the result of iterating
881 * forward starting at the given index.
882 *
883 * @throws IllegalArgumentException if the given index is less than
884 * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
885 * @return The codepoint as an int
886 * @deprecated ICU 3.2
887 * @obsolete ICU 3.2
888 */
889 @Deprecated
890 public int setIndex(int index) {
891 setIndexOnly(index);
892 return current();
893 }
894
895 /**
896 * Retrieve the index of the start of the input text. This is the begin
897 * index of the {@code CharacterIterator} or the start (i.e. 0) of the
898 * {@code String} over which this {@code Normalizer} is iterating
899 * @deprecated ICU 2.2. Use startIndex() instead.
900 * @return The codepoint as an int
901 * @see #startIndex
902 */
903 @Deprecated
904 public int getBeginIndex() {
905 return 0;
906 }
907
908 /**
909 * Retrieve the index of the end of the input text. This is the end index
910 * of the {@code CharacterIterator} or the length of the {@code String}
911 * over which this {@code Normalizer} is iterating
912 * @deprecated ICU 2.2. Use endIndex() instead.
913 * @return The codepoint as an int
914 * @see #endIndex
915 */
916 @Deprecated
917 public int getEndIndex() {
918 return endIndex();
919 }
920
921 /**
922 * Retrieve the current iteration position in the input text that is
923 * being normalized. This method is useful in applications such as
924 * searching, where you need to be able to determine the position in
925 * the input text that corresponds to a given normalized output character.
926 * <p>
927 * <b>Note:</b> This method sets the position in the <em>input</em>, while
928 * {@link #next} and {@link #previous} iterate through characters in the
929 * <em>output</em>. This means that there is not necessarily a one-to-one
930 * correspondence between characters returned by {@code next} and
931 * {@code previous} and the indices passed to and returned from
932 * {@code setIndex} and {@link #getIndex}.
933 * @return The current iteration position
934 * @stable ICU 2.8
935 */
936 public int getIndex() {
937 if(bufferPos<bufferLimit) {
938 return currentIndex;
939 } else {
940 return nextIndex;
941 }
942 }
943
944 /**
945 * Retrieve the index of the end of the input text. This is the end index
946 * of the {@code CharacterIterator} or the length of the {@code String}
947 * over which this {@code Normalizer} is iterating
948 * @return The current iteration position
949 * @stable ICU 2.8
950 */
951 public int endIndex() {
952 return text.getLength();
953 }
954
955 //-------------------------------------------------------------------------
956 // Property access methods
957 //-------------------------------------------------------------------------
958 /**
959 * Set the normalization mode for this object.
960 * <p>
961 * <b>Note:</b>If the normalization mode is changed while iterating
962 * over a string, calls to {@link #next} and {@link #previous} may
963 * return previously buffers characters in the old normalization mode
964 * until the iteration is able to re-sync at the next base character.
965 * It is safest to call {@link #setText setText()}, {@link #first},
966 * {@link #last}, etc. after calling {@code setMode}.
967 *
968 * @param newMode the new mode for this {@code Normalizer}.
969 * The supported modes are:
970 * <ul>
971 * <li>{@link #COMPOSE} - Unicode canonical decompositiion
972 * followed by canonical composition.
973 * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
974 * follwed by canonical composition.
975 * <li>{@link #DECOMP} - Unicode canonical decomposition
976 * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
977 * <li>{@link #NO_OP} - Do nothing but return characters
978 * from the underlying input text.
979 * </ul>
980 *
981 * @see #getMode
982 * @stable ICU 2.8
983 */
984 public void setMode(Mode newMode) {
985 mode = newMode;
986 }
987 /**
988 * Return the basic operation performed by this {@code Normalizer}
989 *
990 * @see #setMode
991 * @stable ICU 2.8
992 */
993 public Mode getMode() {
994 return mode;
995 }
996
997 /**
998 * Set the input text over which this {@code Normalizer} will iterate.
999 * The iteration position is set to the beginning of the input text.
1000 * @param newText The new string to be normalized.
1001 * @stable ICU 2.8
1002 */
1003 public void setText(String newText) {
1004
1005 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1006 if (newIter == null) {
1007 throw new InternalError("Could not create a new UCharacterIterator");
1008 }
1009 text = newIter;
1010 reset();
1011 }
1012
1013 /**
1014 * Set the input text over which this {@code Normalizer} will iterate.
1015 * The iteration position is set to the beginning of the input text.
1016 * @param newText The new string to be normalized.
1017 * @stable ICU 2.8
1018 */
1019 public void setText(CharacterIterator newText) {
1020
1021 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1022 if (newIter == null) {
1023 throw new InternalError("Could not create a new UCharacterIterator");
1024 }
1025 text = newIter;
1026 currentIndex=nextIndex=0;
1027 clearBuffer();
1028 }
1029
1030 //-------------------------------------------------------------------------
1031 // Private utility methods
1032 //-------------------------------------------------------------------------
1033
1034
1035 /* backward iteration --------------------------------------------------- */
1554 public static final int UNICODE_3_2_0_ORIGINAL =
1555 UNICODE_3_2 |
1556 NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
1557 NormalizerImpl.BEFORE_PRI_29;
1558
1559 /*
1560 * Default option for the latest Unicode normalization. This option is
1561 * provided mainly for testing.
1562 * The value zero means that normalization is done with the fixes for
1563 * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1564 * - Corrigendum 5 (Normalization Idempotency)
1565 */
1566 public static final int UNICODE_LATEST = 0x00;
1567
1568 //
1569 // public constructor and methods for java.text.Normalizer and
1570 // sun.text.Normalizer
1571 //
1572
1573 /**
1574 * Creates a new {@code Normalizer} object for iterating over the
1575 * normalized form of a given string.
1576 *
1577 * @param str The string to be normalized. The normalization
1578 * will start at the beginning of the string.
1579 *
1580 * @param mode The normalization mode.
1581 */
1582 public NormalizerBase(String str, Mode mode) {
1583 this(str, mode, UNICODE_LATEST);
1584 }
1585
1586 /**
1587 * Normalizes a <code>String</code> using the given normalization form.
1588 *
1589 * @param str the input string to be normalized.
1590 * @param form the normalization form
1591 */
1592 public static String normalize(String str, Normalizer.Form form) {
1593 return normalize(str, form, UNICODE_LATEST);
1594 }
1629 return asciiOnly ? str : NFKC.normalize(str, options);
1630 case NFKD :
1631 return asciiOnly ? str : NFKD.normalize(str, options);
1632 }
1633
1634 throw new IllegalArgumentException("Unexpected normalization form: " +
1635 form);
1636 }
1637
1638 /**
1639 * Test if a string is in a given normalization form.
1640 * This is semantically equivalent to source.equals(normalize(source, mode)).
1641 *
1642 * Unlike quickCheck(), this function returns a definitive result,
1643 * never a "maybe".
1644 * For NFD, NFKD, and FCD, both functions work exactly the same.
1645 * For NFC and NFKC where quickCheck may return "maybe", this function will
1646 * perform further tests to arrive at a true/false result.
1647 * @param str the input string to be checked to see if it is normalized
1648 * @param form the normalization form
1649 */
1650 public static boolean isNormalized(String str, Normalizer.Form form) {
1651 return isNormalized(str, form, UNICODE_LATEST);
1652 }
1653
1654 /**
1655 * Test if a string is in a given normalization form.
1656 * This is semantically equivalent to source.equals(normalize(source, mode)).
1657 *
1658 * Unlike quickCheck(), this function returns a definitive result,
1659 * never a "maybe".
1660 * For NFD, NFKD, and FCD, both functions work exactly the same.
1661 * For NFC and NFKC where quickCheck may return "maybe", this function will
1662 * perform further tests to arrive at a true/false result.
1663 * @param str the input string to be checked to see if it is normalized
1664 * @param form the normalization form
1665 * @param options the optional features to be enabled.
1666 */
1667 public static boolean isNormalized(String str, Normalizer.Form form, int options) {
1668 switch (form) {
|