--- old/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2020-01-10 13:50:38.000000000 -0800 +++ /dev/null 2020-01-10 13:50:38.000000000 -0800 @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2000-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -import java.text.CharacterIterator; -import java.text.Normalizer; - -/** - * Unicode Normalization - * - *

Unicode normalization API

- * - * normalize transforms Unicode text into an equivalent composed or - * decomposed form, allowing for easier sorting and searching of text. - * normalize supports the standard normalization forms described in - * - * Unicode Standard Annex #15 — Unicode Normalization Forms. - * - * Characters with accents or other adornments can be encoded in - * several different ways in Unicode. For example, take the character A-acute. - * In Unicode, this can be encoded as a single character (the - * "composed" form): - * - *
- *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
- * 
- * - * or as two separate characters (the "decomposed" form): - * - *
- *      0041    LATIN CAPITAL LETTER A
- *      0301    COMBINING ACUTE ACCENT
- * 
- * - * To a user of your program, however, both of these sequences should be - * treated as the same "user-level" character "A with acute accent". When you - * are searching or comparing text, you must ensure that these two sequences are - * treated equivalently. In addition, you must handle characters with more than - * one accent. Sometimes the order of a character's combining accents is - * significant, while in other cases accent sequences in different orders are - * really equivalent. - * - * Similarly, the string "ffi" can be encoded as three separate letters: - * - *
- *      0066    LATIN SMALL LETTER F
- *      0066    LATIN SMALL LETTER F
- *      0069    LATIN SMALL LETTER I
- * 
- * - * or as the single character - * - *
- *      FB03    LATIN SMALL LIGATURE FFI
- * 
- * - * The ffi ligature is not a distinct semantic character, and strictly speaking - * it shouldn't be in Unicode at all, but it was included for compatibility - * with existing character sets that already provided it. The Unicode standard - * identifies such characters by giving them "compatibility" decompositions - * into the corresponding semantic characters. When sorting and searching, you - * will often want to use these mappings. - * - * normalize helps solve these problems by transforming text into - * the canonical composed and decomposed forms as shown in the first example - * above. In addition, you can have it perform compatibility decompositions so - * that you can treat compatibility characters the same as their equivalents. - * Finally, normalize rearranges accents into the proper canonical - * order, so that you do not have to worry about accent rearrangement on your - * own. - * - * Form FCD, "Fast C or D", is also designed for collation. - * It allows to work on strings that are not necessarily normalized - * with an algorithm (like in collation) that works under "canonical closure", - * i.e., it treats precomposed characters and their decomposed equivalents the - * same. - * - * It is not a normalization form because it does not provide for uniqueness of - * representation. Multiple strings may be canonically equivalent (their NFDs - * are identical) and may all conform to FCD without being identical themselves. - * - * The form is defined such that the "raw decomposition", the recursive - * canonical decomposition of each character, results in a string that is - * canonically ordered. This means that precomposed characters are allowed for - * as long as their decompositions do not need canonical reordering. - * - * Its advantage for a process like collation is that all NFD and most NFC texts - * - and many unnormalized texts - already conform to FCD and do not need to be - * normalized (NFD) for such a process. The FCD quick check will return YES for - * most strings in practice. - * - * normalize(FCD) may be implemented with NFD. - * - * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): - * http://www.unicode.org/notes/tn5/#FCD - * - * ICU collation performs either NFD or FCD normalization automatically if - * normalization is turned on for the collator object. Beyond collation and - * string search, normalized strings may be useful for string equivalence - * comparisons, transliteration/transcription, unique representations, etc. - * - * The W3C generally recommends to exchange texts in NFC. - * Note also that most legacy character encodings use only precomposed forms and - * often do not encode any combining marks by themselves. For conversion to such - * character encodings the Unicode text needs to be normalized to NFC. - * For more usage examples, see the Unicode Standard Annex. - * - * Note: The Normalizer class also provides API for iterative normalization. - * While the setIndex() and getIndex() refer to indices in the - * underlying Unicode input text, the next() and previous() methods - * iterate through characters in the normalized output. - * This means that there is not necessarily a one-to-one correspondence - * between characters returned by next() and previous() and the indices - * passed to and returned from setIndex() and getIndex(). - * It is for this reason that Normalizer does not implement the CharacterIterator interface. - * - * @stable ICU 2.8 - */ -// Original filename in ICU4J: Normalizer.java -public final class NormalizerBase implements Cloneable { - - // The input text and our position in it - private UCharacterIterator text; - private Normalizer2 norm2; - private Mode mode; - private int options; - - // The normalization buffer is the result of normalization - // of the source in [currentIndex..nextIndex] . - private int currentIndex; - private int nextIndex; - - // A buffer for holding intermediate results - private StringBuilder buffer; - private int bufferPos; - - // Helper classes to defer loading of normalization data. - private static final class ModeImpl { - private ModeImpl(Normalizer2 n2) { - normalizer2 = n2; - } - private final Normalizer2 normalizer2; - } - - private static final class NFDModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); - } - - private static final class NFKDModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); - } - - private static final class NFCModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); - } - - private static final class NFKCModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); - } - - private static final class Unicode32 { - private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); - } - - private static final class NFD32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), - Unicode32.INSTANCE)); - } - - private static final class NFKD32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), - Unicode32.INSTANCE)); - } - - private static final class NFC32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), - Unicode32.INSTANCE)); - } - - private static final class NFKC32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), - Unicode32.INSTANCE)); - } - - /** - * Options bit set value to select Unicode 3.2 normalization - * (except NormalizationCorrections). - * At most one Unicode version can be selected at a time. - * @stable ICU 2.6 - */ - public static final int UNICODE_3_2=0x20; - - public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; - - /* - * Default option for the latest Unicode normalization. This option is - * provided mainly for testing. - * The value zero means that normalization is done with the fixes for - * - Corrigendum 4 (Five CJK Canonical Mapping Errors) - * - Corrigendum 5 (Normalization Idempotency) - */ - public static final int UNICODE_LATEST = 0x00; - - /** - * Constant indicating that the end of the iteration has been reached. - * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. - * @stable ICU 2.8 - */ - public static final int DONE = UCharacterIterator.DONE; - - /** - * Constants for normalization modes. - *

- * The Mode class is not intended for public subclassing. - * Only the Mode constants provided by the Normalizer class should be used, - * and any fields or methods should not be called or overridden by users. - * @stable ICU 2.8 - */ - public abstract static class Mode { - - /** - * Sole constructor - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected Mode() { - } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected abstract Normalizer2 getNormalizer2(int options); - } - - private static Mode toMode(Normalizer.Form form) { - switch (form) { - case NFC : - return NFC; - case NFD : - return NFD; - case NFKC : - return NFKC; - case NFKD : - return NFKD; - } - - throw new IllegalArgumentException("Unexpected normalization form: " + - form); - } - - private static final class NONEMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } - } - - private static final class NFDMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFD32ModeImpl.INSTANCE.normalizer2 : - NFDModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFKDMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFKD32ModeImpl.INSTANCE.normalizer2 : - NFKDModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFCMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFC32ModeImpl.INSTANCE.normalizer2 : - NFCModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFKCMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFKC32ModeImpl.INSTANCE.normalizer2 : - NFKCModeImpl.INSTANCE.normalizer2; - } - } - - /** - * No decomposition/composition. - * @stable ICU 2.8 - */ - public static final Mode NONE = new NONEMode(); - - /** - * Canonical decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFD = new NFDMode(); - - /** - * Compatibility decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFKD = new NFKDMode(); - - /** - * Canonical decomposition followed by canonical composition. - * @stable ICU 2.8 - */ - public static final Mode NFC = new NFCMode(); - - public static final Mode NFKC =new NFKCMode(); - - //------------------------------------------------------------------------- - // Iterator constructors - //------------------------------------------------------------------------- - - /** - * Creates a new {@code NormalizerBase} object for iterating over the - * normalized form of a given string. - *

- * The {@code options} parameter specifies which optional - * {@code NormalizerBase} features are to be enabled for this object. - *

- * @param str The string to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - * - * @param opt Any optional features to be enabled. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the - * standard Unicode Normalization Forms, use 0 for this argument. - * @stable ICU 2.6 - */ - public NormalizerBase(String str, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance(str); - this.mode = mode; - this.options=opt; - norm2 = mode.getNormalizer2(opt); - buffer = new StringBuilder(); - } - - public NormalizerBase(String str, Mode mode) { - this(str, mode, 0); - } - - - /** - * Creates a new {@code NormalizerBase} object for iterating over the - * normalized form of the given text. - *

- * @param iter The input text to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - * - * @param opt Any optional features to be enabled. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the - * standard Unicode Normalization Forms, use 0 for this argument. - * @stable ICU 2.6 - */ - public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); - this.mode = mode; - this.options = opt; - norm2 = mode.getNormalizer2(opt); - buffer = new StringBuilder(); - } - - public NormalizerBase(CharacterIterator iter, Mode mode) { - this(iter, mode, 0); - } - - /** - * Clones this {@code NormalizerBase} object. All properties of this - * object are duplicated in the new object, including the cloning of any - * {@link CharacterIterator} that was passed in to the constructor - * or to {@link #setText(CharacterIterator) setText}. - * However, the text storage underlying - * the {@code CharacterIterator} is not duplicated unless the - * iterator's {@code clone} method does so. - * @stable ICU 2.8 - */ - public Object clone() { - try { - NormalizerBase copy = (NormalizerBase) super.clone(); - copy.text = (UCharacterIterator) text.clone(); - copy.mode = mode; - copy.options = options; - copy.norm2 = norm2; - copy.buffer = new StringBuilder(buffer); - copy.bufferPos = bufferPos; - copy.currentIndex = currentIndex; - copy.nextIndex = nextIndex; - return copy; - } - catch (CloneNotSupportedException e) { - throw new InternalError(e.toString(), e); - } - } - - /** - * Normalizes a {@code String} using the given normalization operation. - *

- * The {@code options} parameter specifies which optional - * {@code NormalizerBase} features are to be enabled for this operation. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the standard - * Unicode Normalization Forms, use 0 for this argument. - *

- * @param str the input string to be normalized. - * @param mode the normalization mode - * @param options the optional features to be enabled. - * @return String the normalized string - * @stable ICU 2.6 - */ - public static String normalize(String str, Mode mode, int options) { - return mode.getNormalizer2(options).normalize(str); - } - - public static String normalize(String str, Normalizer.Form form) { - return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); - } - - public static String normalize(String str, Normalizer.Form form, int options) { - return NormalizerBase.normalize(str, toMode(form), options); - } - - /** - * Test if a string is in a given normalization form. - * This is semantically equivalent to source.equals(normalize(source, mode)). - * - * Unlike quickCheck(), this function returns a definitive result, - * never a "maybe". - * For NFD, NFKD, and FCD, both functions work exactly the same. - * For NFC and NFKC where quickCheck may return "maybe", this function will - * perform further tests to arrive at a true/false result. - * @param str the input string to be checked to see if it is - * normalized - * @param mode the normalization mode - * @param options Options for use with exclusion set and tailored Normalization - * The only option that is currently recognized is UNICODE_3_2 - * @see #isNormalized - * @stable ICU 2.6 - */ - public static boolean isNormalized(String str, Mode mode, int options) { - return mode.getNormalizer2(options).isNormalized(str); - } - - public static boolean isNormalized(String str, Normalizer.Form form) { - return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); - } - - public static boolean isNormalized(String str, Normalizer.Form form, int options) { - return NormalizerBase.isNormalized(str, toMode(form), options); - } - - //------------------------------------------------------------------------- - // Iteration API - //------------------------------------------------------------------------- - - /** - * Return the current character in the normalized text. - * @return The codepoint as an int - * @stable ICU 2.8 - */ - public int current() { - if(bufferPos0 || previousNormalize()) { - int c=buffer.codePointBefore(bufferPos); - bufferPos-=Character.charCount(c); - return c; - } else { - return DONE; - } - } - - /** - * Reset the index to the beginning of the text. - * This is equivalent to setIndexOnly(startIndex)). - * @stable ICU 2.8 - */ - public void reset() { - text.setIndex(0); - currentIndex=nextIndex=0; - clearBuffer(); - } - - /** - * Set the iteration position in the input text that is being normalized, - * without any immediate normalization. - * After setIndexOnly(), getIndex() will return the same index that is - * specified here. - * - * @param index the desired index in the input text. - * @stable ICU 2.8 - */ - public void setIndexOnly(int index) { - text.setIndex(index); // validates index - currentIndex=nextIndex=index; - clearBuffer(); - } - - /** - * Set the iteration position in the input text that is being normalized - * and return the first normalized character at that position. - *

- * Note: This method sets the position in the input text, - * while {@link #next} and {@link #previous} iterate through characters - * in the normalized output. This means that there is not - * necessarily a one-to-one correspondence between characters returned - * by {@code next} and {@code previous} and the indices passed to and - * returned from {@code setIndex} and {@link #getIndex}. - *

- * @param index the desired index in the input text. - * - * @return the first normalized character that is the result of iterating - * forward starting at the given index. - * - * @throws IllegalArgumentException if the given index is less than - * {@link #getBeginIndex} or greater than {@link #getEndIndex}. - * deprecated ICU 3.2 - * @obsolete ICU 3.2 - */ - public int setIndex(int index) { - setIndexOnly(index); - return current(); - } - - /** - * Retrieve the index of the start of the input text. This is the begin - * index of the {@code CharacterIterator} or the start (i.e. 0) of the - * {@code String} over which this {@code NormalizerBase} is iterating - * @deprecated ICU 2.2. Use startIndex() instead. - * @return The codepoint as an int - * @see #startIndex - */ - @Deprecated - public int getBeginIndex() { - return 0; - } - - /** - * Retrieve the index of the end of the input text. This is the end index - * of the {@code CharacterIterator} or the length of the {@code String} - * over which this {@code NormalizerBase} is iterating - * @deprecated ICU 2.2. Use endIndex() instead. - * @return The codepoint as an int - * @see #endIndex - */ - @Deprecated - public int getEndIndex() { - return endIndex(); - } - - /** - * Retrieve the current iteration position in the input text that is - * being normalized. This method is useful in applications such as - * searching, where you need to be able to determine the position in - * the input text that corresponds to a given normalized output character. - *

- * Note: This method sets the position in the input, while - * {@link #next} and {@link #previous} iterate through characters in the - * output. This means that there is not necessarily a one-to-one - * correspondence between characters returned by {@code next} and - * {@code previous} and the indices passed to and returned from - * {@code setIndex} and {@link #getIndex}. - * @return The current iteration position - * @stable ICU 2.8 - */ - public int getIndex() { - if(bufferPos - * Note:If the normalization mode is changed while iterating - * over a string, calls to {@link #next} and {@link #previous} may - * return previously buffers characters in the old normalization mode - * until the iteration is able to re-sync at the next base character. - * It is safest to call {@link #setText setText()}, {@link #first}, - * {@link #last}, etc. after calling {@code setMode}. - *

- * @param newMode the new mode for this {@code NormalizerBase}. - * The supported modes are: - *

- * - * @see #getMode - * @stable ICU 2.8 - */ - public void setMode(Mode newMode) { - mode = newMode; - norm2 = mode.getNormalizer2(options); - } - - /** - * Return the basic operation performed by this {@code NormalizerBase} - * - * @see #setMode - * @stable ICU 2.8 - */ - public Mode getMode() { - return mode; - } - - /** - * Set the input text over which this {@code NormalizerBase} will iterate. - * The iteration position is set to the beginning of the input text. - * @param newText The new string to be normalized. - * @stable ICU 2.8 - */ - public void setText(String newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); - if (newIter == null) { - throw new IllegalStateException("Could not create a new UCharacterIterator"); - } - text = newIter; - reset(); - } - - /** - * Set the input text over which this {@code NormalizerBase} will iterate. - * The iteration position is set to the beginning of the input text. - * @param newText The new string to be normalized. - * @stable ICU 2.8 - */ - public void setText(CharacterIterator newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); - if (newIter == null) { - throw new IllegalStateException("Could not create a new UCharacterIterator"); - } - text = newIter; - currentIndex=nextIndex=0; - clearBuffer(); - } - - private void clearBuffer() { - buffer.setLength(0); - bufferPos=0; - } - - private boolean nextNormalize() { - clearBuffer(); - currentIndex=nextIndex; - text.setIndex(nextIndex); - // Skip at least one character so we make progress. - int c=text.nextCodePoint(); - if(c<0) { - return false; - } - StringBuilder segment=new StringBuilder().appendCodePoint(c); - while((c=text.nextCodePoint())>=0) { - if(norm2.hasBoundaryBefore(c)) { - text.moveCodePointIndex(-1); - break; - } - segment.appendCodePoint(c); - } - nextIndex=text.getIndex(); - norm2.normalize(segment, buffer); - return buffer.length()!=0; - } - - private boolean previousNormalize() { - clearBuffer(); - nextIndex=currentIndex; - text.setIndex(currentIndex); - StringBuilder segment=new StringBuilder(); - int c; - while((c=text.previousCodePoint())>=0) { - if(c<=0xffff) { - segment.insert(0, (char)c); - } else { - segment.insert(0, Character.toChars(c)); - } - if(norm2.hasBoundaryBefore(c)) { - break; - } - } - currentIndex=text.getIndex(); - norm2.normalize(segment, buffer); - bufferPos=buffer.length(); - return buffer.length()!=0; - } - -} --- /dev/null 2020-01-10 13:50:38.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/NormalizerBase.java 2020-01-10 13:50:38.000000000 -0800 @@ -0,0 +1,784 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2000-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package jdk.internal.icu.text; + +import jdk.internal.icu.impl.Norm2AllModes; + +import java.text.CharacterIterator; +import java.text.Normalizer; + +/** + * Unicode Normalization + * + *

Unicode normalization API

+ * + * normalize transforms Unicode text into an equivalent composed or + * decomposed form, allowing for easier sorting and searching of text. + * normalize supports the standard normalization forms described in + * + * Unicode Standard Annex #15 — Unicode Normalization Forms. + * + * Characters with accents or other adornments can be encoded in + * several different ways in Unicode. For example, take the character A-acute. + * In Unicode, this can be encoded as a single character (the + * "composed" form): + * + *
+ *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
+ * 
+ * + * or as two separate characters (the "decomposed" form): + * + *
+ *      0041    LATIN CAPITAL LETTER A
+ *      0301    COMBINING ACUTE ACCENT
+ * 
+ * + * To a user of your program, however, both of these sequences should be + * treated as the same "user-level" character "A with acute accent". When you + * are searching or comparing text, you must ensure that these two sequences are + * treated equivalently. In addition, you must handle characters with more than + * one accent. Sometimes the order of a character's combining accents is + * significant, while in other cases accent sequences in different orders are + * really equivalent. + * + * Similarly, the string "ffi" can be encoded as three separate letters: + * + *
+ *      0066    LATIN SMALL LETTER F
+ *      0066    LATIN SMALL LETTER F
+ *      0069    LATIN SMALL LETTER I
+ * 
+ * + * or as the single character + * + *
+ *      FB03    LATIN SMALL LIGATURE FFI
+ * 
+ * + * The ffi ligature is not a distinct semantic character, and strictly speaking + * it shouldn't be in Unicode at all, but it was included for compatibility + * with existing character sets that already provided it. The Unicode standard + * identifies such characters by giving them "compatibility" decompositions + * into the corresponding semantic characters. When sorting and searching, you + * will often want to use these mappings. + * + * normalize helps solve these problems by transforming text into + * the canonical composed and decomposed forms as shown in the first example + * above. In addition, you can have it perform compatibility decompositions so + * that you can treat compatibility characters the same as their equivalents. + * Finally, normalize rearranges accents into the proper canonical + * order, so that you do not have to worry about accent rearrangement on your + * own. + * + * Form FCD, "Fast C or D", is also designed for collation. + * It allows to work on strings that are not necessarily normalized + * with an algorithm (like in collation) that works under "canonical closure", + * i.e., it treats precomposed characters and their decomposed equivalents the + * same. + * + * It is not a normalization form because it does not provide for uniqueness of + * representation. Multiple strings may be canonically equivalent (their NFDs + * are identical) and may all conform to FCD without being identical themselves. + * + * The form is defined such that the "raw decomposition", the recursive + * canonical decomposition of each character, results in a string that is + * canonically ordered. This means that precomposed characters are allowed for + * as long as their decompositions do not need canonical reordering. + * + * Its advantage for a process like collation is that all NFD and most NFC texts + * - and many unnormalized texts - already conform to FCD and do not need to be + * normalized (NFD) for such a process. The FCD quick check will return YES for + * most strings in practice. + * + * normalize(FCD) may be implemented with NFD. + * + * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): + * http://www.unicode.org/notes/tn5/#FCD + * + * ICU collation performs either NFD or FCD normalization automatically if + * normalization is turned on for the collator object. Beyond collation and + * string search, normalized strings may be useful for string equivalence + * comparisons, transliteration/transcription, unique representations, etc. + * + * The W3C generally recommends to exchange texts in NFC. + * Note also that most legacy character encodings use only precomposed forms and + * often do not encode any combining marks by themselves. For conversion to such + * character encodings the Unicode text needs to be normalized to NFC. + * For more usage examples, see the Unicode Standard Annex. + * + * Note: The Normalizer class also provides API for iterative normalization. + * While the setIndex() and getIndex() refer to indices in the + * underlying Unicode input text, the next() and previous() methods + * iterate through characters in the normalized output. + * This means that there is not necessarily a one-to-one correspondence + * between characters returned by next() and previous() and the indices + * passed to and returned from setIndex() and getIndex(). + * It is for this reason that Normalizer does not implement the CharacterIterator interface. + * + * @stable ICU 2.8 + */ +// Original filename in ICU4J: Normalizer.java +public final class NormalizerBase implements Cloneable { + + // The input text and our position in it + private UCharacterIterator text; + private Normalizer2 norm2; + private Mode mode; + private int options; + + // The normalization buffer is the result of normalization + // of the source in [currentIndex..nextIndex] . + private int currentIndex; + private int nextIndex; + + // A buffer for holding intermediate results + private StringBuilder buffer; + private int bufferPos; + + // Helper classes to defer loading of normalization data. + private static final class ModeImpl { + private ModeImpl(Normalizer2 n2) { + normalizer2 = n2; + } + private final Normalizer2 normalizer2; + } + + private static final class NFDModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); + } + + private static final class NFKDModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); + } + + private static final class NFCModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); + } + + private static final class NFKCModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); + } + + private static final class Unicode32 { + private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); + } + + private static final class NFD32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFKD32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFC32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFKC32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), + Unicode32.INSTANCE)); + } + + /** + * Options bit set value to select Unicode 3.2 normalization + * (except NormalizationCorrections). + * At most one Unicode version can be selected at a time. + * @stable ICU 2.6 + */ + public static final int UNICODE_3_2=0x20; + + public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; + + /* + * Default option for the latest Unicode normalization. This option is + * provided mainly for testing. + * The value zero means that normalization is done with the fixes for + * - Corrigendum 4 (Five CJK Canonical Mapping Errors) + * - Corrigendum 5 (Normalization Idempotency) + */ + public static final int UNICODE_LATEST = 0x00; + + /** + * Constant indicating that the end of the iteration has been reached. + * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. + * @stable ICU 2.8 + */ + public static final int DONE = UCharacterIterator.DONE; + + /** + * Constants for normalization modes. + *

+ * The Mode class is not intended for public subclassing. + * Only the Mode constants provided by the Normalizer class should be used, + * and any fields or methods should not be called or overridden by users. + * @stable ICU 2.8 + */ + public abstract static class Mode { + + /** + * Sole constructor + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected Mode() { + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected abstract Normalizer2 getNormalizer2(int options); + } + + private static Mode toMode(Normalizer.Form form) { + switch (form) { + case NFC : + return NFC; + case NFD : + return NFD; + case NFKC : + return NFKC; + case NFKD : + return NFKD; + } + + throw new IllegalArgumentException("Unexpected normalization form: " + + form); + } + + private static final class NONEMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } + } + + private static final class NFDMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFD32ModeImpl.INSTANCE.normalizer2 : + NFDModeImpl.INSTANCE.normalizer2; + } + } + + private static final class NFKDMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFKD32ModeImpl.INSTANCE.normalizer2 : + NFKDModeImpl.INSTANCE.normalizer2; + } + } + + private static final class NFCMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFC32ModeImpl.INSTANCE.normalizer2 : + NFCModeImpl.INSTANCE.normalizer2; + } + } + + private static final class NFKCMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFKC32ModeImpl.INSTANCE.normalizer2 : + NFKCModeImpl.INSTANCE.normalizer2; + } + } + + /** + * No decomposition/composition. + * @stable ICU 2.8 + */ + public static final Mode NONE = new NONEMode(); + + /** + * Canonical decomposition. + * @stable ICU 2.8 + */ + public static final Mode NFD = new NFDMode(); + + /** + * Compatibility decomposition. + * @stable ICU 2.8 + */ + public static final Mode NFKD = new NFKDMode(); + + /** + * Canonical decomposition followed by canonical composition. + * @stable ICU 2.8 + */ + public static final Mode NFC = new NFCMode(); + + public static final Mode NFKC =new NFKCMode(); + + //------------------------------------------------------------------------- + // Iterator constructors + //------------------------------------------------------------------------- + + /** + * Creates a new {@code NormalizerBase} object for iterating over the + * normalized form of a given string. + *

+ * The {@code options} parameter specifies which optional + * {@code NormalizerBase} features are to be enabled for this object. + *

+ * @param str The string to be normalized. The normalization + * will start at the beginning of the string. + * + * @param mode The normalization mode. + * + * @param opt Any optional features to be enabled. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the + * standard Unicode Normalization Forms, use 0 for this argument. + * @stable ICU 2.6 + */ + public NormalizerBase(String str, Mode mode, int opt) { + this.text = UCharacterIterator.getInstance(str); + this.mode = mode; + this.options=opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); + } + + public NormalizerBase(String str, Mode mode) { + this(str, mode, 0); + } + + + /** + * Creates a new {@code NormalizerBase} object for iterating over the + * normalized form of the given text. + *

+ * @param iter The input text to be normalized. The normalization + * will start at the beginning of the string. + * + * @param mode The normalization mode. + * + * @param opt Any optional features to be enabled. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the + * standard Unicode Normalization Forms, use 0 for this argument. + * @stable ICU 2.6 + */ + public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { + this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); + this.mode = mode; + this.options = opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); + } + + public NormalizerBase(CharacterIterator iter, Mode mode) { + this(iter, mode, 0); + } + + /** + * Clones this {@code NormalizerBase} object. All properties of this + * object are duplicated in the new object, including the cloning of any + * {@link CharacterIterator} that was passed in to the constructor + * or to {@link #setText(CharacterIterator) setText}. + * However, the text storage underlying + * the {@code CharacterIterator} is not duplicated unless the + * iterator's {@code clone} method does so. + * @stable ICU 2.8 + */ + public Object clone() { + try { + NormalizerBase copy = (NormalizerBase) super.clone(); + copy.text = (UCharacterIterator) text.clone(); + copy.mode = mode; + copy.options = options; + copy.norm2 = norm2; + copy.buffer = new StringBuilder(buffer); + copy.bufferPos = bufferPos; + copy.currentIndex = currentIndex; + copy.nextIndex = nextIndex; + return copy; + } + catch (CloneNotSupportedException e) { + throw new InternalError(e.toString(), e); + } + } + + /** + * Normalizes a {@code String} using the given normalization operation. + *

+ * The {@code options} parameter specifies which optional + * {@code NormalizerBase} features are to be enabled for this operation. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the standard + * Unicode Normalization Forms, use 0 for this argument. + *

+ * @param str the input string to be normalized. + * @param mode the normalization mode + * @param options the optional features to be enabled. + * @return String the normalized string + * @stable ICU 2.6 + */ + public static String normalize(String str, Mode mode, int options) { + return mode.getNormalizer2(options).normalize(str); + } + + public static String normalize(String str, Normalizer.Form form) { + return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); + } + + public static String normalize(String str, Normalizer.Form form, int options) { + return NormalizerBase.normalize(str, toMode(form), options); + } + + /** + * Test if a string is in a given normalization form. + * This is semantically equivalent to source.equals(normalize(source, mode)). + * + * Unlike quickCheck(), this function returns a definitive result, + * never a "maybe". + * For NFD, NFKD, and FCD, both functions work exactly the same. + * For NFC and NFKC where quickCheck may return "maybe", this function will + * perform further tests to arrive at a true/false result. + * @param str the input string to be checked to see if it is + * normalized + * @param mode the normalization mode + * @param options Options for use with exclusion set and tailored Normalization + * The only option that is currently recognized is UNICODE_3_2 + * @see #isNormalized + * @stable ICU 2.6 + */ + public static boolean isNormalized(String str, Mode mode, int options) { + return mode.getNormalizer2(options).isNormalized(str); + } + + public static boolean isNormalized(String str, Normalizer.Form form) { + return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); + } + + public static boolean isNormalized(String str, Normalizer.Form form, int options) { + return NormalizerBase.isNormalized(str, toMode(form), options); + } + + //------------------------------------------------------------------------- + // Iteration API + //------------------------------------------------------------------------- + + /** + * Return the current character in the normalized text. + * @return The codepoint as an int + * @stable ICU 2.8 + */ + public int current() { + if(bufferPos0 || previousNormalize()) { + int c=buffer.codePointBefore(bufferPos); + bufferPos-=Character.charCount(c); + return c; + } else { + return DONE; + } + } + + /** + * Reset the index to the beginning of the text. + * This is equivalent to setIndexOnly(startIndex)). + * @stable ICU 2.8 + */ + public void reset() { + text.setIndex(0); + currentIndex=nextIndex=0; + clearBuffer(); + } + + /** + * Set the iteration position in the input text that is being normalized, + * without any immediate normalization. + * After setIndexOnly(), getIndex() will return the same index that is + * specified here. + * + * @param index the desired index in the input text. + * @stable ICU 2.8 + */ + public void setIndexOnly(int index) { + text.setIndex(index); // validates index + currentIndex=nextIndex=index; + clearBuffer(); + } + + /** + * Set the iteration position in the input text that is being normalized + * and return the first normalized character at that position. + *

+ * Note: This method sets the position in the input text, + * while {@link #next} and {@link #previous} iterate through characters + * in the normalized output. This means that there is not + * necessarily a one-to-one correspondence between characters returned + * by {@code next} and {@code previous} and the indices passed to and + * returned from {@code setIndex} and {@link #getIndex}. + *

+ * @param index the desired index in the input text. + * + * @return the first normalized character that is the result of iterating + * forward starting at the given index. + * + * @throws IllegalArgumentException if the given index is less than + * {@link #getBeginIndex} or greater than {@link #getEndIndex}. + * deprecated ICU 3.2 + * @obsolete ICU 3.2 + */ + public int setIndex(int index) { + setIndexOnly(index); + return current(); + } + + /** + * Retrieve the index of the start of the input text. This is the begin + * index of the {@code CharacterIterator} or the start (i.e. 0) of the + * {@code String} over which this {@code NormalizerBase} is iterating + * @deprecated ICU 2.2. Use startIndex() instead. + * @return The codepoint as an int + * @see #startIndex + */ + @Deprecated + public int getBeginIndex() { + return 0; + } + + /** + * Retrieve the index of the end of the input text. This is the end index + * of the {@code CharacterIterator} or the length of the {@code String} + * over which this {@code NormalizerBase} is iterating + * @deprecated ICU 2.2. Use endIndex() instead. + * @return The codepoint as an int + * @see #endIndex + */ + @Deprecated + public int getEndIndex() { + return endIndex(); + } + + /** + * Retrieve the current iteration position in the input text that is + * being normalized. This method is useful in applications such as + * searching, where you need to be able to determine the position in + * the input text that corresponds to a given normalized output character. + *

+ * Note: This method sets the position in the input, while + * {@link #next} and {@link #previous} iterate through characters in the + * output. This means that there is not necessarily a one-to-one + * correspondence between characters returned by {@code next} and + * {@code previous} and the indices passed to and returned from + * {@code setIndex} and {@link #getIndex}. + * @return The current iteration position + * @stable ICU 2.8 + */ + public int getIndex() { + if(bufferPos + * Note:If the normalization mode is changed while iterating + * over a string, calls to {@link #next} and {@link #previous} may + * return previously buffers characters in the old normalization mode + * until the iteration is able to re-sync at the next base character. + * It is safest to call {@link #setText setText()}, {@link #first}, + * {@link #last}, etc. after calling {@code setMode}. + *

+ * @param newMode the new mode for this {@code NormalizerBase}. + * The supported modes are: + *

+ * + * @see #getMode + * @stable ICU 2.8 + */ + public void setMode(Mode newMode) { + mode = newMode; + norm2 = mode.getNormalizer2(options); + } + + /** + * Return the basic operation performed by this {@code NormalizerBase} + * + * @see #setMode + * @stable ICU 2.8 + */ + public Mode getMode() { + return mode; + } + + /** + * Set the input text over which this {@code NormalizerBase} will iterate. + * The iteration position is set to the beginning of the input text. + * @param newText The new string to be normalized. + * @stable ICU 2.8 + */ + public void setText(String newText) { + UCharacterIterator newIter = UCharacterIterator.getInstance(newText); + if (newIter == null) { + throw new IllegalStateException("Could not create a new UCharacterIterator"); + } + text = newIter; + reset(); + } + + /** + * Set the input text over which this {@code NormalizerBase} will iterate. + * The iteration position is set to the beginning of the input text. + * @param newText The new string to be normalized. + * @stable ICU 2.8 + */ + public void setText(CharacterIterator newText) { + UCharacterIterator newIter = UCharacterIterator.getInstance(newText); + if (newIter == null) { + throw new IllegalStateException("Could not create a new UCharacterIterator"); + } + text = newIter; + currentIndex=nextIndex=0; + clearBuffer(); + } + + private void clearBuffer() { + buffer.setLength(0); + bufferPos=0; + } + + private boolean nextNormalize() { + clearBuffer(); + currentIndex=nextIndex; + text.setIndex(nextIndex); + // Skip at least one character so we make progress. + int c=text.nextCodePoint(); + if(c<0) { + return false; + } + StringBuilder segment=new StringBuilder().appendCodePoint(c); + while((c=text.nextCodePoint())>=0) { + if(norm2.hasBoundaryBefore(c)) { + text.moveCodePointIndex(-1); + break; + } + segment.appendCodePoint(c); + } + nextIndex=text.getIndex(); + norm2.normalize(segment, buffer); + return buffer.length()!=0; + } + + private boolean previousNormalize() { + clearBuffer(); + nextIndex=currentIndex; + text.setIndex(currentIndex); + StringBuilder segment=new StringBuilder(); + int c; + while((c=text.previousCodePoint())>=0) { + if(c<=0xffff) { + segment.insert(0, (char)c); + } else { + segment.insert(0, Character.toChars(c)); + } + if(norm2.hasBoundaryBefore(c)) { + break; + } + } + currentIndex=text.getIndex(); + norm2.normalize(segment, buffer); + bufferPos=buffer.length(); + return buffer.length()!=0; + } + +}