--- old/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2020-01-10 15:58:00.000000000 -0800 +++ /dev/null 2020-01-10 15:58:00.000000000 -0800 @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2000-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -import java.text.CharacterIterator; -import java.text.Normalizer; - -/** - * Unicode Normalization - * - *
normalize
transforms Unicode text into an equivalent composed or
- * decomposed form, allowing for easier sorting and searching of text.
- * normalize
supports the standard normalization forms described in
- *
- * Unicode Standard Annex #15 — Unicode Normalization Forms.
- *
- * Characters with accents or other adornments can be encoded in
- * several different ways in Unicode. For example, take the character A-acute.
- * In Unicode, this can be encoded as a single character (the
- * "composed" form):
- *
- * - * 00C1 LATIN CAPITAL LETTER A WITH ACUTE - *- * - * or as two separate characters (the "decomposed" form): - * - *
- * 0041 LATIN CAPITAL LETTER A - * 0301 COMBINING ACUTE ACCENT - *- * - * To a user of your program, however, both of these sequences should be - * treated as the same "user-level" character "A with acute accent". When you - * are searching or comparing text, you must ensure that these two sequences are - * treated equivalently. In addition, you must handle characters with more than - * one accent. Sometimes the order of a character's combining accents is - * significant, while in other cases accent sequences in different orders are - * really equivalent. - * - * Similarly, the string "ffi" can be encoded as three separate letters: - * - *
- * 0066 LATIN SMALL LETTER F - * 0066 LATIN SMALL LETTER F - * 0069 LATIN SMALL LETTER I - *- * - * or as the single character - * - *
- * FB03 LATIN SMALL LIGATURE FFI - *- * - * The ffi ligature is not a distinct semantic character, and strictly speaking - * it shouldn't be in Unicode at all, but it was included for compatibility - * with existing character sets that already provided it. The Unicode standard - * identifies such characters by giving them "compatibility" decompositions - * into the corresponding semantic characters. When sorting and searching, you - * will often want to use these mappings. - * - *
normalize
helps solve these problems by transforming text into
- * the canonical composed and decomposed forms as shown in the first example
- * above. In addition, you can have it perform compatibility decompositions so
- * that you can treat compatibility characters the same as their equivalents.
- * Finally, normalize
rearranges accents into the proper canonical
- * order, so that you do not have to worry about accent rearrangement on your
- * own.
- *
- * Form FCD, "Fast C or D", is also designed for collation.
- * It allows to work on strings that are not necessarily normalized
- * with an algorithm (like in collation) that works under "canonical closure",
- * i.e., it treats precomposed characters and their decomposed equivalents the
- * same.
- *
- * It is not a normalization form because it does not provide for uniqueness of
- * representation. Multiple strings may be canonically equivalent (their NFDs
- * are identical) and may all conform to FCD without being identical themselves.
- *
- * The form is defined such that the "raw decomposition", the recursive
- * canonical decomposition of each character, results in a string that is
- * canonically ordered. This means that precomposed characters are allowed for
- * as long as their decompositions do not need canonical reordering.
- *
- * Its advantage for a process like collation is that all NFD and most NFC texts
- * - and many unnormalized texts - already conform to FCD and do not need to be
- * normalized (NFD) for such a process. The FCD quick check will return YES for
- * most strings in practice.
- *
- * normalize(FCD) may be implemented with NFD.
- *
- * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
- * http://www.unicode.org/notes/tn5/#FCD
- *
- * ICU collation performs either NFD or FCD normalization automatically if
- * normalization is turned on for the collator object. Beyond collation and
- * string search, normalized strings may be useful for string equivalence
- * comparisons, transliteration/transcription, unique representations, etc.
- *
- * The W3C generally recommends to exchange texts in NFC.
- * Note also that most legacy character encodings use only precomposed forms and
- * often do not encode any combining marks by themselves. For conversion to such
- * character encodings the Unicode text needs to be normalized to NFC.
- * For more usage examples, see the Unicode Standard Annex.
- *
- * Note: The Normalizer class also provides API for iterative normalization.
- * While the setIndex() and getIndex() refer to indices in the
- * underlying Unicode input text, the next() and previous() methods
- * iterate through characters in the normalized output.
- * This means that there is not necessarily a one-to-one correspondence
- * between characters returned by next() and previous() and the indices
- * passed to and returned from setIndex() and getIndex().
- * It is for this reason that Normalizer does not implement the CharacterIterator interface.
- *
- * @stable ICU 2.8
- */
-// Original filename in ICU4J: Normalizer.java
-public final class NormalizerBase implements Cloneable {
-
- // The input text and our position in it
- private UCharacterIterator text;
- private Normalizer2 norm2;
- private Mode mode;
- private int options;
-
- // The normalization buffer is the result of normalization
- // of the source in [currentIndex..nextIndex] .
- private int currentIndex;
- private int nextIndex;
-
- // A buffer for holding intermediate results
- private StringBuilder buffer;
- private int bufferPos;
-
- // Helper classes to defer loading of normalization data.
- private static final class ModeImpl {
- private ModeImpl(Normalizer2 n2) {
- normalizer2 = n2;
- }
- private final Normalizer2 normalizer2;
- }
-
- private static final class NFDModeImpl {
- private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
- }
-
- private static final class NFKDModeImpl {
- private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
- }
-
- private static final class NFCModeImpl {
- private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
- }
-
- private static final class NFKCModeImpl {
- private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
- }
-
- private static final class Unicode32 {
- private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
- }
-
- private static final class NFD32ModeImpl {
- private static final ModeImpl INSTANCE =
- new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
- Unicode32.INSTANCE));
- }
-
- private static final class NFKD32ModeImpl {
- private static final ModeImpl INSTANCE =
- new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
- Unicode32.INSTANCE));
- }
-
- private static final class NFC32ModeImpl {
- private static final ModeImpl INSTANCE =
- new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
- Unicode32.INSTANCE));
- }
-
- private static final class NFKC32ModeImpl {
- private static final ModeImpl INSTANCE =
- new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
- Unicode32.INSTANCE));
- }
-
- /**
- * Options bit set value to select Unicode 3.2 normalization
- * (except NormalizationCorrections).
- * At most one Unicode version can be selected at a time.
- * @stable ICU 2.6
- */
- public static final int UNICODE_3_2=0x20;
-
- public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
-
- /*
- * Default option for the latest Unicode normalization. This option is
- * provided mainly for testing.
- * The value zero means that normalization is done with the fixes for
- * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
- * - Corrigendum 5 (Normalization Idempotency)
- */
- public static final int UNICODE_LATEST = 0x00;
-
- /**
- * Constant indicating that the end of the iteration has been reached.
- * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
- * @stable ICU 2.8
- */
- public static final int DONE = UCharacterIterator.DONE;
-
- /**
- * Constants for normalization modes.
- * - * The Mode class is not intended for public subclassing. - * Only the Mode constants provided by the Normalizer class should be used, - * and any fields or methods should not be called or overridden by users. - * @stable ICU 2.8 - */ - public abstract static class Mode { - - /** - * Sole constructor - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected Mode() { - } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected abstract Normalizer2 getNormalizer2(int options); - } - - private static Mode toMode(Normalizer.Form form) { - switch (form) { - case NFC : - return NFC; - case NFD : - return NFD; - case NFKC : - return NFKC; - case NFKD : - return NFKD; - } - - throw new IllegalArgumentException("Unexpected normalization form: " + - form); - } - - private static final class NONEMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } - } - - private static final class NFDMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFD32ModeImpl.INSTANCE.normalizer2 : - NFDModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFKDMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFKD32ModeImpl.INSTANCE.normalizer2 : - NFKDModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFCMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFC32ModeImpl.INSTANCE.normalizer2 : - NFCModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFKCMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFKC32ModeImpl.INSTANCE.normalizer2 : - NFKCModeImpl.INSTANCE.normalizer2; - } - } - - /** - * No decomposition/composition. - * @stable ICU 2.8 - */ - public static final Mode NONE = new NONEMode(); - - /** - * Canonical decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFD = new NFDMode(); - - /** - * Compatibility decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFKD = new NFKDMode(); - - /** - * Canonical decomposition followed by canonical composition. - * @stable ICU 2.8 - */ - public static final Mode NFC = new NFCMode(); - - public static final Mode NFKC =new NFKCMode(); - - //------------------------------------------------------------------------- - // Iterator constructors - //------------------------------------------------------------------------- - - /** - * Creates a new {@code NormalizerBase} object for iterating over the - * normalized form of a given string. - *
- * The {@code options} parameter specifies which optional - * {@code NormalizerBase} features are to be enabled for this object. - *
- * @param str The string to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - * - * @param opt Any optional features to be enabled. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the - * standard Unicode Normalization Forms, use 0 for this argument. - * @stable ICU 2.6 - */ - public NormalizerBase(String str, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance(str); - this.mode = mode; - this.options=opt; - norm2 = mode.getNormalizer2(opt); - buffer = new StringBuilder(); - } - - public NormalizerBase(String str, Mode mode) { - this(str, mode, 0); - } - - - /** - * Creates a new {@code NormalizerBase} object for iterating over the - * normalized form of the given text. - *
- * @param iter The input text to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - * - * @param opt Any optional features to be enabled. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the - * standard Unicode Normalization Forms, use 0 for this argument. - * @stable ICU 2.6 - */ - public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); - this.mode = mode; - this.options = opt; - norm2 = mode.getNormalizer2(opt); - buffer = new StringBuilder(); - } - - public NormalizerBase(CharacterIterator iter, Mode mode) { - this(iter, mode, 0); - } - - /** - * Clones this {@code NormalizerBase} object. All properties of this - * object are duplicated in the new object, including the cloning of any - * {@link CharacterIterator} that was passed in to the constructor - * or to {@link #setText(CharacterIterator) setText}. - * However, the text storage underlying - * the {@code CharacterIterator} is not duplicated unless the - * iterator's {@code clone} method does so. - * @stable ICU 2.8 - */ - public Object clone() { - try { - NormalizerBase copy = (NormalizerBase) super.clone(); - copy.text = (UCharacterIterator) text.clone(); - copy.mode = mode; - copy.options = options; - copy.norm2 = norm2; - copy.buffer = new StringBuilder(buffer); - copy.bufferPos = bufferPos; - copy.currentIndex = currentIndex; - copy.nextIndex = nextIndex; - return copy; - } - catch (CloneNotSupportedException e) { - throw new InternalError(e.toString(), e); - } - } - - /** - * Normalizes a {@code String} using the given normalization operation. - *
- * The {@code options} parameter specifies which optional - * {@code NormalizerBase} features are to be enabled for this operation. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the standard - * Unicode Normalization Forms, use 0 for this argument. - *
- * @param str the input string to be normalized.
- * @param mode the normalization mode
- * @param options the optional features to be enabled.
- * @return String the normalized string
- * @stable ICU 2.6
- */
- public static String normalize(String str, Mode mode, int options) {
- return mode.getNormalizer2(options).normalize(str);
- }
-
- public static String normalize(String str, Normalizer.Form form) {
- return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
- }
-
- public static String normalize(String str, Normalizer.Form form, int options) {
- return NormalizerBase.normalize(str, toMode(form), options);
- }
-
- /**
- * Test if a string is in a given normalization form.
- * This is semantically equivalent to source.equals(normalize(source, mode)).
- *
- * Unlike quickCheck(), this function returns a definitive result,
- * never a "maybe".
- * For NFD, NFKD, and FCD, both functions work exactly the same.
- * For NFC and NFKC where quickCheck may return "maybe", this function will
- * perform further tests to arrive at a true/false result.
- * @param str the input string to be checked to see if it is
- * normalized
- * @param mode the normalization mode
- * @param options Options for use with exclusion set and tailored Normalization
- * The only option that is currently recognized is UNICODE_3_2
- * @see #isNormalized
- * @stable ICU 2.6
- */
- public static boolean isNormalized(String str, Mode mode, int options) {
- return mode.getNormalizer2(options).isNormalized(str);
- }
-
- public static boolean isNormalized(String str, Normalizer.Form form) {
- return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
- }
-
- public static boolean isNormalized(String str, Normalizer.Form form, int options) {
- return NormalizerBase.isNormalized(str, toMode(form), options);
- }
-
- //-------------------------------------------------------------------------
- // Iteration API
- //-------------------------------------------------------------------------
-
- /**
- * Return the current character in the normalized text.
- * @return The codepoint as an int
- * @stable ICU 2.8
- */
- public int current() {
- if(bufferPos
- * Note: This method sets the position in the input text,
- * while {@link #next} and {@link #previous} iterate through characters
- * in the normalized output. This means that there is not
- * necessarily a one-to-one correspondence between characters returned
- * by {@code next} and {@code previous} and the indices passed to and
- * returned from {@code setIndex} and {@link #getIndex}.
- *
- * @param index the desired index in the input text.
- *
- * @return the first normalized character that is the result of iterating
- * forward starting at the given index.
- *
- * @throws IllegalArgumentException if the given index is less than
- * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
- * deprecated ICU 3.2
- * @obsolete ICU 3.2
- */
- public int setIndex(int index) {
- setIndexOnly(index);
- return current();
- }
-
- /**
- * Retrieve the index of the start of the input text. This is the begin
- * index of the {@code CharacterIterator} or the start (i.e. 0) of the
- * {@code String} over which this {@code NormalizerBase} is iterating
- * @deprecated ICU 2.2. Use startIndex() instead.
- * @return The codepoint as an int
- * @see #startIndex
- */
- @Deprecated
- public int getBeginIndex() {
- return 0;
- }
-
- /**
- * Retrieve the index of the end of the input text. This is the end index
- * of the {@code CharacterIterator} or the length of the {@code String}
- * over which this {@code NormalizerBase} is iterating
- * @deprecated ICU 2.2. Use endIndex() instead.
- * @return The codepoint as an int
- * @see #endIndex
- */
- @Deprecated
- public int getEndIndex() {
- return endIndex();
- }
-
- /**
- * Retrieve the current iteration position in the input text that is
- * being normalized. This method is useful in applications such as
- * searching, where you need to be able to determine the position in
- * the input text that corresponds to a given normalized output character.
- *
- * Note: This method sets the position in the input, while
- * {@link #next} and {@link #previous} iterate through characters in the
- * output. This means that there is not necessarily a one-to-one
- * correspondence between characters returned by {@code next} and
- * {@code previous} and the indices passed to and returned from
- * {@code setIndex} and {@link #getIndex}.
- * @return The current iteration position
- * @stable ICU 2.8
- */
- public int getIndex() {
- if(bufferPos
- * @param newMode the new mode for this {@code NormalizerBase}.
- * The supported modes are:
- *
+ * The Mode class is not intended for public subclassing.
+ * Only the Mode constants provided by the Normalizer class should be used,
+ * and any fields or methods should not be called or overridden by users.
+ * @stable ICU 2.8
+ */
+ public abstract static class Mode {
+
+ /**
+ * Sole constructor
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ protected Mode() {
+ }
+
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ protected abstract Normalizer2 getNormalizer2(int options);
+ }
+
+ private static Mode toMode(Normalizer.Form form) {
+ switch (form) {
+ case NFC :
+ return NFC;
+ case NFD :
+ return NFD;
+ case NFKC :
+ return NFKC;
+ case NFKD :
+ return NFKD;
+ }
+
+ throw new IllegalArgumentException("Unexpected normalization form: " +
+ form);
+ }
+
+ private static final class NONEMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
+ }
+
+ private static final class NFDMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFD32ModeImpl.INSTANCE.normalizer2 :
+ NFDModeImpl.INSTANCE.normalizer2;
+ }
+ }
+
+ private static final class NFKDMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFKD32ModeImpl.INSTANCE.normalizer2 :
+ NFKDModeImpl.INSTANCE.normalizer2;
+ }
+ }
+
+ private static final class NFCMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFC32ModeImpl.INSTANCE.normalizer2 :
+ NFCModeImpl.INSTANCE.normalizer2;
+ }
+ }
+
+ private static final class NFKCMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFKC32ModeImpl.INSTANCE.normalizer2 :
+ NFKCModeImpl.INSTANCE.normalizer2;
+ }
+ }
+
+ /**
+ * No decomposition/composition.
+ * @stable ICU 2.8
+ */
+ public static final Mode NONE = new NONEMode();
+
+ /**
+ * Canonical decomposition.
+ * @stable ICU 2.8
+ */
+ public static final Mode NFD = new NFDMode();
+
+ /**
+ * Compatibility decomposition.
+ * @stable ICU 2.8
+ */
+ public static final Mode NFKD = new NFKDMode();
+
+ /**
+ * Canonical decomposition followed by canonical composition.
+ * @stable ICU 2.8
+ */
+ public static final Mode NFC = new NFCMode();
+
+ public static final Mode NFKC =new NFKCMode();
+
+ //-------------------------------------------------------------------------
+ // Iterator constructors
+ //-------------------------------------------------------------------------
+
+ /**
+ * Creates a new {@code NormalizerBase} object for iterating over the
+ * normalized form of a given string.
+ *
+ * The {@code options} parameter specifies which optional
+ * {@code NormalizerBase} features are to be enabled for this object.
+ *
+ * @param str The string to be normalized. The normalization
+ * will start at the beginning of the string.
+ *
+ * @param mode The normalization mode.
+ *
+ * @param opt Any optional features to be enabled.
+ * Currently the only available option is {@link #UNICODE_3_2}.
+ * If you want the default behavior corresponding to one of the
+ * standard Unicode Normalization Forms, use 0 for this argument.
+ * @stable ICU 2.6
+ */
+ public NormalizerBase(String str, Mode mode, int opt) {
+ this.text = UCharacterIterator.getInstance(str);
+ this.mode = mode;
+ this.options=opt;
+ norm2 = mode.getNormalizer2(opt);
+ buffer = new StringBuilder();
+ }
+
+ public NormalizerBase(String str, Mode mode) {
+ this(str, mode, 0);
+ }
+
+
+ /**
+ * Creates a new {@code NormalizerBase} object for iterating over the
+ * normalized form of the given text.
+ *
+ * @param iter The input text to be normalized. The normalization
+ * will start at the beginning of the string.
+ *
+ * @param mode The normalization mode.
+ *
+ * @param opt Any optional features to be enabled.
+ * Currently the only available option is {@link #UNICODE_3_2}.
+ * If you want the default behavior corresponding to one of the
+ * standard Unicode Normalization Forms, use 0 for this argument.
+ * @stable ICU 2.6
+ */
+ public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
+ this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
+ this.mode = mode;
+ this.options = opt;
+ norm2 = mode.getNormalizer2(opt);
+ buffer = new StringBuilder();
+ }
+
+ public NormalizerBase(CharacterIterator iter, Mode mode) {
+ this(iter, mode, 0);
+ }
+
+ /**
+ * Clones this {@code NormalizerBase} object. All properties of this
+ * object are duplicated in the new object, including the cloning of any
+ * {@link CharacterIterator} that was passed in to the constructor
+ * or to {@link #setText(CharacterIterator) setText}.
+ * However, the text storage underlying
+ * the {@code CharacterIterator} is not duplicated unless the
+ * iterator's {@code clone} method does so.
+ * @stable ICU 2.8
+ */
+ public Object clone() {
+ try {
+ NormalizerBase copy = (NormalizerBase) super.clone();
+ copy.text = (UCharacterIterator) text.clone();
+ copy.mode = mode;
+ copy.options = options;
+ copy.norm2 = norm2;
+ copy.buffer = new StringBuilder(buffer);
+ copy.bufferPos = bufferPos;
+ copy.currentIndex = currentIndex;
+ copy.nextIndex = nextIndex;
+ return copy;
+ }
+ catch (CloneNotSupportedException e) {
+ throw new InternalError(e.toString(), e);
+ }
+ }
+
+ /**
+ * Normalizes a {@code String} using the given normalization operation.
+ *
+ * The {@code options} parameter specifies which optional
+ * {@code NormalizerBase} features are to be enabled for this operation.
+ * Currently the only available option is {@link #UNICODE_3_2}.
+ * If you want the default behavior corresponding to one of the standard
+ * Unicode Normalization Forms, use 0 for this argument.
+ *
+ * @param str the input string to be normalized.
+ * @param mode the normalization mode
+ * @param options the optional features to be enabled.
+ * @return String the normalized string
+ * @stable ICU 2.6
+ */
+ public static String normalize(String str, Mode mode, int options) {
+ return mode.getNormalizer2(options).normalize(str);
+ }
+
+ public static String normalize(String str, Normalizer.Form form) {
+ return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
+ }
+
+ public static String normalize(String str, Normalizer.Form form, int options) {
+ return NormalizerBase.normalize(str, toMode(form), options);
+ }
+
+ /**
+ * Test if a string is in a given normalization form.
+ * This is semantically equivalent to source.equals(normalize(source, mode)).
+ *
+ * Unlike quickCheck(), this function returns a definitive result,
+ * never a "maybe".
+ * For NFD, NFKD, and FCD, both functions work exactly the same.
+ * For NFC and NFKC where quickCheck may return "maybe", this function will
+ * perform further tests to arrive at a true/false result.
+ * @param str the input string to be checked to see if it is
+ * normalized
+ * @param mode the normalization mode
+ * @param options Options for use with exclusion set and tailored Normalization
+ * The only option that is currently recognized is UNICODE_3_2
+ * @see #isNormalized
+ * @stable ICU 2.6
+ */
+ public static boolean isNormalized(String str, Mode mode, int options) {
+ return mode.getNormalizer2(options).isNormalized(str);
+ }
+
+ public static boolean isNormalized(String str, Normalizer.Form form) {
+ return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
+ }
+
+ public static boolean isNormalized(String str, Normalizer.Form form, int options) {
+ return NormalizerBase.isNormalized(str, toMode(form), options);
+ }
+
+ //-------------------------------------------------------------------------
+ // Iteration API
+ //-------------------------------------------------------------------------
+
+ /**
+ * Return the current character in the normalized text.
+ * @return The codepoint as an int
+ * @stable ICU 2.8
+ */
+ public int current() {
+ if(bufferPos
+ * Note: This method sets the position in the input text,
+ * while {@link #next} and {@link #previous} iterate through characters
+ * in the normalized output. This means that there is not
+ * necessarily a one-to-one correspondence between characters returned
+ * by {@code next} and {@code previous} and the indices passed to and
+ * returned from {@code setIndex} and {@link #getIndex}.
+ *
+ * @param index the desired index in the input text.
+ *
+ * @return the first normalized character that is the result of iterating
+ * forward starting at the given index.
+ *
+ * @throws IllegalArgumentException if the given index is less than
+ * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
+ * deprecated ICU 3.2
+ * @obsolete ICU 3.2
+ */
+ public int setIndex(int index) {
+ setIndexOnly(index);
+ return current();
+ }
+
+ /**
+ * Retrieve the index of the start of the input text. This is the begin
+ * index of the {@code CharacterIterator} or the start (i.e. 0) of the
+ * {@code String} over which this {@code NormalizerBase} is iterating
+ * @deprecated ICU 2.2. Use startIndex() instead.
+ * @return The codepoint as an int
+ * @see #startIndex
+ */
+ @Deprecated
+ public int getBeginIndex() {
+ return 0;
+ }
+
+ /**
+ * Retrieve the index of the end of the input text. This is the end index
+ * of the {@code CharacterIterator} or the length of the {@code String}
+ * over which this {@code NormalizerBase} is iterating
+ * @deprecated ICU 2.2. Use endIndex() instead.
+ * @return The codepoint as an int
+ * @see #endIndex
+ */
+ @Deprecated
+ public int getEndIndex() {
+ return endIndex();
+ }
+
+ /**
+ * Retrieve the current iteration position in the input text that is
+ * being normalized. This method is useful in applications such as
+ * searching, where you need to be able to determine the position in
+ * the input text that corresponds to a given normalized output character.
+ *
+ * Note: This method sets the position in the input, while
+ * {@link #next} and {@link #previous} iterate through characters in the
+ * output. This means that there is not necessarily a one-to-one
+ * correspondence between characters returned by {@code next} and
+ * {@code previous} and the indices passed to and returned from
+ * {@code setIndex} and {@link #getIndex}.
+ * @return The current iteration position
+ * @stable ICU 2.8
+ */
+ public int getIndex() {
+ if(bufferPos
+ * @param newMode the new mode for this {@code NormalizerBase}.
+ * The supported modes are:
+ *
- *
- *
- * @see #getMode
- * @stable ICU 2.8
- */
- public void setMode(Mode newMode) {
- mode = newMode;
- norm2 = mode.getNormalizer2(options);
- }
-
- /**
- * Return the basic operation performed by this {@code NormalizerBase}
- *
- * @see #setMode
- * @stable ICU 2.8
- */
- public Mode getMode() {
- return mode;
- }
-
- /**
- * Set the input text over which this {@code NormalizerBase} will iterate.
- * The iteration position is set to the beginning of the input text.
- * @param newText The new string to be normalized.
- * @stable ICU 2.8
- */
- public void setText(String newText) {
- UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
- if (newIter == null) {
- throw new IllegalStateException("Could not create a new UCharacterIterator");
- }
- text = newIter;
- reset();
- }
-
- /**
- * Set the input text over which this {@code NormalizerBase} will iterate.
- * The iteration position is set to the beginning of the input text.
- * @param newText The new string to be normalized.
- * @stable ICU 2.8
- */
- public void setText(CharacterIterator newText) {
- UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
- if (newIter == null) {
- throw new IllegalStateException("Could not create a new UCharacterIterator");
- }
- text = newIter;
- currentIndex=nextIndex=0;
- clearBuffer();
- }
-
- private void clearBuffer() {
- buffer.setLength(0);
- bufferPos=0;
- }
-
- private boolean nextNormalize() {
- clearBuffer();
- currentIndex=nextIndex;
- text.setIndex(nextIndex);
- // Skip at least one character so we make progress.
- int c=text.nextCodePoint();
- if(c<0) {
- return false;
- }
- StringBuilder segment=new StringBuilder().appendCodePoint(c);
- while((c=text.nextCodePoint())>=0) {
- if(norm2.hasBoundaryBefore(c)) {
- text.moveCodePointIndex(-1);
- break;
- }
- segment.appendCodePoint(c);
- }
- nextIndex=text.getIndex();
- norm2.normalize(segment, buffer);
- return buffer.length()!=0;
- }
-
- private boolean previousNormalize() {
- clearBuffer();
- nextIndex=currentIndex;
- text.setIndex(currentIndex);
- StringBuilder segment=new StringBuilder();
- int c;
- while((c=text.previousCodePoint())>=0) {
- if(c<=0xffff) {
- segment.insert(0, (char)c);
- } else {
- segment.insert(0, Character.toChars(c));
- }
- if(norm2.hasBoundaryBefore(c)) {
- break;
- }
- }
- currentIndex=text.getIndex();
- norm2.normalize(segment, buffer);
- bufferPos=buffer.length();
- return buffer.length()!=0;
- }
-
-}
--- /dev/null 2020-01-10 15:58:00.000000000 -0800
+++ new/src/java.base/share/classes/jdk/internal/icu/text/NormalizerBase.java 2020-01-10 15:58:00.000000000 -0800
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2000-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+package jdk.internal.icu.text;
+
+import jdk.internal.icu.impl.Norm2AllModes;
+
+import java.text.CharacterIterator;
+import java.text.Normalizer;
+
+/**
+ * Unicode Normalization
+ *
+ * Unicode normalization API
+ *
+ * normalize
transforms Unicode text into an equivalent composed or
+ * decomposed form, allowing for easier sorting and searching of text.
+ * normalize
supports the standard normalization forms described in
+ *
+ * Unicode Standard Annex #15 — Unicode Normalization Forms.
+ *
+ * Characters with accents or other adornments can be encoded in
+ * several different ways in Unicode. For example, take the character A-acute.
+ * In Unicode, this can be encoded as a single character (the
+ * "composed" form):
+ *
+ *
+ * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
+ *
+ *
+ * or as two separate characters (the "decomposed" form):
+ *
+ *
+ * 0041 LATIN CAPITAL LETTER A
+ * 0301 COMBINING ACUTE ACCENT
+ *
+ *
+ * To a user of your program, however, both of these sequences should be
+ * treated as the same "user-level" character "A with acute accent". When you
+ * are searching or comparing text, you must ensure that these two sequences are
+ * treated equivalently. In addition, you must handle characters with more than
+ * one accent. Sometimes the order of a character's combining accents is
+ * significant, while in other cases accent sequences in different orders are
+ * really equivalent.
+ *
+ * Similarly, the string "ffi" can be encoded as three separate letters:
+ *
+ *
+ * 0066 LATIN SMALL LETTER F
+ * 0066 LATIN SMALL LETTER F
+ * 0069 LATIN SMALL LETTER I
+ *
+ *
+ * or as the single character
+ *
+ *
+ * FB03 LATIN SMALL LIGATURE FFI
+ *
+ *
+ * The ffi ligature is not a distinct semantic character, and strictly speaking
+ * it shouldn't be in Unicode at all, but it was included for compatibility
+ * with existing character sets that already provided it. The Unicode standard
+ * identifies such characters by giving them "compatibility" decompositions
+ * into the corresponding semantic characters. When sorting and searching, you
+ * will often want to use these mappings.
+ *
+ * normalize
helps solve these problems by transforming text into
+ * the canonical composed and decomposed forms as shown in the first example
+ * above. In addition, you can have it perform compatibility decompositions so
+ * that you can treat compatibility characters the same as their equivalents.
+ * Finally, normalize
rearranges accents into the proper canonical
+ * order, so that you do not have to worry about accent rearrangement on your
+ * own.
+ *
+ * Form FCD, "Fast C or D", is also designed for collation.
+ * It allows to work on strings that are not necessarily normalized
+ * with an algorithm (like in collation) that works under "canonical closure",
+ * i.e., it treats precomposed characters and their decomposed equivalents the
+ * same.
+ *
+ * It is not a normalization form because it does not provide for uniqueness of
+ * representation. Multiple strings may be canonically equivalent (their NFDs
+ * are identical) and may all conform to FCD without being identical themselves.
+ *
+ * The form is defined such that the "raw decomposition", the recursive
+ * canonical decomposition of each character, results in a string that is
+ * canonically ordered. This means that precomposed characters are allowed for
+ * as long as their decompositions do not need canonical reordering.
+ *
+ * Its advantage for a process like collation is that all NFD and most NFC texts
+ * - and many unnormalized texts - already conform to FCD and do not need to be
+ * normalized (NFD) for such a process. The FCD quick check will return YES for
+ * most strings in practice.
+ *
+ * normalize(FCD) may be implemented with NFD.
+ *
+ * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
+ * http://www.unicode.org/notes/tn5/#FCD
+ *
+ * ICU collation performs either NFD or FCD normalization automatically if
+ * normalization is turned on for the collator object. Beyond collation and
+ * string search, normalized strings may be useful for string equivalence
+ * comparisons, transliteration/transcription, unique representations, etc.
+ *
+ * The W3C generally recommends to exchange texts in NFC.
+ * Note also that most legacy character encodings use only precomposed forms and
+ * often do not encode any combining marks by themselves. For conversion to such
+ * character encodings the Unicode text needs to be normalized to NFC.
+ * For more usage examples, see the Unicode Standard Annex.
+ *
+ * Note: The Normalizer class also provides API for iterative normalization.
+ * While the setIndex() and getIndex() refer to indices in the
+ * underlying Unicode input text, the next() and previous() methods
+ * iterate through characters in the normalized output.
+ * This means that there is not necessarily a one-to-one correspondence
+ * between characters returned by next() and previous() and the indices
+ * passed to and returned from setIndex() and getIndex().
+ * It is for this reason that Normalizer does not implement the CharacterIterator interface.
+ *
+ * @stable ICU 2.8
+ */
+// Original filename in ICU4J: Normalizer.java
+public final class NormalizerBase implements Cloneable {
+
+ // The input text and our position in it
+ private UCharacterIterator text;
+ private Normalizer2 norm2;
+ private Mode mode;
+ private int options;
+
+ // The normalization buffer is the result of normalization
+ // of the source in [currentIndex..nextIndex] .
+ private int currentIndex;
+ private int nextIndex;
+
+ // A buffer for holding intermediate results
+ private StringBuilder buffer;
+ private int bufferPos;
+
+ // Helper classes to defer loading of normalization data.
+ private static final class ModeImpl {
+ private ModeImpl(Normalizer2 n2) {
+ normalizer2 = n2;
+ }
+ private final Normalizer2 normalizer2;
+ }
+
+ private static final class NFDModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
+ }
+
+ private static final class NFKDModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
+ }
+
+ private static final class NFCModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
+ }
+
+ private static final class NFKCModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
+ }
+
+ private static final class Unicode32 {
+ private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
+ }
+
+ private static final class NFD32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFKD32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFC32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFKC32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ /**
+ * Options bit set value to select Unicode 3.2 normalization
+ * (except NormalizationCorrections).
+ * At most one Unicode version can be selected at a time.
+ * @stable ICU 2.6
+ */
+ public static final int UNICODE_3_2=0x20;
+
+ public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
+
+ /*
+ * Default option for the latest Unicode normalization. This option is
+ * provided mainly for testing.
+ * The value zero means that normalization is done with the fixes for
+ * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
+ * - Corrigendum 5 (Normalization Idempotency)
+ */
+ public static final int UNICODE_LATEST = 0x00;
+
+ /**
+ * Constant indicating that the end of the iteration has been reached.
+ * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
+ * @stable ICU 2.8
+ */
+ public static final int DONE = UCharacterIterator.DONE;
+
+ /**
+ * Constants for normalization modes.
+ *
+ *
+ *
+ * @see #getMode
+ * @stable ICU 2.8
+ */
+ public void setMode(Mode newMode) {
+ mode = newMode;
+ norm2 = mode.getNormalizer2(options);
+ }
+
+ /**
+ * Return the basic operation performed by this {@code NormalizerBase}
+ *
+ * @see #setMode
+ * @stable ICU 2.8
+ */
+ public Mode getMode() {
+ return mode;
+ }
+
+ /**
+ * Set the input text over which this {@code NormalizerBase} will iterate.
+ * The iteration position is set to the beginning of the input text.
+ * @param newText The new string to be normalized.
+ * @stable ICU 2.8
+ */
+ public void setText(String newText) {
+ UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
+ if (newIter == null) {
+ throw new IllegalStateException("Could not create a new UCharacterIterator");
+ }
+ text = newIter;
+ reset();
+ }
+
+ /**
+ * Set the input text over which this {@code NormalizerBase} will iterate.
+ * The iteration position is set to the beginning of the input text.
+ * @param newText The new string to be normalized.
+ * @stable ICU 2.8
+ */
+ public void setText(CharacterIterator newText) {
+ UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
+ if (newIter == null) {
+ throw new IllegalStateException("Could not create a new UCharacterIterator");
+ }
+ text = newIter;
+ currentIndex=nextIndex=0;
+ clearBuffer();
+ }
+
+ private void clearBuffer() {
+ buffer.setLength(0);
+ bufferPos=0;
+ }
+
+ private boolean nextNormalize() {
+ clearBuffer();
+ currentIndex=nextIndex;
+ text.setIndex(nextIndex);
+ // Skip at least one character so we make progress.
+ int c=text.nextCodePoint();
+ if(c<0) {
+ return false;
+ }
+ StringBuilder segment=new StringBuilder().appendCodePoint(c);
+ while((c=text.nextCodePoint())>=0) {
+ if(norm2.hasBoundaryBefore(c)) {
+ text.moveCodePointIndex(-1);
+ break;
+ }
+ segment.appendCodePoint(c);
+ }
+ nextIndex=text.getIndex();
+ norm2.normalize(segment, buffer);
+ return buffer.length()!=0;
+ }
+
+ private boolean previousNormalize() {
+ clearBuffer();
+ nextIndex=currentIndex;
+ text.setIndex(currentIndex);
+ StringBuilder segment=new StringBuilder();
+ int c;
+ while((c=text.previousCodePoint())>=0) {
+ if(c<=0xffff) {
+ segment.insert(0, (char)c);
+ } else {
+ segment.insert(0, Character.toChars(c));
+ }
+ if(norm2.hasBoundaryBefore(c)) {
+ break;
+ }
+ }
+ currentIndex=text.getIndex();
+ norm2.normalize(segment, buffer);
+ bufferPos=buffer.length();
+ return buffer.length()!=0;
+ }
+
+}