< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 20,41 **** * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* ! * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * ! * * ! * The original version of this source code and documentation is copyrighted * ! * and owned by IBM, These materials are provided under terms of a License * ! * Agreement between IBM and Sun. This technology is protected by multiple * ! * US and International patents. This notice and attribution to IBM may not * ! * to removed. * ******************************************************************************* */ - package sun.text.normalizer; import java.text.CharacterIterator; import java.text.Normalizer; --- 20,36 ---- * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* ! * Copyright (C) 2000-2014, International Business Machines Corporation and ! * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; import java.text.CharacterIterator; import java.text.Normalizer;
*** 123,134 **** * normalized (NFD) for such a process. The FCD quick check will return YES for * most strings in practice. * * normalize(FCD) may be implemented with NFD. * ! * For more details on FCD see the collation design document: ! * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm * * ICU collation performs either NFD or FCD normalization automatically if * normalization is turned on for the collator object. Beyond collation and * string search, normalized strings may be useful for string equivalence * comparisons, transliteration/transcription, unique representations, etc. --- 118,129 ---- * normalized (NFD) for such a process. The FCD quick check will return YES for * most strings in practice. * * normalize(FCD) may be implemented with NFD. * ! * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): ! * http://www.unicode.org/notes/tn5/#FCD * * ICU collation performs either NFD or FCD normalization automatically if * normalization is turned on for the collator object. Beyond collation and * string search, normalized strings may be useful for string equivalence * comparisons, transliteration/transcription, unique representations, etc.
*** 136,568 **** * The W3C generally recommends to exchange texts in NFC. * Note also that most legacy character encodings use only precomposed forms and * often do not encode any combining marks by themselves. For conversion to such * character encodings the Unicode text needs to be normalized to NFC. * For more usage examples, see the Unicode Standard Annex. * @stable ICU 2.8 */ ! public final class NormalizerBase implements Cloneable { - //------------------------------------------------------------------------- - // Private data - //------------------------------------------------------------------------- - private char[] buffer = new char[100]; - private int bufferStart = 0; - private int bufferPos = 0; - private int bufferLimit = 0; - // The input text and our position in it private UCharacterIterator text; ! private Mode mode = NFC; ! private int options = 0; private int currentIndex; private int nextIndex; ! /** ! * Options bit set value to select Unicode 3.2 normalization ! * (except NormalizationCorrections). ! * At most one Unicode version can be selected at a time. ! * @stable ICU 2.6 ! */ ! public static final int UNICODE_3_2=0x20; ! /** ! * Constant indicating that the end of the iteration has been reached. ! * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. ! * @stable ICU 2.8 ! */ ! public static final int DONE = UCharacterIterator.DONE; ! /** ! * Constants for normalization modes. ! * @stable ICU 2.8 ! */ ! public static class Mode { ! private int modeValue; ! private Mode(int value) { ! modeValue = value; } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.6 ! */ ! protected int normalize(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! UnicodeSet nx) { ! int srcLen = (srcLimit - srcStart); ! int destLen = (destLimit - destStart); ! if( srcLen > destLen ) { ! return srcLen; } ! System.arraycopy(src,srcStart,dest,destStart,srcLen); ! return srcLen; } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.6 ! */ ! protected int normalize(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! int options) { ! return normalize( src, srcStart, srcLimit, ! dest,destStart,destLimit, ! NormalizerImpl.getNX(options) ! ); } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.6 ! */ ! protected String normalize(String src, int options) { ! return src; } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.8 ! */ ! protected int getMinC() { ! return -1; } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.8 ! */ ! protected int getMask() { ! return -1; } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.8 ! */ ! protected IsPrevBoundary getPrevBoundary() { ! return null; } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.8 ! */ ! protected IsNextBoundary getNextBoundary() { ! return null; } /** ! * This method is used for method dispatch * @stable ICU 2.6 */ ! protected QuickCheckResult quickCheck(char[] src,int start, int limit, ! boolean allowMaybe,UnicodeSet nx) { ! if(allowMaybe) { ! return MAYBE; ! } ! return NO; ! } ! /** ! * This method is used for method dispatch ! * @stable ICU 2.8 */ ! protected boolean isNFSkippable(int c) { ! return true; ! } ! } /** ! * No decomposition/composition. * @stable ICU 2.8 */ ! public static final Mode NONE = new Mode(1); /** ! * Canonical decomposition. * @stable ICU 2.8 */ ! public static final Mode NFD = new NFDMode(2); ! ! private static final class NFDMode extends Mode { ! private NFDMode(int value) { ! super(value); ! } ! ! protected int normalize(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! UnicodeSet nx) { ! int[] trailCC = new int[1]; ! return NormalizerImpl.decompose(src, srcStart,srcLimit, ! dest, destStart,destLimit, ! false, trailCC,nx); ! } ! protected String normalize( String src, int options) { ! return decompose(src,false,options); ! } ! ! protected int getMinC() { ! return NormalizerImpl.MIN_WITH_LEAD_CC; } ! protected IsPrevBoundary getPrevBoundary() { ! return new IsPrevNFDSafe(); } ! protected IsNextBoundary getNextBoundary() { ! return new IsNextNFDSafe(); } ! protected int getMask() { ! return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD); } ! protected QuickCheckResult quickCheck(char[] src,int start, ! int limit,boolean allowMaybe, ! UnicodeSet nx) { ! return NormalizerImpl.quickCheck( ! src, start,limit, ! NormalizerImpl.getFromIndexesArr( ! NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE ! ), ! NormalizerImpl.QC_NFD, ! 0, ! allowMaybe, ! nx ! ); } ! protected boolean isNFSkippable(int c) { ! return NormalizerImpl.isNFSkippable(c,this, ! (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD) ! ); } } - /** - * Compatibility decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFKD = new NFKDMode(3); - private static final class NFKDMode extends Mode { ! private NFKDMode(int value) { ! super(value); ! } ! ! protected int normalize(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! UnicodeSet nx) { ! int[] trailCC = new int[1]; ! return NormalizerImpl.decompose(src, srcStart,srcLimit, ! dest, destStart,destLimit, ! true, trailCC, nx); ! } ! ! protected String normalize( String src, int options) { ! return decompose(src,true,options); ! } ! ! protected int getMinC() { ! return NormalizerImpl.MIN_WITH_LEAD_CC; ! } ! ! protected IsPrevBoundary getPrevBoundary() { ! return new IsPrevNFDSafe(); } - - protected IsNextBoundary getNextBoundary() { - return new IsNextNFDSafe(); } ! protected int getMask() { ! return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD); } - - protected QuickCheckResult quickCheck(char[] src,int start, - int limit,boolean allowMaybe, - UnicodeSet nx) { - return NormalizerImpl.quickCheck( - src,start,limit, - NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE - ), - NormalizerImpl.QC_NFKD, - NormalizerImpl.OPTIONS_COMPAT, - allowMaybe, - nx - ); } ! protected boolean isNFSkippable(int c) { ! return NormalizerImpl.isNFSkippable(c, this, ! (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD) ! ); } } /** ! * Canonical decomposition followed by canonical composition. * @stable ICU 2.8 */ ! public static final Mode NFC = new NFCMode(4); ! ! private static final class NFCMode extends Mode{ ! private NFCMode(int value) { ! super(value); ! } ! protected int normalize(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! UnicodeSet nx) { ! return NormalizerImpl.compose( src, srcStart, srcLimit, ! dest,destStart,destLimit, ! 0, nx); ! } ! ! protected String normalize( String src, int options) { ! return compose(src, false, options); ! } ! ! protected int getMinC() { ! return NormalizerImpl.getFromIndexesArr( ! NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE ! ); ! } ! protected IsPrevBoundary getPrevBoundary() { ! return new IsPrevTrueStarter(); ! } ! protected IsNextBoundary getNextBoundary() { ! return new IsNextTrueStarter(); ! } ! protected int getMask() { ! return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC); ! } ! protected QuickCheckResult quickCheck(char[] src,int start, ! int limit,boolean allowMaybe, ! UnicodeSet nx) { ! return NormalizerImpl.quickCheck( ! src,start,limit, ! NormalizerImpl.getFromIndexesArr( ! NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE ! ), ! NormalizerImpl.QC_NFC, ! 0, ! allowMaybe, ! nx ! ); ! } ! protected boolean isNFSkippable(int c) { ! return NormalizerImpl.isNFSkippable(c,this, ! ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY| ! (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO) ! ) ! ); ! } ! }; /** ! * Compatibility decomposition followed by canonical composition. * @stable ICU 2.8 */ ! public static final Mode NFKC =new NFKCMode(5); ! ! private static final class NFKCMode extends Mode{ ! private NFKCMode(int value) { ! super(value); ! } ! protected int normalize(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! UnicodeSet nx) { ! return NormalizerImpl.compose(src, srcStart,srcLimit, ! dest, destStart,destLimit, ! NormalizerImpl.OPTIONS_COMPAT, nx); ! } ! ! protected String normalize( String src, int options) { ! return compose(src, true, options); ! } ! protected int getMinC() { ! return NormalizerImpl.getFromIndexesArr( ! NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE ! ); ! } ! protected IsPrevBoundary getPrevBoundary() { ! return new IsPrevTrueStarter(); ! } ! protected IsNextBoundary getNextBoundary() { ! return new IsNextTrueStarter(); ! } ! protected int getMask() { ! return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC); ! } ! protected QuickCheckResult quickCheck(char[] src,int start, ! int limit,boolean allowMaybe, ! UnicodeSet nx) { ! return NormalizerImpl.quickCheck( ! src,start,limit, ! NormalizerImpl.getFromIndexesArr( ! NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE ! ), ! NormalizerImpl.QC_NFKC, ! NormalizerImpl.OPTIONS_COMPAT, ! allowMaybe, ! nx ! ); ! } ! protected boolean isNFSkippable(int c) { ! return NormalizerImpl.isNFSkippable(c, this, ! ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY| ! (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO) ! ) ! ); ! } ! }; /** ! * Result values for quickCheck(). ! * For details see Unicode Technical Report 15. ! * @stable ICU 2.8 ! */ ! public static final class QuickCheckResult{ ! private int resultValue; ! private QuickCheckResult(int value) { ! resultValue=value; ! } ! } ! /** ! * Indicates that string is not in the normalized format * @stable ICU 2.8 */ ! public static final QuickCheckResult NO = new QuickCheckResult(0); /** ! * Indicates that string is in the normalized format * @stable ICU 2.8 */ ! public static final QuickCheckResult YES = new QuickCheckResult(1); ! /** ! * Indicates it cannot be determined if string is in the normalized ! * format without further thorough checks. ! * @stable ICU 2.8 ! */ ! public static final QuickCheckResult MAYBE = new QuickCheckResult(2); //------------------------------------------------------------------------- ! // Constructors //------------------------------------------------------------------------- /** ! * Creates a new {@code Normalizer} object for iterating over the * normalized form of a given string. * <p> * The {@code options} parameter specifies which optional ! * {@code Normalizer} features are to be enabled for this object. ! * * @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * --- 131,364 ---- * The W3C generally recommends to exchange texts in NFC. * Note also that most legacy character encodings use only precomposed forms and * often do not encode any combining marks by themselves. For conversion to such * character encodings the Unicode text needs to be normalized to NFC. * For more usage examples, see the Unicode Standard Annex. + * + * Note: The Normalizer class also provides API for iterative normalization. + * While the setIndex() and getIndex() refer to indices in the + * underlying Unicode input text, the next() and previous() methods + * iterate through characters in the normalized output. + * This means that there is not necessarily a one-to-one correspondence + * between characters returned by next() and previous() and the indices + * passed to and returned from setIndex() and getIndex(). + * It is for this reason that Normalizer does not implement the CharacterIterator interface. + * * @stable ICU 2.8 */ ! // Original filename in ICU4J: Normalizer.java public final class NormalizerBase implements Cloneable { // The input text and our position in it private UCharacterIterator text; ! private Normalizer2 norm2; ! private Mode mode; ! private int options; ! ! // The normalization buffer is the result of normalization ! // of the source in [currentIndex..nextIndex] . private int currentIndex; private int nextIndex; ! // A buffer for holding intermediate results ! private StringBuilder buffer; ! private int bufferPos; ! // Helper classes to defer loading of normalization data. ! private static final class ModeImpl { ! private ModeImpl(Normalizer2 n2) { ! normalizer2 = n2; ! } ! private final Normalizer2 normalizer2; ! } ! private static final class NFDModeImpl { ! private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); } ! private static final class NFKDModeImpl { ! private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); } ! ! private static final class NFCModeImpl { ! private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); } ! private static final class NFKCModeImpl { ! private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); } ! private static final class Unicode32 { ! private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); } ! private static final class NFD32ModeImpl { ! private static final ModeImpl INSTANCE = ! new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), ! Unicode32.INSTANCE)); } ! private static final class NFKD32ModeImpl { ! private static final ModeImpl INSTANCE = ! new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), ! Unicode32.INSTANCE)); } ! private static final class NFC32ModeImpl { ! private static final ModeImpl INSTANCE = ! new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), ! Unicode32.INSTANCE)); } ! private static final class NFKC32ModeImpl { ! private static final ModeImpl INSTANCE = ! new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), ! Unicode32.INSTANCE)); } /** ! * Options bit set value to select Unicode 3.2 normalization ! * (except NormalizationCorrections). ! * At most one Unicode version can be selected at a time. * @stable ICU 2.6 */ ! public static final int UNICODE_3_2=0x20; ! public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; ! ! /* ! * Default option for the latest Unicode normalization. This option is ! * provided mainly for testing. ! * The value zero means that normalization is done with the fixes for ! * - Corrigendum 4 (Five CJK Canonical Mapping Errors) ! * - Corrigendum 5 (Normalization Idempotency) */ ! public static final int UNICODE_LATEST = 0x00; /** ! * Constant indicating that the end of the iteration has been reached. ! * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. * @stable ICU 2.8 */ ! public static final int DONE = UCharacterIterator.DONE; /** ! * Constants for normalization modes. ! * <p> ! * The Mode class is not intended for public subclassing. ! * Only the Mode constants provided by the Normalizer class should be used, ! * and any fields or methods should not be called or overridden by users. * @stable ICU 2.8 */ ! public static abstract class Mode { ! /** ! * Sole constructor ! * @internal ! * @deprecated This API is ICU internal only. ! */ ! @Deprecated ! protected Mode() { } ! /** ! * @internal ! * @deprecated This API is ICU internal only. ! */ ! @Deprecated ! protected abstract Normalizer2 getNormalizer2(int options); } ! private static Mode toMode(Normalizer.Form form) { ! switch (form) { ! case NFC : ! return NFC; ! case NFD : ! return NFD; ! case NFKC : ! return NFKC; ! case NFKD : ! return NFKD; } ! throw new IllegalArgumentException("Unexpected normalization form: " + ! form); } ! private static final class NONEMode extends Mode { ! protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } } ! private static final class NFDMode extends Mode { ! protected Normalizer2 getNormalizer2(int options) { ! return (options&UNICODE_3_2) != 0 ? ! NFD32ModeImpl.INSTANCE.normalizer2 : ! NFDModeImpl.INSTANCE.normalizer2; } } private static final class NFKDMode extends Mode { ! protected Normalizer2 getNormalizer2(int options) { ! return (options&UNICODE_3_2) != 0 ? ! NFKD32ModeImpl.INSTANCE.normalizer2 : ! NFKDModeImpl.INSTANCE.normalizer2; } } ! private static final class NFCMode extends Mode { ! protected Normalizer2 getNormalizer2(int options) { ! return (options&UNICODE_3_2) != 0 ? ! NFC32ModeImpl.INSTANCE.normalizer2 : ! NFCModeImpl.INSTANCE.normalizer2; } } ! private static final class NFKCMode extends Mode { ! protected Normalizer2 getNormalizer2(int options) { ! return (options&UNICODE_3_2) != 0 ? ! NFKC32ModeImpl.INSTANCE.normalizer2 : ! NFKCModeImpl.INSTANCE.normalizer2; } } /** ! * No decomposition/composition. * @stable ICU 2.8 */ ! public static final Mode NONE = new NONEMode(); /** ! * Canonical decomposition. * @stable ICU 2.8 */ ! public static final Mode NFD = new NFDMode(); /** ! * Compatibility decomposition. * @stable ICU 2.8 */ ! public static final Mode NFKD = new NFKDMode(); /** ! * Canonical decomposition followed by canonical composition. * @stable ICU 2.8 */ ! public static final Mode NFC = new NFCMode(); ! public static final Mode NFKC =new NFKCMode(); //------------------------------------------------------------------------- ! // Iterator constructors //------------------------------------------------------------------------- /** ! * Creates a new {@code NormalizerBase} object for iterating over the * normalized form of a given string. * <p> * The {@code options} parameter specifies which optional ! * {@code NormalizerBase} features are to be enabled for this object. ! * <p> * @param str The string to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. *
*** 574,602 **** */ public NormalizerBase(String str, Mode mode, int opt) { this.text = UCharacterIterator.getInstance(str); this.mode = mode; this.options=opt; } ! /** ! * Creates a new {@code Normalizer} object for iterating over the ! * normalized form of the given text. ! * ! * @param iter The input text to be normalized. The normalization ! * will start at the beginning of the string. ! * ! * @param mode The normalization mode. ! */ ! public NormalizerBase(CharacterIterator iter, Mode mode) { ! this(iter, mode, UNICODE_LATEST); } /** ! * Creates a new {@code Normalizer} object for iterating over the * normalized form of the given text. ! * * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. * --- 370,392 ---- */ public NormalizerBase(String str, Mode mode, int opt) { this.text = UCharacterIterator.getInstance(str); this.mode = mode; this.options=opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); } ! public NormalizerBase(String str, Mode mode) { ! this(str, mode, 0); } + /** ! * Creates a new {@code NormalizerBase} object for iterating over the * normalized form of the given text. ! * <p> * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * * @param mode The normalization mode. *
*** 605,623 **** * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument. * @stable ICU 2.6 */ public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { ! this.text = UCharacterIterator.getInstance( ! (CharacterIterator)iter.clone() ! ); this.mode = mode; this.options = opt; } /** ! * Clones this {@code Normalizer} object. All properties of this * object are duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor * or to {@link #setText(CharacterIterator) setText}. * However, the text storage underlying * the {@code CharacterIterator} is not duplicated unless the --- 395,417 ---- * If you want the default behavior corresponding to one of the * standard Unicode Normalization Forms, use 0 for this argument. * @stable ICU 2.6 */ public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { ! this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); this.mode = mode; this.options = opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); + } + + public NormalizerBase(CharacterIterator iter, Mode mode) { + this(iter, mode, 0); } /** ! * Clones this {@code NormalizerBase} object. All properties of this * object are duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor * or to {@link #setText(CharacterIterator) setText}. * However, the text storage underlying * the {@code CharacterIterator} is not duplicated unless the
*** 626,791 **** */ public Object clone() { try { NormalizerBase copy = (NormalizerBase) super.clone(); copy.text = (UCharacterIterator) text.clone(); ! //clone the internal buffer ! if (buffer != null) { ! copy.buffer = new char[buffer.length]; ! System.arraycopy(buffer,0,copy.buffer,0,buffer.length); ! } return copy; } catch (CloneNotSupportedException e) { throw new InternalError(e.toString(), e); } } - //-------------------------------------------------------------------------- - // Static Utility methods - //-------------------------------------------------------------------------- - /** ! * Compose a string. ! * The string will be composed according to the specified mode. ! * @param str The string to compose. ! * @param compat If true the string will be composed according to ! * NFKC rules and if false will be composed according to ! * NFC rules. ! * @param options The only recognized option is UNICODE_3_2 ! * @return String The composed string * @stable ICU 2.6 */ ! public static String compose(String str, boolean compat, int options) { ! ! char[] dest, src; ! if (options == UNICODE_3_2_0_ORIGINAL) { ! String mappedStr = NormalizerImpl.convert(str); ! dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE]; ! src = mappedStr.toCharArray(); ! } else { ! dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE]; ! src = str.toCharArray(); } - int destSize=0; - - UnicodeSet nx = NormalizerImpl.getNX(options); ! /* reset options bits that should only be set here or inside compose() */ ! options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS); ! ! if(compat) { ! options|=NormalizerImpl.OPTIONS_COMPAT; ! } ! ! for(;;) { ! destSize=NormalizerImpl.compose(src,0,src.length, ! dest,0,dest.length,options, ! nx); ! if(destSize<=dest.length) { ! return new String(dest,0,destSize); ! } else { ! dest = new char[destSize]; ! } ! } } ! private static final int MAX_BUF_SIZE_COMPOSE = 2; ! private static final int MAX_BUF_SIZE_DECOMPOSE = 3; ! ! /** ! * Decompose a string. ! * The string will be decomposed according to the specified mode. ! * @param str The string to decompose. ! * @param compat If true the string will be decomposed according to NFKD ! * rules and if false will be decomposed according to NFD ! * rules. ! * @return String The decomposed string ! * @stable ICU 2.8 ! */ ! public static String decompose(String str, boolean compat) { ! return decompose(str,compat,UNICODE_LATEST); } /** ! * Decompose a string. ! * The string will be decomposed according to the specified mode. ! * @param str The string to decompose. ! * @param compat If true the string will be decomposed according to NFKD ! * rules and if false will be decomposed according to NFD ! * rules. ! * @param options The normalization options, ORed together (0 for no options). ! * @return String The decomposed string * @stable ICU 2.6 */ ! public static String decompose(String str, boolean compat, int options) { ! ! int[] trailCC = new int[1]; ! int destSize=0; ! UnicodeSet nx = NormalizerImpl.getNX(options); ! char[] dest; ! ! if (options == UNICODE_3_2_0_ORIGINAL) { ! String mappedStr = NormalizerImpl.convert(str); ! dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE]; ! ! for(;;) { ! destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(), ! dest,0,dest.length, ! compat,trailCC, nx); ! if(destSize<=dest.length) { ! return new String(dest,0,destSize); ! } else { ! dest = new char[destSize]; } - } - } else { - dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE]; ! for(;;) { ! destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(), ! dest,0,dest.length, ! compat,trailCC, nx); ! if(destSize<=dest.length) { ! return new String(dest,0,destSize); ! } else { ! dest = new char[destSize]; ! } ! } ! } } ! /** ! * Normalize a string. ! * The string will be normalized according to the specified normalization ! * mode and options. ! * @param src The char array to compose. ! * @param srcStart Start index of the source ! * @param srcLimit Limit index of the source ! * @param dest The char buffer to fill in ! * @param destStart Start index of the destination buffer ! * @param destLimit End index of the destination buffer ! * @param mode The normalization mode; one of Normalizer.NONE, ! * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, ! * Normalizer.NFKD, Normalizer.DEFAULT ! * @param options The normalization options, ORed together (0 for no options). ! * @return int The total buffer size needed;if greater than length of ! * result, the output was truncated. ! * @exception IndexOutOfBoundsException if the target capacity is ! * less than the required length ! * @stable ICU 2.6 ! */ ! public static int normalize(char[] src,int srcStart, int srcLimit, ! char[] dest,int destStart, int destLimit, ! Mode mode, int options) { ! int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options); ! ! if(length<=(destLimit-destStart)) { ! return length; ! } else { ! throw new IndexOutOfBoundsException(Integer.toString(length)); ! } } //------------------------------------------------------------------------- // Iteration API //------------------------------------------------------------------------- --- 420,497 ---- */ public Object clone() { try { NormalizerBase copy = (NormalizerBase) super.clone(); copy.text = (UCharacterIterator) text.clone(); ! copy.mode = mode; ! copy.options = options; ! copy.norm2 = norm2; ! copy.buffer = new StringBuilder(buffer); ! copy.bufferPos = bufferPos; ! copy.currentIndex = currentIndex; ! copy.nextIndex = nextIndex; return copy; } catch (CloneNotSupportedException e) { throw new InternalError(e.toString(), e); } } /** ! * Normalizes a {@code String} using the given normalization operation. ! * <p> ! * The {@code options} parameter specifies which optional ! * {@code NormalizerBase} features are to be enabled for this operation. ! * Currently the only available option is {@link #UNICODE_3_2}. ! * If you want the default behavior corresponding to one of the standard ! * Unicode Normalization Forms, use 0 for this argument. ! * <p> ! * @param str the input string to be normalized. ! * @param mode the normalization mode ! * @param options the optional features to be enabled. ! * @return String the normalized string * @stable ICU 2.6 */ ! public static String normalize(String str, Mode mode, int options) { ! return mode.getNormalizer2(options).normalize(str); } ! public static String normalize(String str, Normalizer.Form form) { ! return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); } ! public static String normalize(String str, Normalizer.Form form, int options) { ! return NormalizerBase.normalize(str, toMode(form), options); } /** ! * Test if a string is in a given normalization form. ! * This is semantically equivalent to source.equals(normalize(source, mode)). ! * ! * Unlike quickCheck(), this function returns a definitive result, ! * never a "maybe". ! * For NFD, NFKD, and FCD, both functions work exactly the same. ! * For NFC and NFKC where quickCheck may return "maybe", this function will ! * perform further tests to arrive at a true/false result. ! * @param str the input string to be checked to see if it is ! * normalized ! * @param mode the normalization mode ! * @param options Options for use with exclusion set and tailored Normalization ! * The only option that is currently recognized is UNICODE_3_2 ! * @see #isNormalized * @stable ICU 2.6 */ ! public static boolean isNormalized(String str, Mode mode, int options) { ! return mode.getNormalizer2(options).isNormalized(str); } ! public static boolean isNormalized(String str, Normalizer.Form form) { ! return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); } ! public static boolean isNormalized(String str, Normalizer.Form form, int options) { ! return NormalizerBase.isNormalized(str, toMode(form), options); } //------------------------------------------------------------------------- // Iteration API //-------------------------------------------------------------------------
*** 794,805 **** * Return the current character in the normalized text. * @return The codepoint as an int * @stable ICU 2.8 */ public int current() { ! if(bufferPos<bufferLimit || nextNormalize()) { ! return getCodePointAt(bufferPos); } else { return DONE; } } --- 500,511 ---- * Return the current character in the normalized text. * @return The codepoint as an int * @stable ICU 2.8 */ public int current() { ! if(bufferPos<buffer.length() || nextNormalize()) { ! return buffer.codePointAt(bufferPos); } else { return DONE; } }
*** 809,839 **** * of the text has already been reached, {@link #DONE} is returned. * @return The codepoint as an int * @stable ICU 2.8 */ public int next() { ! if(bufferPos<bufferLimit || nextNormalize()) { ! int c=getCodePointAt(bufferPos); ! bufferPos+=(c>0xFFFF) ? 2 : 1; return c; } else { return DONE; } } - /** * Return the previous character in the normalized text and decrement * the iteration position by one. If the beginning * of the text has already been reached, {@link #DONE} is returned. * @return The codepoint as an int * @stable ICU 2.8 */ public int previous() { if(bufferPos>0 || previousNormalize()) { ! int c=getCodePointAt(bufferPos-1); ! bufferPos-=(c>0xFFFF) ? 2 : 1; return c; } else { return DONE; } } --- 515,544 ---- * of the text has already been reached, {@link #DONE} is returned. * @return The codepoint as an int * @stable ICU 2.8 */ public int next() { ! if(bufferPos<buffer.length() || nextNormalize()) { ! int c=buffer.codePointAt(bufferPos); ! bufferPos+=Character.charCount(c); return c; } else { return DONE; } } /** * Return the previous character in the normalized text and decrement * the iteration position by one. If the beginning * of the text has already been reached, {@link #DONE} is returned. * @return The codepoint as an int * @stable ICU 2.8 */ public int previous() { if(bufferPos>0 || previousNormalize()) { ! int c=buffer.codePointBefore(bufferPos); ! bufferPos-=Character.charCount(c); return c; } else { return DONE; } }
*** 857,868 **** * * @param index the desired index in the input text. * @stable ICU 2.8 */ public void setIndexOnly(int index) { ! text.setIndex(index); ! currentIndex=nextIndex=index; // validates index clearBuffer(); } /** * Set the iteration position in the input text that is being normalized --- 562,573 ---- * * @param index the desired index in the input text. * @stable ICU 2.8 */ public void setIndexOnly(int index) { ! text.setIndex(index); // validates index ! currentIndex=nextIndex=index; clearBuffer(); } /** * Set the iteration position in the input text that is being normalized
*** 872,903 **** * while {@link #next} and {@link #previous} iterate through characters * in the normalized <em>output</em>. This means that there is not * necessarily a one-to-one correspondence between characters returned * by {@code next} and {@code previous} and the indices passed to and * returned from {@code setIndex} and {@link #getIndex}. ! * * @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating * forward starting at the given index. * * @throws IllegalArgumentException if the given index is less than * {@link #getBeginIndex} or greater than {@link #getEndIndex}. ! * @return The codepoint as an int ! * @deprecated ICU 3.2 * @obsolete ICU 3.2 */ - @Deprecated public int setIndex(int index) { setIndexOnly(index); return current(); } /** * Retrieve the index of the start of the input text. This is the begin * index of the {@code CharacterIterator} or the start (i.e. 0) of the ! * {@code String} over which this {@code Normalizer} is iterating * @deprecated ICU 2.2. Use startIndex() instead. * @return The codepoint as an int * @see #startIndex */ @Deprecated --- 577,606 ---- * while {@link #next} and {@link #previous} iterate through characters * in the normalized <em>output</em>. This means that there is not * necessarily a one-to-one correspondence between characters returned * by {@code next} and {@code previous} and the indices passed to and * returned from {@code setIndex} and {@link #getIndex}. ! * <p> * @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating * forward starting at the given index. * * @throws IllegalArgumentException if the given index is less than * {@link #getBeginIndex} or greater than {@link #getEndIndex}. ! * deprecated ICU 3.2 * @obsolete ICU 3.2 */ public int setIndex(int index) { setIndexOnly(index); return current(); } /** * Retrieve the index of the start of the input text. This is the begin * index of the {@code CharacterIterator} or the start (i.e. 0) of the ! * {@code String} over which this {@code NormalizerBase} is iterating * @deprecated ICU 2.2. Use startIndex() instead. * @return The codepoint as an int * @see #startIndex */ @Deprecated
*** 906,916 **** } /** * Retrieve the index of the end of the input text. This is the end index * of the {@code CharacterIterator} or the length of the {@code String} ! * over which this {@code Normalizer} is iterating * @deprecated ICU 2.2. Use endIndex() instead. * @return The codepoint as an int * @see #endIndex */ @Deprecated --- 609,619 ---- } /** * Retrieve the index of the end of the input text. This is the end index * of the {@code CharacterIterator} or the length of the {@code String} ! * over which this {@code NormalizerBase} is iterating * @deprecated ICU 2.2. Use endIndex() instead. * @return The codepoint as an int * @see #endIndex */ @Deprecated
*** 932,1682 **** * {@code setIndex} and {@link #getIndex}. * @return The current iteration position * @stable ICU 2.8 */ public int getIndex() { ! if(bufferPos<bufferLimit) { return currentIndex; } else { return nextIndex; } } /** * Retrieve the index of the end of the input text. This is the end index * of the {@code CharacterIterator} or the length of the {@code String} ! * over which this {@code Normalizer} is iterating * @return The current iteration position * @stable ICU 2.8 */ public int endIndex() { return text.getLength(); } //------------------------------------------------------------------------- ! // Property access methods //------------------------------------------------------------------------- /** * Set the normalization mode for this object. * <p> * <b>Note:</b>If the normalization mode is changed while iterating * over a string, calls to {@link #next} and {@link #previous} may * return previously buffers characters in the old normalization mode * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, * {@link #last}, etc. after calling {@code setMode}. ! * ! * @param newMode the new mode for this {@code Normalizer}. * The supported modes are: * <ul> ! * <li>{@link #COMPOSE} - Unicode canonical decompositiion * followed by canonical composition. ! * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion * follwed by canonical composition. ! * <li>{@link #DECOMP} - Unicode canonical decomposition ! * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition. ! * <li>{@link #NO_OP} - Do nothing but return characters * from the underlying input text. * </ul> * * @see #getMode * @stable ICU 2.8 */ public void setMode(Mode newMode) { mode = newMode; } /** ! * Return the basic operation performed by this {@code Normalizer} * * @see #setMode * @stable ICU 2.8 */ public Mode getMode() { return mode; } /** ! * Set the input text over which this {@code Normalizer} will iterate. * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(String newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { ! throw new InternalError("Could not create a new UCharacterIterator"); } text = newIter; reset(); } /** ! * Set the input text over which this {@code Normalizer} will iterate. * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(CharacterIterator newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { ! throw new InternalError("Could not create a new UCharacterIterator"); } text = newIter; currentIndex=nextIndex=0; clearBuffer(); } - //------------------------------------------------------------------------- - // Private utility methods - //------------------------------------------------------------------------- - - - /* backward iteration --------------------------------------------------- */ - - /* - * read backwards and get norm32 - * return 0 if the character is <minC - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - - private static long getPrevNorm32(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - char[] chars) { - long norm32; - int ch=0; - /* need src.hasPrevious() */ - if((ch=src.previous()) == UCharacterIterator.DONE) { - return 0; - } - chars[0]=(char)ch; - chars[1]=0; - - /* check for a surrogate before getting norm32 to see if we need to - * predecrement further */ - if(chars[0]<minC) { - return 0; - } else if(!UTF16.isSurrogate(chars[0])) { - return NormalizerImpl.getNorm32(chars[0]); - } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) { - /* unpaired surrogate */ - chars[1]=(char)src.current(); - return 0; - } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) { - norm32=NormalizerImpl.getNorm32(chars[1]); - if((norm32&mask)==0) { - /* all surrogate pairs with this lead surrogate have irrelevant - * data */ - return 0; - } else { - /* norm32 must be a surrogate special */ - return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]); - } - } else { - /* unpaired second surrogate, undo the c2=src.previous() movement */ - src.moveIndex( 1); - return 0; - } - } - - private interface IsPrevBoundary{ - public boolean isPrevBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - char[] chars); - } - private static final class IsPrevNFDSafe implements IsPrevBoundary{ - /* - * for NF*D: - * read backwards and check if the lead combining class is 0 - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - public boolean isPrevBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - char[] chars) { - - return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC, - ccOrQCMask, chars), - ccOrQCMask, - ccOrQCMask& NormalizerImpl.QC_MASK); - } - } - - private static final class IsPrevTrueStarter implements IsPrevBoundary{ - /* - * read backwards and check if the character is (or its decomposition - * begins with) a "true starter" (cc==0 and NF*C_YES) - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - public boolean isPrevBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - char[] chars) { - long norm32; - int/*unsigned*/ decompQCMask; - - decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/ - norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars); - return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask); - } - } - - private static int findPreviousIterationBoundary(UCharacterIterator src, - IsPrevBoundary obj, - int/*unsigned*/ minC, - int/*mask*/ mask, - char[] buffer, - int[] startIndex) { - char[] chars=new char[2]; - boolean isBoundary; - - /* fill the buffer from the end backwards */ - startIndex[0] = buffer.length; - chars[0]=0; - while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) { - isBoundary=obj.isPrevBoundary(src, minC, mask, chars); - - /* always write this character to the front of the buffer */ - /* make sure there is enough space in the buffer */ - if(startIndex[0] < (chars[1]==0 ? 1 : 2)) { - - // grow the buffer - char[] newBuf = new char[buffer.length*2]; - /* move the current buffer contents up */ - System.arraycopy(buffer,startIndex[0],newBuf, - newBuf.length-(buffer.length-startIndex[0]), - buffer.length-startIndex[0]); - //adjust the startIndex - startIndex[0]+=newBuf.length-buffer.length; - - buffer=newBuf; - newBuf=null; - - } - - buffer[--startIndex[0]]=chars[0]; - if(chars[1]!=0) { - buffer[--startIndex[0]]=chars[1]; - } - - /* stop if this just-copied character is a boundary */ - if(isBoundary) { - break; - } - } - - /* return the length of the buffer contents */ - return buffer.length-startIndex[0]; - } - - private static int previous(UCharacterIterator src, - char[] dest, int destStart, int destLimit, - Mode mode, - boolean doNormalize, - boolean[] pNeededToNormalize, - int options) { - - IsPrevBoundary isPreviousBoundary; - int destLength, bufferLength; - int/*unsigned*/ mask; - int c,c2; - - char minC; - int destCapacity = destLimit-destStart; - destLength=0; - - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=false; - } - minC = (char)mode.getMinC(); - mask = mode.getMask(); - isPreviousBoundary = mode.getPrevBoundary(); - - if(isPreviousBoundary==null) { - destLength=0; - if((c=src.previous())>=0) { - destLength=1; - if(UTF16.isTrailSurrogate((char)c)) { - c2= src.previous(); - if(c2!= UCharacterIterator.DONE) { - if(UTF16.isLeadSurrogate((char)c2)) { - if(destCapacity>=2) { - dest[1]=(char)c; // trail surrogate - destLength=2; - } - // lead surrogate to be written below - c=c2; - } else { - src.moveIndex(1); - } - } - } - - if(destCapacity>0) { - dest[0]=(char)c; - } - } - return destLength; - } - - char[] buffer = new char[100]; - int[] startIndex= new int[1]; - bufferLength=findPreviousIterationBoundary(src, - isPreviousBoundary, - minC, mask,buffer, - startIndex); - if(bufferLength>0) { - if(doNormalize) { - destLength=NormalizerBase.normalize(buffer,startIndex[0], - startIndex[0]+bufferLength, - dest, destStart,destLimit, - mode, options); - - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=destLength!=bufferLength || - Utility.arrayRegionMatches( - buffer,0,dest, - destStart,destLimit - ); - } - } else { - /* just copy the source characters */ - if(destCapacity>0) { - System.arraycopy(buffer,startIndex[0],dest,0, - (bufferLength<destCapacity) ? - bufferLength : destCapacity - ); - } - } - } - - - return destLength; - } - - - - /* forward iteration ---------------------------------------------------- */ - /* - * read forward and check if the character is a next-iteration boundary - * if c2!=0 then (c, c2) is a surrogate pair - */ - private interface IsNextBoundary{ - boolean isNextBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - int[] chars); - } - /* - * read forward and get norm32 - * return 0 if the character is <minC - * if c2!=0 then (c2, c) is a surrogate pair - * always reads complete characters - */ - private static long /*unsigned*/ getNextNorm32(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - int[] chars) { - long norm32; - - /* need src.hasNext() to be true */ - chars[0]=src.next(); - chars[1]=0; - - if(chars[0]<minC) { - return 0; - } - - norm32=NormalizerImpl.getNorm32((char)chars[0]); - if(UTF16.isLeadSurrogate((char)chars[0])) { - if(src.current()!=UCharacterIterator.DONE && - UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) { - src.moveIndex(1); /* skip the c2 surrogate */ - if((norm32&mask)==0) { - /* irrelevant data */ - return 0; - } else { - /* norm32 must be a surrogate special */ - return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]); - } - } else { - /* unmatched surrogate */ - return 0; - } - } - return norm32; - } - - - /* - * for NF*D: - * read forward and check if the lead combining class is 0 - * if c2!=0 then (c, c2) is a surrogate pair - */ - private static final class IsNextNFDSafe implements IsNextBoundary{ - public boolean isNextBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - int[] chars) { - return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars), - ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK); - } - } - - /* - * for NF*C: - * read forward and check if the character is (or its decomposition begins - * with) a "true starter" (cc==0 and NF*C_YES) - * if c2!=0 then (c, c2) is a surrogate pair - */ - private static final class IsNextTrueStarter implements IsNextBoundary{ - public boolean isNextBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - int[] chars) { - long norm32; - int/*unsigned*/ decompQCMask; - - decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/ - norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars); - return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask); - } - } - - private static int findNextIterationBoundary(UCharacterIterator src, - IsNextBoundary obj, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - char[] buffer) { - if(src.current()==UCharacterIterator.DONE) { - return 0; - } - - /* get one character and ignore its properties */ - int[] chars = new int[2]; - chars[0]=src.next(); - buffer[0]=(char)chars[0]; - int bufferIndex = 1; - - if(UTF16.isLeadSurrogate((char)chars[0])&& - src.current()!=UCharacterIterator.DONE) { - if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) { - buffer[bufferIndex++]=(char)chars[1]; - } else { - src.moveIndex(-1); /* back out the non-trail-surrogate */ - } - } - - /* get all following characters until we see a boundary */ - /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff - * is part of the string */ - while( src.current()!=UCharacterIterator.DONE) { - if(obj.isNextBoundary(src, minC, mask, chars)) { - /* back out the latest movement to stop at the boundary */ - src.moveIndex(chars[1]==0 ? -1 : -2); - break; - } else { - if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) { - buffer[bufferIndex++]=(char)chars[0]; - if(chars[1]!=0) { - buffer[bufferIndex++]=(char)chars[1]; - } - } else { - char[] newBuf = new char[buffer.length*2]; - System.arraycopy(buffer,0,newBuf,0,bufferIndex); - buffer = newBuf; - buffer[bufferIndex++]=(char)chars[0]; - if(chars[1]!=0) { - buffer[bufferIndex++]=(char)chars[1]; - } - } - } - } - - /* return the length of the buffer contents */ - return bufferIndex; - } - - private static int next(UCharacterIterator src, - char[] dest, int destStart, int destLimit, - NormalizerBase.Mode mode, - boolean doNormalize, - boolean[] pNeededToNormalize, - int options) { - - IsNextBoundary isNextBoundary; - int /*unsigned*/ mask; - int /*unsigned*/ bufferLength; - int c,c2; - char minC; - int destCapacity = destLimit - destStart; - int destLength = 0; - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=false; - } - - minC = (char)mode.getMinC(); - mask = mode.getMask(); - isNextBoundary = mode.getNextBoundary(); - - if(isNextBoundary==null) { - destLength=0; - c=src.next(); - if(c!=UCharacterIterator.DONE) { - destLength=1; - if(UTF16.isLeadSurrogate((char)c)) { - c2= src.next(); - if(c2!= UCharacterIterator.DONE) { - if(UTF16.isTrailSurrogate((char)c2)) { - if(destCapacity>=2) { - dest[1]=(char)c2; // trail surrogate - destLength=2; - } - // lead surrogate to be written below - } else { - src.moveIndex(-1); - } - } - } - - if(destCapacity>0) { - dest[0]=(char)c; - } - } - return destLength; - } - - char[] buffer=new char[100]; - int[] startIndex = new int[1]; - bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask, - buffer); - if(bufferLength>0) { - if(doNormalize) { - destLength=mode.normalize(buffer,startIndex[0],bufferLength, - dest,destStart,destLimit, options); - - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=destLength!=bufferLength || - Utility.arrayRegionMatches(buffer,startIndex[0], - dest,destStart, - destLength); - } - } else { - /* just copy the source characters */ - if(destCapacity>0) { - System.arraycopy(buffer,0,dest,destStart, - Math.min(bufferLength,destCapacity) - ); - } - - - } - } - return destLength; - } - private void clearBuffer() { ! bufferLimit=bufferStart=bufferPos=0; } private boolean nextNormalize() { - clearBuffer(); currentIndex=nextIndex; text.setIndex(nextIndex); ! ! bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options); ! nextIndex=text.getIndex(); ! return (bufferLimit>0); } private boolean previousNormalize() { - clearBuffer(); nextIndex=currentIndex; text.setIndex(currentIndex); ! bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options); ! ! currentIndex=text.getIndex(); ! bufferPos = bufferLimit; ! return bufferLimit>0; ! } ! ! private int getCodePointAt(int index) { ! if( UTF16.isSurrogate(buffer[index])) { ! if(UTF16.isLeadSurrogate(buffer[index])) { ! if((index+1)<bufferLimit && ! UTF16.isTrailSurrogate(buffer[index+1])) { ! return UCharacterProperty.getRawSupplementary( ! buffer[index], ! buffer[index+1] ! ); ! } ! }else if(UTF16.isTrailSurrogate(buffer[index])) { ! if(index>0 && UTF16.isLeadSurrogate(buffer[index-1])) { ! return UCharacterProperty.getRawSupplementary( ! buffer[index-1], ! buffer[index] ! ); ! } ! } ! } ! return buffer[index]; ! ! } ! ! /** ! * Internal API ! * @internal ! */ ! public static boolean isNFSkippable(int c, Mode mode) { ! return mode.isNFSkippable(c); ! } ! ! // ! // Options ! // ! ! /* ! * Default option for Unicode 3.2.0 normalization. ! * Corrigendum 4 was fixed in Unicode 3.2.0 but isn't supported in ! * IDNA/StringPrep. ! * The public review issue #29 was fixed in Unicode 4.1.0. Corrigendum 5 ! * allowed Unicode 3.2 to 4.0.1 to apply the fix for PRI #29, but it isn't ! * supported by IDNA/StringPrep as well as Corrigendum 4. ! */ ! public static final int UNICODE_3_2_0_ORIGINAL = ! UNICODE_3_2 | ! NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS | ! NormalizerImpl.BEFORE_PRI_29; ! ! /* ! * Default option for the latest Unicode normalization. This option is ! * provided mainly for testing. ! * The value zero means that normalization is done with the fixes for ! * - Corrigendum 4 (Five CJK Canonical Mapping Errors) ! * - Corrigendum 5 (Normalization Idempotency) ! */ ! public static final int UNICODE_LATEST = 0x00; ! ! // ! // public constructor and methods for java.text.Normalizer and ! // sun.text.Normalizer ! // ! ! /** ! * Creates a new {@code Normalizer} object for iterating over the ! * normalized form of a given string. ! * ! * @param str The string to be normalized. The normalization ! * will start at the beginning of the string. ! * ! * @param mode The normalization mode. ! */ ! public NormalizerBase(String str, Mode mode) { ! this(str, mode, UNICODE_LATEST); ! } ! ! /** ! * Normalizes a <code>String</code> using the given normalization form. ! * ! * @param str the input string to be normalized. ! * @param form the normalization form ! */ ! public static String normalize(String str, Normalizer.Form form) { ! return normalize(str, form, UNICODE_LATEST); ! } ! ! /** ! * Normalizes a <code>String</code> using the given normalization form. ! * ! * @param str the input string to be normalized. ! * @param form the normalization form ! * @param options the optional features to be enabled. ! */ ! public static String normalize(String str, Normalizer.Form form, int options) { ! int len = str.length(); ! boolean asciiOnly = true; ! if (len < 80) { ! for (int i = 0; i < len; i++) { ! if (str.charAt(i) > 127) { ! asciiOnly = false; ! break; ! } ! } } else { ! char[] a = str.toCharArray(); ! for (int i = 0; i < len; i++) { ! if (a[i] > 127) { ! asciiOnly = false; ! break; } } } ! ! switch (form) { ! case NFC : ! return asciiOnly ? str : NFC.normalize(str, options); ! case NFD : ! return asciiOnly ? str : NFD.normalize(str, options); ! case NFKC : ! return asciiOnly ? str : NFKC.normalize(str, options); ! case NFKD : ! return asciiOnly ? str : NFKD.normalize(str, options); ! } ! ! throw new IllegalArgumentException("Unexpected normalization form: " + ! form); ! } ! ! /** ! * Test if a string is in a given normalization form. ! * This is semantically equivalent to source.equals(normalize(source, mode)). ! * ! * Unlike quickCheck(), this function returns a definitive result, ! * never a "maybe". ! * For NFD, NFKD, and FCD, both functions work exactly the same. ! * For NFC and NFKC where quickCheck may return "maybe", this function will ! * perform further tests to arrive at a true/false result. ! * @param str the input string to be checked to see if it is normalized ! * @param form the normalization form ! */ ! public static boolean isNormalized(String str, Normalizer.Form form) { ! return isNormalized(str, form, UNICODE_LATEST); ! } ! ! /** ! * Test if a string is in a given normalization form. ! * This is semantically equivalent to source.equals(normalize(source, mode)). ! * ! * Unlike quickCheck(), this function returns a definitive result, ! * never a "maybe". ! * For NFD, NFKD, and FCD, both functions work exactly the same. ! * For NFC and NFKC where quickCheck may return "maybe", this function will ! * perform further tests to arrive at a true/false result. ! * @param str the input string to be checked to see if it is normalized ! * @param form the normalization form ! * @param options the optional features to be enabled. ! */ ! public static boolean isNormalized(String str, Normalizer.Form form, int options) { ! switch (form) { ! case NFC: ! return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); ! case NFD: ! return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); ! case NFKC: ! return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); ! case NFKD: ! return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); } - throw new IllegalArgumentException("Unexpected normalization form: " + - form); - } } --- 635,782 ---- * {@code setIndex} and {@link #getIndex}. * @return The current iteration position * @stable ICU 2.8 */ public int getIndex() { ! if(bufferPos<buffer.length()) { return currentIndex; } else { return nextIndex; } } /** * Retrieve the index of the end of the input text. This is the end index * of the {@code CharacterIterator} or the length of the {@code String} ! * over which this {@code NormalizerBase} is iterating * @return The current iteration position * @stable ICU 2.8 */ public int endIndex() { return text.getLength(); } //------------------------------------------------------------------------- ! // Iterator attributes //------------------------------------------------------------------------- /** * Set the normalization mode for this object. * <p> * <b>Note:</b>If the normalization mode is changed while iterating * over a string, calls to {@link #next} and {@link #previous} may * return previously buffers characters in the old normalization mode * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, * {@link #last}, etc. after calling {@code setMode}. ! * <p> ! * @param newMode the new mode for this {@code NormalizerBase}. * The supported modes are: * <ul> ! * <li>{@link #NFC} - Unicode canonical decompositiion * followed by canonical composition. ! * <li>{@link #NFKC} - Unicode compatibility decompositiion * follwed by canonical composition. ! * <li>{@link #NFD} - Unicode canonical decomposition ! * <li>{@link #NFKD} - Unicode compatibility decomposition. ! * <li>{@link #NONE} - Do nothing but return characters * from the underlying input text. * </ul> * * @see #getMode * @stable ICU 2.8 */ public void setMode(Mode newMode) { mode = newMode; + norm2 = mode.getNormalizer2(options); } + /** ! * Return the basic operation performed by this {@code NormalizerBase} * * @see #setMode * @stable ICU 2.8 */ public Mode getMode() { return mode; } /** ! * Set the input text over which this {@code NormalizerBase} will iterate. * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(String newText) { UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { ! throw new IllegalStateException("Could not create a new UCharacterIterator"); } text = newIter; reset(); } /** ! * Set the input text over which this {@code NormalizerBase} will iterate. * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(CharacterIterator newText) { UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { ! throw new IllegalStateException("Could not create a new UCharacterIterator"); } text = newIter; currentIndex=nextIndex=0; clearBuffer(); } private void clearBuffer() { ! buffer.setLength(0); ! bufferPos=0; } private boolean nextNormalize() { clearBuffer(); currentIndex=nextIndex; text.setIndex(nextIndex); ! // Skip at least one character so we make progress. ! int c=text.nextCodePoint(); ! if(c<0) { ! return false; ! } ! StringBuilder segment=new StringBuilder().appendCodePoint(c); ! while((c=text.nextCodePoint())>=0) { ! if(norm2.hasBoundaryBefore(c)) { ! text.moveCodePointIndex(-1); ! break; ! } ! segment.appendCodePoint(c); ! } nextIndex=text.getIndex(); ! norm2.normalize(segment, buffer); ! return buffer.length()!=0; } private boolean previousNormalize() { clearBuffer(); nextIndex=currentIndex; text.setIndex(currentIndex); ! StringBuilder segment=new StringBuilder(); ! int c; ! while((c=text.previousCodePoint())>=0) { ! if(c<=0xffff) { ! segment.insert(0, (char)c); } else { ! segment.insert(0, Character.toChars(c)); } + if(norm2.hasBoundaryBefore(c)) { + break; } } ! currentIndex=text.getIndex(); ! norm2.normalize(segment, buffer); ! bufferPos=buffer.length(); ! return buffer.length()!=0; } }
< prev index next >