--- old/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2020-01-10 15:57:31.000000000 -0800 +++ /dev/null 2020-01-10 15:57:31.000000000 -0800 @@ -1,2188 +0,0 @@ -/* - * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2009-2014, International Business Machines - * Corporation and others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.text.Normalizer; - -// Original filename in ICU4J: Normalizer2Impl.java -public final class NormalizerImpl { - public static final class Hangul { - /* Korean Hangul and Jamo constants */ - public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ - public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ - public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ - - public static final int HANGUL_BASE=0xac00; - public static final int HANGUL_END=0xd7a3; - - public static final int JAMO_L_COUNT=19; - public static final int JAMO_V_COUNT=21; - public static final int JAMO_T_COUNT=28; - - public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; - public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; - - public static boolean isHangul(int c) { - return HANGUL_BASE<=c && c - * If dest is a StringBuilder, then the buffer writes directly to it. - * Otherwise, the buffer maintains a StringBuilder for intermediate text segments - * until no further changes are necessary and whole segments are appended. - * append() methods that take combining-class values always write to the StringBuilder. - * Other append() methods flush and append to the Appendable. - */ - public static final class ReorderingBuffer implements Appendable { - public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { - impl=ni; - app=dest; - if (app instanceof StringBuilder) { - appIsStringBuilder=true; - str=(StringBuilder)dest; - // In Java, the constructor subsumes public void init(int destCapacity) - str.ensureCapacity(destCapacity); - reorderStart=0; - if(str.length()==0) { - lastCC=0; - } else { - setIterator(); - lastCC=previousCC(); - // Set reorderStart after the last code point with cc<=1 if there is one. - if(lastCC>1) { - while(previousCC()>1) {} - } - reorderStart=codePointLimit; - } - } else { - appIsStringBuilder=false; - str=new StringBuilder(); - reorderStart=0; - lastCC=0; - } - } - - public boolean isEmpty() { return str.length()==0; } - public int length() { return str.length(); } - public int getLastCC() { return lastCC; } - - public StringBuilder getStringBuilder() { return str; } - - public boolean equals(CharSequence s, int start, int limit) { - return UTF16Plus.equal(str, 0, str.length(), s, start, limit); - } - - public void append(int c, int cc) { - if(lastCC<=cc || cc==0) { - str.appendCodePoint(c); - lastCC=cc; - if(cc<=1) { - reorderStart=str.length(); - } - } else { - insert(c, cc); - } - } - public void append(CharSequence s, int start, int limit, boolean isNFD, - int leadCC, int trailCC) { - if(start==limit) { - return; - } - if(lastCC<=leadCC || leadCC==0) { - if(trailCC<=1) { - reorderStart=str.length()+(limit-start); - } else if(leadCC<=1) { - reorderStart=str.length()+1; // Ok if not a code point boundary. - } - str.append(s, start, limit); - lastCC=trailCC; - } else { - int c=Character.codePointAt(s, start); - start+=Character.charCount(c); - insert(c, leadCC); // insert first code point - while(startcc;) {} - // insert c at codePointLimit, after the character with prevCC<=cc - if(c<=0xffff) { - str.insert(codePointLimit, (char)c); - if(cc<=1) { - reorderStart=codePointLimit+1; - } - } else { - str.insert(codePointLimit, Character.toChars(c)); - if(cc<=1) { - reorderStart=codePointLimit+2; - } - } - } - - private final NormalizerImpl impl; - private final Appendable app; - private final StringBuilder str; - private final boolean appIsStringBuilder; - private int reorderStart; - private int lastCC; - - // private backward iterator - private void setIterator() { codePointStart=str.length(); } - private void skipPrevious() { // Requires 0=codePointStart) { - return 0; - } - int c=str.codePointBefore(codePointStart); - codePointStart-=Character.charCount(c); - return impl.getCCFromYesOrMaybeCP(c); - } - private int codePointStart, codePointLimit; - } - - // TODO: Propose as public API on the UTF16 class. - // TODO: Propose widening UTF16 methods that take char to take int. - // TODO: Propose widening UTF16 methods that take String to take CharSequence. - public static final class UTF16Plus { - /** - * Is this code point a lead surrogate (U+d800..U+dbff)? - * @param c code unit or code point - * @return true or false - */ - public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } - /** - * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), - * is it a lead surrogate? - * @param c code unit or code point - * @return true or false - */ - public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } - - /** - * Compares two CharSequence subsequences for binary equality. - * @param s1 first sequence - * @param start1 start offset in first sequence - * @param limit1 limit offset in first sequence - * @param s2 second sequence - * @param start2 start offset in second sequence - * @param limit2 limit offset in second sequence - * @return true if s1.subSequence(start1, limit1) contains the same text - * as s2.subSequence(start2, limit2) - */ - public static boolean equal(CharSequence s1, int start1, int limit1, - CharSequence s2, int start2, int limit2) { - if((limit1-start1)!=(limit2-start2)) { - return false; - } - if(s1==s2 && start1==start2) { - return true; - } - while(start1>DELTA_SHIFT)-MAX_DELTA-1; - - // Read the normTrie. - int offset=inIndexes[IX_NORM_TRIE_OFFSET]; - int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; - int triePosition = bytes.position(); - normTrie = CodePointTrie.Fast16.fromBinary(bytes); - int trieLength = bytes.position() - triePosition; - if(trieLength>(nextOffset-offset)) { - throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); - } - ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes - - // Read the composition and mapping data. - offset=nextOffset; - nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; - int numChars=(nextOffset-offset)/2; - if(numChars!=0) { - maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); - extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); - } - - // smallFCD: new in formatVersion 2 - offset=nextOffset; - smallFCD=new byte[0x100]; - bytes.get(smallFCD); - - return this; - } catch(IOException e) { - throw new InternalError(e); - } - } - public NormalizerImpl load(String name) { - return load(ICUBinary.getRequiredData(name)); - } - - // The trie stores values for lead surrogate code *units*. - // Surrogate code *points* are inert. - public int getNorm16(int c) { - return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); - } - public int getRawNorm16(int c) { return normTrie.get(c); } - public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16=MIN_NORMAL_MAYBE_YES) { - return getCCFromNormalYesOrMaybe(norm16); - } - if(norm16> OFFSET_SHIFT) & 0xff; - } - public static int getCCFromYesOrMaybe(int norm16) { - return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; - } - public int getCCFromYesOrMaybeCP(int c) { - if (c < minCompNoMaybeCP) { return 0; } - return getCCFromYesOrMaybe(getNorm16(c)); - } - - /** - * Returns the FCD data for code point c. - * @param c A Unicode code point. - * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. - */ - public int getFCD16(int c) { - if(c>8]; - if(bits==0) { return false; } - return ((bits>>((lead>>5)&7))&1)!=0; - } - - /** Gets the FCD value from the regular normalization data. */ - public int getFCD16FromNormData(int c) { - int norm16=getNorm16(c); - if (norm16 >= limitNoNo) { - if(norm16>=MIN_NORMAL_MAYBE_YES) { - // combining mark - norm16=getCCFromNormalYesOrMaybe(norm16); - return norm16|(norm16<<8); - } else if(norm16>=minMaybeYes) { - return 0; - } else { // isDecompNoAlgorithmic(norm16) - int deltaTrailCC = norm16 & DELTA_TCCC_MASK; - if (deltaTrailCC <= DELTA_TCCC_1) { - return deltaTrailCC >> OFFSET_SHIFT; - } - // Maps to an isCompYesAndZeroCC. - c=mapAlgorithmic(c, norm16); - norm16=getRawNorm16(c); - } - } - if(norm16<=minYesNo || isHangulLVT(norm16)) { - // no decomposition or Hangul syllable, all zeros - return 0; - } - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - int fcd16=firstUnit>>8; // tccc - if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc - } - return fcd16; - } - - /** - * Gets the decomposition for one code point. - * @param c code point - * @return c's decomposition, if it has one; returns null if it does not have a decomposition - */ - public String getDecomposition(int c) { - int norm16; - if(c>OFFSET_SHIFT; - int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; - return extraData.substring(mapping, mapping+length); - } - - // Fixed norm16 values. - public static final int MIN_YES_YES_WITH_CC=0xfe02; - public static final int JAMO_VT=0xfe00; - public static final int MIN_NORMAL_MAYBE_YES=0xfc00; - public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE - public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE - - // norm16 bit 0 is comp-boundary-after. - public static final int HAS_COMP_BOUNDARY_AFTER=1; - public static final int OFFSET_SHIFT=1; - - // For algorithmic one-way mappings, norm16 bits 2..1 indicate the - // tccc (0, 1, >1) for quick FCC boundary-after tests. - public static final int DELTA_TCCC_0=0; - public static final int DELTA_TCCC_1=2; - public static final int DELTA_TCCC_GT_1=4; - public static final int DELTA_TCCC_MASK=6; - public static final int DELTA_SHIFT=3; - - public static final int MAX_DELTA=0x40; - - // Byte offsets from the start of the data, after the generic header. - public static final int IX_NORM_TRIE_OFFSET=0; - public static final int IX_EXTRA_DATA_OFFSET=1; - public static final int IX_SMALL_FCD_OFFSET=2; - public static final int IX_RESERVED3_OFFSET=3; - public static final int IX_TOTAL_SIZE=7; - public static final int MIN_CCC_LCCC_CP=0x300; - // Code point thresholds for quick check codes. - public static final int IX_MIN_DECOMP_NO_CP=8; - public static final int IX_MIN_COMP_NO_MAYBE_CP=9; - - // Norm16 value thresholds for quick check combinations and types of extra data. - - /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ - public static final int IX_MIN_YES_NO=10; - /** Mappings are comp-normalized. */ - public static final int IX_MIN_NO_NO=11; - public static final int IX_LIMIT_NO_NO=12; - public static final int IX_MIN_MAYBE_YES=13; - - /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ - public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; - /** Mappings are not comp-normalized but have a comp boundary before. */ - public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; - /** Mappings do not have a comp boundary before. */ - public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; - /** Mappings to the empty string. */ - public static final int IX_MIN_NO_NO_EMPTY=17; - - public static final int IX_MIN_LCCC_CP=18; - public static final int IX_COUNT=20; - - public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; - public static final int MAPPING_HAS_RAW_MAPPING=0x40; - // unused bit 0x20; - public static final int MAPPING_LENGTH_MASK=0x1f; - - public static final int COMP_1_LAST_TUPLE=0x8000; - public static final int COMP_1_TRIPLE=1; - public static final int COMP_1_TRAIL_LIMIT=0x3400; - public static final int COMP_1_TRAIL_MASK=0x7ffe; - public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit - public static final int COMP_2_TRAIL_SHIFT=6; - public static final int COMP_2_TRAIL_MASK=0xffc0; - - // higher-level functionality ------------------------------------------ *** - - /** - * Decomposes s[src, limit[ and writes the result to dest. - * limit can be NULL if src is NUL-terminated. - * destLengthEstimate is the initial dest buffer capacity and can be -1. - */ - public void decompose(CharSequence s, int src, int limit, StringBuilder dest, - int destLengthEstimate) { - if(destLengthEstimate<0) { - destLengthEstimate=limit-src; - } - dest.setLength(0); - ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); - decompose(s, src, limit, buffer); - } - - // Dual functionality: - // buffer!=NULL: normalize - // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes - public int decompose(CharSequence s, int src, int limit, - ReorderingBuffer buffer) { - int minNoCP=minDecompNoCP; - - int prevSrc; - int c=0; - int norm16=0; - - // only for quick check - int prevBoundary=src; - int prevCC=0; - - for(;;) { - // count code units below the minimum or with irrelevant data for the quick check - for(prevSrc=src; src!=limit;) { - if( (c=s.charAt(src))=limit) { - break; - } - c=Character.codePointAt(s, src); - cc=getCC(getNorm16(c)); - }; - buffer.append(s, 0, src, false, firstCC, prevCC); - buffer.append(s, src, limit); - } - - // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. - // doCompose: normalize - // !doCompose: isNormalized (buffer must be empty and initialized) - public boolean compose(CharSequence s, int src, int limit, - boolean onlyContiguous, - boolean doCompose, - ReorderingBuffer buffer) { - int prevBoundary=src; - int minNoMaybeCP=minCompNoMaybeCP; - - for (;;) { - // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, - // or with (compYes && ccc==0) properties. - int prevSrc; - int c = 0; - int norm16 = 0; - for (;;) { - if (src == limit) { - if (prevBoundary != limit && doCompose) { - buffer.append(s, prevBoundary, limit); - } - return true; - } - if( (c=s.charAt(src))=minNoNo. - // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) - // or a "yesYes" with ccc!=0. - // It is not a Hangul syllable or Jamo L because those have "yes" properties. - - // Medium-fast path: Handle cases that do not require full decomposition and recomposition. - if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes - if (!doCompose) { - return false; - } - // Fast path for mapping a character that is immediately surrounded by boundaries. - // In this case, we need not decompose around the current character. - if (isDecompNoAlgorithmic(norm16)) { - // Maps to a single isCompYesAndZeroCC character - // which also implies hasCompBoundaryBefore. - if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || - hasCompBoundaryBefore(s, src, limit)) { - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - buffer.append(mapAlgorithmic(c, norm16), 0); - prevBoundary = src; - continue; - } - } else if (norm16 < minNoNoCompBoundaryBefore) { - // The mapping is comp-normalized which also implies hasCompBoundaryBefore. - if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || - hasCompBoundaryBefore(s, src, limit)) { - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - int mapping = norm16 >> OFFSET_SHIFT; - int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; - buffer.append(extraData, mapping, mapping + length); - prevBoundary = src; - continue; - } - } else if (norm16 >= minNoNoEmpty) { - // The current character maps to nothing. - // Simply omit it from the output if there is a boundary before _or_ after it. - // The character itself implies no boundaries. - if (hasCompBoundaryBefore(s, src, limit) || - hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - prevBoundary = src; - continue; - } - } - // Other "noNo" type, or need to examine more text around this character: - // Fall through to the slow path. - } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { - char prev=s.charAt(prevSrc-1); - if(c= 0) { - int syllable = Hangul.HANGUL_BASE + - (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * - Hangul.JAMO_T_COUNT + t; - --prevSrc; // Replace the Jamo L as well. - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - buffer.append((char)syllable); - prevBoundary = src; - continue; - } - // If we see L+V+x where x!=T then we drop to the slow path, - // decompose and recompose. - // This is to deal with NFKC finding normal L and V but a - // compatibility variant of a T. - // We need to either fully compose that combination here - // (which would complicate the code and may not work with strange custom data) - // or use the slow path. - } - } else if (Hangul.isHangulLV(prev)) { - // The current character is a Jamo Trailing consonant, - // compose with previous Hangul LV that does not contain a Jamo T. - if (!doCompose) { - return false; - } - int syllable = prev + c - Hangul.JAMO_T_BASE; - --prevSrc; // Replace the Hangul LV as well. - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - buffer.append((char)syllable); - prevBoundary = src; - continue; - } - // No matching context, or may need to decompose surrounding text first: - // Fall through to the slow path. - } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC - // One or more combining marks that do not combine-back: - // Check for canonical order, copy unchanged if ok and - // if followed by a character with a boundary-before. - int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 - if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { - // Fails FCD test, need to decompose and contiguously recompose. - if (!doCompose) { - return false; - } - } else { - // If !onlyContiguous (not FCC), then we ignore the tccc of - // the previous character which passed the quick check "yes && ccc==0" test. - int n16; - for (;;) { - if (src == limit) { - if (doCompose) { - buffer.append(s, prevBoundary, limit); - } - return true; - } - int prevCC = cc; - c = Character.codePointAt(s, src); - n16 = normTrie.get(c); - if (n16 >= MIN_YES_YES_WITH_CC) { - cc = getCCFromNormalYesOrMaybe(n16); - if (prevCC > cc) { - if (!doCompose) { - return false; - } - break; - } - } else { - break; - } - src += Character.charCount(c); - } - // p is after the last in-order combining mark. - // If there is a boundary here, then we continue with no change. - if (norm16HasCompBoundaryBefore(n16)) { - if (isCompYesAndZeroCC(n16)) { - src += Character.charCount(c); - } - continue; - } - // Use the slow path. There is no boundary in [prevSrc, src[. - } - } - - // Slow path: Find the nearest boundaries around the current character, - // decompose and recompose. - if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { - c = Character.codePointBefore(s, prevSrc); - norm16 = normTrie.get(c); - if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { - prevSrc -= Character.charCount(c); - } - } - if (doCompose && prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - int recomposeStartIndex=buffer.length(); - // We know there is not a boundary here. - decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, - buffer); - // Decompose until the next boundary. - src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, - buffer); - recompose(buffer, recomposeStartIndex, onlyContiguous); - if(!doCompose) { - if(!buffer.equals(s, prevSrc, src)) { - return false; - } - buffer.remove(); - } - prevBoundary=src; - } - } - - /** - * Very similar to compose(): Make the same changes in both places if relevant. - * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) - * !doSpan: quickCheck - * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and - * bit 0: set if "maybe"; otherwise, if the span length<s.length() - * then the quick check result is "no" - */ - public int composeQuickCheck(CharSequence s, int src, int limit, - boolean onlyContiguous, boolean doSpan) { - int qcResult=0; - int prevBoundary=src; - int minNoMaybeCP=minCompNoMaybeCP; - - for(;;) { - // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, - // or with (compYes && ccc==0) properties. - int prevSrc; - int c = 0; - int norm16 = 0; - for (;;) { - if(src==limit) { - return (src<<1)|qcResult; // "yes" or "maybe" - } - if( (c=s.charAt(src))=minNoNo. - // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) - // or a "yesYes" with ccc!=0. - // It is not a Hangul syllable or Jamo L because those have "yes" properties. - - int prevNorm16 = INERT; - if (prevBoundary != prevSrc) { - prevBoundary = prevSrc; - if (!norm16HasCompBoundaryBefore(norm16)) { - c = Character.codePointBefore(s, prevSrc); - int n16 = getNorm16(c); - if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { - prevBoundary -= Character.charCount(c); - prevNorm16 = n16; - } - } - } - - if(isMaybeOrNonZeroCC(norm16)) { - int cc=getCCFromYesOrMaybe(norm16); - if (onlyContiguous /* FCC */ && cc != 0 && - getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { - // The [prevBoundary..prevSrc[ character - // passed the quick check "yes && ccc==0" test - // but is out of canonical order with the current combining mark. - } else { - // If !onlyContiguous (not FCC), then we ignore the tccc of - // the previous character which passed the quick check "yes && ccc==0" test. - for (;;) { - if (norm16 < MIN_YES_YES_WITH_CC) { - if (!doSpan) { - qcResult = 1; - } else { - return prevBoundary << 1; // spanYes does not care to know it's "maybe" - } - } - if (src == limit) { - return (src<<1) | qcResult; // "yes" or "maybe" - } - int prevCC = cc; - c = Character.codePointAt(s, src); - norm16 = getNorm16(c); - if (isMaybeOrNonZeroCC(norm16)) { - cc = getCCFromYesOrMaybe(norm16); - if (!(prevCC <= cc || cc == 0)) { - break; - } - } else { - break; - } - src += Character.charCount(c); - } - // src is after the last in-order combining mark. - if (isCompYesAndZeroCC(norm16)) { - prevBoundary = src; - src += Character.charCount(c); - continue; - } - } - } - return prevBoundary<<1; // "no" - } - } - public void composeAndAppend(CharSequence s, - boolean doCompose, - boolean onlyContiguous, - ReorderingBuffer buffer) { - int src=0, limit=s.length(); - if(!buffer.isEmpty()) { - int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); - if(0!=firstStarterInSrc) { - int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), - buffer.length(), onlyContiguous); - StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ - firstStarterInSrc+16); - middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); - buffer.removeSuffix(buffer.length()-lastStarterInDest); - middle.append(s, 0, firstStarterInSrc); - compose(middle, 0, middle.length(), onlyContiguous, true, buffer); - src=firstStarterInSrc; - } - } - if(doCompose) { - compose(s, src, limit, onlyContiguous, true, buffer); - } else { - buffer.append(s, src, limit); - } - } - // Dual functionality: - // buffer!=NULL: normalize - // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes - public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { - // Note: In this function we use buffer->appendZeroCC() because we track - // the lead and trail combining classes here, rather than leaving it to - // the ReorderingBuffer. - // The exception is the call to decomposeShort() which uses the buffer - // in the normal way. - - // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. - // Similar to the prevBoundary in the compose() implementation. - int prevBoundary=src; - int prevSrc; - int c=0; - int prevFCD16=0; - int fcd16=0; - - for(;;) { - // count code units with lccc==0 - for(prevSrc=src; src!=limit;) { - if((c=s.charAt(src))1) { - --prevBoundary; - } - } - } else { - int p=src-1; - if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { - prevBoundary=p; - } - } - if(buffer!=null) { - // The last lccc==0 character is excluded from the - // flush-and-append call in case it needs to be modified. - buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); - buffer.append(s, prevBoundary, src); - } - // The start of the current character (c). - prevSrc=src; - } else if(src==limit) { - break; - } - - src+=Character.charCount(c); - // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. - // Check for proper order, and decompose locally if necessary. - if((prevFCD16&0xff)<=(fcd16>>8)) { - // proper order: prev tccc <= current lccc - if((fcd16&0xff)<=1) { - prevBoundary=src; - } - if(buffer!=null) { - buffer.appendZeroCC(c); - } - prevFCD16=fcd16; - continue; - } else if(buffer==null) { - return prevBoundary; // quick check "no" - } else { - /* - * Back out the part of the source that we copied or appended - * already but is now going to be decomposed. - * prevSrc is set to after what was copied/appended. - */ - buffer.removeSuffix(prevSrc-prevBoundary); - /* - * Find the part of the source that needs to be decomposed, - * up to the next safe boundary. - */ - src=findNextFCDBoundary(s, src, limit); - /* - * The source text does not fulfill the conditions for FCD. - * Decompose and reorder a limited piece of the text. - */ - decomposeShort(s, prevBoundary, src, false, false, buffer); - prevBoundary=src; - prevFCD16=0; - } - } - return src; - } - - public boolean hasDecompBoundaryBefore(int c) { - return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || - norm16HasDecompBoundaryBefore(getNorm16(c)); - } - public boolean norm16HasDecompBoundaryBefore(int norm16) { - if (norm16 < minNoNoCompNoMaybeCC) { - return true; - } - if (norm16 >= limitNoNo) { - return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; - } - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - // true if leadCC==0 (hasFCDBoundaryBefore()) - return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; - } - public boolean hasDecompBoundaryAfter(int c) { - if (c < minDecompNoCP) { - return true; - } - if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { - return true; - } - return norm16HasDecompBoundaryAfter(getNorm16(c)); - } - public boolean norm16HasDecompBoundaryAfter(int norm16) { - if(norm16 <= minYesNo || isHangulLVT(norm16)) { - return true; - } - if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { - return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; - } - // Maps to an isCompYesAndZeroCC. - return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; - } - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - // decomp after-boundary: same as hasFCDBoundaryAfter(), - // fcd16<=1 || trailCC==0 - if(firstUnit>0x1ff) { - return false; // trailCC>1 - } - if(firstUnit<=0xff) { - return true; // trailCC==0 - } - // if(trailCC==1) test leadCC==0, same as checking for before-boundary - // true if leadCC==0 (hasFCDBoundaryBefore()) - return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; - } - public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } - - public boolean hasCompBoundaryBefore(int c) { - return c=minMaybeYes; } - private static boolean isInert(int norm16) { return norm16==INERT; } - private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } - private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } - private boolean isHangulLV(int norm16) { return norm16==minYesNo; } - private boolean isHangulLVT(int norm16) { - return norm16==hangulLVT(); - } - private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } - - // For use with isCompYes(). - // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. - // static uint8_t getCCFromYes(uint16_t norm16) { - // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; - // } - private int getCCFromNoNo(int norm16) { - int mapping=norm16>>OFFSET_SHIFT; - if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - return extraData.charAt(mapping-1)&0xff; - } else { - return 0; - } - } - int getTrailCCFromCompYesAndZeroCC(int norm16) { - if(norm16<=minYesNo) { - return 0; // yesYes and Hangul LV have ccc=tccc=0 - } else { - // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. - return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo - } - } - - // Requires algorithmic-NoNo. - private int mapAlgorithmic(int c, int norm16) { - return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; - } - - // Requires minYesNo>OFFSET_SHIFT); } - - /** - * @return index into maybeYesCompositions, or -1 - */ - private int getCompositionsListForDecompYes(int norm16) { - if(norm16>OFFSET_SHIFT; - } - } - /** - * @return index into maybeYesCompositions - */ - private int getCompositionsListForComposite(int norm16) { - // A composite has both mapping & compositions list. - int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; - int firstUnit=maybeYesCompositions.charAt(list); - return list+ // mapping in maybeYesCompositions - 1+ // +1 to skip the first unit with the mapping length - (firstUnit&MAPPING_LENGTH_MASK); // + mapping length - } - - // Decompose a short piece of text which is likely to contain characters that - // fail the quick check loop and/or where the quick check loop's overhead - // is unlikely to be amortized. - // Called by the compose() and makeFCD() implementations. - // Public in Java for collation implementation code. - private int decomposeShort( - CharSequence s, int src, int limit, - boolean stopAtCompBoundary, boolean onlyContiguous, - ReorderingBuffer buffer) { - while(src= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { - buffer.append(c, getCCFromYesOrMaybe(norm16)); - return; - } - // Maps to an isCompYesAndZeroCC. - c=mapAlgorithmic(c, norm16); - norm16=getRawNorm16(c); - } - if (norm16 < minYesNo) { - // c does not decompose - buffer.append(c, 0); - } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { - // Hangul syllable: decompose algorithmically - Hangul.decompose(c, buffer); - } else { - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - int length=firstUnit&MAPPING_LENGTH_MASK; - int leadCC, trailCC; - trailCC=firstUnit>>8; - if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - leadCC=extraData.charAt(mapping-1)>>8; - } else { - leadCC=0; - } - ++mapping; // skip over the firstUnit - buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); - } - } - - /** - * Finds the recomposition result for - * a forward-combining "lead" character, - * specified with a pointer to its compositions list, - * and a backward-combining "trail" character. - * - *

If the lead and trail characters combine, then this function returns - * the following "compositeAndFwd" value: - *

-     * Bits 21..1  composite character
-     * Bit      0  set if the composite is a forward-combining starter
-     * 
- * otherwise it returns -1. - * - *

The compositions list has (trail, compositeAndFwd) pair entries, - * encoded as either pairs or triples of 16-bit units. - * The last entry has the high bit of its first unit set. - * - *

The list is sorted by ascending trail characters (there are no duplicates). - * A linear search is used. - * - *

See normalizer2impl.h for a more detailed description - * of the compositions list format. - */ - private static int combine(String compositions, int list, int trail) { - int key1, firstUnit; - if(trail(firstUnit=compositions.charAt(list))) { - list+=2+(firstUnit&COMP_1_TRIPLE); - } - if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { - if((firstUnit&COMP_1_TRIPLE)!=0) { - return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); - } else { - return compositions.charAt(list+1); - } - } - } else { - // trail character is 3400..10FFFF - // result entry has 3 units - key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); - int key2=(trail<(firstUnit=compositions.charAt(list))) { - list+=2+(firstUnit&COMP_1_TRIPLE); - } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { - if(key2>(secondUnit=compositions.charAt(list+1))) { - if((firstUnit&COMP_1_LAST_TUPLE)!=0) { - break; - } else { - list+=3; - } - } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { - return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); - } else { - break; - } - } else { - break; - } - } - } - return -1; - } - - /* - * Recomposes the buffer text starting at recomposeStartIndex - * (which is in NFD - decomposed and canonically ordered), - * and truncates the buffer contents. - * - * Note that recomposition never lengthens the text: - * Any character consists of either one or two code units; - * a composition may contain at most one more code unit than the original starter, - * while the combining mark that is removed has at least one code unit. - */ - private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, - boolean onlyContiguous) { - StringBuilder sb=buffer.getStringBuilder(); - int p=recomposeStartIndex; - if(p==sb.length()) { - return; - } - - int starter, pRemove; - int compositionsList; - int c, compositeAndFwd; - int norm16; - int cc, prevCC; - boolean starterIsSupplementary; - - // Some of the following variables are not used until we have a forward-combining starter - // and are only initialized now to avoid compiler warnings. - compositionsList=-1; // used as indicator for whether we have a forward-combining starter - starter=-1; - starterIsSupplementary=false; - prevCC=0; - - for(;;) { - c=sb.codePointAt(p); - p+=Character.charCount(c); - norm16=getNorm16(c); - cc=getCCFromYesOrMaybe(norm16); - if( // this character combines backward and - isMaybe(norm16) && - // we have seen a starter that combines forward and - compositionsList>=0 && - // the backward-combining character is not blocked - (prevCC=0) { - // The starter and the combining mark (c) do combine. - int composite=compositeAndFwd>>1; - - // Remove the combining mark. - pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark - sb.delete(pRemove, p); - p=pRemove; - // Replace the starter with the composite. - if(starterIsSupplementary) { - if(composite>0xffff) { - // both are supplementary - sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); - sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); - } else { - sb.setCharAt(starter, (char)c); - sb.deleteCharAt(starter+1); - // The composite is shorter than the starter, - // move the intermediate characters forward one. - starterIsSupplementary=false; - --p; - } - } else if(composite>0xffff) { - // The composite is longer than the starter, - // move the intermediate characters back one. - starterIsSupplementary=true; - sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); - sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); - ++p; - } else { - // both are on the BMP - sb.setCharAt(starter, (char)composite); - } - - // Keep prevCC because we removed the combining mark. - - if(p==sb.length()) { - break; - } - // Is the composite a starter that combines forward? - if((compositeAndFwd&1)!=0) { - compositionsList= - getCompositionsListForComposite(getRawNorm16(composite)); - } else { - compositionsList=-1; - } - - // We combined; continue with looking for compositions. - continue; - } - } - - // no combination this time - prevCC=cc; - if(p==sb.length()) { - break; - } - - // If c did not combine, then check if it is a starter. - if(cc==0) { - // Found a new starter. - if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { - // It may combine with something, prepare for it. - if(c<=0xffff) { - starterIsSupplementary=false; - starter=p-1; - } else { - starterIsSupplementary=true; - starter=p-2; - } - } - } else if(onlyContiguous) { - // FCC: no discontiguous compositions; any intervening character blocks. - compositionsList=-1; - } - } - buffer.flush(); - } - - /** - * Does c have a composition boundary before it? - * True if its decomposition begins with a character that has - * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). - * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes - * (isCompYesAndZeroCC()) so we need not decompose. - */ - private boolean hasCompBoundaryBefore(int c, int norm16) { - return c> OFFSET_SHIFT) <= 0x1ff); - } - - private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { - while(p>0) { - int c=Character.codePointBefore(s, p); - int norm16 = getNorm16(c); - if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { - break; - } - p-=Character.charCount(c); - if(hasCompBoundaryBefore(c, norm16)) { - break; - } - } - return p; - } - private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { - while(p= 0x0009 && c <= 0x000D) || - (c >= 0x0020 && c <= 0x002F) || - (c >= 0x003A && c <= 0x0040) || - (c >= 0x005B && c <= 0x0060) || - (c >= 0x007B && c <= 0x007E); - } - - public static String canonicalDecomposeWithSingleQuotation(String string) { - Normalizer2 impl = Normalizer2.getNFDInstance(); - char[] src = string.toCharArray(); - int srcIndex = 0; - int srcLimit = src.length; - char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 - int destIndex = 0; - int destLimit = dest.length; - - int prevSrc; - String norm; - int reorderStartIndex, length; - char c1, c2; - int cp; - int minNoMaybe = 0x00c0; - int cc, prevCC, trailCC; - char[] p; - int pStart; - - // initialize - reorderStartIndex = 0; - prevCC = 0; - norm = null; - cp = 0; - pStart = 0; - - cc = trailCC = -1; // initialize to bogus value - c1 = 0; - for (;;) { - prevSrc=srcIndex; - //quick check (1)less than minNoMaybe (2)no decomp (3)hangual - while (srcIndex != srcLimit && - ((c1 = src[srcIndex]) < minNoMaybe || - (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || - (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables - prevCC = 0; - srcIndex += (cp < 0x10000) ? 1 : 2; - } - - // copy these code units all at once - if (srcIndex != prevSrc) { - length = srcIndex - prevSrc; - if ((destIndex + length) <= destLimit) { - System.arraycopy(src,prevSrc,dest,destIndex,length); - } - - destIndex += length; - reorderStartIndex = destIndex; - } - - // end of source reached? - if (srcIndex == srcLimit) { - break; - } - - // cp already contains *src and norm32 is set for it, increment src - srcIndex += (cp < 0x10000) ? 1 : 2; - - if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { - c2 = 0; - length = 1; - - if (Character.isHighSurrogate(c1) - || Character.isLowSurrogate(c1)) { - norm = null; - } - } else { - length = 2; - c2 = src[srcIndex-1]; - } - - // get the decomposition and the lead and trail cc's - if (norm == null) { - // cp does not decompose - cc = trailCC = UCharacter.getCombiningClass(cp); - p = null; - pStart = -1; - } else { - - pStart = 0; - p = norm.toCharArray(); - length = p.length; - int cpNum = norm.codePointCount(0, length); - cc= UCharacter.getCombiningClass(norm.codePointAt(0)); - trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); - if (length == 1) { - // fastpath a single code unit from decomposition - c1 = p[pStart]; - c2 = 0; - p = null; - pStart = -1; - } - } - - if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations - // buffer overflow - char[] tmpBuf = new char[destLimit * 2]; - System.arraycopy(dest, 0, tmpBuf, 0, destIndex); - dest = tmpBuf; - destLimit = dest.length; - } - - // append the decomposition to the destination buffer, assume length>0 - { - int reorderSplit = destIndex; - if (p == null) { - // fastpath: single code point - if (needSingleQuotation(c1)) { - //if we need single quotation, no need to consider "prevCC" - //and it must NOT be a supplementary pair - dest[destIndex++] = '\''; - dest[destIndex++] = c1; - dest[destIndex++] = '\''; - trailCC = 0; - } else if(cc != 0 && cc < prevCC) { - // (c1, c2) is out of order with respect to the preceding - // text - destIndex += length; - trailCC = insertOrdered(dest, reorderStartIndex, - reorderSplit, destIndex, c1, c2, cc); - } else { - // just append (c1, c2) - dest[destIndex++] = c1; - if(c2 != 0) { - dest[destIndex++] = c2; - } - } - } else { - // general: multiple code points (ordered by themselves) - // from decomposition - if (needSingleQuotation(p[pStart])) { - dest[destIndex++] = '\''; - dest[destIndex++] = p[pStart++]; - dest[destIndex++] = '\''; - length--; - do { - dest[destIndex++] = p[pStart++]; - } while(--length > 0); - } else if (cc != 0 && cc < prevCC) { - destIndex += length; - trailCC = mergeOrdered(dest, reorderStartIndex, - reorderSplit, p, pStart, - pStart+length); - } else { - // just append the decomposition - do { - dest[destIndex++] = p[pStart++]; - } while (--length > 0); - } - } - } - prevCC = trailCC; - if(prevCC == 0) { - reorderStartIndex = destIndex; - } - } - - return new String(dest, 0, destIndex); - } - - /** - * simpler, single-character version of mergeOrdered() - - * bubble-insert one single code point into the preceding string - * which is already canonically ordered - * (c, c2) may or may not yet have been inserted at src[current]..src[p] - * - * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) - * - * before: src[start]..src[current] is already ordered, and - * src[current]..src[p] may or may not hold (c, c2) but - * must be exactly the same length as (c, c2) - * after: src[start]..src[p] is ordered - * - * @return the trailing combining class - */ - private static int/*unsigned byte*/ insertOrdered(char[] source, - int start, - int current, int p, - char c1, char c2, - int/*unsigned byte*/ cc) { - int back, preBack; - int r; - int prevCC, trailCC=cc; - - if (start=prevCC - preBack=back=current; - - PrevArgs prevArgs = new PrevArgs(); - prevArgs.current = current; - prevArgs.start = start; - prevArgs.src = source; - prevArgs.c1 = c1; - prevArgs.c2 = c2; - - // get the prevCC - prevCC=getPrevCC(prevArgs); - preBack = prevArgs.current; - - if(cc=prevCC) { - break; - } - back=preBack; - } - - // this is where we are right now with all these indicies: - // [start]..[pPreBack] 0..? code points that we can ignore - // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc - // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) - // [current]..[p] 1 code point (c, c2) with cc - - // move the code units in between up - r=p; - do { - source[--r]=source[--current]; - } while (back!=current); - } - } - - // insert (c1, c2) - source[current] = c1; - if (c2!=0) { - source[(current+1)] = c2; - } - - // we know the cc of the last code point - return trailCC; - } - /** - * merge two UTF-16 string parts together - * to canonically order (order by combining classes) their concatenation - * - * the two strings may already be adjacent, so that the merging is done - * in-place if the two strings are not adjacent, then the buffer holding the - * first one must be large enough - * the second string may or may not be ordered in itself - * - * before: [start]..[current] is already ordered, and - * [next]..[limit] may be ordered in itself, but - * is not in relation to [start..current[ - * after: [start..current+(limit-next)[ is ordered - * - * the algorithm is a simple bubble-sort that takes the characters from - * src[next++] and inserts them in correct combining class order into the - * preceding part of the string - * - * since this function is called much less often than the single-code point - * insertOrdered(), it just uses that for easier maintenance - * - * @return the trailing combining class - */ - private static int /*unsigned byte*/ mergeOrdered(char[] source, - int start, - int current, - char[] data, - int next, - int limit) { - int r; - int /*unsigned byte*/ cc, trailCC=0; - boolean adjacent; - - adjacent= current==next; - NextCCArgs ncArgs = new NextCCArgs(); - ncArgs.source = data; - ncArgs.next = next; - ncArgs.limit = limit; - - if(start!=current) { - - while(ncArgs.next + * If dest is a StringBuilder, then the buffer writes directly to it. + * Otherwise, the buffer maintains a StringBuilder for intermediate text segments + * until no further changes are necessary and whole segments are appended. + * append() methods that take combining-class values always write to the StringBuilder. + * Other append() methods flush and append to the Appendable. + */ + public static final class ReorderingBuffer implements Appendable { + public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { + impl=ni; + app=dest; + if (app instanceof StringBuilder) { + appIsStringBuilder=true; + str=(StringBuilder)dest; + // In Java, the constructor subsumes public void init(int destCapacity) + str.ensureCapacity(destCapacity); + reorderStart=0; + if(str.length()==0) { + lastCC=0; + } else { + setIterator(); + lastCC=previousCC(); + // Set reorderStart after the last code point with cc<=1 if there is one. + if(lastCC>1) { + while(previousCC()>1) {} + } + reorderStart=codePointLimit; + } + } else { + appIsStringBuilder=false; + str=new StringBuilder(); + reorderStart=0; + lastCC=0; + } + } + + public boolean isEmpty() { return str.length()==0; } + public int length() { return str.length(); } + public int getLastCC() { return lastCC; } + + public StringBuilder getStringBuilder() { return str; } + + public boolean equals(CharSequence s, int start, int limit) { + return UTF16Plus.equal(str, 0, str.length(), s, start, limit); + } + + public void append(int c, int cc) { + if(lastCC<=cc || cc==0) { + str.appendCodePoint(c); + lastCC=cc; + if(cc<=1) { + reorderStart=str.length(); + } + } else { + insert(c, cc); + } + } + public void append(CharSequence s, int start, int limit, boolean isNFD, + int leadCC, int trailCC) { + if(start==limit) { + return; + } + if(lastCC<=leadCC || leadCC==0) { + if(trailCC<=1) { + reorderStart=str.length()+(limit-start); + } else if(leadCC<=1) { + reorderStart=str.length()+1; // Ok if not a code point boundary. + } + str.append(s, start, limit); + lastCC=trailCC; + } else { + int c=Character.codePointAt(s, start); + start+=Character.charCount(c); + insert(c, leadCC); // insert first code point + while(startcc;) {} + // insert c at codePointLimit, after the character with prevCC<=cc + if(c<=0xffff) { + str.insert(codePointLimit, (char)c); + if(cc<=1) { + reorderStart=codePointLimit+1; + } + } else { + str.insert(codePointLimit, Character.toChars(c)); + if(cc<=1) { + reorderStart=codePointLimit+2; + } + } + } + + private final NormalizerImpl impl; + private final Appendable app; + private final StringBuilder str; + private final boolean appIsStringBuilder; + private int reorderStart; + private int lastCC; + + // private backward iterator + private void setIterator() { codePointStart=str.length(); } + private void skipPrevious() { // Requires 0=codePointStart) { + return 0; + } + int c=str.codePointBefore(codePointStart); + codePointStart-=Character.charCount(c); + return impl.getCCFromYesOrMaybeCP(c); + } + private int codePointStart, codePointLimit; + } + + // TODO: Propose as public API on the UTF16 class. + // TODO: Propose widening UTF16 methods that take char to take int. + // TODO: Propose widening UTF16 methods that take String to take CharSequence. + public static final class UTF16Plus { + /** + * Is this code point a lead surrogate (U+d800..U+dbff)? + * @param c code unit or code point + * @return true or false + */ + public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } + /** + * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), + * is it a lead surrogate? + * @param c code unit or code point + * @return true or false + */ + public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } + + /** + * Compares two CharSequence subsequences for binary equality. + * @param s1 first sequence + * @param start1 start offset in first sequence + * @param limit1 limit offset in first sequence + * @param s2 second sequence + * @param start2 start offset in second sequence + * @param limit2 limit offset in second sequence + * @return true if s1.subSequence(start1, limit1) contains the same text + * as s2.subSequence(start2, limit2) + */ + public static boolean equal(CharSequence s1, int start1, int limit1, + CharSequence s2, int start2, int limit2) { + if((limit1-start1)!=(limit2-start2)) { + return false; + } + if(s1==s2 && start1==start2) { + return true; + } + while(start1>DELTA_SHIFT)-MAX_DELTA-1; + + // Read the normTrie. + int offset=inIndexes[IX_NORM_TRIE_OFFSET]; + int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; + int triePosition = bytes.position(); + normTrie = CodePointTrie.Fast16.fromBinary(bytes); + int trieLength = bytes.position() - triePosition; + if(trieLength>(nextOffset-offset)) { + throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); + } + ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes + + // Read the composition and mapping data. + offset=nextOffset; + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; + int numChars=(nextOffset-offset)/2; + if(numChars!=0) { + maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); + extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); + } + + // smallFCD: new in formatVersion 2 + offset=nextOffset; + smallFCD=new byte[0x100]; + bytes.get(smallFCD); + + return this; + } catch(IOException e) { + throw new InternalError(e); + } + } + public NormalizerImpl load(String name) { + return load(ICUBinary.getRequiredData(name)); + } + + // The trie stores values for lead surrogate code *units*. + // Surrogate code *points* are inert. + public int getNorm16(int c) { + return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); + } + public int getRawNorm16(int c) { return normTrie.get(c); } + public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16=MIN_NORMAL_MAYBE_YES) { + return getCCFromNormalYesOrMaybe(norm16); + } + if(norm16> OFFSET_SHIFT) & 0xff; + } + public static int getCCFromYesOrMaybe(int norm16) { + return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; + } + public int getCCFromYesOrMaybeCP(int c) { + if (c < minCompNoMaybeCP) { return 0; } + return getCCFromYesOrMaybe(getNorm16(c)); + } + + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + public int getFCD16(int c) { + if(c>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } + + /** Gets the FCD value from the regular normalization data. */ + public int getFCD16FromNormData(int c) { + int norm16=getNorm16(c); + if (norm16 >= limitNoNo) { + if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16=getCCFromNormalYesOrMaybe(norm16); + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else { // isDecompNoAlgorithmic(norm16) + int deltaTrailCC = norm16 & DELTA_TCCC_MASK; + if (deltaTrailCC <= DELTA_TCCC_1) { + return deltaTrailCC >> OFFSET_SHIFT; + } + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); + } + } + if(norm16<=minYesNo || isHangulLVT(norm16)) { + // no decomposition or Hangul syllable, all zeros + return 0; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + int fcd16=firstUnit>>8; // tccc + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc + } + return fcd16; + } + + /** + * Gets the decomposition for one code point. + * @param c code point + * @return c's decomposition, if it has one; returns null if it does not have a decomposition + */ + public String getDecomposition(int c) { + int norm16; + if(c>OFFSET_SHIFT; + int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; + return extraData.substring(mapping, mapping+length); + } + + // Fixed norm16 values. + public static final int MIN_YES_YES_WITH_CC=0xfe02; + public static final int JAMO_VT=0xfe00; + public static final int MIN_NORMAL_MAYBE_YES=0xfc00; + public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE + public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE + + // norm16 bit 0 is comp-boundary-after. + public static final int HAS_COMP_BOUNDARY_AFTER=1; + public static final int OFFSET_SHIFT=1; + + // For algorithmic one-way mappings, norm16 bits 2..1 indicate the + // tccc (0, 1, >1) for quick FCC boundary-after tests. + public static final int DELTA_TCCC_0=0; + public static final int DELTA_TCCC_1=2; + public static final int DELTA_TCCC_GT_1=4; + public static final int DELTA_TCCC_MASK=6; + public static final int DELTA_SHIFT=3; + + public static final int MAX_DELTA=0x40; + + // Byte offsets from the start of the data, after the generic header. + public static final int IX_NORM_TRIE_OFFSET=0; + public static final int IX_EXTRA_DATA_OFFSET=1; + public static final int IX_SMALL_FCD_OFFSET=2; + public static final int IX_RESERVED3_OFFSET=3; + public static final int IX_TOTAL_SIZE=7; + public static final int MIN_CCC_LCCC_CP=0x300; + // Code point thresholds for quick check codes. + public static final int IX_MIN_DECOMP_NO_CP=8; + public static final int IX_MIN_COMP_NO_MAYBE_CP=9; + + // Norm16 value thresholds for quick check combinations and types of extra data. + + /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ + public static final int IX_MIN_YES_NO=10; + /** Mappings are comp-normalized. */ + public static final int IX_MIN_NO_NO=11; + public static final int IX_LIMIT_NO_NO=12; + public static final int IX_MIN_MAYBE_YES=13; + + /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ + public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; + /** Mappings are not comp-normalized but have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; + /** Mappings do not have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; + /** Mappings to the empty string. */ + public static final int IX_MIN_NO_NO_EMPTY=17; + + public static final int IX_MIN_LCCC_CP=18; + public static final int IX_COUNT=20; + + public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; + public static final int MAPPING_HAS_RAW_MAPPING=0x40; + // unused bit 0x20; + public static final int MAPPING_LENGTH_MASK=0x1f; + + public static final int COMP_1_LAST_TUPLE=0x8000; + public static final int COMP_1_TRIPLE=1; + public static final int COMP_1_TRAIL_LIMIT=0x3400; + public static final int COMP_1_TRAIL_MASK=0x7ffe; + public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit + public static final int COMP_2_TRAIL_SHIFT=6; + public static final int COMP_2_TRAIL_MASK=0xffc0; + + // higher-level functionality ------------------------------------------ *** + + /** + * Decomposes s[src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + public void decompose(CharSequence s, int src, int limit, StringBuilder dest, + int destLengthEstimate) { + if(destLengthEstimate<0) { + destLengthEstimate=limit-src; + } + dest.setLength(0); + ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); + decompose(s, src, limit, buffer); + } + + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int decompose(CharSequence s, int src, int limit, + ReorderingBuffer buffer) { + int minNoCP=minDecompNoCP; + + int prevSrc; + int c=0; + int norm16=0; + + // only for quick check + int prevBoundary=src; + int prevCC=0; + + for(;;) { + // count code units below the minimum or with irrelevant data for the quick check + for(prevSrc=src; src!=limit;) { + if( (c=s.charAt(src))=limit) { + break; + } + c=Character.codePointAt(s, src); + cc=getCC(getNorm16(c)); + }; + buffer.append(s, 0, src, false, firstCC, prevCC); + buffer.append(s, src, limit); + } + + // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. + // doCompose: normalize + // !doCompose: isNormalized (buffer must be empty and initialized) + public boolean compose(CharSequence s, int src, int limit, + boolean onlyContiguous, + boolean doCompose, + ReorderingBuffer buffer) { + int prevBoundary=src; + int minNoMaybeCP=minCompNoMaybeCP; + + for (;;) { + // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, + // or with (compYes && ccc==0) properties. + int prevSrc; + int c = 0; + int norm16 = 0; + for (;;) { + if (src == limit) { + if (prevBoundary != limit && doCompose) { + buffer.append(s, prevBoundary, limit); + } + return true; + } + if( (c=s.charAt(src))=minNoNo. + // The current character is either a "noNo" (has a mapping) + // or a "maybeYes" (combines backward) + // or a "yesYes" with ccc!=0. + // It is not a Hangul syllable or Jamo L because those have "yes" properties. + + // Medium-fast path: Handle cases that do not require full decomposition and recomposition. + if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes + if (!doCompose) { + return false; + } + // Fast path for mapping a character that is immediately surrounded by boundaries. + // In this case, we need not decompose around the current character. + if (isDecompNoAlgorithmic(norm16)) { + // Maps to a single isCompYesAndZeroCC character + // which also implies hasCompBoundaryBefore. + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || + hasCompBoundaryBefore(s, src, limit)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append(mapAlgorithmic(c, norm16), 0); + prevBoundary = src; + continue; + } + } else if (norm16 < minNoNoCompBoundaryBefore) { + // The mapping is comp-normalized which also implies hasCompBoundaryBefore. + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || + hasCompBoundaryBefore(s, src, limit)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + int mapping = norm16 >> OFFSET_SHIFT; + int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; + buffer.append(extraData, mapping, mapping + length); + prevBoundary = src; + continue; + } + } else if (norm16 >= minNoNoEmpty) { + // The current character maps to nothing. + // Simply omit it from the output if there is a boundary before _or_ after it. + // The character itself implies no boundaries. + if (hasCompBoundaryBefore(s, src, limit) || + hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + prevBoundary = src; + continue; + } + } + // Other "noNo" type, or need to examine more text around this character: + // Fall through to the slow path. + } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { + char prev=s.charAt(prevSrc-1); + if(c= 0) { + int syllable = Hangul.HANGUL_BASE + + (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * + Hangul.JAMO_T_COUNT + t; + --prevSrc; // Replace the Jamo L as well. + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append((char)syllable); + prevBoundary = src; + continue; + } + // If we see L+V+x where x!=T then we drop to the slow path, + // decompose and recompose. + // This is to deal with NFKC finding normal L and V but a + // compatibility variant of a T. + // We need to either fully compose that combination here + // (which would complicate the code and may not work with strange custom data) + // or use the slow path. + } + } else if (Hangul.isHangulLV(prev)) { + // The current character is a Jamo Trailing consonant, + // compose with previous Hangul LV that does not contain a Jamo T. + if (!doCompose) { + return false; + } + int syllable = prev + c - Hangul.JAMO_T_BASE; + --prevSrc; // Replace the Hangul LV as well. + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append((char)syllable); + prevBoundary = src; + continue; + } + // No matching context, or may need to decompose surrounding text first: + // Fall through to the slow path. + } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC + // One or more combining marks that do not combine-back: + // Check for canonical order, copy unchanged if ok and + // if followed by a character with a boundary-before. + int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 + if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { + // Fails FCD test, need to decompose and contiguously recompose. + if (!doCompose) { + return false; + } + } else { + // If !onlyContiguous (not FCC), then we ignore the tccc of + // the previous character which passed the quick check "yes && ccc==0" test. + int n16; + for (;;) { + if (src == limit) { + if (doCompose) { + buffer.append(s, prevBoundary, limit); + } + return true; + } + int prevCC = cc; + c = Character.codePointAt(s, src); + n16 = normTrie.get(c); + if (n16 >= MIN_YES_YES_WITH_CC) { + cc = getCCFromNormalYesOrMaybe(n16); + if (prevCC > cc) { + if (!doCompose) { + return false; + } + break; + } + } else { + break; + } + src += Character.charCount(c); + } + // p is after the last in-order combining mark. + // If there is a boundary here, then we continue with no change. + if (norm16HasCompBoundaryBefore(n16)) { + if (isCompYesAndZeroCC(n16)) { + src += Character.charCount(c); + } + continue; + } + // Use the slow path. There is no boundary in [prevSrc, src[. + } + } + + // Slow path: Find the nearest boundaries around the current character, + // decompose and recompose. + if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { + c = Character.codePointBefore(s, prevSrc); + norm16 = normTrie.get(c); + if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + prevSrc -= Character.charCount(c); + } + } + if (doCompose && prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + int recomposeStartIndex=buffer.length(); + // We know there is not a boundary here. + decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, + buffer); + // Decompose until the next boundary. + src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, + buffer); + recompose(buffer, recomposeStartIndex, onlyContiguous); + if(!doCompose) { + if(!buffer.equals(s, prevSrc, src)) { + return false; + } + buffer.remove(); + } + prevBoundary=src; + } + } + + /** + * Very similar to compose(): Make the same changes in both places if relevant. + * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) + * !doSpan: quickCheck + * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and + * bit 0: set if "maybe"; otherwise, if the span length<s.length() + * then the quick check result is "no" + */ + public int composeQuickCheck(CharSequence s, int src, int limit, + boolean onlyContiguous, boolean doSpan) { + int qcResult=0; + int prevBoundary=src; + int minNoMaybeCP=minCompNoMaybeCP; + + for(;;) { + // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, + // or with (compYes && ccc==0) properties. + int prevSrc; + int c = 0; + int norm16 = 0; + for (;;) { + if(src==limit) { + return (src<<1)|qcResult; // "yes" or "maybe" + } + if( (c=s.charAt(src))=minNoNo. + // The current character is either a "noNo" (has a mapping) + // or a "maybeYes" (combines backward) + // or a "yesYes" with ccc!=0. + // It is not a Hangul syllable or Jamo L because those have "yes" properties. + + int prevNorm16 = INERT; + if (prevBoundary != prevSrc) { + prevBoundary = prevSrc; + if (!norm16HasCompBoundaryBefore(norm16)) { + c = Character.codePointBefore(s, prevSrc); + int n16 = getNorm16(c); + if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { + prevBoundary -= Character.charCount(c); + prevNorm16 = n16; + } + } + } + + if(isMaybeOrNonZeroCC(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if (onlyContiguous /* FCC */ && cc != 0 && + getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { + // The [prevBoundary..prevSrc[ character + // passed the quick check "yes && ccc==0" test + // but is out of canonical order with the current combining mark. + } else { + // If !onlyContiguous (not FCC), then we ignore the tccc of + // the previous character which passed the quick check "yes && ccc==0" test. + for (;;) { + if (norm16 < MIN_YES_YES_WITH_CC) { + if (!doSpan) { + qcResult = 1; + } else { + return prevBoundary << 1; // spanYes does not care to know it's "maybe" + } + } + if (src == limit) { + return (src<<1) | qcResult; // "yes" or "maybe" + } + int prevCC = cc; + c = Character.codePointAt(s, src); + norm16 = getNorm16(c); + if (isMaybeOrNonZeroCC(norm16)) { + cc = getCCFromYesOrMaybe(norm16); + if (!(prevCC <= cc || cc == 0)) { + break; + } + } else { + break; + } + src += Character.charCount(c); + } + // src is after the last in-order combining mark. + if (isCompYesAndZeroCC(norm16)) { + prevBoundary = src; + src += Character.charCount(c); + continue; + } + } + } + return prevBoundary<<1; // "no" + } + } + public void composeAndAppend(CharSequence s, + boolean doCompose, + boolean onlyContiguous, + ReorderingBuffer buffer) { + int src=0, limit=s.length(); + if(!buffer.isEmpty()) { + int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); + if(0!=firstStarterInSrc) { + int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), + buffer.length(), onlyContiguous); + StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ + firstStarterInSrc+16); + middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); + buffer.removeSuffix(buffer.length()-lastStarterInDest); + middle.append(s, 0, firstStarterInSrc); + compose(middle, 0, middle.length(), onlyContiguous, true, buffer); + src=firstStarterInSrc; + } + } + if(doCompose) { + compose(s, src, limit, onlyContiguous, true, buffer); + } else { + buffer.append(s, src, limit); + } + } + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { + // Note: In this function we use buffer->appendZeroCC() because we track + // the lead and trail combining classes here, rather than leaving it to + // the ReorderingBuffer. + // The exception is the call to decomposeShort() which uses the buffer + // in the normal way. + + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. + // Similar to the prevBoundary in the compose() implementation. + int prevBoundary=src; + int prevSrc; + int c=0; + int prevFCD16=0; + int fcd16=0; + + for(;;) { + // count code units with lccc==0 + for(prevSrc=src; src!=limit;) { + if((c=s.charAt(src))1) { + --prevBoundary; + } + } + } else { + int p=src-1; + if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { + prevBoundary=p; + } + } + if(buffer!=null) { + // The last lccc==0 character is excluded from the + // flush-and-append call in case it needs to be modified. + buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); + buffer.append(s, prevBoundary, src); + } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { + break; + } + + src+=Character.charCount(c); + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. + // Check for proper order, and decompose locally if necessary. + if((prevFCD16&0xff)<=(fcd16>>8)) { + // proper order: prev tccc <= current lccc + if((fcd16&0xff)<=1) { + prevBoundary=src; + } + if(buffer!=null) { + buffer.appendZeroCC(c); + } + prevFCD16=fcd16; + continue; + } else if(buffer==null) { + return prevBoundary; // quick check "no" + } else { + /* + * Back out the part of the source that we copied or appended + * already but is now going to be decomposed. + * prevSrc is set to after what was copied/appended. + */ + buffer.removeSuffix(prevSrc-prevBoundary); + /* + * Find the part of the source that needs to be decomposed, + * up to the next safe boundary. + */ + src=findNextFCDBoundary(s, src, limit); + /* + * The source text does not fulfill the conditions for FCD. + * Decompose and reorder a limited piece of the text. + */ + decomposeShort(s, prevBoundary, src, false, false, buffer); + prevBoundary=src; + prevFCD16=0; + } + } + return src; + } + + public boolean hasDecompBoundaryBefore(int c) { + return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || + norm16HasDecompBoundaryBefore(getNorm16(c)); + } + public boolean norm16HasDecompBoundaryBefore(int norm16) { + if (norm16 < minNoNoCompNoMaybeCC) { + return true; + } + if (norm16 >= limitNoNo) { + return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; + } + public boolean hasDecompBoundaryAfter(int c) { + if (c < minDecompNoCP) { + return true; + } + if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { + return true; + } + return norm16HasDecompBoundaryAfter(getNorm16(c)); + } + public boolean norm16HasDecompBoundaryAfter(int norm16) { + if(norm16 <= minYesNo || isHangulLVT(norm16)) { + return true; + } + if (norm16 >= limitNoNo) { + if (isMaybeOrNonZeroCC(norm16)) { + return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } + // Maps to an isCompYesAndZeroCC. + return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + // decomp after-boundary: same as hasFCDBoundaryAfter(), + // fcd16<=1 || trailCC==0 + if(firstUnit>0x1ff) { + return false; // trailCC>1 + } + if(firstUnit<=0xff) { + return true; // trailCC==0 + } + // if(trailCC==1) test leadCC==0, same as checking for before-boundary + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; + } + public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } + + public boolean hasCompBoundaryBefore(int c) { + return c=minMaybeYes; } + private static boolean isInert(int norm16) { return norm16==INERT; } + private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } + private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } + private boolean isHangulLV(int norm16) { return norm16==minYesNo; } + private boolean isHangulLVT(int norm16) { + return norm16==hangulLVT(); + } + private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; + // } + private int getCCFromNoNo(int norm16) { + int mapping=norm16>>OFFSET_SHIFT; + if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + return extraData.charAt(mapping-1)&0xff; + } else { + return 0; + } + } + int getTrailCCFromCompYesAndZeroCC(int norm16) { + if(norm16<=minYesNo) { + return 0; // yesYes and Hangul LV have ccc=tccc=0 + } else { + // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. + return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo + } + } + + // Requires algorithmic-NoNo. + private int mapAlgorithmic(int c, int norm16) { + return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; + } + + // Requires minYesNo>OFFSET_SHIFT); } + + /** + * @return index into maybeYesCompositions, or -1 + */ + private int getCompositionsListForDecompYes(int norm16) { + if(norm16>OFFSET_SHIFT; + } + } + /** + * @return index into maybeYesCompositions + */ + private int getCompositionsListForComposite(int norm16) { + // A composite has both mapping & compositions list. + int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; + int firstUnit=maybeYesCompositions.charAt(list); + return list+ // mapping in maybeYesCompositions + 1+ // +1 to skip the first unit with the mapping length + (firstUnit&MAPPING_LENGTH_MASK); // + mapping length + } + + // Decompose a short piece of text which is likely to contain characters that + // fail the quick check loop and/or where the quick check loop's overhead + // is unlikely to be amortized. + // Called by the compose() and makeFCD() implementations. + // Public in Java for collation implementation code. + private int decomposeShort( + CharSequence s, int src, int limit, + boolean stopAtCompBoundary, boolean onlyContiguous, + ReorderingBuffer buffer) { + while(src= limitNoNo) { + if (isMaybeOrNonZeroCC(norm16)) { + buffer.append(c, getCCFromYesOrMaybe(norm16)); + return; + } + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); + } + if (norm16 < minYesNo) { + // c does not decompose + buffer.append(c, 0); + } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { + // Hangul syllable: decompose algorithmically + Hangul.decompose(c, buffer); + } else { + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + int length=firstUnit&MAPPING_LENGTH_MASK; + int leadCC, trailCC; + trailCC=firstUnit>>8; + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + leadCC=extraData.charAt(mapping-1)>>8; + } else { + leadCC=0; + } + ++mapping; // skip over the firstUnit + buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); + } + } + + /** + * Finds the recomposition result for + * a forward-combining "lead" character, + * specified with a pointer to its compositions list, + * and a backward-combining "trail" character. + * + *

If the lead and trail characters combine, then this function returns + * the following "compositeAndFwd" value: + *

+     * Bits 21..1  composite character
+     * Bit      0  set if the composite is a forward-combining starter
+     * 
+ * otherwise it returns -1. + * + *

The compositions list has (trail, compositeAndFwd) pair entries, + * encoded as either pairs or triples of 16-bit units. + * The last entry has the high bit of its first unit set. + * + *

The list is sorted by ascending trail characters (there are no duplicates). + * A linear search is used. + * + *

See normalizer2impl.h for a more detailed description + * of the compositions list format. + */ + private static int combine(String compositions, int list, int trail) { + int key1, firstUnit; + if(trail(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if((firstUnit&COMP_1_TRIPLE)!=0) { + return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); + } else { + return compositions.charAt(list+1); + } + } + } else { + // trail character is 3400..10FFFF + // result entry has 3 units + key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); + int key2=(trail<(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if(key2>(secondUnit=compositions.charAt(list+1))) { + if((firstUnit&COMP_1_LAST_TUPLE)!=0) { + break; + } else { + list+=3; + } + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { + return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); + } else { + break; + } + } else { + break; + } + } + } + return -1; + } + + /* + * Recomposes the buffer text starting at recomposeStartIndex + * (which is in NFD - decomposed and canonically ordered), + * and truncates the buffer contents. + * + * Note that recomposition never lengthens the text: + * Any character consists of either one or two code units; + * a composition may contain at most one more code unit than the original starter, + * while the combining mark that is removed has at least one code unit. + */ + private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, + boolean onlyContiguous) { + StringBuilder sb=buffer.getStringBuilder(); + int p=recomposeStartIndex; + if(p==sb.length()) { + return; + } + + int starter, pRemove; + int compositionsList; + int c, compositeAndFwd; + int norm16; + int cc, prevCC; + boolean starterIsSupplementary; + + // Some of the following variables are not used until we have a forward-combining starter + // and are only initialized now to avoid compiler warnings. + compositionsList=-1; // used as indicator for whether we have a forward-combining starter + starter=-1; + starterIsSupplementary=false; + prevCC=0; + + for(;;) { + c=sb.codePointAt(p); + p+=Character.charCount(c); + norm16=getNorm16(c); + cc=getCCFromYesOrMaybe(norm16); + if( // this character combines backward and + isMaybe(norm16) && + // we have seen a starter that combines forward and + compositionsList>=0 && + // the backward-combining character is not blocked + (prevCC=0) { + // The starter and the combining mark (c) do combine. + int composite=compositeAndFwd>>1; + + // Remove the combining mark. + pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark + sb.delete(pRemove, p); + p=pRemove; + // Replace the starter with the composite. + if(starterIsSupplementary) { + if(composite>0xffff) { + // both are supplementary + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); + } else { + sb.setCharAt(starter, (char)c); + sb.deleteCharAt(starter+1); + // The composite is shorter than the starter, + // move the intermediate characters forward one. + starterIsSupplementary=false; + --p; + } + } else if(composite>0xffff) { + // The composite is longer than the starter, + // move the intermediate characters back one. + starterIsSupplementary=true; + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); + ++p; + } else { + // both are on the BMP + sb.setCharAt(starter, (char)composite); + } + + // Keep prevCC because we removed the combining mark. + + if(p==sb.length()) { + break; + } + // Is the composite a starter that combines forward? + if((compositeAndFwd&1)!=0) { + compositionsList= + getCompositionsListForComposite(getRawNorm16(composite)); + } else { + compositionsList=-1; + } + + // We combined; continue with looking for compositions. + continue; + } + } + + // no combination this time + prevCC=cc; + if(p==sb.length()) { + break; + } + + // If c did not combine, then check if it is a starter. + if(cc==0) { + // Found a new starter. + if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { + // It may combine with something, prepare for it. + if(c<=0xffff) { + starterIsSupplementary=false; + starter=p-1; + } else { + starterIsSupplementary=true; + starter=p-2; + } + } + } else if(onlyContiguous) { + // FCC: no discontiguous compositions; any intervening character blocks. + compositionsList=-1; + } + } + buffer.flush(); + } + + /** + * Does c have a composition boundary before it? + * True if its decomposition begins with a character that has + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes + * (isCompYesAndZeroCC()) so we need not decompose. + */ + private boolean hasCompBoundaryBefore(int c, int norm16) { + return c> OFFSET_SHIFT) <= 0x1ff); + } + + private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { + while(p>0) { + int c=Character.codePointBefore(s, p); + int norm16 = getNorm16(c); + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + break; + } + p-=Character.charCount(c); + if(hasCompBoundaryBefore(c, norm16)) { + break; + } + } + return p; + } + private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { + while(p= 0x0009 && c <= 0x000D) || + (c >= 0x0020 && c <= 0x002F) || + (c >= 0x003A && c <= 0x0040) || + (c >= 0x005B && c <= 0x0060) || + (c >= 0x007B && c <= 0x007E); + } + + public static String canonicalDecomposeWithSingleQuotation(String string) { + Normalizer2 impl = Normalizer2.getNFDInstance(); + char[] src = string.toCharArray(); + int srcIndex = 0; + int srcLimit = src.length; + char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 + int destIndex = 0; + int destLimit = dest.length; + + int prevSrc; + String norm; + int reorderStartIndex, length; + char c1, c2; + int cp; + int minNoMaybe = 0x00c0; + int cc, prevCC, trailCC; + char[] p; + int pStart; + + // initialize + reorderStartIndex = 0; + prevCC = 0; + norm = null; + cp = 0; + pStart = 0; + + cc = trailCC = -1; // initialize to bogus value + c1 = 0; + for (;;) { + prevSrc=srcIndex; + //quick check (1)less than minNoMaybe (2)no decomp (3)hangual + while (srcIndex != srcLimit && + ((c1 = src[srcIndex]) < minNoMaybe || + (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || + (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables + prevCC = 0; + srcIndex += (cp < 0x10000) ? 1 : 2; + } + + // copy these code units all at once + if (srcIndex != prevSrc) { + length = srcIndex - prevSrc; + if ((destIndex + length) <= destLimit) { + System.arraycopy(src,prevSrc,dest,destIndex,length); + } + + destIndex += length; + reorderStartIndex = destIndex; + } + + // end of source reached? + if (srcIndex == srcLimit) { + break; + } + + // cp already contains *src and norm32 is set for it, increment src + srcIndex += (cp < 0x10000) ? 1 : 2; + + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + c2 = 0; + length = 1; + + if (Character.isHighSurrogate(c1) + || Character.isLowSurrogate(c1)) { + norm = null; + } + } else { + length = 2; + c2 = src[srcIndex-1]; + } + + // get the decomposition and the lead and trail cc's + if (norm == null) { + // cp does not decompose + cc = trailCC = UCharacter.getCombiningClass(cp); + p = null; + pStart = -1; + } else { + + pStart = 0; + p = norm.toCharArray(); + length = p.length; + int cpNum = norm.codePointCount(0, length); + cc= UCharacter.getCombiningClass(norm.codePointAt(0)); + trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); + if (length == 1) { + // fastpath a single code unit from decomposition + c1 = p[pStart]; + c2 = 0; + p = null; + pStart = -1; + } + } + + if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations + // buffer overflow + char[] tmpBuf = new char[destLimit * 2]; + System.arraycopy(dest, 0, tmpBuf, 0, destIndex); + dest = tmpBuf; + destLimit = dest.length; + } + + // append the decomposition to the destination buffer, assume length>0 + { + int reorderSplit = destIndex; + if (p == null) { + // fastpath: single code point + if (needSingleQuotation(c1)) { + //if we need single quotation, no need to consider "prevCC" + //and it must NOT be a supplementary pair + dest[destIndex++] = '\''; + dest[destIndex++] = c1; + dest[destIndex++] = '\''; + trailCC = 0; + } else if(cc != 0 && cc < prevCC) { + // (c1, c2) is out of order with respect to the preceding + // text + destIndex += length; + trailCC = insertOrdered(dest, reorderStartIndex, + reorderSplit, destIndex, c1, c2, cc); + } else { + // just append (c1, c2) + dest[destIndex++] = c1; + if(c2 != 0) { + dest[destIndex++] = c2; + } + } + } else { + // general: multiple code points (ordered by themselves) + // from decomposition + if (needSingleQuotation(p[pStart])) { + dest[destIndex++] = '\''; + dest[destIndex++] = p[pStart++]; + dest[destIndex++] = '\''; + length--; + do { + dest[destIndex++] = p[pStart++]; + } while(--length > 0); + } else if (cc != 0 && cc < prevCC) { + destIndex += length; + trailCC = mergeOrdered(dest, reorderStartIndex, + reorderSplit, p, pStart, + pStart+length); + } else { + // just append the decomposition + do { + dest[destIndex++] = p[pStart++]; + } while (--length > 0); + } + } + } + prevCC = trailCC; + if(prevCC == 0) { + reorderStartIndex = destIndex; + } + } + + return new String(dest, 0, destIndex); + } + + /** + * simpler, single-character version of mergeOrdered() - + * bubble-insert one single code point into the preceding string + * which is already canonically ordered + * (c, c2) may or may not yet have been inserted at src[current]..src[p] + * + * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) + * + * before: src[start]..src[current] is already ordered, and + * src[current]..src[p] may or may not hold (c, c2) but + * must be exactly the same length as (c, c2) + * after: src[start]..src[p] is ordered + * + * @return the trailing combining class + */ + private static int/*unsigned byte*/ insertOrdered(char[] source, + int start, + int current, int p, + char c1, char c2, + int/*unsigned byte*/ cc) { + int back, preBack; + int r; + int prevCC, trailCC=cc; + + if (start=prevCC + preBack=back=current; + + PrevArgs prevArgs = new PrevArgs(); + prevArgs.current = current; + prevArgs.start = start; + prevArgs.src = source; + prevArgs.c1 = c1; + prevArgs.c2 = c2; + + // get the prevCC + prevCC=getPrevCC(prevArgs); + preBack = prevArgs.current; + + if(cc=prevCC) { + break; + } + back=preBack; + } + + // this is where we are right now with all these indicies: + // [start]..[pPreBack] 0..? code points that we can ignore + // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc + // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) + // [current]..[p] 1 code point (c, c2) with cc + + // move the code units in between up + r=p; + do { + source[--r]=source[--current]; + } while (back!=current); + } + } + + // insert (c1, c2) + source[current] = c1; + if (c2!=0) { + source[(current+1)] = c2; + } + + // we know the cc of the last code point + return trailCC; + } + /** + * merge two UTF-16 string parts together + * to canonically order (order by combining classes) their concatenation + * + * the two strings may already be adjacent, so that the merging is done + * in-place if the two strings are not adjacent, then the buffer holding the + * first one must be large enough + * the second string may or may not be ordered in itself + * + * before: [start]..[current] is already ordered, and + * [next]..[limit] may be ordered in itself, but + * is not in relation to [start..current[ + * after: [start..current+(limit-next)[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from + * src[next++] and inserts them in correct combining class order into the + * preceding part of the string + * + * since this function is called much less often than the single-code point + * insertOrdered(), it just uses that for easier maintenance + * + * @return the trailing combining class + */ + private static int /*unsigned byte*/ mergeOrdered(char[] source, + int start, + int current, + char[] data, + int next, + int limit) { + int r; + int /*unsigned byte*/ cc, trailCC=0; + boolean adjacent; + + adjacent= current==next; + NextCCArgs ncArgs = new NextCCArgs(); + ncArgs.source = data; + ncArgs.next = next; + ncArgs.limit = limit; + + if(start!=current) { + + while(ncArgs.next