< prev index next >

src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 27,46 **** ******************************************************************************* * Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ - package sun.text.normalizer; import java.io.IOException; import java.nio.ByteBuffer; import java.text.Normalizer; // Original filename in ICU4J: Normalizer2Impl.java public final class NormalizerImpl { - public static final class Hangul { /* Korean Hangul and Jamo constants */ public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ --- 27,44 ----
*** 56,69 **** public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; public static boolean isHangul(int c) { return HANGUL_BASE<=c && c<HANGUL_LIMIT; } ! ! public static boolean isHangulWithoutJamoT(char c) { c-=HANGUL_BASE; ! return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; } /** * Decomposes c, which must be a Hangul syllable, into buffer * and returns the length of the decomposition (2 or 3). --- 54,66 ---- public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; public static boolean isHangul(int c) { return HANGUL_BASE<=c && c<HANGUL_LIMIT; } ! public static boolean isHangulLV(int c) { c-=HANGUL_BASE; ! return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; } /** * Decomposes c, which must be a Hangul syllable, into buffer * and returns the length of the decomposition (2 or 3).
*** 100,113 **** */ public static final class ReorderingBuffer implements Appendable { public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { impl=ni; app=dest; ! if (app instanceof StringBuilder) { appIsStringBuilder=true; str=(StringBuilder)dest; ! // In Java, the constructor subsumes public void init(int destCapacity) str.ensureCapacity(destCapacity); reorderStart=0; if(str.length()==0) { lastCC=0; } else { --- 97,110 ---- */ public static final class ReorderingBuffer implements Appendable { public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { impl=ni; app=dest; ! if(app instanceof StringBuilder) { appIsStringBuilder=true; str=(StringBuilder)dest; ! // In Java, the constructor subsumes public void init(int destCapacity) { str.ensureCapacity(destCapacity); reorderStart=0; if(str.length()==0) { lastCC=0; } else {
*** 135,149 **** public boolean equals(CharSequence s, int start, int limit) { return UTF16Plus.equal(str, 0, str.length(), s, start, limit); } - // For Hangul composition, replacing the Leading consonant Jamo with the syllable. - public void setLastChar(char c) { - str.setCharAt(str.length()-1, c); - } - public void append(int c, int cc) { if(lastCC<=cc || cc==0) { str.appendCodePoint(c); lastCC=cc; if(cc<=1) { --- 132,141 ----
*** 151,161 **** } } else { insert(c, cc); } } - // s must be in NFD, otherwise change the implementation. public void append(CharSequence s, int start, int limit, int leadCC, int trailCC) { if(start==limit) { return; --- 143,152 ----
*** 183,230 **** } append(c, leadCC); } } } - // The following append() methods work like C++ appendZeroCC(). // They assume that the cc or trailCC of their input is 0. // Most of them implement Appendable interface methods. ! // @Override when we switch to Java 6 public ReorderingBuffer append(char c) { str.append(c); lastCC=0; reorderStart=str.length(); return this; } - public void appendZeroCC(int c) { str.appendCodePoint(c); lastCC=0; reorderStart=str.length(); } ! ! // @Override when we switch to Java 6 public ReorderingBuffer append(CharSequence s) { if(s.length()!=0) { str.append(s); lastCC=0; reorderStart=str.length(); } return this; } ! ! // @Override when we switch to Java 6 public ReorderingBuffer append(CharSequence s, int start, int limit) { if(start!=limit) { str.append(s, start, limit); lastCC=0; reorderStart=str.length(); } return this; } - /** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Used after recomposition. * Must be called at the end when writing to a non-StringBuilder Appendable. --- 174,216 ---- } append(c, leadCC); } } } // The following append() methods work like C++ appendZeroCC(). // They assume that the cc or trailCC of their input is 0. // Most of them implement Appendable interface methods. ! @Override public ReorderingBuffer append(char c) { str.append(c); lastCC=0; reorderStart=str.length(); return this; } public void appendZeroCC(int c) { str.appendCodePoint(c); lastCC=0; reorderStart=str.length(); } ! @Override public ReorderingBuffer append(CharSequence s) { if(s.length()!=0) { str.append(s); lastCC=0; reorderStart=str.length(); } return this; } ! @Override public ReorderingBuffer append(CharSequence s, int start, int limit) { if(start!=limit) { str.append(s, start, limit); lastCC=0; reorderStart=str.length(); } return this; } /** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Used after recomposition. * Must be called at the end when writing to a non-StringBuilder Appendable.
*** 241,251 **** throw new InternalError(e); // Avoid declaring "throws IOException". } } lastCC=0; } - /** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Then appends the new text to the Appendable or StringBuilder. * Normally used after quick check loops find a non-empty sequence. --- 227,236 ----
*** 264,280 **** } } lastCC=0; return this; } - public void remove() { str.setLength(0); lastCC=0; reorderStart=0; } - public void removeSuffix(int suffixLength) { int oldLength=str.length(); str.delete(oldLength-suffixLength, oldLength); lastCC=0; reorderStart=str.length(); --- 249,263 ----
*** 316,331 **** if(reorderStart>=codePointStart) { return 0; } int c=str.codePointBefore(codePointStart); codePointStart-=Character.charCount(c); ! if(c<MIN_CCC_LCCC_CP) { ! return 0; ! } ! return getCCFromYesOrMaybe(impl.getNorm16(c)); } - private int codePointStart, codePointLimit; } // TODO: Propose as public API on the UTF16 class. // TODO: Propose widening UTF16 methods that take char to take int. --- 299,310 ---- if(reorderStart>=codePointStart) { return 0; } int c=str.codePointBefore(codePointStart); codePointStart-=Character.charCount(c); ! return impl.getCCFromYesOrMaybeCP(c); } private int codePointStart, codePointLimit; } // TODO: Propose as public API on the UTF16 class. // TODO: Propose widening UTF16 methods that take char to take int.
*** 368,415 **** } public NormalizerImpl() {} private static final class IsAcceptable implements ICUBinary.Authenticate { - // @Override when we switch to Java 6 public boolean isDataVersionAcceptable(byte version[]) { ! return version[0]==2; } } - private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" public NormalizerImpl load(ByteBuffer bytes) { try { dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 ! if(indexesLength<=IX_MIN_MAYBE_YES) { ! throw new IOException("Normalizer2 data: not enough indexes"); } int[] inIndexes=new int[indexesLength]; inIndexes[0]=indexesLength*4; for(int i=1; i<indexesLength; ++i) { inIndexes[i]=bytes.getInt(); } minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; minYesNo=inIndexes[IX_MIN_YES_NO]; minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; minNoNo=inIndexes[IX_MIN_NO_NO]; limitNoNo=inIndexes[IX_LIMIT_NO_NO]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; normTrie=Trie2_16.createFromSerialized(bytes); int trieLength=normTrie.getSerializedLength(); if(trieLength>(nextOffset-offset)) { ! throw new IOException("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset; --- 347,398 ---- } public NormalizerImpl() {} private static final class IsAcceptable implements ICUBinary.Authenticate { public boolean isDataVersionAcceptable(byte version[]) { ! return version[0]==3; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" public NormalizerImpl load(ByteBuffer bytes) { try { dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 ! if(indexesLength<=IX_MIN_LCCC_CP) { ! throw new InternalError("Normalizer2 data: not enough indexes"); } int[] inIndexes=new int[indexesLength]; inIndexes[0]=indexesLength*4; for(int i=1; i<indexesLength; ++i) { inIndexes[i]=bytes.getInt(); } minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; + minLcccCP=inIndexes[IX_MIN_LCCC_CP]; minYesNo=inIndexes[IX_MIN_YES_NO]; minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; minNoNo=inIndexes[IX_MIN_NO_NO]; + minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; + minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; + minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; limitNoNo=inIndexes[IX_LIMIT_NO_NO]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; + assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields + centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; normTrie=Trie2_16.createFromSerialized(bytes); int trieLength=normTrie.getSerializedLength(); if(trieLength>(nextOffset-offset)) { ! throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset;
*** 420,606 **** chars=new char[numChars]; for(int i=0; i<numChars; ++i) { chars[i]=bytes.getChar(); } maybeYesCompositions=new String(chars); ! extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); } // smallFCD: new in formatVersion 2 offset=nextOffset; smallFCD=new byte[0x100]; ! for(int i=0; i<0x100; ++i) { ! smallFCD[i]=bytes.get(); ! } ! ! // Build tccc180[]. ! // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. ! tccc180=new int[0x180]; ! int bits=0; ! for(int c=0; c<0x180; bits>>=1) { ! if((c&0xff)==0) { ! bits=smallFCD[c>>8]; // one byte per 0x100 code points ! } ! if((bits&1)!=0) { ! for(int i=0; i<0x20; ++i, ++c) { ! tccc180[c]=getFCD16FromNormData(c)&0xff; ! } ! } else { ! c+=0x20; ! } ! } return this; } catch(IOException e) { throw new InternalError(e); } } - public NormalizerImpl load(String name) { return load(ICUBinary.getRequiredData(name)); } - public int getNorm16(int c) { - return normTrie.get(c); - } public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } public int getCC(int norm16) { if(norm16>=MIN_NORMAL_MAYBE_YES) { ! return norm16&0xff; } if(norm16<minNoNo || limitNoNo<=norm16) { return 0; } return getCCFromNoNo(norm16); } ! public static int getCCFromYesOrMaybe(int norm16) { ! return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ public int getFCD16(int c) { ! if(c<0) { return 0; - } else if(c<0x180) { - return tccc180[c]; } else if(c<=0xffff) { if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } } return getFCD16FromNormData(c); } - - /** Returns the FCD data for U+0000<=c<U+0180. */ - public int getFCD16FromBelow180(int c) { return tccc180[c]; } /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ public boolean singleLeadMightHaveNonZeroFCD16(int lead) { // 0<=lead<=0xffff byte bits=smallFCD[lead>>8]; if(bits==0) { return false; } return ((bits>>((lead>>5)&7))&1)!=0; } /** Gets the FCD value from the regular normalization data. */ public int getFCD16FromNormData(int c) { - // Only loops for 1:1 algorithmic mappings. - for(;;) { int norm16=getNorm16(c); ! if(norm16<=minYesNo) { ! // no decomposition or Hangul syllable, all zeros ! return 0; ! } else if(norm16>=MIN_NORMAL_MAYBE_YES) { // combining mark ! norm16&=0xff; return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; ! } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); ! } else { // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! if((firstUnit&MAPPING_LENGTH_MASK)==0) { ! // A character that is deleted (maps to an empty string) must ! // get the worst-case lccc and tccc values because arbitrary ! // characters on both sides will become adjacent. ! return 0x1ff; ! } else { int fcd16=firstUnit>>8; // tccc if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc } return fcd16; } - } - } - } /** * Gets the decomposition for one code point. * @param c code point * @return c's decomposition, if it has one; returns null if it does not have a decomposition */ public String getDecomposition(int c) { - int decomp=-1; int norm16; ! for(;;) { ! if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { // c does not decompose ! } else if(isHangul(norm16)) { ! // Hangul syllable: decompose algorithmically ! StringBuilder buffer=new StringBuilder(); ! Hangul.decompose(c, buffer); ! return buffer.toString(); ! } else if(isDecompNoAlgorithmic(norm16)) { decomp=c=mapAlgorithmic(c, norm16); ! continue; ! } else { ! // c decomposes, get everything from the variable-length extra data ! int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; ! return extraData.substring(norm16, norm16+length); } if(decomp<0) { return null; } else { return UTF16.valueOf(decomp); } } ! } ! ! public static final int MIN_CCC_LCCC_CP=0x300; - public static final int MIN_YES_YES_WITH_CC=0xff01; - public static final int JAMO_VT=0xff00; - public static final int MIN_NORMAL_MAYBE_YES=0xfe00; public static final int MAX_DELTA=0x40; // Byte offsets from the start of the data, after the generic header. public static final int IX_NORM_TRIE_OFFSET=0; public static final int IX_EXTRA_DATA_OFFSET=1; public static final int IX_SMALL_FCD_OFFSET=2; ! // Code point thresholds for quick check codes. public static final int IX_MIN_DECOMP_NO_CP=8; public static final int IX_MIN_COMP_NO_MAYBE_CP=9; // Norm16 value thresholds for quick check combinations and types of extra data. ! // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. public static final int IX_MIN_YES_NO=10; public static final int IX_MIN_NO_NO=11; public static final int IX_LIMIT_NO_NO=12; public static final int IX_MIN_MAYBE_YES=13; ! // Mappings only in [minYesNoMappingsOnly..minNoNo[. public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; public static final int MAPPING_LENGTH_MASK=0x1f; public static final int COMP_1_LAST_TUPLE=0x8000; public static final int COMP_1_TRIPLE=1; public static final int COMP_1_TRAIL_LIMIT=0x3400; --- 403,602 ---- chars=new char[numChars]; for(int i=0; i<numChars; ++i) { chars[i]=bytes.getChar(); } maybeYesCompositions=new String(chars); ! extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); } // smallFCD: new in formatVersion 2 offset=nextOffset; smallFCD=new byte[0x100]; ! bytes.get(smallFCD); return this; } catch(IOException e) { throw new InternalError(e); } } public NormalizerImpl load(String name) { return load(ICUBinary.getRequiredData(name)); } + public int getNorm16(int c) { return normTrie.get(c); } + public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } + public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } public int getCC(int norm16) { if(norm16>=MIN_NORMAL_MAYBE_YES) { ! return getCCFromNormalYesOrMaybe(norm16); } if(norm16<minNoNo || limitNoNo<=norm16) { return 0; } return getCCFromNoNo(norm16); } ! public static int getCCFromNormalYesOrMaybe(int norm16) { ! return (norm16 >> OFFSET_SHIFT) & 0xff; ! } public static int getCCFromYesOrMaybe(int norm16) { ! return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; ! } ! public int getCCFromYesOrMaybeCP(int c) { ! if (c < minCompNoMaybeCP) { return 0; } ! return getCCFromYesOrMaybe(getNorm16(c)); } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ public int getFCD16(int c) { ! if(c<minDecompNoCP) { return 0; } else if(c<=0xffff) { if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } } return getFCD16FromNormData(c); } /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ public boolean singleLeadMightHaveNonZeroFCD16(int lead) { // 0<=lead<=0xffff byte bits=smallFCD[lead>>8]; if(bits==0) { return false; } return ((bits>>((lead>>5)&7))&1)!=0; } /** Gets the FCD value from the regular normalization data. */ public int getFCD16FromNormData(int c) { int norm16=getNorm16(c); ! if (norm16 >= limitNoNo) { ! if(norm16>=MIN_NORMAL_MAYBE_YES) { // combining mark ! norm16=getCCFromNormalYesOrMaybe(norm16); return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; ! } else { // isDecompNoAlgorithmic(norm16) ! int deltaTrailCC = norm16 & DELTA_TCCC_MASK; ! if (deltaTrailCC <= DELTA_TCCC_1) { ! return deltaTrailCC >> OFFSET_SHIFT; ! } ! // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); ! norm16=getNorm16(c); ! } ! } ! if(norm16<=minYesNo || isHangulLVT(norm16)) { ! // no decomposition or Hangul syllable, all zeros ! return 0; ! } // c decomposes, get everything from the variable-length extra data ! int mapping=norm16>>OFFSET_SHIFT; ! int firstUnit=extraData.charAt(mapping); int fcd16=firstUnit>>8; // tccc if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc } return fcd16; } /** * Gets the decomposition for one code point. * @param c code point * @return c's decomposition, if it has one; returns null if it does not have a decomposition */ public String getDecomposition(int c) { int norm16; ! if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { // c does not decompose ! return null; ! } ! int decomp = -1; ! if(isDecompNoAlgorithmic(norm16)) { ! // Maps to an isCompYesAndZeroCC. decomp=c=mapAlgorithmic(c, norm16); ! // The mapping might decompose further. ! norm16 = getNorm16(c); } + if (norm16 < minYesNo) { if(decomp<0) { return null; } else { return UTF16.valueOf(decomp); } + } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { + // Hangul syllable: decompose algorithmically + StringBuilder buffer=new StringBuilder(); + Hangul.decompose(c, buffer); + return buffer.toString(); } ! // c decomposes, get everything from the variable-length extra data ! int mapping=norm16>>OFFSET_SHIFT; ! int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; ! return extraData.substring(mapping, mapping+length); ! } ! ! // Fixed norm16 values. ! public static final int MIN_YES_YES_WITH_CC=0xfe02; ! public static final int JAMO_VT=0xfe00; ! public static final int MIN_NORMAL_MAYBE_YES=0xfc00; ! public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE ! public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE ! ! // norm16 bit 0 is comp-boundary-after. ! public static final int HAS_COMP_BOUNDARY_AFTER=1; ! public static final int OFFSET_SHIFT=1; ! ! // For algorithmic one-way mappings, norm16 bits 2..1 indicate the ! // tccc (0, 1, >1) for quick FCC boundary-after tests. ! public static final int DELTA_TCCC_0=0; ! public static final int DELTA_TCCC_1=2; ! public static final int DELTA_TCCC_GT_1=4; ! public static final int DELTA_TCCC_MASK=6; ! public static final int DELTA_SHIFT=3; public static final int MAX_DELTA=0x40; // Byte offsets from the start of the data, after the generic header. public static final int IX_NORM_TRIE_OFFSET=0; public static final int IX_EXTRA_DATA_OFFSET=1; public static final int IX_SMALL_FCD_OFFSET=2; ! public static final int IX_RESERVED3_OFFSET=3; ! public static final int IX_TOTAL_SIZE=7; ! public static final int MIN_CCC_LCCC_CP=0x300; // Code point thresholds for quick check codes. public static final int IX_MIN_DECOMP_NO_CP=8; public static final int IX_MIN_COMP_NO_MAYBE_CP=9; // Norm16 value thresholds for quick check combinations and types of extra data. ! ! /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ public static final int IX_MIN_YES_NO=10; + /** Mappings are comp-normalized. */ public static final int IX_MIN_NO_NO=11; public static final int IX_LIMIT_NO_NO=12; public static final int IX_MIN_MAYBE_YES=13; ! /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; + /** Mappings are not comp-normalized but have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; + /** Mappings do not have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; + /** Mappings to the empty string. */ + public static final int IX_MIN_NO_NO_EMPTY=17; + + public static final int IX_MIN_LCCC_CP=18; + public static final int IX_COUNT=20; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; + public static final int MAPPING_HAS_RAW_MAPPING=0x40; + // unused bit 0x20; public static final int MAPPING_LENGTH_MASK=0x1f; public static final int COMP_1_LAST_TUPLE=0x8000; public static final int COMP_1_TRIPLE=1; public static final int COMP_1_TRAIL_LIMIT=0x3400;
*** 700,710 **** return prevBoundary; // "no" or cc out of order } } return src; } - public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { int limit=s.length(); if(limit==0) { return; } --- 696,705 ----
*** 735,978 **** // !doCompose: isNormalized (buffer must be empty and initialized) public boolean compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer) { int minNoMaybeCP=minCompNoMaybeCP; ! /* ! * prevBoundary points to the last character before the current one ! * that has a composition boundary before it with ccc==0 and quick check "yes". ! * Keeping track of prevBoundary saves us looking for a composition boundary ! * when we find a "no" or "maybe". ! * ! * When we back out from prevSrc back to prevBoundary, ! * then we also remove those same characters (which had been simply copied ! * or canonically-order-inserted) from the ReorderingBuffer. ! * Therefore, at all times, the [prevBoundary..prevSrc[ source units ! * must correspond 1:1 to destination units at the end of the destination buffer. ! */ ! int prevBoundary=src; int prevSrc; ! int c=0; ! int norm16=0; ! ! // only for isNormalized ! int prevCC=0; ! ! for(;;) { ! // count code units below the minimum or with irrelevant data for the quick check ! for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; ! } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; c=Character.toCodePoint(c2, (char)c); } } ! if(isCompYesAndZeroCC(norm16=getNorm16(c))) { ! src+=Character.charCount(c); ! } else { break; } } } - // copy these code units all at once - if(src!=prevSrc) { - if(src==limit) { - if(doCompose) { - buffer.flushAndAppendZeroCC(s, prevSrc, src); } ! break; } ! // Set prevBoundary to the last character in the quick check loop. ! prevBoundary=src-1; ! if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && ! Character.isHighSurrogate(s.charAt(prevBoundary-1)) ! ) { ! --prevBoundary; } ! if(doCompose) { ! // The last "quick check yes" character is excluded from the ! // flush-and-append call in case it needs to be modified. ! buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); ! buffer.append(s, prevBoundary, src); ! } else { ! prevCC=0; } ! // The start of the current character (c). ! prevSrc=src; ! } else if(src==limit) { ! break; } ! ! src+=Character.charCount(c); ! /* ! * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. ! * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) ! * or has ccc!=0. ! * Check for Jamo V/T, then for regular characters. ! * c is not a Hangul syllable or Jamo L because those have "yes" properties. ! */ ! if(isJamoVT(norm16) && prevBoundary!=prevSrc) { char prev=s.charAt(prevSrc-1); - boolean needToDecompose=false; if(c<Hangul.JAMO_T_BASE) { ! // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. ! prev-=Hangul.JAMO_L_BASE; ! if(prev<Hangul.JAMO_L_COUNT) { ! if(!doCompose) { return false; } ! char syllable=(char) ! (Hangul.HANGUL_BASE+ ! (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* ! Hangul.JAMO_T_COUNT); ! char t; ! if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { ++src; ! syllable+=t; // The next character was a Jamo T. ! prevBoundary=src; ! buffer.setLastChar(syllable); continue; } // If we see L+V+x where x!=T then we drop to the slow path, // decompose and recompose. // This is to deal with NFKC finding normal L and V but a ! // compatibility variant of a T. We need to either fully compose that ! // combination here (which would complicate the code and may not work ! // with strange custom data) or use the slow path -- or else our replacing ! // two input characters (L+V) with one output character (LV syllable) ! // would violate the invariant that [prevBoundary..prevSrc[ has the same ! // length as what we appended to the buffer since prevBoundary. ! needToDecompose=true; } ! } else if(Hangul.isHangulWithoutJamoT(prev)) { ! // c is a Jamo Trailing consonant, // compose with previous Hangul LV that does not contain a Jamo T. ! if(!doCompose) { return false; } ! buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); ! prevBoundary=src; ! continue; ! } ! if(!needToDecompose) { ! // The Jamo V/T did not compose into a Hangul syllable. ! if(doCompose) { ! buffer.append((char)c); ! } else { ! prevCC=0; } continue; } ! } ! /* ! * Source buffer pointers: ! * ! * all done quick check current char not yet ! * "yes" but (c) processed ! * may combine ! * forward ! * [-------------[-------------[-------------[-------------[ ! * | | | | | ! * orig. src prevBoundary prevSrc src limit ! * ! * ! * Destination buffer pointers inside the ReorderingBuffer: ! * ! * all done might take not filled yet ! * characters for ! * reordering ! * [-------------[-------------[-------------[ ! * | | | | ! * start reorderStart limit | ! * +remainingCap.+ ! */ ! if(norm16>=MIN_YES_YES_WITH_CC) { ! int cc=norm16&0xff; // cc!=0 ! if( onlyContiguous && // FCC ! (doCompose ? buffer.getLastCC() : prevCC)==0 && ! prevBoundary<prevSrc && ! // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that ! // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) ! // passed the quick check "yes && ccc==0" test. ! // Check whether the last character was a "yesYes" or a "yesNo". ! // If a "yesNo", then we get its trailing ccc from its ! // mapping and check for canonical order. ! // All other cases are ok. ! getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ! ) { // Fails FCD test, need to decompose and contiguously recompose. ! if(!doCompose) { return false; } - } else if(doCompose) { - buffer.append(c, cc); - continue; - } else if(prevCC<=cc) { - prevCC=cc; - continue; } else { ! return false; } ! } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { return false; } ! ! /* ! * Find appropriate boundaries around this character, ! * decompose the source text from between the boundaries, ! * and recompose it. ! * ! * We may need to remove the last few characters from the ReorderingBuffer ! * to account for source text that was copied or appended ! * but needs to take part in the recomposition. ! */ ! ! /* ! * Find the last composition boundary in [prevBoundary..src[. ! * It is either the decomposition of the current character (at prevSrc), ! * or prevBoundary. ! */ ! if(hasCompBoundaryBefore(c, norm16)) { ! prevBoundary=prevSrc; ! } else if(doCompose) { ! buffer.removeSuffix(prevSrc-prevBoundary); } ! // Find the next composition boundary in [src..limit[ - ! // modifies src to point to the next starter. ! src=findNextCompBoundary(s, src, limit); ! ! // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. int recomposeStartIndex=buffer.length(); ! decomposeShort(s, prevBoundary, src, buffer); recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { ! if(!buffer.equals(s, prevBoundary, src)) { return false; } buffer.remove(); - prevCC=0; } - - // Move to the next starter. We never need to look back before this point again. prevBoundary=src; } - return true; } /** * Very similar to compose(): Make the same changes in both places if relevant. * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) --- 730,971 ---- // !doCompose: isNormalized (buffer must be empty and initialized) public boolean compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer) { + int prevBoundary=src; int minNoMaybeCP=minCompNoMaybeCP; ! for (;;) { ! // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, ! // or with (compYes && ccc==0) properties. int prevSrc; ! int c = 0; ! int norm16 = 0; ! for (;;) { ! if (src == limit) { ! if (prevBoundary != limit && doCompose) { ! buffer.append(s, prevBoundary, limit); ! } ! return true; ! } if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; ! } else { ! prevSrc = src++; ! if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { ! if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { ! ++src; c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { ! if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { ! --prevSrc; c=Character.toCodePoint(c2, (char)c); } } ! if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { break; } } } } ! // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. ! // The current character is either a "noNo" (has a mapping) ! // or a "maybeYes" (combines backward) ! // or a "yesYes" with ccc!=0. ! // It is not a Hangul syllable or Jamo L because those have "yes" properties. ! ! // Medium-fast path: Handle cases that do not require full decomposition and recomposition. ! if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes ! if (!doCompose) { ! return false; } ! // Fast path for mapping a character that is immediately surrounded by boundaries. ! // In this case, we need not decompose around the current character. ! if (isDecompNoAlgorithmic(norm16)) { ! // Maps to a single isCompYesAndZeroCC character ! // which also implies hasCompBoundaryBefore. ! if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || ! hasCompBoundaryBefore(s, src, limit)) { ! if (prevBoundary != prevSrc) { ! buffer.append(s, prevBoundary, prevSrc); } ! buffer.append(mapAlgorithmic(c, norm16), 0); ! prevBoundary = src; ! continue; } ! } else if (norm16 < minNoNoCompBoundaryBefore) { ! // The mapping is comp-normalized which also implies hasCompBoundaryBefore. ! if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || ! hasCompBoundaryBefore(s, src, limit)) { ! if (prevBoundary != prevSrc) { ! buffer.append(s, prevBoundary, prevSrc); ! } ! int mapping = norm16 >> OFFSET_SHIFT; ! int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; ! buffer.append(extraData, mapping, mapping + length); ! prevBoundary = src; ! continue; } ! } else if (norm16 >= minNoNoEmpty) { ! // The current character maps to nothing. ! // Simply omit it from the output if there is a boundary before _or_ after it. ! // The character itself implies no boundaries. ! if (hasCompBoundaryBefore(s, src, limit) || ! hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { ! if (prevBoundary != prevSrc) { ! buffer.append(s, prevBoundary, prevSrc); ! } ! prevBoundary = src; ! continue; ! } ! } ! // Other "noNo" type, or need to examine more text around this character: ! // Fall through to the slow path. ! } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { char prev=s.charAt(prevSrc-1); if(c<Hangul.JAMO_T_BASE) { ! // The current character is a Jamo Vowel, ! // compose with previous Jamo L and following Jamo T. ! char l = (char)(prev-Hangul.JAMO_L_BASE); ! if(l<Hangul.JAMO_L_COUNT) { ! if (!doCompose) { return false; } ! int t; ! if (src != limit && ! 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && ! t < Hangul.JAMO_T_COUNT) { ! // The next character is a Jamo T. ++src; ! } else if (hasCompBoundaryBefore(s, src, limit)) { ! // No Jamo T follows, not even via decomposition. ! t = 0; ! } else { ! t = -1; ! } ! if (t >= 0) { ! int syllable = Hangul.HANGUL_BASE + ! (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * ! Hangul.JAMO_T_COUNT + t; ! --prevSrc; // Replace the Jamo L as well. ! if (prevBoundary != prevSrc) { ! buffer.append(s, prevBoundary, prevSrc); ! } ! buffer.append((char)syllable); ! prevBoundary = src; continue; } // If we see L+V+x where x!=T then we drop to the slow path, // decompose and recompose. // This is to deal with NFKC finding normal L and V but a ! // compatibility variant of a T. ! // We need to either fully compose that combination here ! // (which would complicate the code and may not work with strange custom data) ! // or use the slow path. } ! } else if (Hangul.isHangulLV(prev)) { ! // The current character is a Jamo Trailing consonant, // compose with previous Hangul LV that does not contain a Jamo T. ! if (!doCompose) { return false; } ! int syllable = prev + c - Hangul.JAMO_T_BASE; ! --prevSrc; // Replace the Hangul LV as well. ! if (prevBoundary != prevSrc) { ! buffer.append(s, prevBoundary, prevSrc); } + buffer.append((char)syllable); + prevBoundary = src; continue; } ! // No matching context, or may need to decompose surrounding text first: ! // Fall through to the slow path. ! } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC ! // One or more combining marks that do not combine-back: ! // Check for canonical order, copy unchanged if ok and ! // if followed by a character with a boundary-before. ! int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 ! if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { // Fails FCD test, need to decompose and contiguously recompose. ! if (!doCompose) { return false; } } else { ! // If !onlyContiguous (not FCC), then we ignore the tccc of ! // the previous character which passed the quick check "yes && ccc==0" test. ! int n16; ! for (;;) { ! if (src == limit) { ! if (doCompose) { ! buffer.append(s, prevBoundary, limit); } ! return true; ! } ! int prevCC = cc; ! c = Character.codePointAt(s, src); ! n16 = normTrie.get(c); ! if (n16 >= MIN_YES_YES_WITH_CC) { ! cc = getCCFromNormalYesOrMaybe(n16); ! if (prevCC > cc) { ! if (!doCompose) { return false; } ! break; ! } ! } else { ! break; ! } ! src += Character.charCount(c); ! } ! // p is after the last in-order combining mark. ! // If there is a boundary here, then we continue with no change. ! if (norm16HasCompBoundaryBefore(n16)) { ! if (isCompYesAndZeroCC(n16)) { ! src += Character.charCount(c); ! } ! continue; ! } ! // Use the slow path. There is no boundary in [prevSrc, src[. ! } } ! // Slow path: Find the nearest boundaries around the current character, ! // decompose and recompose. ! if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { ! c = Character.codePointBefore(s, prevSrc); ! norm16 = normTrie.get(c); ! if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { ! prevSrc -= Character.charCount(c); ! } ! } ! if (doCompose && prevBoundary != prevSrc) { ! buffer.append(s, prevBoundary, prevSrc); ! } int recomposeStartIndex=buffer.length(); ! // We know there is not a boundary here. ! decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, ! buffer); ! // Decompose until the next boundary. ! src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, ! buffer); recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { ! if(!buffer.equals(s, prevSrc, src)) { return false; } buffer.remove(); } prevBoundary=src; } } /** * Very similar to compose(): Make the same changes in both places if relevant. * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
*** 982,1095 **** * then the quick check result is "no" */ public int composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan) { int qcResult=0; - int minNoMaybeCP=minCompNoMaybeCP; - - /* - * prevBoundary points to the last character before the current one - * that has a composition boundary before it with ccc==0 and quick check "yes". - */ int prevBoundary=src; ! int prevSrc; ! int c=0; ! int norm16=0; ! int prevCC=0; for(;;) { ! // count code units below the minimum or with irrelevant data for the quick check ! for(prevSrc=src;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; ! } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; c=Character.toCodePoint(c2, (char)c); } } ! if(isCompYesAndZeroCC(norm16=getNorm16(c))) { ! src+=Character.charCount(c); ! } else { break; } } } - if(src!=prevSrc) { - // Set prevBoundary to the last character in the quick check loop. - prevBoundary=src-1; - if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && - Character.isHighSurrogate(s.charAt(prevBoundary-1)) - ) { - --prevBoundary; } ! prevCC=0; ! // The start of the current character (c). ! prevSrc=src; } - src+=Character.charCount(c); - /* - * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. - * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) - * or has ccc!=0. - */ if(isMaybeOrNonZeroCC(norm16)) { int cc=getCCFromYesOrMaybe(norm16); ! if( onlyContiguous && // FCC ! cc!=0 && ! prevCC==0 && ! prevBoundary<prevSrc && ! // prevCC==0 && prevBoundary<prevSrc tell us that ! // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) ! // passed the quick check "yes && ccc==0" test. ! // Check whether the last character was a "yesYes" or a "yesNo". ! // If a "yesNo", then we get its trailing ccc from its ! // mapping and check for canonical order. ! // All other cases are ok. ! getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ! ) { ! // Fails FCD test. ! } else if(prevCC<=cc || cc==0) { ! prevCC=cc; ! if(norm16<MIN_YES_YES_WITH_CC) { ! if(!doSpan) { ! qcResult=1; } else { ! return prevBoundary<<1; // spanYes does not care to know it's "maybe" } } continue; } } return prevBoundary<<1; // "no" } } - public void composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { ! int firstStarterInSrc=findNextCompBoundary(s, 0, limit); if(0!=firstStarterInSrc) { int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), ! buffer.length()); StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ firstStarterInSrc+16); middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastStarterInDest); middle.append(s, 0, firstStarterInSrc); --- 975,1098 ---- * then the quick check result is "no" */ public int composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan) { int qcResult=0; int prevBoundary=src; ! int minNoMaybeCP=minCompNoMaybeCP; for(;;) { ! // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, ! // or with (compYes && ccc==0) properties. ! int prevSrc; ! int c = 0; ! int norm16 = 0; ! for (;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; ! } else { ! prevSrc = src++; ! if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { ! if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { ! ++src; c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { ! if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { ! --prevSrc; c=Character.toCodePoint(c2, (char)c); } } ! if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { break; } } } } ! // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. ! // The current character is either a "noNo" (has a mapping) ! // or a "maybeYes" (combines backward) ! // or a "yesYes" with ccc!=0. ! // It is not a Hangul syllable or Jamo L because those have "yes" properties. ! ! int prevNorm16 = INERT; ! if (prevBoundary != prevSrc) { ! prevBoundary = prevSrc; ! if (!norm16HasCompBoundaryBefore(norm16)) { ! c = Character.codePointBefore(s, prevSrc); ! int n16 = getNorm16(c); ! if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { ! prevBoundary -= Character.charCount(c); ! prevNorm16 = n16; ! } ! } } if(isMaybeOrNonZeroCC(norm16)) { int cc=getCCFromYesOrMaybe(norm16); ! if (onlyContiguous /* FCC */ && cc != 0 && ! getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { ! // The [prevBoundary..prevSrc[ character ! // passed the quick check "yes && ccc==0" test ! // but is out of canonical order with the current combining mark. ! } else { ! // If !onlyContiguous (not FCC), then we ignore the tccc of ! // the previous character which passed the quick check "yes && ccc==0" test. ! for (;;) { ! if (norm16 < MIN_YES_YES_WITH_CC) { ! if (!doSpan) { ! qcResult = 1; ! } else { ! return prevBoundary << 1; // spanYes does not care to know it's "maybe" ! } ! } ! if (src == limit) { ! return (src<<1) | qcResult; // "yes" or "maybe" ! } ! int prevCC = cc; ! c = Character.codePointAt(s, src); ! norm16 = getNorm16(c); ! if (isMaybeOrNonZeroCC(norm16)) { ! cc = getCCFromYesOrMaybe(norm16); ! if (!(prevCC <= cc || cc == 0)) { ! break; ! } } else { ! break; } + src += Character.charCount(c); } + // src is after the last in-order combining mark. + if (isCompYesAndZeroCC(norm16)) { + prevBoundary = src; + src += Character.charCount(c); continue; } } + } return prevBoundary<<1; // "no" } } public void composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { ! int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); if(0!=firstStarterInSrc) { int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), ! buffer.length(), onlyContiguous); StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ firstStarterInSrc+16); middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastStarterInDest); middle.append(s, 0, firstStarterInSrc);
*** 1101,1111 **** compose(s, src, limit, onlyContiguous, true, buffer); } else { buffer.append(s, src, limit); } } - // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { // Note: In this function we use buffer->appendZeroCC() because we track --- 1104,1113 ----
*** 1123,1133 **** int fcd16=0; for(;;) { // count code units with lccc==0 for(prevSrc=src; src!=limit;) { ! if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { prevFCD16=~c; ++src; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { prevFCD16=0; ++src; --- 1125,1135 ---- int fcd16=0; for(;;) { // count code units with lccc==0 for(prevSrc=src; src!=limit;) { ! if((c=s.charAt(src))<minLcccCP) { prevFCD16=~c; ++src; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { prevFCD16=0; ++src;
*** 1162,1177 **** break; } prevBoundary=src; // We know that the previous character's lccc==0. if(prevFCD16<0) { ! // Fetching the fcd16 value was deferred for this below-U+0300 code point. int prev=~prevFCD16; ! prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); if(prevFCD16>1) { --prevBoundary; } } else { int p=src-1; if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && Character.isHighSurrogate(s.charAt(p-1)) ) { --- 1164,1183 ---- break; } prevBoundary=src; // We know that the previous character's lccc==0. if(prevFCD16<0) { ! // Fetching the fcd16 value was deferred for this below-minLcccCP code point. int prev=~prevFCD16; ! if(prev<minDecompNoCP) { ! prevFCD16=0; ! } else { ! prevFCD16=getFCD16FromNormData(prev); if(prevFCD16>1) { --prevBoundary; } + } } else { int p=src-1; if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && Character.isHighSurrogate(s.charAt(p-1)) ) {
*** 1226,1291 **** src=findNextFCDBoundary(s, src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ ! decomposeShort(s, prevBoundary, src, buffer); prevBoundary=src; prevFCD16=0; } } return src; } ! // Note: hasDecompBoundary() could be implemented as aliases to ! // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() ! // at the cost of building the FCD trie for a decomposition normalizer. ! public boolean hasDecompBoundary(int c, boolean before) { ! for(;;) { ! if(c<minDecompNoCP) { ! return true; } ! int norm16=getNorm16(c); ! if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { return true; ! } else if(norm16>MIN_NORMAL_MAYBE_YES) { ! return false; // ccc!=0 ! } else if(isDecompNoAlgorithmic(norm16)) { ! c=mapAlgorithmic(c, norm16); ! } else { // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! if((firstUnit&MAPPING_LENGTH_MASK)==0) { ! return false; } ! if(!before) { // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return false; // trailCC>1 } if(firstUnit<=0xff) { return true; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary - } // true if leadCC==0 (hasFCDBoundaryBefore()) ! return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; ! } ! } } public boolean hasCompBoundaryBefore(int c) { ! return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); } private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } ! private boolean isHangul(int norm16) { return norm16==minYesNo; } private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } - // UBool isCompYes(uint16_t norm16) const { // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; // } // UBool isCompYesOrMaybe(uint16_t norm16) const { // return norm16<minNoNo || minMaybeYes<=norm16; --- 1232,1320 ---- src=findNextFCDBoundary(s, src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ ! decomposeShort(s, prevBoundary, src, false, false, buffer); prevBoundary=src; prevFCD16=0; } } return src; } ! public boolean hasDecompBoundaryBefore(int c) { ! return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || ! norm16HasDecompBoundaryBefore(getNorm16(c)); } ! public boolean norm16HasDecompBoundaryBefore(int norm16) { ! if (norm16 < minNoNoCompNoMaybeCC) { return true; ! } ! if (norm16 >= limitNoNo) { ! return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; ! } // c decomposes, get everything from the variable-length extra data ! int mapping=norm16>>OFFSET_SHIFT; ! int firstUnit=extraData.charAt(mapping); ! // true if leadCC==0 (hasFCDBoundaryBefore()) ! return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; ! } ! public boolean hasDecompBoundaryAfter(int c) { ! if (c < minDecompNoCP) { ! return true; ! } ! if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { ! return true; } ! return norm16HasDecompBoundaryAfter(getNorm16(c)); ! } ! public boolean norm16HasDecompBoundaryAfter(int norm16) { ! if(norm16 <= minYesNo || isHangulLVT(norm16)) { ! return true; ! } ! if (norm16 >= limitNoNo) { ! if (isMaybeOrNonZeroCC(norm16)) { ! return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; ! } ! // Maps to an isCompYesAndZeroCC. ! return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; ! } ! // c decomposes, get everything from the variable-length extra data ! int mapping=norm16>>OFFSET_SHIFT; ! int firstUnit=extraData.charAt(mapping); // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return false; // trailCC>1 } if(firstUnit<=0xff) { return true; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary // true if leadCC==0 (hasFCDBoundaryBefore()) ! return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; } + public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } public boolean hasCompBoundaryBefore(int c) { ! return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); ! } ! public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { ! return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); } private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } + private static boolean isInert(int norm16) { return norm16==INERT; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } ! private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } ! private boolean isHangulLV(int norm16) { return norm16==minYesNo; } ! private boolean isHangulLVT(int norm16) { ! return norm16==hangulLVT(); ! } private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } // UBool isCompYes(uint16_t norm16) const { // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; // } // UBool isCompYesOrMaybe(uint16_t norm16) const { // return norm16<minNoNo || minMaybeYes<=norm16;
*** 1296,1426 **** private boolean isDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==JAMO_VT || (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); } - /** * A little faster and simpler than isDecompYesAndZeroCC() but does not include * the MaybeYes which combine-forward and have ccc=0. ! * (Standard Unicode 5.2 normalization does not have such characters.) */ private boolean isMostDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; } - private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { ! // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; // } private int getCCFromNoNo(int norm16) { ! if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! return extraData.charAt(norm16-1)&0xff; } else { return 0; } } ! ! // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() ! int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { ! int c; ! if(cpStart==(cpLimit-1)) { ! c=s.charAt(cpStart); ! } else { ! c=Character.codePointAt(s, cpStart); ! } ! int prevNorm16=getNorm16(c); ! if(prevNorm16<=minYesNo) { ! return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 } else { ! return extraData.charAt(prevNorm16)>>8; // tccc from yesNo } } // Requires algorithmic-NoNo. private int mapAlgorithmic(int c, int norm16) { ! return c+norm16-(minMaybeYes-MAX_DELTA-1); } // Requires minYesNo<norm16<limitNoNo. ! // private int getMapping(int norm16) { return /*extraData+*/norm16; } /** * @return index into maybeYesCompositions, or -1 */ private int getCompositionsListForDecompYes(int norm16) { ! if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { return -1; } else { if((norm16-=minMaybeYes)<0) { // norm16<minMaybeYes: index into extraData which is a substring at // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list } ! return norm16; } } - /** * @return index into maybeYesCompositions */ private int getCompositionsListForComposite(int norm16) { ! // composite has both mapping & compositions list ! int firstUnit=extraData.charAt(norm16); ! return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions ! 1+ // +1 to skip the first unit with the mapping lenth (firstUnit&MAPPING_LENGTH_MASK); // + mapping length } // Decompose a short piece of text which is likely to contain characters that // fail the quick check loop and/or where the quick check loop's overhead // is unlikely to be amortized. // Called by the compose() and makeFCD() implementations. // Public in Java for collation implementation code. ! public void decomposeShort(CharSequence s, int src, int limit, ReorderingBuffer buffer) { while(src<limit) { int c=Character.codePointAt(s, src); src+=Character.charCount(c); ! decompose(c, getNorm16(c), buffer); } } ! ! private void decompose(int c, int norm16, ! ReorderingBuffer buffer) { ! // Only loops for 1:1 algorithmic mappings. ! for(;;) { // get the decomposition and the lead and trail cc's ! if(isDecompYes(norm16)) { ! // c does not decompose buffer.append(c, getCCFromYesOrMaybe(norm16)); ! } else if(isHangul(norm16)) { ! // Hangul syllable: decompose algorithmically ! Hangul.decompose(c, buffer); ! } else if(isDecompNoAlgorithmic(norm16)) { c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); ! continue; } else { // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); int length=firstUnit&MAPPING_LENGTH_MASK; int leadCC, trailCC; trailCC=firstUnit>>8; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! leadCC=extraData.charAt(norm16-1)>>8; } else { leadCC=0; } ! ++norm16; // skip over the firstUnit ! buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); ! } ! return; } } /** * Finds the recomposition result for --- 1325,1459 ---- private boolean isDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==JAMO_VT || (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); } /** * A little faster and simpler than isDecompYesAndZeroCC() but does not include * the MaybeYes which combine-forward and have ccc=0. ! * (Standard Unicode 10 normalization does not have such characters.) */ private boolean isMostDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; } private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { ! // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; // } private int getCCFromNoNo(int norm16) { ! int mapping=norm16>>OFFSET_SHIFT; ! if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! return extraData.charAt(mapping-1)&0xff; } else { return 0; } } ! int getTrailCCFromCompYesAndZeroCC(int norm16) { ! if(norm16<=minYesNo) { ! return 0; // yesYes and Hangul LV have ccc=tccc=0 } else { ! // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. ! return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo } } // Requires algorithmic-NoNo. private int mapAlgorithmic(int c, int norm16) { ! return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; } // Requires minYesNo<norm16<limitNoNo. ! // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } /** * @return index into maybeYesCompositions, or -1 */ private int getCompositionsListForDecompYes(int norm16) { ! if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { return -1; } else { if((norm16-=minMaybeYes)<0) { // norm16<minMaybeYes: index into extraData which is a substring at // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list } ! return norm16>>OFFSET_SHIFT; } } /** * @return index into maybeYesCompositions */ private int getCompositionsListForComposite(int norm16) { ! // A composite has both mapping & compositions list. ! int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; ! int firstUnit=maybeYesCompositions.charAt(list); ! return list+ // mapping in maybeYesCompositions ! 1+ // +1 to skip the first unit with the mapping length (firstUnit&MAPPING_LENGTH_MASK); // + mapping length } // Decompose a short piece of text which is likely to contain characters that // fail the quick check loop and/or where the quick check loop's overhead // is unlikely to be amortized. // Called by the compose() and makeFCD() implementations. // Public in Java for collation implementation code. ! private int decomposeShort( ! CharSequence s, int src, int limit, ! boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer) { while(src<limit) { int c=Character.codePointAt(s, src); + if (stopAtCompBoundary && c < minCompNoMaybeCP) { + return src; + } + int norm16 = getNorm16(c); + if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { + return src; + } src+=Character.charCount(c); ! decompose(c, norm16, buffer); ! if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { ! return src; } } ! return src; ! } ! private void decompose(int c, int norm16, ReorderingBuffer buffer) { // get the decomposition and the lead and trail cc's ! if (norm16 >= limitNoNo) { ! if (isMaybeOrNonZeroCC(norm16)) { buffer.append(c, getCCFromYesOrMaybe(norm16)); ! return; ! } ! // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); ! } ! if (norm16 < minYesNo) { ! // c does not decompose ! buffer.append(c, 0); ! } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { ! // Hangul syllable: decompose algorithmically ! Hangul.decompose(c, buffer); } else { // c decomposes, get everything from the variable-length extra data ! int mapping=norm16>>OFFSET_SHIFT; ! int firstUnit=extraData.charAt(mapping); int length=firstUnit&MAPPING_LENGTH_MASK; int leadCC, trailCC; trailCC=firstUnit>>8; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! leadCC=extraData.charAt(mapping-1)>>8; } else { leadCC=0; } ! ++mapping; // skip over the firstUnit ! buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); } } /** * Finds the recomposition result for
*** 1455,1465 **** while(key1>(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { ! return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); } else { return compositions.charAt(list+1); } } } else { --- 1488,1498 ---- while(key1>(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { ! return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); } else { return compositions.charAt(list+1); } } } else {
*** 1531,1541 **** if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList>=0 && // the backward-combining character is not blocked ! (prevCC<cc || prevCC==0)) { if(isJamoVT(norm16)) { // c is a Jamo V/T, see if we can compose it with the previous character. if(c<Hangul.JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); --- 1564,1575 ---- if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList>=0 && // the backward-combining character is not blocked ! (prevCC<cc || prevCC==0) ! ) { if(isJamoVT(norm16)) { // c is a Jamo V/T, see if we can compose it with the previous character. if(c<Hangul.JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
*** 1652,1715 **** * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes * (isCompYesAndZeroCC()) so we need not decompose. */ private boolean hasCompBoundaryBefore(int c, int norm16) { ! for(;;) { ! if(isCompYesAndZeroCC(norm16)) { ! return true; ! } else if(isMaybeOrNonZeroCC(norm16)) { ! return false; ! } else if(isDecompNoAlgorithmic(norm16)) { ! c=mapAlgorithmic(c, norm16); ! norm16=getNorm16(c); ! } else { ! // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! if((firstUnit&MAPPING_LENGTH_MASK)==0) { ! return false; } ! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { ! return false; // non-zero leadCC } ! return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); } } } ! private int findPreviousCompBoundary(CharSequence s, int p) { while(p>0) { int c=Character.codePointBefore(s, p); p-=Character.charCount(c); ! if(hasCompBoundaryBefore(c)) { break; } - // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, - // but that's probably not worth the extra cost. } return p; } ! ! private int findNextCompBoundary(CharSequence s, int p, int limit) { while(p<limit) { int c=Character.codePointAt(s, p); int norm16=normTrie.get(c); if(hasCompBoundaryBefore(c, norm16)) { break; } p+=Character.charCount(c); } return p; } private int findNextFCDBoundary(CharSequence s, int p, int limit) { while(p<limit) { int c=Character.codePointAt(s, p); ! if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { break; } p+=Character.charCount(c); } return p; } /** --- 1686,1757 ---- * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes * (isCompYesAndZeroCC()) so we need not decompose. */ private boolean hasCompBoundaryBefore(int c, int norm16) { ! return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); } ! private boolean norm16HasCompBoundaryBefore(int norm16) { ! return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); } ! private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { ! return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); } + private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { + return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && + (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); } + private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { + return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); + } + /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ + private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { + return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? + (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); } ! private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { while(p>0) { int c=Character.codePointBefore(s, p); + int norm16 = getNorm16(c); + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + break; + } p-=Character.charCount(c); ! if(hasCompBoundaryBefore(c, norm16)) { break; } } return p; } ! private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { while(p<limit) { int c=Character.codePointAt(s, p); int norm16=normTrie.get(c); if(hasCompBoundaryBefore(c, norm16)) { break; } p+=Character.charCount(c); + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + break; + } } return p; } + private int findNextFCDBoundary(CharSequence s, int p, int limit) { while(p<limit) { int c=Character.codePointAt(s, p); ! int norm16; ! if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { break; } p+=Character.charCount(c); + if (norm16HasDecompBoundaryAfter(norm16)) { + break; + } } return p; } /**
*** 1988,1998 **** } // we know the cc of the last code point return trailCC; } - /** * merge two UTF-16 string parts together * to canonically order (order by combining classes) their concatenation * * the two strings may already be adjacent, so that the merging is done --- 2030,2039 ----
*** 2072,2082 **** prevArgs.current = ncArgs.limit; return getPrevCC(prevArgs); } } - private static final class PrevArgs{ char[] src; int start; int current; char c1; --- 2113,2122 ----
*** 2088,2098 **** --- 2128,2156 ---- int next; int limit; char c1; char c2; } + private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { + args.c1=args.source[args.next++]; + args.c2=0; + if (UTF16.isTrailSurrogate(args.c1)) { + /* unpaired second surrogate */ + return 0; + } else if (!UTF16.isLeadSurrogate(args.c1)) { + return UCharacter.getCombiningClass(args.c1); + } else if (args.next!=args.limit && + UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ + ++args.next; + return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); + } else { + /* unpaired first surrogate */ + args.c2=0; + return 0; + } + } private static int /*unsigned*/ getPrevCC(PrevArgs args) { args.c1=args.src[--args.current]; args.c2=0; if (args.c1 < MIN_CCC_LCCC_CP) {
*** 2111,2155 **** args.c2=0; return 0; } } ! private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { ! args.c1=args.source[args.next++]; ! args.c2=0; ! ! if (UTF16.isTrailSurrogate(args.c1)) { ! /* unpaired second surrogate */ ! return 0; ! } else if (!UTF16.isLeadSurrogate(args.c1)) { ! return UCharacter.getCombiningClass(args.c1); ! } else if (args.next!=args.limit && ! UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ ! ++args.next; ! return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); ! } else { ! /* unpaired first surrogate */ ! args.c2=0; return 0; } } private VersionInfo dataVersion; ! // Code point thresholds for quick check codes. private int minDecompNoCP; private int minCompNoMaybeCP; // Norm16 value thresholds for quick check combinations and types of extra data. private int minYesNo; private int minYesNoMappingsOnly; private int minNoNo; private int limitNoNo; private int minMaybeYes; private Trie2_16 normTrie; private String maybeYesCompositions; private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 - private int[] tccc180; // [0x180] tccc values for U+0000..U+017F ! } --- 2169,2204 ---- args.c2=0; return 0; } } ! private int getPreviousTrailCC(CharSequence s, int start, int p) { ! if (start == p) { return 0; } + return getFCD16(Character.codePointBefore(s, p)); } private VersionInfo dataVersion; ! // BMP code point thresholds for quick check loops looking at single UTF-16 code units. private int minDecompNoCP; private int minCompNoMaybeCP; + private int minLcccCP; // Norm16 value thresholds for quick check combinations and types of extra data. private int minYesNo; private int minYesNoMappingsOnly; private int minNoNo; + private int minNoNoCompBoundaryBefore; + private int minNoNoCompNoMaybeCC; + private int minNoNoEmpty; private int limitNoNo; + private int centerNoNoDelta; private int minMaybeYes; private Trie2_16 normTrie; private String maybeYesCompositions; private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 ! }
< prev index next >