< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 20,2549 **** * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* ! * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * ! * * ! * The original version of this source code and documentation is copyrighted * ! * and owned by IBM, These materials are provided under terms of a License * ! * Agreement between IBM and Sun. This technology is protected by multiple * ! * US and International patents. This notice and attribution to IBM may not * ! * to removed. * ******************************************************************************* */ package sun.text.normalizer; - import java.io.BufferedInputStream; - import java.io.ByteArrayInputStream; import java.io.IOException; ! import java.io.BufferedInputStream; ! import java.io.InputStream; ! /** ! * @author Ram Viswanadha ! */ public final class NormalizerImpl { - // Static block for the class to initialize its own self - static final NormalizerImpl IMPL; - - static - { - try - { - IMPL = new NormalizerImpl(); - } - catch (Exception e) - { - throw new RuntimeException(e.getMessage()); - } - } - - static final int UNSIGNED_BYTE_MASK =0xFF; - static final long UNSIGNED_INT_MASK = 0xffffffffL; - /* - * This new implementation of the normalization code loads its data from - * unorm.icu, which is generated with the gennorm tool. - * The format of that file is described at the end of this file. - */ - private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu"; - - // norm32 value constants - - // quick check flags 0..3 set mean "no" for their forms - public static final int QC_NFC=0x11; /* no|maybe */ - public static final int QC_NFKC=0x22; /* no|maybe */ - public static final int QC_NFD=4; /* no */ - public static final int QC_NFKD=8; /* no */ - - public static final int QC_ANY_NO=0xf; - - /* quick check flags 4..5 mean "maybe" for their forms; - * test flags>=QC_MAYBE - */ - public static final int QC_MAYBE=0x10; - public static final int QC_ANY_MAYBE=0x30; - - public static final int QC_MASK=0x3f; - - private static final int COMBINES_FWD=0x40; - private static final int COMBINES_BACK=0x80; - public static final int COMBINES_ANY=0xc0; - // UnicodeData.txt combining class in bits 15. - private static final int CC_SHIFT=8; - public static final int CC_MASK=0xff00; - // 16 bits for the index to UChars and other extra data - private static final int EXTRA_SHIFT=16; - - /* norm32 value constants using >16 bits */ - private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK; - private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK; - private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK; - // private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK; - private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK; - - - /* indexes[] value names */ - /* number of bytes in normalization trie */ - static final int INDEX_TRIE_SIZE = 0; - /* number of chars in extra data */ - static final int INDEX_CHAR_COUNT = 1; - /* number of uint16_t words for combining data */ - static final int INDEX_COMBINE_DATA_COUNT = 2; - /* first code point with quick check NFC NO/MAYBE */ - public static final int INDEX_MIN_NFC_NO_MAYBE = 6; - /* first code point with quick check NFKC NO/MAYBE */ - public static final int INDEX_MIN_NFKC_NO_MAYBE = 7; - /* first code point with quick check NFD NO/MAYBE */ - public static final int INDEX_MIN_NFD_NO_MAYBE = 8; - /* first code point with quick check NFKD NO/MAYBE */ - public static final int INDEX_MIN_NFKD_NO_MAYBE = 9; - /* number of bytes in FCD trie */ - static final int INDEX_FCD_TRIE_SIZE = 10; - /* number of bytes in the auxiliary trie */ - static final int INDEX_AUX_TRIE_SIZE = 11; - /* changing this requires a new formatVersion */ - static final int INDEX_TOP = 32; - - - /* AUX constants */ - /* value constants for auxTrie */ - private static final int AUX_UNSAFE_SHIFT = 11; - private static final int AUX_COMP_EX_SHIFT = 10; - private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12; - - private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT; - private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK); - private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK); - private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK); - private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT); - - private static final int MAX_BUFFER_SIZE = 20; - - /*******************************/ - - /* Wrappers for Trie implementations */ - static final class NormTrieImpl implements Trie.DataManipulate{ - static IntTrie normTrie= null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ - public int getFoldingOffset(int value){ - return BMP_INDEX_LENGTH+ - ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))& - (0x3ff<<SURROGATE_BLOCK_BITS)); - } - - } - static final class FCDTrieImpl implements Trie.DataManipulate{ - static CharTrie fcdTrie=null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - /* fcdTrie: the folding offset is the lead FCD value itself */ - public int getFoldingOffset(int value){ - return value; - } - } - - static final class AuxTrieImpl implements Trie.DataManipulate{ - static CharTrie auxTrie = null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */ - public int getFoldingOffset(int value){ - return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS; - } - } - - /****************************************************/ - - - private static FCDTrieImpl fcdTrieImpl; - private static NormTrieImpl normTrieImpl; - private static AuxTrieImpl auxTrieImpl; - private static int[] indexes; - private static char[] combiningTable; - private static char[] extraData; - - private static boolean isDataLoaded; - private static boolean isFormatVersion_2_1; - private static boolean isFormatVersion_2_2; - private static byte[] unicodeVersion; - - /** - * Default buffer size of datafile - */ - private static final int DATA_BUFFER_SIZE = 25000; - - /** - * FCD check: everything below this code point is known to have a 0 - * lead combining class - */ - public static final int MIN_WITH_LEAD_CC=0x300; - - - /** - * Bit 7 of the length byte for a decomposition string in extra data is - * a flag indicating whether the decomposition string is - * preceded by a 16-bit word with the leading and trailing cc - * of the decomposition (like for A-umlaut); - * if not, then both cc's are zero (like for compatibility ideographs). - */ - private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80; - /** - * Bits 6..0 of the length byte contain the actual length. - */ - private static final int DECOMP_LENGTH_MASK=0x7f; - - /** Length of the BMP portion of the index (stage 1) array. */ - private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_; - /** Number of bits of a trail surrogate that are used in index table - * lookups. - */ - private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_; - - - // public utility - public static int getFromIndexesArr(int index){ - return indexes[index]; - } - - // protected constructor --------------------------------------------- - - /** - * Constructor - * @exception thrown when data reading fails or data corrupted - */ - private NormalizerImpl() throws IOException { - //data should be loaded only once - if(!isDataLoaded){ - - // jar access - InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME); - BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE); - NormalizerDataReader reader = new NormalizerDataReader(b); - - // read the indexes - indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP); - - byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]]; - - int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT]; - combiningTable = new char[combiningTableTop]; - - int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT]; - extraData = new char[extraDataTop]; - - byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]]; - byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]]; - - fcdTrieImpl = new FCDTrieImpl(); - normTrieImpl = new NormTrieImpl(); - auxTrieImpl = new AuxTrieImpl(); - - // load the rest of the data data and initialize the data members - reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable); - - NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl ); - FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl ); - AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl ); - - // we reached here without any exceptions so the data is fully - // loaded set the variable to true - isDataLoaded = true; - - // get the data format version - byte[] formatVersion = reader.getDataFormatVersion(); - - isFormatVersion_2_1 =( formatVersion[0]>2 - || - (formatVersion[0]==2 && formatVersion[1]>=1) - ); - isFormatVersion_2_2 =( formatVersion[0]>2 - || - (formatVersion[0]==2 && formatVersion[1]>=2) - ); - unicodeVersion = reader.getUnicodeVersion(); - b.close(); - } - } - - /* ---------------------------------------------------------------------- */ /* Korean Hangul and Jamo constants */ - public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ public static final int HANGUL_BASE=0xac00; public static final int JAMO_L_COUNT=19; public static final int JAMO_V_COUNT=21; ! public static final int JAMO_T_COUNT=28; ! public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; ! ! private static boolean isHangulWithoutJamoT(char c) { ! c-=HANGUL_BASE; ! return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; ! } ! ! /* norm32 helpers */ ! ! /* is this a norm32 with a regular index? */ ! private static boolean isNorm32Regular(long norm32) { ! return norm32<MIN_SPECIAL; ! } ! ! /* is this a norm32 with a special index for a lead surrogate? */ ! private static boolean isNorm32LeadSurrogate(long norm32) { ! return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP; ! } ! ! /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */ ! private static boolean isNorm32HangulOrJamo(long norm32) { ! return norm32>=MIN_HANGUL; ! } ! ! /* ! * Given norm32 for Jamo V or T, ! * is this a Jamo V? ! */ ! private static boolean isJamoVTNorm32JamoV(long norm32) { ! return norm32<JAMO_V_TOP; ! } ! ! /* data access primitives ----------------------------------------------- */ ! ! public static long/*unsigned*/ getNorm32(char c) { ! return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c))); ! } ! ! public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32, ! char c2) { ! /* ! * the surrogate index in norm32 stores only the number of the surrogate ! * index block see gennorm/store.c/getFoldedNormValue() ! */ ! return ((UNSIGNED_INT_MASK) & ! NormTrieImpl.normTrie.getTrailValue((int)norm32, c2)); ! } ! ///CLOVER:OFF ! private static long getNorm32(int c){ ! return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c))); ! } ! ! /* ! * get a norm32 from text with complete code points ! * (like from decompositions) ! */ ! private static long/*unsigned*/ getNorm32(char[] p,int start, ! int/*unsigned*/ mask) { ! long/*unsigned*/ norm32= getNorm32(p[start]); ! if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) { ! /* *p is a lead surrogate, get the real norm32 */ ! norm32=getNorm32FromSurrogatePair(norm32, p[start+1]); ! } ! return norm32; ! } ! ! //// for StringPrep ! public static VersionInfo getUnicodeVersion(){ ! return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1], ! unicodeVersion[2], unicodeVersion[3]); ! } ! ! public static char getFCD16(char c) { ! return FCDTrieImpl.fcdTrie.getLeadValue(c); ! } ! ! public static char getFCD16FromSurrogatePair(char fcd16, char c2) { ! /* the surrogate index in fcd16 is an absolute offset over the ! * start of stage 1 ! * */ ! return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2); ! } ! public static int getFCD16(int c) { ! return FCDTrieImpl.fcdTrie.getCodePointValue(c); ! } ! ! private static int getExtraDataIndex(long norm32) { ! return (int)(norm32>>EXTRA_SHIFT); ! } ! ! private static final class DecomposeArgs{ ! int /*unsigned byte*/ cc; ! int /*unsigned byte*/ trailCC; ! int length; ! } ! /** ! * ! * get the canonical or compatibility decomposition for one character ! * ! * @return index into the extraData array ! */ ! private static int/*index*/ decompose(long/*unsigned*/ norm32, ! int/*unsigned*/ qcMask, ! DecomposeArgs args) { ! int p= getExtraDataIndex(norm32); ! args.length=extraData[p++]; ! ! if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) { ! /* use compatibility decomposition, skip canonical data */ ! p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK); ! args.length>>=8; ! } ! ! if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { ! /* get the lead and trail cc's */ ! char bothCCs=extraData[p++]; ! args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); ! args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; ! } else { ! /* lead and trail cc's are both 0 */ ! args.cc=args.trailCC=0; ! } ! ! args.length&=DECOMP_LENGTH_MASK; ! return p; ! } ! ! ! /** ! * get the canonical decomposition for one character ! * @return index into the extraData array ! */ ! private static int decompose(long/*unsigned*/ norm32, ! DecomposeArgs args) { ! ! int p= getExtraDataIndex(norm32); ! args.length=extraData[p++]; ! ! if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { ! /* get the lead and trail cc's */ ! char bothCCs=extraData[p++]; ! args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); ! args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; ! } else { ! /* lead and trail cc's are both 0 */ ! args.cc=args.trailCC=0; ! } ! ! args.length&=DECOMP_LENGTH_MASK; ! return p; ! } ! ! ! private static final class NextCCArgs{ ! char[] source; ! int next; ! int limit; ! char c; ! char c2; ! } ! ! /* ! * get the combining class of (c, c2)= args.source[args.next++] ! * before: args.next<args.limit after: args.next<=args.limit ! * if only one code unit is used, then c2==0 ! */ ! private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { ! long /*unsigned*/ norm32; ! ! args.c=args.source[args.next++]; ! ! norm32= getNorm32(args.c); ! if((norm32 & CC_MASK)==0) { ! args.c2=0; ! return 0; ! } else { ! if(!isNorm32LeadSurrogate(norm32)) { ! args.c2=0; ! } else { ! /* c is a lead surrogate, get the real norm32 */ ! if(args.next!=args.limit && ! UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ ! ++args.next; ! norm32=getNorm32FromSurrogatePair(norm32, args.c2); ! } else { ! args.c2=0; ! return 0; ! } ! } ! ! return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); ! } ! } ! ! private static final class PrevArgs{ ! char[] src; ! int start; ! int current; ! char c; ! char c2; ! } ! ! /* ! * read backwards and get norm32 ! * return 0 if the character is <minC ! * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first ! * surrogate but read second!) ! */ ! private static long /*unsigned*/ getPrevNorm32(PrevArgs args, ! int/*unsigned*/ minC, ! int/*unsigned*/ mask) { ! long/*unsigned*/ norm32; ! ! args.c=args.src[--args.current]; ! args.c2=0; ! ! /* check for a surrogate before getting norm32 to see if we need to ! * predecrement further ! */ ! if(args.c<minC) { ! return 0; ! } else if(!UTF16.isSurrogate(args.c)) { ! return getNorm32(args.c); ! } else if(UTF16.isLeadSurrogate(args.c)) { ! /* unpaired first surrogate */ ! return 0; ! } else if(args.current!=args.start && ! UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { ! --args.current; ! norm32=getNorm32(args.c2); ! ! if((norm32&mask)==0) { ! /* all surrogate pairs with this lead surrogate have ! * only irrelevant data ! */ ! return 0; ! } else { ! /* norm32 must be a surrogate special */ ! return getNorm32FromSurrogatePair(norm32, args.c); ! } ! } else { ! /* unpaired second surrogate */ ! args.c2=0; ! return 0; ! } ! } ! ! /* ! * get the combining class of (c, c2)=*--p ! * before: start<p after: start<=p ! */ ! private static int /*unsigned byte*/ getPrevCC(PrevArgs args) { ! ! return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC, ! CC_MASK)>>CC_SHIFT)); ! } ! ! /* ! * is this a safe boundary character for NF*D? ! * (lead cc==0) ! */ ! public static boolean isNFDSafe(long/*unsigned*/ norm32, ! int/*unsigned*/ccOrQCMask, ! int/*unsigned*/ decompQCMask) { ! if((norm32&ccOrQCMask)==0) { ! return true; /* cc==0 and no decomposition: this is NF*D safe */ ! } ! ! /* inspect its decomposition - maybe a Hangul but not a surrogate here*/ ! if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) { ! DecomposeArgs args=new DecomposeArgs(); ! /* decomposes, get everything from the variable-length extra data */ ! decompose(norm32, decompQCMask, args); ! return args.cc==0; ! } else { ! /* no decomposition (or Hangul), test the cc directly */ ! return (norm32&CC_MASK)==0; ! } ! } ! ! /* ! * is this (or does its decomposition begin with) a "true starter"? ! * (cc==0 and NF*C_YES) ! */ ! public static boolean isTrueStarter(long/*unsigned*/ norm32, ! int/*unsigned*/ ccOrQCMask, ! int/*unsigned*/ decompQCMask) { ! if((norm32&ccOrQCMask)==0) { ! return true; /* this is a true starter (could be Hangul or Jamo L)*/ ! } ! ! /* inspect its decomposition - not a Hangul or a surrogate here */ ! if((norm32&decompQCMask)!=0) { ! int p; /* index into extra data array */ ! DecomposeArgs args=new DecomposeArgs(); ! /* decomposes, get everything from the variable-length extra data */ ! p=decompose(norm32, decompQCMask, args); ! ! if(args.cc==0) { ! int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK; ! ! /* does it begin with NFC_YES? */ ! if((getNorm32(extraData,p, qcMask)&qcMask)==0) { ! /* yes, the decomposition begins with a true starter */ ! return true; ! } ! } ! } ! return false; ! } ! ! /* reorder UTF-16 in-place ---------------------------------------------- */ ! ! /** ! * simpler, single-character version of mergeOrdered() - ! * bubble-insert one single code point into the preceding string ! * which is already canonically ordered ! * (c, c2) may or may not yet have been inserted at src[current]..src[p] ! * ! * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) ! * ! * before: src[start]..src[current] is already ordered, and ! * src[current]..src[p] may or may not hold (c, c2) but ! * must be exactly the same length as (c, c2) ! * after: src[start]..src[p] is ordered ! * ! * @return the trailing combining class ! */ ! private static int/*unsigned byte*/ insertOrdered(char[] source, ! int start, ! int current, int p, ! char c, char c2, ! int/*unsigned byte*/ cc) { ! int back, preBack; ! int r; ! int prevCC, trailCC=cc; ! ! if(start<current && cc!=0) { ! // search for the insertion point where cc>=prevCC ! preBack=back=current; ! PrevArgs prevArgs = new PrevArgs(); ! prevArgs.current = current; ! prevArgs.start = start; ! prevArgs.src = source; ! // get the prevCC ! prevCC=getPrevCC(prevArgs); ! preBack = prevArgs.current; ! ! if(cc<prevCC) { ! // this will be the last code point, so keep its cc ! trailCC=prevCC; ! back=preBack; ! while(start<preBack) { ! prevCC=getPrevCC(prevArgs); ! preBack=prevArgs.current; ! if(cc>=prevCC) { ! break; ! } ! back=preBack; ! } ! ! ! // this is where we are right now with all these indicies: ! // [start]..[pPreBack] 0..? code points that we can ignore ! // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc ! // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) ! // [current]..[p] 1 code point (c, c2) with cc ! ! // move the code units in between up ! r=p; ! do { ! source[--r]=source[--current]; ! } while(back!=current); ! } ! } ! ! // insert (c, c2) ! source[current]=c; ! if(c2!=0) { ! source[(current+1)]=c2; ! } ! ! // we know the cc of the last code point ! return trailCC; ! } ! ! /** ! * merge two UTF-16 string parts together ! * to canonically order (order by combining classes) their concatenation ! * ! * the two strings may already be adjacent, so that the merging is done ! * in-place if the two strings are not adjacent, then the buffer holding the ! * first one must be large enough ! * the second string may or may not be ordered in itself ! * ! * before: [start]..[current] is already ordered, and ! * [next]..[limit] may be ordered in itself, but ! * is not in relation to [start..current[ ! * after: [start..current+(limit-next)[ is ordered ! * ! * the algorithm is a simple bubble-sort that takes the characters from ! * src[next++] and inserts them in correct combining class order into the ! * preceding part of the string ! * ! * since this function is called much less often than the single-code point ! * insertOrdered(), it just uses that for easier maintenance ! * ! * @return the trailing combining class ! */ ! private static int /*unsigned byte*/ mergeOrdered(char[] source, ! int start, ! int current, ! char[] data, ! int next, ! int limit, ! boolean isOrdered) { ! int r; ! int /*unsigned byte*/ cc, trailCC=0; ! boolean adjacent; ! ! adjacent= current==next; ! NextCCArgs ncArgs = new NextCCArgs(); ! ncArgs.source = data; ! ncArgs.next = next; ! ncArgs.limit = limit; ! ! if(start!=current || !isOrdered) { ! ! while(ncArgs.next<ncArgs.limit) { ! cc=getNextCC(ncArgs); ! if(cc==0) { ! // does not bubble back ! trailCC=0; ! if(adjacent) { ! current=ncArgs.next; ! } else { ! data[current++]=ncArgs.c; ! if(ncArgs.c2!=0) { ! data[current++]=ncArgs.c2; } } ! if(isOrdered) { ! break; } else { ! start=current; } ! } else { ! r=current+(ncArgs.c2==0 ? 1 : 2); ! trailCC=insertOrdered(source,start, current, r, ! ncArgs.c, ncArgs.c2, cc); ! current=r; } } } ! if(ncArgs.next==ncArgs.limit) { ! // we know the cc of the last code point ! return trailCC; } else { ! if(!adjacent) { ! // copy the second string part ! do { ! source[current++]=data[ncArgs.next++]; ! } while(ncArgs.next!=ncArgs.limit); ! ncArgs.limit=current; } ! PrevArgs prevArgs = new PrevArgs(); ! prevArgs.src = data; ! prevArgs.start = start; ! prevArgs.current = ncArgs.limit; ! return getPrevCC(prevArgs); } ! } - private static int /*unsigned byte*/ mergeOrdered(char[] source, - int start, - int current, - char[] data, - final int next, - final int limit) { - return mergeOrdered(source,start,current,data,next,limit,true); - } - - public static NormalizerBase.QuickCheckResult quickCheck(char[] src, - int srcStart, - int srcLimit, - int minNoMaybe, - int qcMask, - int options, - boolean allowMaybe, - UnicodeSet nx){ - - int ccOrQCMask; - long norm32; - char c, c2; - char cc, prevCC; - long qcNorm32; - NormalizerBase.QuickCheckResult result; - ComposePartArgs args = new ComposePartArgs(); - char[] buffer ; - int start = srcStart; - - if(!isDataLoaded) { - return NormalizerBase.MAYBE; } - // initialize - ccOrQCMask=CC_MASK|qcMask; - result=NormalizerBase.YES; - prevCC=0; ! for(;;) { ! for(;;) { ! if(srcStart==srcLimit) { ! return result; ! } else if((c=src[srcStart++])>=minNoMaybe && ! (( norm32=getNorm32(c)) & ccOrQCMask)!=0) { ! break; ! } ! prevCC=0; ! } ! // check one above-minimum, relevant code unit ! if(isNorm32LeadSurrogate(norm32)) { ! // c is a lead surrogate, get the real norm32 ! if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) { ! ++srcStart; ! norm32=getNorm32FromSurrogatePair(norm32,c2); ! } else { ! norm32=0; ! c2=0; ! } ! }else{ ! c2=0; ! } ! if(nx_contains(nx, c, c2)) { ! /* excluded: norm32==0 */ ! norm32=0; ! } ! ! // check the combining order ! cc=(char)((norm32>>CC_SHIFT)&0xFF); ! if(cc!=0 && cc<prevCC) { ! return NormalizerBase.NO; } - prevCC=cc; - - // check for "no" or "maybe" quick check flags - qcNorm32 = norm32 & qcMask; - if((qcNorm32& QC_ANY_NO)>=1) { - result= NormalizerBase.NO; - break; - } else if(qcNorm32!=0) { - // "maybe" can only occur for NFC and NFKC - if(allowMaybe){ - result=NormalizerBase.MAYBE; - }else{ - // normalize a section around here to see if it is really - // normalized or not - int prevStarter; - int/*unsigned*/ decompQCMask; - - decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask ! // find the previous starter ! ! // set prevStarter to the beginning of the current character ! prevStarter=srcStart-1; ! if(UTF16.isTrailSurrogate(src[prevStarter])) { ! // safe because unpaired surrogates do not result ! // in "maybe" ! --prevStarter; } - prevStarter=findPreviousStarter(src, start, prevStarter, - ccOrQCMask, decompQCMask, - (char)minNoMaybe); - - // find the next true starter in [src..limit[ - modifies - // src to point to the next starter - srcStart=findNextStarter(src,srcStart, srcLimit, qcMask, - decompQCMask,(char) minNoMaybe); - - //set the args for compose part - args.prevCC = prevCC; - - // decompose and recompose [prevStarter..src[ - buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx); ! // compare the normalized version with the original ! if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) { ! result=NormalizerBase.NO; // normalization differs ! break; } ! // continue after the next starter } } } ! return result; } - - - //------------------------------------------------------ - // make NFD & NFKD - //------------------------------------------------------ - - public static int decompose(char[] src,int srcStart,int srcLimit, - char[] dest,int destStart,int destLimit, - boolean compat,int[] outTrailCC, - UnicodeSet nx) { - - char[] buffer = new char[3]; - int prevSrc; - long norm32; - int ccOrQCMask, qcMask; - int reorderStartIndex, length; - char c, c2, minNoMaybe; - int/*unsigned byte*/ cc, prevCC, trailCC; - char[] p; - int pStart; - int destIndex = destStart; - int srcIndex = srcStart; - if(!compat) { - minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE]; - qcMask=QC_NFD; - } else { - minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE]; - qcMask=QC_NFKD; } - - /* initialize */ - ccOrQCMask=CC_MASK|qcMask; - reorderStartIndex=0; - prevCC=0; - norm32=0; - c=0; - pStart=0; - - cc=trailCC=-1;//initialize to bogus value - - for(;;) { - /* count code units below the minimum or with irrelevant data for - * the quick check - */ - prevSrc=srcIndex; - - while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe || - ((norm32=getNorm32(c))&ccOrQCMask)==0)){ - prevCC=0; - ++srcIndex; } ! /* copy these code units all at once */ ! if(srcIndex!=prevSrc) { ! length=srcIndex-prevSrc; ! if((destIndex+length)<=destLimit) { ! System.arraycopy(src,prevSrc,dest,destIndex,length); } ! destIndex+=length; ! reorderStartIndex=destIndex; } ! /* end of source reached? */ ! if(srcIndex==srcLimit) { ! break; } ! ! /* c already contains *src and norm32 is set for it, increment src*/ ! ++srcIndex; ! ! /* check one above-minimum, relevant code unit */ ! /* ! * generally, set p and length to the decomposition string ! * in simple cases, p==NULL and (c, c2) will hold the length code ! * units to append in all cases, set cc to the lead and trailCC to ! * the trail combining class ! * ! * the following merge-sort of the current character into the ! * preceding, canonically ordered result text will use the ! * optimized insertOrdered() ! * if there is only one single code point to process; ! * this is indicated with p==NULL, and (c, c2) is the character to ! * insert ! * ((c, 0) for a BMP character and (lead surrogate, trail surrogate) ! * for a supplementary character) ! * otherwise, p[length] is merged in with _mergeOrdered() ! */ ! if(isNorm32HangulOrJamo(norm32)) { ! if(nx_contains(nx, c)) { ! c2=0; ! p=null; ! length=1; ! } else { ! // Hangul syllable: decompose algorithmically ! p=buffer; ! pStart=0; ! cc=trailCC=0; ! ! c-=HANGUL_BASE; ! ! c2=(char)(c%JAMO_T_COUNT); ! c/=JAMO_T_COUNT; ! if(c2>0) { ! buffer[2]=(char)(JAMO_T_BASE+c2); ! length=3; ! } else { ! length=2; } ! buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT); ! buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT); ! } ! } else { ! if(isNorm32Regular(norm32)) { ! c2=0; ! length=1; ! } else { ! // c is a lead surrogate, get the real norm32 ! if(srcIndex!=srcLimit && ! UTF16.isTrailSurrogate(c2=src[srcIndex])) { ! ++srcIndex; ! length=2; ! norm32=getNorm32FromSurrogatePair(norm32, c2); ! } else { ! c2=0; ! length=1; ! norm32=0; } } ! /* get the decomposition and the lead and trail cc's */ ! if(nx_contains(nx, c, c2)) { ! /* excluded: norm32==0 */ ! cc=trailCC=0; ! p=null; ! } else if((norm32&qcMask)==0) { ! /* c does not decompose */ ! cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); ! p=null; ! pStart=-1; ! } else { ! DecomposeArgs arg = new DecomposeArgs(); ! /* c decomposes, get everything from the variable-length ! * extra data ! */ ! pStart=decompose(norm32, qcMask, arg); ! p=extraData; ! length=arg.length; ! cc=arg.cc; ! trailCC=arg.trailCC; ! if(length==1) { ! /* fastpath a single code unit from decomposition */ ! c=p[pStart]; ! c2=0; ! p=null; ! pStart=-1; } } } ! /* append the decomposition to the destination buffer, assume ! * length>0 ! */ ! if((destIndex+length)<=destLimit) { ! int reorderSplit=destIndex; ! if(p==null) { ! /* fastpath: single code point */ ! if(cc!=0 && cc<prevCC) { ! /* (c, c2) is out of order with respect to the preceding ! * text */ ! destIndex+=length; ! trailCC=insertOrdered(dest,reorderStartIndex, ! reorderSplit, destIndex, c, c2, cc); } else { ! /* just append (c, c2) */ ! dest[destIndex++]=c; ! if(c2!=0) { ! dest[destIndex++]=c2; } } ! } else { ! /* general: multiple code points (ordered by themselves) ! * from decomposition ! */ ! if(cc!=0 && cc<prevCC) { ! /* the decomposition is out of order with respect to the ! * preceding text ! */ ! destIndex+=length; ! trailCC=mergeOrdered(dest,reorderStartIndex, ! reorderSplit,p, pStart,pStart+length); ! } else { ! /* just append the decomposition */ ! do { ! dest[destIndex++]=p[pStart++]; ! } while(--length>0); } } ! } else { ! /* buffer overflow */ ! /* keep incrementing the destIndex for preflighting */ ! destIndex+=length; } ! prevCC=trailCC; ! if(prevCC==0) { ! reorderStartIndex=destIndex; } } - - outTrailCC[0]=prevCC; - - return destIndex - destStart; } - - /* make NFC & NFKC ------------------------------------------------------ */ - private static final class NextCombiningArgs{ - char[] source; - int start; - //int limit; - char c; - char c2; - int/*unsigned*/ combiningIndex; - char /*unsigned byte*/ cc; } ! /* get the composition properties of the next character */ ! private static int /*unsigned*/ getNextCombining(NextCombiningArgs args, ! int limit, ! UnicodeSet nx) { ! long/*unsigned*/ norm32; ! int combineFlags; ! /* get properties */ ! args.c=args.source[args.start++]; ! norm32=getNorm32(args.c); ! ! /* preset output values for most characters */ ! args.c2=0; ! args.combiningIndex=0; ! args.cc=0; ! if((norm32&(CC_MASK|COMBINES_ANY))==0) { return 0; ! } else { ! if(isNorm32Regular(norm32)) { ! /* set cc etc. below */ ! } else if(isNorm32HangulOrJamo(norm32)) { ! /* a compatibility decomposition contained Jamos */ ! args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0| ! (norm32>>EXTRA_SHIFT))); ! return (int)(norm32&COMBINES_ANY); ! } else { ! /* c is a lead surrogate, get the real norm32 */ ! if(args.start!=limit && UTF16.isTrailSurrogate(args.c2= ! args.source[args.start])) { ! ++args.start; ! norm32=getNorm32FromSurrogatePair(norm32, args.c2); ! } else { ! args.c2=0; return 0; } } ! if(nx_contains(nx, args.c, args.c2)) { ! return 0; /* excluded: norm32==0 */ } ! args.cc= (char)((norm32>>CC_SHIFT)&0xff); ! combineFlags=(int)(norm32&COMBINES_ANY); ! if(combineFlags!=0) { ! int index = getExtraDataIndex(norm32); ! args.combiningIndex=index>0 ? extraData[(index-1)] :0; } ! return combineFlags; } } ! /* ! * given a composition-result starter (c, c2) - which means its cc==0, ! * it combines forward, it has extra data, its norm32!=0, ! * it is not a Hangul or Jamo, ! * get just its combineFwdIndex ! * ! * norm32(c) is special if and only if c2!=0 ! */ ! private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){ ! long/*unsigned*/ norm32; ! norm32=getNorm32(c); ! if(c2!=0) { ! norm32=getNorm32FromSurrogatePair(norm32, c2); } ! return extraData[(getExtraDataIndex(norm32)-1)]; } ! /* ! * Find the recomposition result for ! * a forward-combining character ! * (specified with a pointer to its part of the combiningTable[]) ! * and a backward-combining character ! * (specified with its combineBackIndex). ! * ! * If these two characters combine, then set (value, value2) ! * with the code unit(s) of the composition character. ! * ! * Return value: ! * 0 do not combine ! * 1 combine ! * >1 combine, and the composition is a forward-combining starter ! * ! * See unormimp.h for a description of the composition table format. ! */ ! private static int/*unsigned*/ combine(char[]table,int tableStart, ! int/*unsinged*/ combineBackIndex, ! int[] outValues) { ! int/*unsigned*/ key; ! int value,value2; ! if(outValues.length<2){ ! throw new IllegalArgumentException(); } ! /* search in the starter's composition table */ ! for(;;) { ! key=table[tableStart++]; ! if(key>=combineBackIndex) { ! break; } ! tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1; } ! /* mask off bit 15, the last-entry-in-the-list flag */ ! if((key&0x7fff)==combineBackIndex) { ! /* found! combine! */ ! value=table[tableStart]; ! ! /* is the composition a starter that combines forward? */ ! key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1)); ! /* get the composition result code point from the variable-length ! * result value ! */ ! if((value&0x8000) != 0) { ! if((value&0x4000) != 0) { ! /* surrogate pair composition result */ ! value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800)); ! value2=table[tableStart+1]; ! } else { ! /* BMP composition result U+2000..U+ffff */ ! value=table[tableStart+1]; ! value2=0; } ! } else { ! /* BMP composition result U+0000..U+1fff */ ! value&=0x1fff; ! value2=0; } - outValues[0]=value; - outValues[1]=value2; - return key; } else { ! /* not found */ ! return 0; } } ! private static final class RecomposeArgs{ ! char[] source; ! int start; ! int limit; } - /* - * recompose the characters in [p..limit[ - * (which is in NFD - decomposed and canonically ordered), - * adjust limit, and return the trailing cc - * - * since for NFKC we may get Jamos in decompositions, we need to - * recompose those too - * - * note that recomposition never lengthens the text: - * any character consists of either one or two code units; - * a composition may contain at most one more code unit than the original - * starter, while the combining mark that is removed has at least one code - * unit - */ - private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) { - int remove, q, r; - int /*unsigned*/ combineFlags; - int /*unsigned*/ combineFwdIndex, combineBackIndex; - int /*unsigned*/ result, value=0, value2=0; - int /*unsigned byte*/ prevCC; - boolean starterIsSupplementary; - int starter; - int[] outValues = new int[2]; - starter=-1; /* no starter */ - combineFwdIndex=0; /* will not be used until starter!=NULL */ - starterIsSupplementary=false; /* will not be used until starter!=NULL */ - prevCC=0; ! NextCombiningArgs ncArg = new NextCombiningArgs(); ! ncArg.source = args.source; ! ncArg.cc =0; ! ncArg.c2 =0; ! for(;;) { ! ncArg.start = args.start; ! combineFlags=getNextCombining(ncArg,args.limit,nx); ! combineBackIndex=ncArg.combiningIndex; ! args.start = ncArg.start; ! ! if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) { ! if((combineBackIndex&0x8000)!=0) { ! /* c is a Jamo V/T, see if we can compose it with the ! * previous character ! */ ! /* for the PRI #29 fix, check that there is no intervening combining mark */ ! if((options&BEFORE_PRI_29)!=0 || prevCC==0) { ! remove=-1; /* NULL while no Hangul composition */ ! combineFlags=0; ! ncArg.c2=args.source[starter]; ! if(combineBackIndex==0xfff2) { ! /* Jamo V, compose with previous Jamo L and following ! * Jamo T ! */ ! ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE); ! if(ncArg.c2<JAMO_L_COUNT) { ! remove=args.start-1; ! ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+ ! (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT); ! if(args.start!=args.limit && ! (ncArg.c2=(char)(args.source[args.start] ! -JAMO_T_BASE))<JAMO_T_COUNT) { ! ++args.start; ! ncArg.c+=ncArg.c2; ! } else { ! /* the result is an LV syllable, which is a starter (unlike LVT) */ ! combineFlags=COMBINES_FWD; ! } ! if(!nx_contains(nx, ncArg.c)) { ! args.source[starter]=ncArg.c; ! } else { ! /* excluded */ ! if(!isHangulWithoutJamoT(ncArg.c)) { ! --args.start; /* undo the ++args.start from reading the Jamo T */ } ! /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */ ! remove=args.start; } } ! /* ! * Normally, the following can not occur: ! * Since the input is in NFD, there are no Hangul LV syllables that ! * a Jamo T could combine with. ! * All Jamo Ts are combined above when handling Jamo Vs. ! * ! * However, before the PRI #29 fix, this can occur due to ! * an intervening combining mark between the Hangul LV and the Jamo T. */ } else { ! /* Jamo T, compose with previous Hangul that does not have a Jamo T */ ! if(isHangulWithoutJamoT(ncArg.c2)) { ! ncArg.c2+=ncArg.c-JAMO_T_BASE; ! if(!nx_contains(nx, ncArg.c2)) { ! remove=args.start-1; ! args.source[starter]=ncArg.c2; } } } - - if(remove!=-1) { - /* remove the Jamo(s) */ - q=remove; - r=args.start; - while(r<args.limit) { - args.source[q++]=args.source[r++]; } - args.start=remove; - args.limit=q; } ! ncArg.c2=0; /* c2 held *starter temporarily */ ! ! if(combineFlags!=0) { ! /* ! * not starter=NULL because the composition is a Hangul LV syllable ! * and might combine once more (but only before the PRI #29 fix) */ ! ! /* done? */ ! if(args.start==args.limit) { ! return (char)prevCC; ! } ! ! /* the composition is a Hangul LV syllable which is a starter that combines forward */ ! combineFwdIndex=0xfff0; ! ! /* we combined; continue with looking for compositions */ continue; } ! } ! ! /* ! * now: cc==0 and the combining index does not include ! * "forward" -> the rest of the loop body will reset starter ! * to NULL; technically, a composed Hangul syllable is a ! * starter, but it does not combine forward now that we have ! * consumed all eligible Jamos; for Jamo V/T, combineFlags ! * does not contain _NORM_COMBINES_FWD ! */ ! ! } else if( ! /* the starter is not a Hangul LV or Jamo V/T and */ ! !((combineFwdIndex&0x8000)!=0) && ! /* the combining mark is not blocked and */ ! ((options&BEFORE_PRI_29)!=0 ? ! (prevCC!=ncArg.cc || prevCC==0) : ! (prevCC<ncArg.cc || prevCC==0)) && ! /* the starter and the combining mark (c, c2) do combine */ ! 0!=(result=combine(combiningTable,combineFwdIndex, ! combineBackIndex, outValues)) && ! /* the composition result is not excluded */ ! !nx_contains(nx, (char)value, (char)value2) ! ) { ! value=outValues[0]; ! value2=outValues[1]; ! /* replace the starter with the composition, remove the ! * combining mark ! */ ! remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */ ! ! /* replace the starter with the composition */ ! args.source[starter]=(char)value; ! if(starterIsSupplementary) { ! if(value2!=0) { ! /* both are supplementary */ ! args.source[starter+1]=(char)value2; } else { ! /* the composition is shorter than the starter, ! * move the intermediate characters forward one */ ! starterIsSupplementary=false; ! q=starter+1; ! r=q+1; ! while(r<remove) { ! args.source[q++]=args.source[r++]; } - --remove; } - } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB - starterIsSupplementary=true; - args.source[starter+1]=(char)value2; - /* } else { both are on the BMP, nothing more to do */ } ! /* remove the combining mark by moving the following text ! * over it */ ! if(remove<args.start) { ! q=remove; ! r=args.start; ! while(r<args.limit) { ! args.source[q++]=args.source[r++]; ! } ! args.start=remove; ! args.limit=q; ! } ! /* keep prevCC because we removed the combining mark */ ! /* done? */ ! if(args.start==args.limit) { ! return (char)prevCC; ! } ! /* is the composition a starter that combines forward? */ ! if(result>1) { ! combineFwdIndex=getCombiningIndexFromStarter((char)value, ! (char)value2); ! } else { ! starter=-1; ! } ! /* we combined; continue with looking for compositions */ ! continue; ! } ! } ! /* no combination this time */ ! prevCC=ncArg.cc; ! if(args.start==args.limit) { ! return (char)prevCC; ! } ! /* if (c, c2) did not combine, then check if it is a starter */ ! if(ncArg.cc==0) { ! /* found a new starter; combineFlags==0 if (c, c2) is excluded */ ! if((combineFlags&COMBINES_FWD)!=0) { ! /* it may combine with something, prepare for it */ ! if(ncArg.c2==0) { ! starterIsSupplementary=false; ! starter=args.start-1; } else { ! starterIsSupplementary=false; ! starter=args.start-2; } ! combineFwdIndex=combineBackIndex; } else { ! /* it will not combine with anything */ ! starter=-1; } - } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) { - /* FCC: no discontiguous compositions; any intervening character blocks */ - starter=-1; } } } ! ! // find the last true starter between src[start]....src[current] going ! // backwards and return its index ! private static int findPreviousStarter(char[]src, int srcStart, int current, ! int/*unsigned*/ ccOrQCMask, ! int/*unsigned*/ decompQCMask, ! char minNoMaybe) { ! long norm32; ! PrevArgs args = new PrevArgs(); ! args.src = src; ! args.start = srcStart; ! args.current = current; ! ! while(args.start<args.current) { ! norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask); ! if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { break; } - } - return args.current; - } - - /* find the first true starter in [src..limit[ and return the - * pointer to it - */ - private static int/*index*/ findNextStarter(char[] src,int start,int limit, - int/*unsigned*/ qcMask, - int/*unsigned*/ decompQCMask, - char minNoMaybe) { - int p; - long/*unsigned*/ norm32; - int ccOrQCMask; - char c, c2; - - ccOrQCMask=CC_MASK|qcMask; ! DecomposeArgs decompArgs = new DecomposeArgs(); ! ! for(;;) { ! if(start==limit) { ! break; /* end of string */ } ! c=src[start]; ! if(c<minNoMaybe) { ! break; /* catches NUL terminater, too */ } - - norm32=getNorm32(c); - if((norm32&ccOrQCMask)==0) { - break; /* true starter */ } ! ! if(isNorm32LeadSurrogate(norm32)) { ! /* c is a lead surrogate, get the real norm32 */ ! if((start+1)==limit || ! !UTF16.isTrailSurrogate(c2=(src[start+1]))){ ! /* unmatched first surrogate: counts as a true starter */ ! break; } - norm32=getNorm32FromSurrogatePair(norm32, c2); - - if((norm32&ccOrQCMask)==0) { - break; /* true starter */ } ! } else { ! c2=0; } ! /* (c, c2) is not a true starter but its decomposition may be */ ! if((norm32&decompQCMask)!=0) { ! /* (c, c2) decomposes, get everything from the variable-length ! * extra data */ ! p=decompose(norm32, decompQCMask, decompArgs); ! ! /* get the first character's norm32 to check if it is a true ! * starter */ ! if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) { ! break; /* true starter */ } } ! ! start+= c2==0 ? 1 : 2; /* not a true starter, continue */ ! } ! ! return start; } ! ! ! private static final class ComposePartArgs{ ! int prevCC; ! int length; /* length of decomposed part */ } ! /* decompose and recompose [prevStarter..src[ */ ! private static char[] composePart(ComposePartArgs args, ! int prevStarter, ! char[] src, int start, int limit, ! int options, ! UnicodeSet nx) { ! int recomposeLimit; ! boolean compat =((options&OPTIONS_COMPAT)!=0); ! /* decompose [prevStarter..src[ */ ! int[] outTrailCC = new int[1]; ! char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE]; ! for(;;){ ! args.length=decompose(src,prevStarter,(start), ! buffer,0,buffer.length, ! compat,outTrailCC,nx); ! if(args.length<=buffer.length){ break; ! }else{ ! buffer = new char[args.length]; ! } } - - /* recompose the decomposition */ - recomposeLimit=args.length; - - if(args.length>=2) { - RecomposeArgs rcArgs = new RecomposeArgs(); - rcArgs.source = buffer; - rcArgs.start = 0; - rcArgs.limit = recomposeLimit; - args.prevCC=recompose(rcArgs, options, nx); - recomposeLimit = rcArgs.limit; } ! ! /* return with a pointer to the recomposition and its length */ ! args.length=recomposeLimit; ! return buffer; } - - private static boolean composeHangul(char prev, char c, - long/*unsigned*/ norm32, - char[] src,int[] srcIndex, int limit, - boolean compat, - char[] dest,int destIndex, - UnicodeSet nx) { - int start=srcIndex[0]; - if(isJamoVTNorm32JamoV(norm32)) { - /* c is a Jamo V, compose with previous Jamo L and - * following Jamo T */ - prev=(char)(prev-JAMO_L_BASE); - if(prev<JAMO_L_COUNT) { - c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+ - (c-JAMO_V_BASE))*JAMO_T_COUNT); - - /* check if the next character is a Jamo T (normal or - * compatibility) */ - if(start!=limit) { - char next, t; - - next=src[start]; - if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) { - /* normal Jamo T */ - ++start; - c+=t; - } else if(compat) { - /* if NFKC, then check for compatibility Jamo T - * (BMP only) */ - norm32=getNorm32(next); - if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) { - int p /*index into extra data array*/; - DecomposeArgs dcArgs = new DecomposeArgs(); - p=decompose(norm32, QC_NFKD, dcArgs); - if(dcArgs.length==1 && - (t=(char)(extraData[p]-JAMO_T_BASE)) - <JAMO_T_COUNT) { - /* compatibility Jamo T */ - ++start; - c+=t; } } } } ! if(nx_contains(nx, c)) { ! if(!isHangulWithoutJamoT(c)) { ! --start; /* undo ++start from reading the Jamo T */ } ! return false; } ! dest[destIndex]=c; ! srcIndex[0]=start; ! return true; } ! } else if(isHangulWithoutJamoT(prev)) { ! /* c is a Jamo T, compose with previous Hangul LV that does not ! * contain a Jamo T */ ! c=(char)(prev+(c-JAMO_T_BASE)); ! if(nx_contains(nx, c)) { return false; } ! dest[destIndex]=c; ! srcIndex[0]=start; ! return true; } return false; } ! /* ! public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){ ! return compose(src,0,src.length,dest,0,dest.length,compat, nx); } ! */ ! ! public static int compose(char[] src, int srcStart, int srcLimit, ! char[] dest,int destStart,int destLimit, ! int options,UnicodeSet nx) { ! ! int prevSrc, prevStarter; ! long/*unsigned*/ norm32; ! int ccOrQCMask, qcMask; ! int reorderStartIndex, length; ! char c, c2, minNoMaybe; ! int/*unsigned byte*/ cc, prevCC; ! int[] ioIndex = new int[1]; ! int destIndex = destStart; ! int srcIndex = srcStart; ! ! if((options&OPTIONS_COMPAT)!=0) { ! minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE]; ! qcMask=QC_NFKC; } else { - minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE]; - qcMask=QC_NFC; - } - - /* - * prevStarter points to the last character before the current one - * that is a "true" starter with cc==0 and quick check "yes". - * - * prevStarter will be used instead of looking for a true starter - * while incrementally decomposing [prevStarter..prevSrc[ - * in _composePart(). Having a good prevStarter allows to just decompose - * the entire [prevStarter..prevSrc[. - * - * When _composePart() backs out from prevSrc back to prevStarter, - * then it also backs out destIndex by the same amount. - * Therefore, at all times, the (prevSrc-prevStarter) source units - * must correspond 1:1 to destination units counted with destIndex, - * except for reordering. - * This is true for the qc "yes" characters copied in the fast loop, - * and for pure reordering. - * prevStarter must be set forward to src when this is not true: - * In _composePart() and after composing a Hangul syllable. - * - * This mechanism relies on the assumption that the decomposition of a - * true starter also begins with a true starter. gennorm/store.c checks - * for this. - */ - prevStarter=srcIndex; - - ccOrQCMask=CC_MASK|qcMask; - /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/ - prevCC=0; - - /* avoid compiler warnings */ - norm32=0; - c=0; - - for(;;) { - /* count code units below the minimum or with irrelevant data for - * the quick check */ - prevSrc=srcIndex; - - while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe || - ((norm32=getNorm32(c))&ccOrQCMask)==0)) { prevCC=0; - ++srcIndex; - } - - - /* copy these code units all at once */ - if(srcIndex!=prevSrc) { - length=srcIndex-prevSrc; - if((destIndex+length)<=destLimit) { - System.arraycopy(src,prevSrc,dest,destIndex,length); - } - destIndex+=length; - reorderStartIndex=destIndex; - - /* set prevStarter to the last character in the quick check - * loop */ - prevStarter=srcIndex-1; - if(UTF16.isTrailSurrogate(src[prevStarter]) && - prevSrc<prevStarter && - UTF16.isLeadSurrogate(src[(prevStarter-1)])) { - --prevStarter; } ! ! prevSrc=srcIndex; } - - /* end of source reached? */ - if(srcIndex==srcLimit) { - break; } - - /* c already contains *src and norm32 is set for it, increment src*/ - ++srcIndex; - /* ! * source buffer pointers: * * all done quick check current char not yet ! * "yes" but (c, c2) processed * may combine * forward * [-------------[-------------[-------------[-------------[ * | | | | | ! * start prevStarter prevSrc src limit * * ! * destination buffer pointers and indexes: * * all done might take not filled yet * characters for * reordering * [-------------[-------------[-------------[ * | | | | ! * dest reorderStartIndex destIndex destCapacity ! */ ! ! /* check one above-minimum, relevant code unit */ ! /* ! * norm32 is for c=*(src-1), and the quick check flag is "no" or ! * "maybe", and/or cc!=0 ! * check for Jamo V/T, then for surrogates and regular characters ! * c is not a Hangul syllable or Jamo L because ! * they are not marked with no/maybe for NFC & NFKC(and their cc==0) */ ! if(isNorm32HangulOrJamo(norm32)) { ! /* ! * c is a Jamo V/T: ! * try to compose with the previous character, Jamo V also with ! * a following Jamo T, and set values here right now in case we ! * just continue with the main loop ! */ ! prevCC=cc=0; ! reorderStartIndex=destIndex; ! ioIndex[0]=srcIndex; ! if( ! destIndex>0 && ! composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex, ! srcLimit, (options&OPTIONS_COMPAT)!=0, dest, ! destIndex<=destLimit ? destIndex-1: 0, ! nx) ) { ! srcIndex=ioIndex[0]; ! prevStarter=srcIndex; ! continue; } ! ! srcIndex = ioIndex[0]; ! ! /* the Jamo V/T did not compose into a Hangul syllable, just ! * append to dest */ ! c2=0; ! length=1; ! prevStarter=prevSrc; ! } else { ! if(isNorm32Regular(norm32)) { ! c2=0; ! length=1; ! } else { ! /* c is a lead surrogate, get the real norm32 */ ! if(srcIndex!=srcLimit && ! UTF16.isTrailSurrogate(c2=src[srcIndex])) { ! ++srcIndex; ! length=2; ! norm32=getNorm32FromSurrogatePair(norm32, c2); ! } else { ! /* c is an unpaired lead surrogate, nothing to do */ ! c2=0; ! length=1; ! norm32=0; ! } ! } ! ComposePartArgs args =new ComposePartArgs(); ! ! /* we are looking at the character (c, c2) at [prevSrc..src[ */ ! if(nx_contains(nx, c, c2)) { ! /* excluded: norm32==0 */ ! cc=0; ! } else if((norm32&qcMask)==0) { ! cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT)); } else { ! char[] p; /* ! * find appropriate boundaries around this character, * decompose the source text from between the boundaries, ! * and recompose it * ! * this puts the intermediate text into the side buffer because ! * it might be longer than the recomposition end result, ! * or the destination buffer may be too short or missing ! * ! * note that destIndex may be adjusted backwards to account ! * for source text that passed the quick check but needed to ! * take part in the recomposition */ ! int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ /* ! * find the last true starter in [prevStarter..src[ ! * it is either the decomposition of the current character (at prevSrc), ! * or prevStarter */ ! if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) { ! prevStarter=prevSrc; ! } else { ! /* adjust destIndex: back out what had been copied with qc "yes" */ ! destIndex-=prevSrc-prevStarter; } ! /* find the next true starter in [src..limit[ */ ! srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask, ! decompQCMask, minNoMaybe); ! //args.prevStarter = prevStarter; ! args.prevCC = prevCC; ! //args.destIndex = destIndex; ! args.length = length; ! p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx); ! if(p==null) { ! /* an error occurred (out of memory) */ ! break; } ! prevCC = args.prevCC; ! length = args.length; ! ! /* append the recomposed buffer contents to the destination ! * buffer */ ! if((destIndex+args.length)<=destLimit) { ! int i=0; ! while(i<args.length) { ! dest[destIndex++]=p[i++]; ! --length; } ! } else { ! /* buffer overflow */ ! /* keep incrementing the destIndex for preflighting */ ! destIndex+=length; } ! prevStarter=srcIndex; ! continue; } } ! ! /* append the single code point (c, c2) to the destination buffer */ ! if((destIndex+length)<=destLimit) { ! if(cc!=0 && cc<prevCC) { ! /* (c, c2) is out of order with respect to the preceding ! * text */ ! int reorderSplit= destIndex; ! destIndex+=length; ! prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit, ! destIndex, c, c2, cc); ! } else { ! /* just append (c, c2) */ ! dest[destIndex++]=c; ! if(c2!=0) { ! dest[destIndex++]=c2; } - prevCC=cc; } } else { ! /* buffer overflow */ ! /* keep incrementing the destIndex for preflighting */ ! destIndex+=length; ! prevCC=cc; } } - - return destIndex - destStart; } ! ! public static int getCombiningClass(int c) { ! long norm32; ! norm32=getNorm32(c); ! return (int)((norm32>>CC_SHIFT)&0xFF); } ! public static boolean isFullCompositionExclusion(int c) { ! if(isFormatVersion_2_1) { ! int aux =AuxTrieImpl.auxTrie.getCodePointValue(c); ! return (aux & AUX_COMP_EX_MASK)!=0; } else { ! return false; } } ! ! public static boolean isCanonSafeStart(int c) { ! if(isFormatVersion_2_1) { ! int aux = AuxTrieImpl.auxTrie.getCodePointValue(c); ! return (aux & AUX_UNSAFE_MASK)==0; ! } else { ! return false; } } ! /* Is c an NF<mode>-skippable code point? See unormimp.h. */ ! public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) { ! long /*unsigned int*/ norm32; ! mask = mask & UNSIGNED_INT_MASK; ! char aux; ! /* check conditions (a)..(e), see unormimp.h */ ! norm32 = getNorm32(c); ! if((norm32&mask)!=0) { ! return false; /* fails (a)..(e), not skippable */ } ! ! if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){ ! return true; /* NF*D, passed (a)..(c), is skippable */ } - /* check conditions (a)..(e), see unormimp.h */ - - /* NF*C/FCC, passed (a)..(e) */ - if((norm32& QC_NFD)==0) { - return true; /* no canonical decomposition, is skippable */ } - - /* check Hangul syllables algorithmically */ - if(isNorm32HangulOrJamo(norm32)) { - /* Jamo passed (a)..(e) above, must be Hangul */ - return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */ } ! ! /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */ ! /* NF*C, test (f) flag */ ! if(!isFormatVersion_2_2) { ! return false; /* no (f) data, say not skippable to be safe */ } - - - aux = AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */ - - /* } else { FCC, test fcd<=1 instead of the above } */ } - - public static UnicodeSet addPropertyStarts(UnicodeSet set) { - int c; - - /* add the start code point of each same-value range of each trie */ - //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); - TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie); - RangeValueIterator.Element normResult = new RangeValueIterator.Element(); - - while(normIter.next(normResult)){ - set.add(normResult.start); } ! ! //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); ! TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie); ! RangeValueIterator.Element fcdResult = new RangeValueIterator.Element(); ! ! while(fcdIter.next(fcdResult)){ ! set.add(fcdResult.start); } ! ! if(isFormatVersion_2_1){ ! //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set); ! TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie); ! RangeValueIterator.Element auxResult = new RangeValueIterator.Element(); ! while(auxIter.next(auxResult)){ ! set.add(auxResult.start); } } ! /* add Hangul LV syllables and LV+1 because of skippables */ ! for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) { ! set.add(c); ! set.add(c+1); } - set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ - return set; // for chaining } ! ! /** ! * Internal API, used in UCharacter.getIntPropertyValue(). ! * @internal ! * @param c code point ! * @param modeValue numeric value compatible with Mode ! * @return numeric value compatible with QuickCheck ! */ ! public static final int quickCheck(int c, int modeValue) { ! final int qcMask[/*UNORM_MODE_COUNT*/]={ ! 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC ! }; ! ! int norm32=(int)getNorm32(c)&qcMask[modeValue]; ! ! if(norm32==0) { ! return 1; // YES ! } else if((norm32&QC_ANY_NO)!=0) { ! return 0; // NO ! } else /* _NORM_QC_ANY_MAYBE */ { ! return 2; // MAYBE; } } ! private static int strCompare(char[] s1, int s1Start, int s1Limit, ! char[] s2, int s2Start, int s2Limit, ! boolean codePointOrder) { ! ! int start1, start2, limit1, limit2; ! ! char c1, c2; ! ! /* setup for fix-up */ ! start1=s1Start; ! start2=s2Start; ! ! int length1, length2; ! ! length1 = s1Limit - s1Start; ! length2 = s2Limit - s2Start; ! ! int lengthResult; ! ! if(length1<length2) { ! lengthResult=-1; ! limit1=start1+length1; ! } else if(length1==length2) { ! lengthResult=0; ! limit1=start1+length1; ! } else /* length1>length2 */ { ! lengthResult=1; ! limit1=start1+length2; } ! ! if(s1==s2) { ! return lengthResult; } for(;;) { ! /* check pseudo-limit */ ! if(s1Start==limit1) { ! return lengthResult; } - - c1=s1[s1Start]; - c2=s2[s2Start]; - if(c1!=c2) { - break; } - ++s1Start; - ++s2Start; } ! /* setup for fix-up */ ! limit1=start1+length1; ! limit2=start2+length2; ! ! /* if both values are in or above the surrogate range, fix them up */ ! if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { ! /* subtract 0x2800 from BMP code points to make them smaller than ! * supplementary ones */ ! if( ! ( c1<=0xdbff && (s1Start+1)!=limit1 && ! UTF16.isTrailSurrogate(s1[(s1Start+1)]) ! ) || ! ( UTF16.isTrailSurrogate(c1) && start1!=s1Start && ! UTF16.isLeadSurrogate(s1[(s1Start-1)]) ! ) ! ) { ! /* part of a surrogate pair, leave >=d800 */ } else { ! /* BMP code point - may be surrogate code point - make <d800 */ ! c1-=0x2800; } ! if( ! ( c2<=0xdbff && (s2Start+1)!=limit2 && ! UTF16.isTrailSurrogate(s2[(s2Start+1)]) ! ) || ! ( UTF16.isTrailSurrogate(c2) && start2!=s2Start && ! UTF16.isLeadSurrogate(s2[(s2Start-1)]) ! ) ! ) { ! /* part of a surrogate pair, leave >=d800 */ } else { ! /* BMP code point - may be surrogate code point - make <d800 */ ! c2-=0x2800; } } ! /* now c1 and c2 are in UTF-32-compatible order */ ! return (int)c1-(int)c2; } ! ! /* ! * Status of tailored normalization ! * ! * This was done initially for investigation on Unicode public review issue 7 ! * (http://www.unicode.org/review/). See Jitterbug 2481. ! * While the UTC at meeting #94 (2003mar) did not take up the issue, this is ! * a permanent feature in ICU 2.6 in support of IDNA which requires true ! * Unicode 3.2 normalization. ! * (NormalizationCorrections are rolled into IDNA mapping tables.) ! * ! * Tailored normalization as implemented here allows to "normalize less" ! * than full Unicode normalization would. ! * Based internally on a UnicodeSet of code points that are ! * "excluded from normalization", the normalization functions leave those ! * code points alone ("inert"). This means that tailored normalization ! * still transforms text into a canonically equivalent form. ! * It does not add decompositions to code points that do not have any or ! * change decomposition results. ! * ! * Any function that searches for a safe boundary has not been touched, ! * which means that these functions will be over-pessimistic when ! * exclusions are applied. ! * This should not matter because subsequent checks and normalizations ! * do apply the exclusions; only a little more of the text may be processed ! * than necessary under exclusions. ! * ! * Normalization exclusions have the following effect on excluded code points c: ! * - c is not decomposed ! * - c is not a composition target ! * - c does not combine forward or backward for composition ! * except that this is not implemented for Jamo ! * - c is treated as having a combining class of 0 ! */ ! ! /* ! * Constants for the bit fields in the options bit set parameter. ! * These need not be public. ! * A user only needs to know the currently assigned values. ! * The number and positions of reserved bits per field can remain private. ! */ ! private static final int OPTIONS_NX_MASK=0x1f; ! private static final int OPTIONS_UNICODE_MASK=0xe0; ! public static final int OPTIONS_SETS_MASK=0xff; ! // private static final int OPTIONS_UNICODE_SHIFT=5; ! private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1]; ! ! /* Constants for options flags for normalization.*/ /** ! * Options bit 0, do not decompose Hangul syllables. ! * @draft ICU 2.6 */ ! private static final int NX_HANGUL = 1; ! /** ! * Options bit 1, do not decompose CJK compatibility characters. ! * @draft ICU 2.6 ! */ ! private static final int NX_CJK_COMPAT=2; ! /** ! * Options bit 8, use buggy recomposition described in ! * Unicode Public Review Issue #29 ! * at http://www.unicode.org/review/resolved-pri.html#pri29 ! * ! * Used in IDNA implementation according to strict interpretation ! * of IDNA definition based on Unicode 3.2 which predates PRI #29. ! * ! * See ICU4C unormimp.h ! * ! * @draft ICU 3.2 ! */ ! public static final int BEFORE_PRI_29=0x100; ! /* ! * The following options are used only in some composition functions. ! * They use bits 12 and up to preserve lower bits for the available options ! * space in unorm_compare() - ! * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. */ ! /** Options bit 12, for compatibility vs. canonical decomposition. */ ! public static final int OPTIONS_COMPAT=0x1000; ! /** Options bit 13, no discontiguous composition (FCC vs. NFC). */ ! public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000; ! /* normalization exclusion sets --------------------------------------------- */ /* ! * Normalization exclusion UnicodeSets are used for tailored normalization; ! * see the comment near the beginning of this file. * ! * By specifying one or several sets of code points, ! * those code points become inert for normalization. ! */ ! private static final synchronized UnicodeSet internalGetNXHangul() { ! /* internal function, does not check for incoming U_FAILURE */ ! if(nxCache[NX_HANGUL]==null) { ! nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3); } - return nxCache[NX_HANGUL]; } ! ! private static final synchronized UnicodeSet internalGetNXCJKCompat() { ! /* internal function, does not check for incoming U_FAILURE */ ! ! if(nxCache[NX_CJK_COMPAT]==null) { ! ! /* build a set from [CJK Ideographs]&[has canonical decomposition] */ ! UnicodeSet set, hasDecomp; ! ! set=new UnicodeSet("[:Ideographic:]"); ! ! /* start with an empty set for [has canonical decomposition] */ ! hasDecomp=new UnicodeSet(); ! ! /* iterate over all ideographs and remember which canonically decompose */ ! UnicodeSetIterator it = new UnicodeSetIterator(set); ! int start, end; ! long norm32; ! ! while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) { ! start=it.codepoint; ! end=it.codepointEnd; ! while(start<=end) { ! norm32 = getNorm32(start); ! if((norm32 & QC_NFD)>0) { ! hasDecomp.add(start); } ! ++start; } } ! /* hasDecomp now contains all ideographs that decompose canonically */ ! nxCache[NX_CJK_COMPAT]=hasDecomp; } ! ! return nxCache[NX_CJK_COMPAT]; } ! private static final synchronized UnicodeSet internalGetNXUnicode(int options) { ! options &= OPTIONS_UNICODE_MASK; ! if(options==0) { ! return null; } ! if(nxCache[options]==null) { ! /* build a set with all code points that were not designated by the specified Unicode version */ ! UnicodeSet set = new UnicodeSet(); ! ! switch(options) { ! case NormalizerBase.UNICODE_3_2: ! set.applyPattern("[:^Age=3.2:]"); break; - default: - return null; } ! nxCache[options]=set; } - - return nxCache[options]; } ! ! /* Get a decomposition exclusion set. The data must be loaded. */ ! private static final synchronized UnicodeSet internalGetNX(int options) { ! options&=OPTIONS_SETS_MASK; ! ! if(nxCache[options]==null) { ! /* return basic sets */ ! if(options==NX_HANGUL) { ! return internalGetNXHangul(); } - if(options==NX_CJK_COMPAT) { - return internalGetNXCJKCompat(); } ! if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) { ! return internalGetNXUnicode(options); } ! /* build a set from multiple subsets */ ! UnicodeSet set; ! UnicodeSet other; ! ! set=new UnicodeSet(); ! ! ! if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) { ! set.addAll(other); } ! if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) { ! set.addAll(other); } ! if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) { ! set.addAll(other); } - - nxCache[options]=set; } - return nxCache[options]; } ! public static final UnicodeSet getNX(int options) { ! if((options&=OPTIONS_SETS_MASK)==0) { ! /* incoming failure, or no decomposition exclusions requested */ ! return null; ! } else { ! return internalGetNX(options); } } ! ! private static final boolean nx_contains(UnicodeSet nx, int c) { ! return nx!=null && nx.contains(c); } ! private static final boolean nx_contains(UnicodeSet nx, char c, char c2) { ! return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2)); } ! /*****************************************************************************/ /** * Get the canonical decomposition * sherman for ComposedCharIter */ - public static int getDecompose(int chars[], String decomps[]) { ! DecomposeArgs args = new DecomposeArgs(); int length=0; ! long norm32 = 0; int ch = -1; - int index = 0; int i = 0; while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff //TBD !!!! the hack code heres save us about 50ms for startup //need a better solution/lookup if (ch == 0x30ff) ch = 0xf900; ! else if (ch == 0x10000) ch = 0x1d15e; else if (ch == 0x1d1c1) ch = 0x2f800; ! norm32 = NormalizerImpl.getNorm32(ch); ! if((norm32 & QC_NFD)!=0 && i < chars.length) { chars[i] = ch; ! index = decompose(norm32, args); ! decomps[i++] = new String(extraData,index, args.length); } } return i; } //------------------------------------------------------ ! // special method for Collation //------------------------------------------------------ private static boolean needSingleQuotation(char c) { return (c >= 0x0009 && c <= 0x000D) || (c >= 0x0020 && c <= 0x002F) || (c >= 0x003A && c <= 0x0040) || (c >= 0x005B && c <= 0x0060) || (c >= 0x007B && c <= 0x007E); } public static String canonicalDecomposeWithSingleQuotation(String string) { char[] src = string.toCharArray(); int srcIndex = 0; int srcLimit = src.length; char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 int destIndex = 0; int destLimit = dest.length; - char[] buffer = new char[3]; int prevSrc; ! long norm32; ! int ccOrQCMask; ! int qcMask = QC_NFD; int reorderStartIndex, length; ! char c, c2; ! char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE]; int cc, prevCC, trailCC; char[] p; int pStart; - // initialize - ccOrQCMask = CC_MASK | qcMask; reorderStartIndex = 0; prevCC = 0; ! norm32 = 0; ! c = 0; pStart = 0; cc = trailCC = -1; // initialize to bogus value ! for(;;) { prevSrc=srcIndex; //quick check (1)less than minNoMaybe (2)no decomp (3)hangual while (srcIndex != srcLimit && ! (( c = src[srcIndex]) < minNoMaybe || ! ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || ! ( c >= '\uac00' && c <= '\ud7a3'))){ ! prevCC = 0; ! ++srcIndex; } // copy these code units all at once if (srcIndex != prevSrc) { length = srcIndex - prevSrc; --- 20,1799 ---- * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* ! * Copyright (C) 2009-2014, International Business Machines ! * Corporation and others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; import java.io.IOException; ! import java.nio.ByteBuffer; ! import java.text.Normalizer; ! // Original filename in ICU4J: Normalizer2Impl.java public final class NormalizerImpl { + public static final class Hangul { /* Korean Hangul and Jamo constants */ public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ public static final int HANGUL_BASE=0xac00; + public static final int HANGUL_END=0xd7a3; public static final int JAMO_L_COUNT=19; public static final int JAMO_V_COUNT=21; ! public static final int JAMO_T_COUNT=28; ! ! public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; ! public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; ! ! public static boolean isHangul(int c) { ! return HANGUL_BASE<=c && c<HANGUL_LIMIT; } + + public static boolean isHangulWithoutJamoT(char c) { + c-=HANGUL_BASE; + return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; } ! ! /** ! * Decomposes c, which must be a Hangul syllable, into buffer ! * and returns the length of the decomposition (2 or 3). ! */ ! public static int decompose(int c, Appendable buffer) { ! try { ! c-=HANGUL_BASE; ! int c2=c%JAMO_T_COUNT; ! c/=JAMO_T_COUNT; ! buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); ! buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); ! if(c2==0) { ! return 2; } else { ! buffer.append((char)(JAMO_T_BASE+c2)); ! return 3; } ! } catch(IOException e) { ! throw new InternalError(e); } } } ! /** ! * Writable buffer that takes care of canonical ordering. ! * Its Appendable methods behave like the C++ implementation's ! * appendZeroCC() methods. ! * <p> ! * If dest is a StringBuilder, then the buffer writes directly to it. ! * Otherwise, the buffer maintains a StringBuilder for intermediate text segments ! * until no further changes are necessary and whole segments are appended. ! * append() methods that take combining-class values always write to the StringBuilder. ! * Other append() methods flush and append to the Appendable. ! */ ! public static final class ReorderingBuffer implements Appendable { ! public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { ! impl=ni; ! app=dest; ! if (app instanceof StringBuilder) { ! appIsStringBuilder=true; ! str=(StringBuilder)dest; ! // In Java, the constructor subsumes public void init(int destCapacity) ! str.ensureCapacity(destCapacity); ! reorderStart=0; ! if(str.length()==0) { ! lastCC=0; } else { ! setIterator(); ! lastCC=previousCC(); ! // Set reorderStart after the last code point with cc<=1 if there is one. ! if(lastCC>1) { ! while(previousCC()>1) {} } ! reorderStart=codePointLimit; } ! } else { ! appIsStringBuilder=false; ! str=new StringBuilder(); ! reorderStart=0; ! lastCC=0; } } ! public boolean isEmpty() { return str.length()==0; } ! public int length() { return str.length(); } ! public int getLastCC() { return lastCC; } + public StringBuilder getStringBuilder() { return str; } ! public boolean equals(CharSequence s, int start, int limit) { ! return UTF16Plus.equal(str, 0, str.length(), s, start, limit); } ! // For Hangul composition, replacing the Leading consonant Jamo with the syllable. ! public void setLastChar(char c) { ! str.setCharAt(str.length()-1, c); } ! public void append(int c, int cc) { ! if(lastCC<=cc || cc==0) { ! str.appendCodePoint(c); ! lastCC=cc; ! if(cc<=1) { ! reorderStart=str.length(); ! } ! } else { ! insert(c, cc); ! } } ! // s must be in NFD, otherwise change the implementation. ! public void append(CharSequence s, int start, int limit, ! int leadCC, int trailCC) { ! if(start==limit) { ! return; } + if(lastCC<=leadCC || leadCC==0) { + if(trailCC<=1) { + reorderStart=str.length()+(limit-start); + } else if(leadCC<=1) { + reorderStart=str.length()+1; // Ok if not a code point boundary. } + str.append(s, start, limit); + lastCC=trailCC; + } else { + int c=Character.codePointAt(s, start); + start+=Character.charCount(c); + insert(c, leadCC); // insert first code point + while(start<limit) { + c=Character.codePointAt(s, start); + start+=Character.charCount(c); + if(start<limit) { + // s must be in NFD, otherwise we need to use getCC(). + leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); + } else { + leadCC=trailCC; } ! append(c, leadCC); } } } ! // The following append() methods work like C++ appendZeroCC(). ! // They assume that the cc or trailCC of their input is 0. ! // Most of them implement Appendable interface methods. ! // @Override when we switch to Java 6 ! public ReorderingBuffer append(char c) { ! str.append(c); ! lastCC=0; ! reorderStart=str.length(); ! return this; } ! public void appendZeroCC(int c) { ! str.appendCodePoint(c); ! lastCC=0; ! reorderStart=str.length(); } ! // @Override when we switch to Java 6 ! public ReorderingBuffer append(CharSequence s) { ! if(s.length()!=0) { ! str.append(s); ! lastCC=0; ! reorderStart=str.length(); } ! return this; } ! // @Override when we switch to Java 6 ! public ReorderingBuffer append(CharSequence s, int start, int limit) { ! if(start!=limit) { ! str.append(s, start, limit); ! lastCC=0; ! reorderStart=str.length(); } + return this; } ! /** ! * Flushes from the intermediate StringBuilder to the Appendable, ! * if they are different objects. ! * Used after recomposition. ! * Must be called at the end when writing to a non-StringBuilder Appendable. ! */ ! public void flush() { ! if(appIsStringBuilder) { ! reorderStart=str.length(); ! } else { ! try { ! app.append(str); ! str.setLength(0); ! reorderStart=0; ! } catch(IOException e) { ! throw new InternalError(e); // Avoid declaring "throws IOException". } } + lastCC=0; } ! /** ! * Flushes from the intermediate StringBuilder to the Appendable, ! * if they are different objects. ! * Then appends the new text to the Appendable or StringBuilder. ! * Normally used after quick check loops find a non-empty sequence. */ ! public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { ! if(appIsStringBuilder) { ! str.append(s, start, limit); ! reorderStart=str.length(); } else { ! try { ! app.append(str).append(s, start, limit); ! str.setLength(0); ! reorderStart=0; ! } catch(IOException e) { ! throw new InternalError(e); // Avoid declaring "throws IOException". } } ! lastCC=0; ! return this; } + + public void remove() { + str.setLength(0); + lastCC=0; + reorderStart=0; } ! ! public void removeSuffix(int suffixLength) { ! int oldLength=str.length(); ! str.delete(oldLength-suffixLength, oldLength); ! lastCC=0; ! reorderStart=str.length(); } ! // Inserts c somewhere before the last character. ! // Requires 0<cc<lastCC which implies reorderStart<limit. ! private void insert(int c, int cc) { ! for(setIterator(), skipPrevious(); previousCC()>cc;) {} ! // insert c at codePointLimit, after the character with prevCC<=cc ! if(c<=0xffff) { ! str.insert(codePointLimit, (char)c); ! if(cc<=1) { ! reorderStart=codePointLimit+1; } + } else { + str.insert(codePointLimit, Character.toChars(c)); + if(cc<=1) { + reorderStart=codePointLimit+2; } } } ! private final NormalizerImpl impl; ! private final Appendable app; ! private final StringBuilder str; ! private final boolean appIsStringBuilder; ! private int reorderStart; ! private int lastCC; ! // private backward iterator ! private void setIterator() { codePointStart=str.length(); } ! private void skipPrevious() { // Requires 0<codePointStart. ! codePointLimit=codePointStart; ! codePointStart=str.offsetByCodePoints(codePointStart, -1); ! } ! private int previousCC() { // Returns 0 if there is no previous character. ! codePointLimit=codePointStart; ! if(reorderStart>=codePointStart) { return 0; ! } ! int c=str.codePointBefore(codePointStart); ! codePointStart-=Character.charCount(c); ! if(c<MIN_CCC_LCCC_CP) { return 0; } + return getCCFromYesOrMaybe(impl.getNorm16(c)); } ! private int codePointStart, codePointLimit; } ! // TODO: Propose as public API on the UTF16 class. ! // TODO: Propose widening UTF16 methods that take char to take int. ! // TODO: Propose widening UTF16 methods that take String to take CharSequence. ! public static final class UTF16Plus { ! /** ! * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), ! * is it a lead surrogate? ! * @param c code unit or code point ! * @return true or false ! */ ! public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } ! /** ! * Compares two CharSequence subsequences for binary equality. ! * @param s1 first sequence ! * @param start1 start offset in first sequence ! * @param limit1 limit offset in first sequence ! * @param s2 second sequence ! * @param start2 start offset in second sequence ! * @param limit2 limit offset in second sequence ! * @return true if s1.subSequence(start1, limit1) contains the same text ! * as s2.subSequence(start2, limit2) ! */ ! public static boolean equal(CharSequence s1, int start1, int limit1, ! CharSequence s2, int start2, int limit2) { ! if((limit1-start1)!=(limit2-start2)) { ! return false; ! } ! if(s1==s2 && start1==start2) { ! return true; ! } ! while(start1<limit1) { ! if(s1.charAt(start1++)!=s2.charAt(start2++)) { ! return false; ! } ! } ! return true; ! } } ! public NormalizerImpl() {} ! ! private static final class IsAcceptable implements ICUBinary.Authenticate { ! // @Override when we switch to Java 6 ! public boolean isDataVersionAcceptable(byte version[]) { ! return version[0]==2; } } ! private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); ! private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" ! public NormalizerImpl load(ByteBuffer bytes) { ! try { ! dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); ! int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 ! if(indexesLength<=IX_MIN_MAYBE_YES) { ! throw new IOException("Normalizer2 data: not enough indexes"); } ! int[] inIndexes=new int[indexesLength]; ! inIndexes[0]=indexesLength*4; ! for(int i=1; i<indexesLength; ++i) { ! inIndexes[i]=bytes.getInt(); } ! minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; ! minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; ! ! minYesNo=inIndexes[IX_MIN_YES_NO]; ! minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; ! minNoNo=inIndexes[IX_MIN_NO_NO]; ! limitNoNo=inIndexes[IX_LIMIT_NO_NO]; ! minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; ! // Read the normTrie. ! int offset=inIndexes[IX_NORM_TRIE_OFFSET]; ! int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; ! normTrie=Trie2_16.createFromSerialized(bytes); ! int trieLength=normTrie.getSerializedLength(); ! if(trieLength>(nextOffset-offset)) { ! throw new IOException("Normalizer2 data: not enough bytes for normTrie"); } + ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes ! // Read the composition and mapping data. ! offset=nextOffset; ! nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; ! int numChars=(nextOffset-offset)/2; ! char[] chars; ! if(numChars!=0) { ! chars=new char[numChars]; ! for(int i=0; i<numChars; ++i) { ! chars[i]=bytes.getChar(); } ! maybeYesCompositions=new String(chars); ! extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); } ! // smallFCD: new in formatVersion 2 ! offset=nextOffset; ! smallFCD=new byte[0x100]; ! for(int i=0; i<0x100; ++i) { ! smallFCD[i]=bytes.get(); ! } ! // Build tccc180[]. ! // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. ! tccc180=new int[0x180]; ! int bits=0; ! for(int c=0; c<0x180; bits>>=1) { ! if((c&0xff)==0) { ! bits=smallFCD[c>>8]; // one byte per 0x100 code points } ! if((bits&1)!=0) { ! for(int i=0; i<0x20; ++i, ++c) { ! tccc180[c]=getFCD16FromNormData(c)&0xff; } } else { ! c+=0x20; } } + return this; + } catch(IOException e) { + throw new InternalError(e); + } + } ! public NormalizerImpl load(String name) { ! return load(ICUBinary.getRequiredData(name)); } ! public int getNorm16(int c) { ! return normTrie.get(c); ! } ! public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } ! public int getCC(int norm16) { ! if(norm16>=MIN_NORMAL_MAYBE_YES) { ! return norm16&0xff; } ! if(norm16<minNoNo || limitNoNo<=norm16) { ! return 0; } + return getCCFromNoNo(norm16); } ! public static int getCCFromYesOrMaybe(int norm16) { ! return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; ! } ! ! /** ! * Returns the FCD data for code point c. ! * @param c A Unicode code point. ! * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ + public int getFCD16(int c) { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); + } + + /** Returns the FCD data for U+0000<=c<U+0180. */ + public int getFCD16FromBelow180(int c) { return tccc180[c]; } + /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ + public boolean singleLeadMightHaveNonZeroFCD16(int lead) { + // 0<=lead<=0xffff + byte bits=smallFCD[lead>>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } + + /** Gets the FCD value from the regular normalization data. */ + public int getFCD16FromNormData(int c) { + // Only loops for 1:1 algorithmic mappings. + for(;;) { + int norm16=getNorm16(c); + if(norm16<=minYesNo) { + // no decomposition or Hangul syllable, all zeros + return 0; + } else if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16&=0xff; + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); } else { ! // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! if((firstUnit&MAPPING_LENGTH_MASK)==0) { ! // A character that is deleted (maps to an empty string) must ! // get the worst-case lccc and tccc values because arbitrary ! // characters on both sides will become adjacent. ! return 0x1ff; ! } else { ! int fcd16=firstUnit>>8; // tccc ! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc } + return fcd16; } } } } ! /** ! * Gets the decomposition for one code point. ! * @param c code point ! * @return c's decomposition, if it has one; returns null if it does not have a decomposition */ ! public String getDecomposition(int c) { ! int decomp=-1; ! int norm16; ! for(;;) { ! if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { ! // c does not decompose ! } else if(isHangul(norm16)) { ! // Hangul syllable: decompose algorithmically ! StringBuilder buffer=new StringBuilder(); ! Hangul.decompose(c, buffer); ! return buffer.toString(); ! } else if(isDecompNoAlgorithmic(norm16)) { ! decomp=c=mapAlgorithmic(c, norm16); continue; + } else { + // c decomposes, get everything from the variable-length extra data + int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; + return extraData.substring(norm16, norm16+length); } ! if(decomp<0) { ! return null; } else { ! return UTF16.valueOf(decomp); } } } ! public static final int MIN_CCC_LCCC_CP=0x300; ! public static final int MIN_YES_YES_WITH_CC=0xff01; ! public static final int JAMO_VT=0xff00; ! public static final int MIN_NORMAL_MAYBE_YES=0xfe00; ! public static final int MAX_DELTA=0x40; ! // Byte offsets from the start of the data, after the generic header. ! public static final int IX_NORM_TRIE_OFFSET=0; ! public static final int IX_EXTRA_DATA_OFFSET=1; ! public static final int IX_SMALL_FCD_OFFSET=2; ! // Code point thresholds for quick check codes. ! public static final int IX_MIN_DECOMP_NO_CP=8; ! public static final int IX_MIN_COMP_NO_MAYBE_CP=9; ! // Norm16 value thresholds for quick check combinations and types of extra data. ! // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. ! public static final int IX_MIN_YES_NO=10; ! public static final int IX_MIN_NO_NO=11; ! public static final int IX_LIMIT_NO_NO=12; ! public static final int IX_MIN_MAYBE_YES=13; ! // Mappings only in [minYesNoMappingsOnly..minNoNo[. ! public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; ! public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; ! public static final int MAPPING_LENGTH_MASK=0x1f; ! ! public static final int COMP_1_LAST_TUPLE=0x8000; ! public static final int COMP_1_TRIPLE=1; ! public static final int COMP_1_TRAIL_LIMIT=0x3400; ! public static final int COMP_1_TRAIL_MASK=0x7ffe; ! public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit ! public static final int COMP_2_TRAIL_SHIFT=6; ! public static final int COMP_2_TRAIL_MASK=0xffc0; ! ! // higher-level functionality ------------------------------------------ *** ! ! /** ! * Decomposes s[src, limit[ and writes the result to dest. ! * limit can be NULL if src is NUL-terminated. ! * destLengthEstimate is the initial dest buffer capacity and can be -1. ! */ ! public void decompose(CharSequence s, int src, int limit, StringBuilder dest, ! int destLengthEstimate) { ! if(destLengthEstimate<0) { ! destLengthEstimate=limit-src; ! } ! dest.setLength(0); ! ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); ! decompose(s, src, limit, buffer); ! } ! ! // Dual functionality: ! // buffer!=NULL: normalize ! // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes ! public int decompose(CharSequence s, int src, int limit, ! ReorderingBuffer buffer) { ! int minNoCP=minDecompNoCP; ! ! int prevSrc; ! int c=0; ! int norm16=0; ! ! // only for quick check ! int prevBoundary=src; ! int prevCC=0; ! ! for(;;) { ! // count code units below the minimum or with irrelevant data for the quick check ! for(prevSrc=src; src!=limit;) { ! if( (c=s.charAt(src))<minNoCP || ! isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ! ) { ! ++src; ! } else if(!UTF16.isSurrogate((char)c)) { ! break; } else { ! char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { ! c=Character.toCodePoint((char)c, c2); ! } ! } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; ! c=Character.toCodePoint(c2, (char)c); ! } } ! if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { ! src+=Character.charCount(c); } else { ! break; ! } } } + // copy these code units all at once + if(src!=prevSrc) { + if(buffer!=null) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); + } else { + prevCC=0; + prevBoundary=src; } } ! if(src==limit) { break; } ! // Check one above-minimum, relevant code point. ! src+=Character.charCount(c); ! if(buffer!=null) { ! decompose(c, norm16, buffer); ! } else { ! if(isDecompYes(norm16)) { ! int cc=getCCFromYesOrMaybe(norm16); ! if(prevCC<=cc || cc==0) { ! prevCC=cc; ! if(cc<=1) { ! prevBoundary=src; } ! continue; } } ! return prevBoundary; // "no" or cc out of order } } ! return src; } ! public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { ! int limit=s.length(); ! if(limit==0) { ! return; } + if(doDecompose) { + decompose(s, 0, limit, buffer); + return; } ! // Just merge the strings at the boundary. ! int c=Character.codePointAt(s, 0); ! int src=0; ! int firstCC, prevCC, cc; ! firstCC=prevCC=cc=getCC(getNorm16(c)); ! while(cc!=0) { ! prevCC=cc; ! src+=Character.charCount(c); ! if(src>=limit) { ! break; } ! c=Character.codePointAt(s, src); ! cc=getCC(getNorm16(c)); ! }; ! buffer.append(s, 0, src, firstCC, prevCC); ! buffer.append(s, src, limit); } ! // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. ! // doCompose: normalize ! // !doCompose: isNormalized (buffer must be empty and initialized) ! public boolean compose(CharSequence s, int src, int limit, ! boolean onlyContiguous, ! boolean doCompose, ! ReorderingBuffer buffer) { ! int minNoMaybeCP=minCompNoMaybeCP; ! ! /* ! * prevBoundary points to the last character before the current one ! * that has a composition boundary before it with ccc==0 and quick check "yes". ! * Keeping track of prevBoundary saves us looking for a composition boundary ! * when we find a "no" or "maybe". ! * ! * When we back out from prevSrc back to prevBoundary, ! * then we also remove those same characters (which had been simply copied ! * or canonically-order-inserted) from the ReorderingBuffer. ! * Therefore, at all times, the [prevBoundary..prevSrc[ source units ! * must correspond 1:1 to destination units at the end of the destination buffer. ! */ ! int prevBoundary=src; ! int prevSrc; ! int c=0; ! int norm16=0; ! // only for isNormalized ! int prevCC=0; ! for(;;) { ! // count code units below the minimum or with irrelevant data for the quick check ! for(prevSrc=src; src!=limit;) { ! if( (c=s.charAt(src))<minNoMaybeCP || ! isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ! ) { ! ++src; ! } else if(!UTF16.isSurrogate((char)c)) { break; ! } else { ! char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { ! c=Character.toCodePoint((char)c, c2); ! } ! } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; ! c=Character.toCodePoint(c2, (char)c); } } ! if(isCompYesAndZeroCC(norm16=getNorm16(c))) { ! src+=Character.charCount(c); ! } else { ! break; } } } + // copy these code units all at once + if(src!=prevSrc) { + if(src==limit) { + if(doCompose) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); } + break; } ! // Set prevBoundary to the last character in the quick check loop. ! prevBoundary=src-1; ! if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && ! Character.isHighSurrogate(s.charAt(prevBoundary-1)) ! ) { ! --prevBoundary; } ! if(doCompose) { ! // The last "quick check yes" character is excluded from the ! // flush-and-append call in case it needs to be modified. ! buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); ! buffer.append(s, prevBoundary, src); ! } else { ! prevCC=0; } ! // The start of the current character (c). ! prevSrc=src; ! } else if(src==limit) { ! break; } ! ! src+=Character.charCount(c); ! /* ! * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. ! * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) ! * or has ccc!=0. ! * Check for Jamo V/T, then for regular characters. ! * c is not a Hangul syllable or Jamo L because those have "yes" properties. ! */ ! if(isJamoVT(norm16) && prevBoundary!=prevSrc) { ! char prev=s.charAt(prevSrc-1); ! boolean needToDecompose=false; ! if(c<Hangul.JAMO_T_BASE) { ! // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. ! prev-=Hangul.JAMO_L_BASE; ! if(prev<Hangul.JAMO_L_COUNT) { ! if(!doCompose) { return false; } ! char syllable=(char) ! (Hangul.HANGUL_BASE+ ! (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* ! Hangul.JAMO_T_COUNT); ! char t; ! if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { ! ++src; ! syllable+=t; // The next character was a Jamo T. ! prevBoundary=src; ! buffer.setLastChar(syllable); ! continue; } + // If we see L+V+x where x!=T then we drop to the slow path, + // decompose and recompose. + // This is to deal with NFKC finding normal L and V but a + // compatibility variant of a T. We need to either fully compose that + // combination here (which would complicate the code and may not work + // with strange custom data) or use the slow path -- or else our replacing + // two input characters (L+V) with one output character (LV syllable) + // would violate the invariant that [prevBoundary..prevSrc[ has the same + // length as what we appended to the buffer since prevBoundary. + needToDecompose=true; + } + } else if(Hangul.isHangulWithoutJamoT(prev)) { + // c is a Jamo Trailing consonant, + // compose with previous Hangul LV that does not contain a Jamo T. + if(!doCompose) { return false; } ! buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); ! prevBoundary=src; ! continue; } ! if(!needToDecompose) { ! // The Jamo V/T did not compose into a Hangul syllable. ! if(doCompose) { ! buffer.append((char)c); } else { prevCC=0; } ! continue; } } /* ! * Source buffer pointers: * * all done quick check current char not yet ! * "yes" but (c) processed * may combine * forward * [-------------[-------------[-------------[-------------[ * | | | | | ! * orig. src prevBoundary prevSrc src limit * * ! * Destination buffer pointers inside the ReorderingBuffer: * * all done might take not filled yet * characters for * reordering * [-------------[-------------[-------------[ * | | | | ! * start reorderStart limit | ! * +remainingCap.+ */ ! if(norm16>=MIN_YES_YES_WITH_CC) { ! int cc=norm16&0xff; // cc!=0 ! if( onlyContiguous && // FCC ! (doCompose ? buffer.getLastCC() : prevCC)==0 && ! prevBoundary<prevSrc && ! // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that ! // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) ! // passed the quick check "yes && ccc==0" test. ! // Check whether the last character was a "yesYes" or a "yesNo". ! // If a "yesNo", then we get its trailing ccc from its ! // mapping and check for canonical order. ! // All other cases are ok. ! getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ) { ! // Fails FCD test, need to decompose and contiguously recompose. ! if(!doCompose) { ! return false; } ! } else if(doCompose) { ! buffer.append(c, cc); ! continue; ! } else if(prevCC<=cc) { ! prevCC=cc; ! continue; } else { ! return false; ! } ! } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { ! return false; ! } /* ! * Find appropriate boundaries around this character, * decompose the source text from between the boundaries, ! * and recompose it. * ! * We may need to remove the last few characters from the ReorderingBuffer ! * to account for source text that was copied or appended ! * but needs to take part in the recomposition. */ ! /* ! * Find the last composition boundary in [prevBoundary..src[. ! * It is either the decomposition of the current character (at prevSrc), ! * or prevBoundary. */ ! if(hasCompBoundaryBefore(c, norm16)) { ! prevBoundary=prevSrc; ! } else if(doCompose) { ! buffer.removeSuffix(prevSrc-prevBoundary); } ! // Find the next composition boundary in [src..limit[ - ! // modifies src to point to the next starter. ! src=findNextCompBoundary(s, src, limit); ! // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. ! int recomposeStartIndex=buffer.length(); ! decomposeShort(s, prevBoundary, src, buffer); ! recompose(buffer, recomposeStartIndex, onlyContiguous); ! if(!doCompose) { ! if(!buffer.equals(s, prevBoundary, src)) { ! return false; ! } ! buffer.remove(); ! prevCC=0; } ! // Move to the next starter. We never need to look back before this point again. ! prevBoundary=src; } ! return true; } ! /** ! * Very similar to compose(): Make the same changes in both places if relevant. ! * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) ! * !doSpan: quickCheck ! * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and ! * bit 0: set if "maybe"; otherwise, if the span length&lt;s.length() ! * then the quick check result is "no" ! */ ! public int composeQuickCheck(CharSequence s, int src, int limit, ! boolean onlyContiguous, boolean doSpan) { ! int qcResult=0; ! int minNoMaybeCP=minCompNoMaybeCP; ! ! /* ! * prevBoundary points to the last character before the current one ! * that has a composition boundary before it with ccc==0 and quick check "yes". ! */ ! int prevBoundary=src; ! int prevSrc; ! int c=0; ! int norm16=0; ! int prevCC=0; ! ! for(;;) { ! // count code units below the minimum or with irrelevant data for the quick check ! for(prevSrc=src;;) { ! if(src==limit) { ! return (src<<1)|qcResult; // "yes" or "maybe" } + if( (c=s.charAt(src))<minNoMaybeCP || + isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) + ) { + ++src; + } else if(!UTF16.isSurrogate((char)c)) { + break; + } else { + char c2; + if(UTF16Plus.isSurrogateLead(c)) { + if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { + c=Character.toCodePoint((char)c, c2); } ! } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; ! c=Character.toCodePoint(c2, (char)c); } } + if(isCompYesAndZeroCC(norm16=getNorm16(c))) { + src+=Character.charCount(c); } else { ! break; } } } ! if(src!=prevSrc) { ! // Set prevBoundary to the last character in the quick check loop. ! prevBoundary=src-1; ! if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && ! Character.isHighSurrogate(s.charAt(prevBoundary-1)) ! ) { ! --prevBoundary; ! } ! prevCC=0; ! // The start of the current character (c). ! prevSrc=src; } ! src+=Character.charCount(c); ! /* ! * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. ! * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) ! * or has ccc!=0. ! */ ! if(isMaybeOrNonZeroCC(norm16)) { ! int cc=getCCFromYesOrMaybe(norm16); ! if( onlyContiguous && // FCC ! cc!=0 && ! prevCC==0 && ! prevBoundary<prevSrc && ! // prevCC==0 && prevBoundary<prevSrc tell us that ! // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) ! // passed the quick check "yes && ccc==0" test. ! // Check whether the last character was a "yesYes" or a "yesNo". ! // If a "yesNo", then we get its trailing ccc from its ! // mapping and check for canonical order. ! // All other cases are ok. ! getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ! ) { ! // Fails FCD test. ! } else if(prevCC<=cc || cc==0) { ! prevCC=cc; ! if(norm16<MIN_YES_YES_WITH_CC) { ! if(!doSpan) { ! qcResult=1; } else { ! return prevBoundary<<1; // spanYes does not care to know it's "maybe" } } ! continue; ! } ! } ! return prevBoundary<<1; // "no" } } ! public void composeAndAppend(CharSequence s, ! boolean doCompose, ! boolean onlyContiguous, ! ReorderingBuffer buffer) { ! int src=0, limit=s.length(); ! if(!buffer.isEmpty()) { ! int firstStarterInSrc=findNextCompBoundary(s, 0, limit); ! if(0!=firstStarterInSrc) { ! int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), ! buffer.length()); ! StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ ! firstStarterInSrc+16); ! middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); ! buffer.removeSuffix(buffer.length()-lastStarterInDest); ! middle.append(s, 0, firstStarterInSrc); ! compose(middle, 0, middle.length(), onlyContiguous, true, buffer); ! src=firstStarterInSrc; ! } ! } ! if(doCompose) { ! compose(s, src, limit, onlyContiguous, true, buffer); ! } else { ! buffer.append(s, src, limit); ! } ! } ! // Dual functionality: ! // buffer!=NULL: normalize ! // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes ! public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { ! // Note: In this function we use buffer->appendZeroCC() because we track ! // the lead and trail combining classes here, rather than leaving it to ! // the ReorderingBuffer. ! // The exception is the call to decomposeShort() which uses the buffer ! // in the normal way. ! ! // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. ! // Similar to the prevBoundary in the compose() implementation. ! int prevBoundary=src; ! int prevSrc; ! int c=0; ! int prevFCD16=0; ! int fcd16=0; ! for(;;) { ! // count code units with lccc==0 ! for(prevSrc=src; src!=limit;) { ! if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { ! prevFCD16=~c; ! ++src; ! } else if(!singleLeadMightHaveNonZeroFCD16(c)) { ! prevFCD16=0; ! ++src; ! } else { ! if(UTF16.isSurrogate((char)c)) { ! char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { ! c=Character.toCodePoint((char)c, c2); } ! } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; ! c=Character.toCodePoint(c2, (char)c); } } } ! if((fcd16=getFCD16FromNormData(c))<=0xff) { ! prevFCD16=fcd16; ! src+=Character.charCount(c); ! } else { ! break; } } } ! // copy these code units all at once ! if(src!=prevSrc) { ! if(src==limit) { ! if(buffer!=null) { ! buffer.flushAndAppendZeroCC(s, prevSrc, src); } ! break; } + prevBoundary=src; + // We know that the previous character's lccc==0. + if(prevFCD16<0) { + // Fetching the fcd16 value was deferred for this below-U+0300 code point. + int prev=~prevFCD16; + prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); + if(prevFCD16>1) { + --prevBoundary; + } + } else { + int p=src-1; + if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && + Character.isHighSurrogate(s.charAt(p-1)) + ) { + --p; + // Need to fetch the previous character's FCD value because + // prevFCD16 was just for the trail surrogate code point. + prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); + // Still known to have lccc==0 because its lead surrogate unit had lccc==0. } ! if(prevFCD16>1) { ! prevBoundary=p; } } ! if(buffer!=null) { ! // The last lccc==0 character is excluded from the ! // flush-and-append call in case it needs to be modified. ! buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); ! buffer.append(s, prevBoundary, src); } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { + break; } ! src+=Character.charCount(c); ! // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. ! // Check for proper order, and decompose locally if necessary. ! if((prevFCD16&0xff)<=(fcd16>>8)) { ! // proper order: prev tccc <= current lccc ! if((fcd16&0xff)<=1) { ! prevBoundary=src; } ! if(buffer!=null) { ! buffer.appendZeroCC(c); ! } ! prevFCD16=fcd16; ! continue; ! } else if(buffer==null) { ! return prevBoundary; // quick check "no" ! } else { ! /* ! * Back out the part of the source that we copied or appended ! * already but is now going to be decomposed. ! * prevSrc is set to after what was copied/appended. ! */ ! buffer.removeSuffix(prevSrc-prevBoundary); ! /* ! * Find the part of the source that needs to be decomposed, ! * up to the next safe boundary. ! */ ! src=findNextFCDBoundary(s, src, limit); ! /* ! * The source text does not fulfill the conditions for FCD. ! * Decompose and reorder a limited piece of the text. ! */ ! decomposeShort(s, prevBoundary, src, buffer); ! prevBoundary=src; ! prevFCD16=0; ! } ! } ! return src; } + // Note: hasDecompBoundary() could be implemented as aliases to + // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() + // at the cost of building the FCD trie for a decomposition normalizer. + public boolean hasDecompBoundary(int c, boolean before) { for(;;) { ! if(c<minDecompNoCP) { ! return true; ! } ! int norm16=getNorm16(c); ! if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { ! return true; ! } else if(norm16>MIN_NORMAL_MAYBE_YES) { ! return false; // ccc!=0 ! } else if(isDecompNoAlgorithmic(norm16)) { ! c=mapAlgorithmic(c, norm16); ! } else { ! // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! if((firstUnit&MAPPING_LENGTH_MASK)==0) { ! return false; ! } ! if(!before) { ! // decomp after-boundary: same as hasFCDBoundaryAfter(), ! // fcd16<=1 || trailCC==0 ! if(firstUnit>0x1ff) { ! return false; // trailCC>1 ! } ! if(firstUnit<=0xff) { ! return true; // trailCC==0 ! } ! // if(trailCC==1) test leadCC==0, same as checking for before-boundary ! } ! // true if leadCC==0 (hasFCDBoundaryBefore()) ! return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; } } } ! public boolean hasCompBoundaryBefore(int c) { ! return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); ! } ! ! private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } ! private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } ! private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } ! private boolean isHangul(int norm16) { return norm16==minYesNo; } ! private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } ! ! // UBool isCompYes(uint16_t norm16) const { ! // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; ! // } ! // UBool isCompYesOrMaybe(uint16_t norm16) const { ! // return norm16<minNoNo || minMaybeYes<=norm16; ! // } ! // private boolean hasZeroCCFromDecompYes(int norm16) { ! // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; ! // } ! private boolean isDecompYesAndZeroCC(int norm16) { ! return norm16<minYesNo || ! norm16==JAMO_VT || ! (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); ! } ! /** ! * A little faster and simpler than isDecompYesAndZeroCC() but does not include ! * the MaybeYes which combine-forward and have ccc=0. ! * (Standard Unicode 5.2 normalization does not have such characters.) ! */ ! private boolean isMostDecompYesAndZeroCC(int norm16) { ! return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; ! } ! ! private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } ! ! // For use with isCompYes(). ! // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. ! // static uint8_t getCCFromYes(uint16_t norm16) { ! // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; ! // } ! private int getCCFromNoNo(int norm16) { ! if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! return extraData.charAt(norm16-1)&0xff; } else { ! return 0; ! } } ! // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() ! int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { ! int c; ! if(cpStart==(cpLimit-1)) { ! c=s.charAt(cpStart); ! } else { ! c=Character.codePointAt(s, cpStart); ! } ! int prevNorm16=getNorm16(c); ! if(prevNorm16<=minYesNo) { ! return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 } else { ! return extraData.charAt(prevNorm16)>>8; // tccc from yesNo } } ! // Requires algorithmic-NoNo. ! private int mapAlgorithmic(int c, int norm16) { ! return c+norm16-(minMaybeYes-MAX_DELTA-1); } ! // Requires minYesNo<norm16<limitNoNo. ! // private int getMapping(int norm16) { return /*extraData+*/norm16; } /** ! * @return index into maybeYesCompositions, or -1 */ ! private int getCompositionsListForDecompYes(int norm16) { ! if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { ! return -1; ! } else { ! if((norm16-=minMaybeYes)<0) { ! // norm16<minMaybeYes: index into extraData which is a substring at ! // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] ! // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 ! norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list ! } ! return norm16; ! } ! } ! /** ! * @return index into maybeYesCompositions */ + private int getCompositionsListForComposite(int norm16) { + // composite has both mapping & compositions list + int firstUnit=extraData.charAt(norm16); + return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions + 1+ // +1 to skip the first unit with the mapping lenth + (firstUnit&MAPPING_LENGTH_MASK); // + mapping length + } + + // Decompose a short piece of text which is likely to contain characters that + // fail the quick check loop and/or where the quick check loop's overhead + // is unlikely to be amortized. + // Called by the compose() and makeFCD() implementations. + // Public in Java for collation implementation code. + public void decomposeShort(CharSequence s, int src, int limit, + ReorderingBuffer buffer) { + while(src<limit) { + int c=Character.codePointAt(s, src); + src+=Character.charCount(c); + decompose(c, getNorm16(c), buffer); + } + } ! private void decompose(int c, int norm16, ! ReorderingBuffer buffer) { ! // Only loops for 1:1 algorithmic mappings. ! for(;;) { ! // get the decomposition and the lead and trail cc's ! if(isDecompYes(norm16)) { ! // c does not decompose ! buffer.append(c, getCCFromYesOrMaybe(norm16)); ! } else if(isHangul(norm16)) { ! // Hangul syllable: decompose algorithmically ! Hangul.decompose(c, buffer); ! } else if(isDecompNoAlgorithmic(norm16)) { ! c=mapAlgorithmic(c, norm16); ! norm16=getNorm16(c); ! continue; ! } else { ! // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! int length=firstUnit&MAPPING_LENGTH_MASK; ! int leadCC, trailCC; ! trailCC=firstUnit>>8; ! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { ! leadCC=extraData.charAt(norm16-1)>>8; ! } else { ! leadCC=0; ! } ! ++norm16; // skip over the firstUnit ! buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); ! } ! return; ! } ! } ! /** ! * Finds the recomposition result for ! * a forward-combining "lead" character, ! * specified with a pointer to its compositions list, ! * and a backward-combining "trail" character. ! * ! * <p>If the lead and trail characters combine, then this function returns ! * the following "compositeAndFwd" value: ! * <pre> ! * Bits 21..1 composite character ! * Bit 0 set if the composite is a forward-combining starter ! * </pre> ! * otherwise it returns -1. ! * ! * <p>The compositions list has (trail, compositeAndFwd) pair entries, ! * encoded as either pairs or triples of 16-bit units. ! * The last entry has the high bit of its first unit set. ! * ! * <p>The list is sorted by ascending trail characters (there are no duplicates). ! * A linear search is used. ! * ! * <p>See normalizer2impl.h for a more detailed description ! * of the compositions list format. ! */ ! private static int combine(String compositions, int list, int trail) { ! int key1, firstUnit; ! if(trail<COMP_1_TRAIL_LIMIT) { ! // trail character is 0..33FF ! // result entry may have 2 or 3 units ! key1=(trail<<1); ! while(key1>(firstUnit=compositions.charAt(list))) { ! list+=2+(firstUnit&COMP_1_TRIPLE); ! } ! if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { ! if((firstUnit&COMP_1_TRIPLE)!=0) { ! return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); ! } else { ! return compositions.charAt(list+1); ! } ! } ! } else { ! // trail character is 3400..10FFFF ! // result entry has 3 units ! key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); ! int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; ! int secondUnit; ! for(;;) { ! if(key1>(firstUnit=compositions.charAt(list))) { ! list+=2+(firstUnit&COMP_1_TRIPLE); ! } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { ! if(key2>(secondUnit=compositions.charAt(list+1))) { ! if((firstUnit&COMP_1_LAST_TUPLE)!=0) { ! break; ! } else { ! list+=3; ! } ! } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { ! return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); ! } else { ! break; ! } ! } else { ! break; ! } ! } ! } ! return -1; ! } /* ! * Recomposes the buffer text starting at recomposeStartIndex ! * (which is in NFD - decomposed and canonically ordered), ! * and truncates the buffer contents. * ! * Note that recomposition never lengthens the text: ! * Any character consists of either one or two code units; ! * a composition may contain at most one more code unit than the original starter, ! * while the combining mark that is removed has at least one code unit. ! */ ! private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, ! boolean onlyContiguous) { ! StringBuilder sb=buffer.getStringBuilder(); ! int p=recomposeStartIndex; ! if(p==sb.length()) { ! return; ! } ! ! int starter, pRemove; ! int compositionsList; ! int c, compositeAndFwd; ! int norm16; ! int cc, prevCC; ! boolean starterIsSupplementary; ! ! // Some of the following variables are not used until we have a forward-combining starter ! // and are only initialized now to avoid compiler warnings. ! compositionsList=-1; // used as indicator for whether we have a forward-combining starter ! starter=-1; ! starterIsSupplementary=false; ! prevCC=0; ! for(;;) { ! c=sb.codePointAt(p); ! p+=Character.charCount(c); ! norm16=getNorm16(c); ! cc=getCCFromYesOrMaybe(norm16); ! if( // this character combines backward and ! isMaybe(norm16) && ! // we have seen a starter that combines forward and ! compositionsList>=0 && ! // the backward-combining character is not blocked ! (prevCC<cc || prevCC==0)) { ! if(isJamoVT(norm16)) { ! // c is a Jamo V/T, see if we can compose it with the previous character. ! if(c<Hangul.JAMO_T_BASE) { ! // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. ! char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); ! if(prev<Hangul.JAMO_L_COUNT) { ! pRemove=p-1; ! char syllable=(char) ! (Hangul.HANGUL_BASE+ ! (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* ! Hangul.JAMO_T_COUNT); ! char t; ! if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { ! ++p; ! syllable+=t; // The next character was a Jamo T. ! } ! sb.setCharAt(starter, syllable); ! // remove the Jamo V/T ! sb.delete(pRemove, p); ! p=pRemove; } } ! /* ! * No "else" for Jamo T: ! * Since the input is in NFD, there are no Hangul LV syllables that ! * a Jamo T could combine with. ! * All Jamo Ts are combined above when handling Jamo Vs. ! */ ! if(p==sb.length()) { ! break; } ! compositionsList=-1; ! continue; ! } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { ! // The starter and the combining mark (c) do combine. ! int composite=compositeAndFwd>>1; ! ! // Remove the combining mark. ! pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark ! sb.delete(pRemove, p); ! p=pRemove; ! // Replace the starter with the composite. ! if(starterIsSupplementary) { ! if(composite>0xffff) { ! // both are supplementary ! sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); ! sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); ! } else { ! sb.setCharAt(starter, (char)c); ! sb.deleteCharAt(starter+1); ! // The composite is shorter than the starter, ! // move the intermediate characters forward one. ! starterIsSupplementary=false; ! --p; } + } else if(composite>0xffff) { + // The composite is longer than the starter, + // move the intermediate characters back one. + starterIsSupplementary=true; + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); + ++p; + } else { + // both are on the BMP + sb.setCharAt(starter, (char)composite); } ! // Keep prevCC because we removed the combining mark. + if(p==sb.length()) { + break; } ! // Is the composite a starter that combines forward? ! if((compositeAndFwd&1)!=0) { ! compositionsList= ! getCompositionsListForComposite(getNorm16(composite)); ! } else { ! compositionsList=-1; } ! // We combined; continue with looking for compositions. ! continue; ! } } ! // no combination this time ! prevCC=cc; ! if(p==sb.length()) { break; } ! // If c did not combine, then check if it is a starter. ! if(cc==0) { ! // Found a new starter. ! if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { ! // It may combine with something, prepare for it. ! if(c<=0xffff) { ! starterIsSupplementary=false; ! starter=p-1; ! } else { ! starterIsSupplementary=true; ! starter=p-2; } } ! } else if(onlyContiguous) { ! // FCC: no discontiguous compositions; any intervening character blocks. ! compositionsList=-1; } } ! buffer.flush(); } ! /** ! * Does c have a composition boundary before it? ! * True if its decomposition begins with a character that has ! * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). ! * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes ! * (isCompYesAndZeroCC()) so we need not decompose. ! */ ! private boolean hasCompBoundaryBefore(int c, int norm16) { ! for(;;) { ! if(isCompYesAndZeroCC(norm16)) { ! return true; ! } else if(isMaybeOrNonZeroCC(norm16)) { ! return false; ! } else if(isDecompNoAlgorithmic(norm16)) { ! c=mapAlgorithmic(c, norm16); ! norm16=getNorm16(c); ! } else { ! // c decomposes, get everything from the variable-length extra data ! int firstUnit=extraData.charAt(norm16); ! if((firstUnit&MAPPING_LENGTH_MASK)==0) { ! return false; } ! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { ! return false; // non-zero leadCC } ! return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); } } } ! private int findPreviousCompBoundary(CharSequence s, int p) { ! while(p>0) { ! int c=Character.codePointBefore(s, p); ! p-=Character.charCount(c); ! if(hasCompBoundaryBefore(c)) { ! break; } + // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, + // but that's probably not worth the extra cost. } ! return p; } ! private int findNextCompBoundary(CharSequence s, int p, int limit) { ! while(p<limit) { ! int c=Character.codePointAt(s, p); ! int norm16=normTrie.get(c); ! if(hasCompBoundaryBefore(c, norm16)) { ! break; ! } ! p+=Character.charCount(c); ! } ! return p; } ! private int findNextFCDBoundary(CharSequence s, int p, int limit) { ! while(p<limit) { ! int c=Character.codePointAt(s, p); ! if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { ! break; ! } ! p+=Character.charCount(c); ! } ! return p; ! } /** * Get the canonical decomposition * sherman for ComposedCharIter */ public static int getDecompose(int chars[], String decomps[]) { ! Normalizer2 impl = Normalizer2.getNFDInstance(); ! int length=0; ! int norm16 = 0; int ch = -1; int i = 0; while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff //TBD !!!! the hack code heres save us about 50ms for startup //need a better solution/lookup if (ch == 0x30ff) ch = 0xf900; ! else if (ch == 0x115bc) ch = 0x1d15e; else if (ch == 0x1d1c1) ch = 0x2f800; ! String s = impl.getDecomposition(ch); ! ! if(s != null && i < chars.length) { chars[i] = ch; ! decomps[i++] = s; } } return i; } //------------------------------------------------------ ! // special method for Collation (RBTableBuilder.build()) //------------------------------------------------------ private static boolean needSingleQuotation(char c) { return (c >= 0x0009 && c <= 0x000D) || (c >= 0x0020 && c <= 0x002F) || (c >= 0x003A && c <= 0x0040) || (c >= 0x005B && c <= 0x0060) || (c >= 0x007B && c <= 0x007E); } public static String canonicalDecomposeWithSingleQuotation(String string) { + Normalizer2 impl = Normalizer2.getNFDInstance(); char[] src = string.toCharArray(); int srcIndex = 0; int srcLimit = src.length; char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 int destIndex = 0; int destLimit = dest.length; int prevSrc; ! String norm; int reorderStartIndex, length; ! char c1, c2; ! int cp; ! int minNoMaybe = 0x00c0; int cc, prevCC, trailCC; char[] p; int pStart; // initialize reorderStartIndex = 0; prevCC = 0; ! norm = null; ! cp = 0; pStart = 0; cc = trailCC = -1; // initialize to bogus value ! c1 = 0; ! for (;;) { prevSrc=srcIndex; //quick check (1)less than minNoMaybe (2)no decomp (3)hangual while (srcIndex != srcLimit && ! ((c1 = src[srcIndex]) < minNoMaybe || ! (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || ! (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables prevCC = 0; ! srcIndex += (cp < 0x10000) ? 1 : 2; } // copy these code units all at once if (srcIndex != prevSrc) { length = srcIndex - prevSrc;
*** 2554,2604 **** destIndex += length; reorderStartIndex = destIndex; } // end of source reached? ! if(srcIndex == srcLimit) { break; } - // c already contains *src and norm32 is set for it, increment src - ++srcIndex; ! if(isNorm32Regular(norm32)) { c2 = 0; length = 1; } else { - // c is a lead surrogate, get the real norm32 - if(srcIndex != srcLimit && - Character.isLowSurrogate(c2 = src[srcIndex])) { - ++srcIndex; length = 2; ! norm32 = getNorm32FromSurrogatePair(norm32, c2); ! } else { ! c2 = 0; ! length = 1; ! norm32 = 0; ! } } // get the decomposition and the lead and trail cc's ! if((norm32 & qcMask) == 0) { ! // c does not decompose ! cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT)); p = null; pStart = -1; } else { ! DecomposeArgs arg = new DecomposeArgs(); ! // c decomposes, get everything from the variable-length ! // extra data ! pStart = decompose(norm32, qcMask, arg); ! p = extraData; ! length = arg.length; ! cc = arg.cc; ! trailCC = arg.trailCC; ! if(length == 1) { // fastpath a single code unit from decomposition ! c = p[pStart]; c2 = 0; p = null; pStart = -1; } } --- 1804,1850 ---- destIndex += length; reorderStartIndex = destIndex; } // end of source reached? ! if (srcIndex == srcLimit) { break; } ! // cp already contains *src and norm32 is set for it, increment src ! srcIndex += (cp < 0x10000) ? 1 : 2; ! ! if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { c2 = 0; length = 1; + + if (Character.isHighSurrogate(c1) + || Character.isLowSurrogate(c1)) { + norm = null; + } } else { length = 2; ! c2 = src[srcIndex-1]; } // get the decomposition and the lead and trail cc's ! if (norm == null) { ! // cp does not decompose ! cc = trailCC = UCharacter.getCombiningClass(cp); p = null; pStart = -1; } else { ! ! pStart = 0; ! p = norm.toCharArray(); ! length = p.length; ! int cpNum = norm.codePointCount(0, length); ! cc= UCharacter.getCombiningClass(norm.codePointAt(0)); ! trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); ! if (length == 1) { // fastpath a single code unit from decomposition ! c1 = p[pStart]; c2 = 0; p = null; pStart = -1; } }
*** 2608,2638 **** char[] tmpBuf = new char[destLimit * 2]; System.arraycopy(dest, 0, tmpBuf, 0, destIndex); dest = tmpBuf; destLimit = dest.length; } // append the decomposition to the destination buffer, assume length>0 { int reorderSplit = destIndex; ! if(p == null) { // fastpath: single code point ! if (needSingleQuotation(c)) { //if we need single quotation, no need to consider "prevCC" //and it must NOT be a supplementary pair dest[destIndex++] = '\''; ! dest[destIndex++] = c; dest[destIndex++] = '\''; trailCC = 0; } else if(cc != 0 && cc < prevCC) { ! // (c, c2) is out of order with respect to the preceding // text destIndex += length; ! trailCC = insertOrdered(dest,reorderStartIndex, ! reorderSplit, destIndex, c, c2, cc); } else { ! // just append (c, c2) ! dest[destIndex++] = c; if(c2 != 0) { dest[destIndex++] = c2; } } } else { --- 1854,1885 ---- char[] tmpBuf = new char[destLimit * 2]; System.arraycopy(dest, 0, tmpBuf, 0, destIndex); dest = tmpBuf; destLimit = dest.length; } + // append the decomposition to the destination buffer, assume length>0 { int reorderSplit = destIndex; ! if (p == null) { // fastpath: single code point ! if (needSingleQuotation(c1)) { //if we need single quotation, no need to consider "prevCC" //and it must NOT be a supplementary pair dest[destIndex++] = '\''; ! dest[destIndex++] = c1; dest[destIndex++] = '\''; trailCC = 0; } else if(cc != 0 && cc < prevCC) { ! // (c1, c2) is out of order with respect to the preceding // text destIndex += length; ! trailCC = insertOrdered(dest, reorderStartIndex, ! reorderSplit, destIndex, c1, c2, cc); } else { ! // just append (c1, c2) ! dest[destIndex++] = c1; if(c2 != 0) { dest[destIndex++] = c2; } } } else {
*** 2644,2736 **** dest[destIndex++] = '\''; length--; do { dest[destIndex++] = p[pStart++]; } while(--length > 0); ! } else ! if(cc != 0 && cc < prevCC) { destIndex += length; ! trailCC = mergeOrdered(dest,reorderStartIndex, ! reorderSplit,p, pStart,pStart+length); } else { // just append the decomposition do { dest[destIndex++] = p[pStart++]; ! } while(--length > 0); } } } prevCC = trailCC; if(prevCC == 0) { reorderStartIndex = destIndex; } } return new String(dest, 0, destIndex); } ! //------------------------------------------------------ ! // mapping method for IDNA/StringPrep ! //------------------------------------------------------ ! ! /* ! * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode ! * 3.2 normalization with Corrigendum 4 corrections. However, normalization ! * without the corrections is necessary for IDNA/StringPrep support. ! * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option ! * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five ! * characters in Corrigendum 4 before normalization in order to avoid ! * incorrect normalization. ! * For the Corrigendum 4 issue, refer ! * http://www.unicode.org/versions/corrigendum4.html */ ! /* ! * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL. ! */ ! public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000; ! private static final char[][] corrigendum4MappingTable = { ! {'\uD844', '\uDF6A'}, // 0x2F868 ! {'\u5F33'}, // 0x2F874 ! {'\u43AB'}, // 0x2F91F ! {'\u7AAE'}, // 0x2F95F ! {'\u4D57'}}; // 0x2F9BF ! /* ! * Removing Corrigendum 4 fix ! * @return normalized text ! */ ! public static String convert(String str) { ! if (str == null) { ! return null; } ! int ch = UCharacterIterator.DONE; ! StringBuffer dest = new StringBuffer(); ! UCharacterIterator iter = UCharacterIterator.getInstance(str); ! while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ ! switch (ch) { ! case 0x2F868: ! dest.append(corrigendum4MappingTable[0]); ! break; ! case 0x2F874: ! dest.append(corrigendum4MappingTable[1]); ! break; ! case 0x2F91F: ! dest.append(corrigendum4MappingTable[2]); ! break; ! case 0x2F95F: ! dest.append(corrigendum4MappingTable[3]); ! break; ! case 0x2F9BF: ! dest.append(corrigendum4MappingTable[4]); ! break; ! default: ! UTF16.append(dest,ch); break; } } ! return dest.toString(); } } --- 1891,2155 ---- dest[destIndex++] = '\''; length--; do { dest[destIndex++] = p[pStart++]; } while(--length > 0); ! } else if (cc != 0 && cc < prevCC) { destIndex += length; ! trailCC = mergeOrdered(dest, reorderStartIndex, ! reorderSplit, p, pStart, ! pStart+length); } else { // just append the decomposition do { dest[destIndex++] = p[pStart++]; ! } while (--length > 0); } } } prevCC = trailCC; if(prevCC == 0) { reorderStartIndex = destIndex; } } + return new String(dest, 0, destIndex); } ! /** ! * simpler, single-character version of mergeOrdered() - ! * bubble-insert one single code point into the preceding string ! * which is already canonically ordered ! * (c, c2) may or may not yet have been inserted at src[current]..src[p] ! * ! * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) ! * ! * before: src[start]..src[current] is already ordered, and ! * src[current]..src[p] may or may not hold (c, c2) but ! * must be exactly the same length as (c, c2) ! * after: src[start]..src[p] is ordered ! * ! * @return the trailing combining class */ + private static int/*unsigned byte*/ insertOrdered(char[] source, + int start, + int current, int p, + char c1, char c2, + int/*unsigned byte*/ cc) { + int back, preBack; + int r; + int prevCC, trailCC=cc; ! if (start<current && cc!=0) { ! // search for the insertion point where cc>=prevCC ! preBack=back=current; ! ! PrevArgs prevArgs = new PrevArgs(); ! prevArgs.current = current; ! prevArgs.start = start; ! prevArgs.src = source; ! prevArgs.c1 = c1; ! prevArgs.c2 = c2; ! // get the prevCC ! prevCC=getPrevCC(prevArgs); ! preBack = prevArgs.current; ! if(cc<prevCC) { ! // this will be the last code point, so keep its cc ! trailCC=prevCC; ! back=preBack; ! while(start<preBack) { ! prevCC=getPrevCC(prevArgs); ! preBack=prevArgs.current; ! if(cc>=prevCC) { ! break; ! } ! back=preBack; } ! // this is where we are right now with all these indicies: ! // [start]..[pPreBack] 0..? code points that we can ignore ! // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc ! // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) ! // [current]..[p] 1 code point (c, c2) with cc ! // move the code units in between up ! r=p; ! do { ! source[--r]=source[--current]; ! } while (back!=current); ! } ! } ! ! // insert (c1, c2) ! source[current] = c1; ! if (c2!=0) { ! source[(current+1)] = c2; ! } ! ! // we know the cc of the last code point ! return trailCC; ! } ! ! /** ! * merge two UTF-16 string parts together ! * to canonically order (order by combining classes) their concatenation ! * ! * the two strings may already be adjacent, so that the merging is done ! * in-place if the two strings are not adjacent, then the buffer holding the ! * first one must be large enough ! * the second string may or may not be ordered in itself ! * ! * before: [start]..[current] is already ordered, and ! * [next]..[limit] may be ordered in itself, but ! * is not in relation to [start..current[ ! * after: [start..current+(limit-next)[ is ordered ! * ! * the algorithm is a simple bubble-sort that takes the characters from ! * src[next++] and inserts them in correct combining class order into the ! * preceding part of the string ! * ! * since this function is called much less often than the single-code point ! * insertOrdered(), it just uses that for easier maintenance ! * ! * @return the trailing combining class ! */ ! private static int /*unsigned byte*/ mergeOrdered(char[] source, ! int start, ! int current, ! char[] data, ! int next, ! int limit) { ! int r; ! int /*unsigned byte*/ cc, trailCC=0; ! boolean adjacent; ! ! adjacent= current==next; ! NextCCArgs ncArgs = new NextCCArgs(); ! ncArgs.source = data; ! ncArgs.next = next; ! ncArgs.limit = limit; ! ! if(start!=current) { ! ! while(ncArgs.next<ncArgs.limit) { ! cc=getNextCC(ncArgs); ! if(cc==0) { ! // does not bubble back ! trailCC=0; ! if(adjacent) { ! current=ncArgs.next; ! } else { ! data[current++]=ncArgs.c1; ! if(ncArgs.c2!=0) { ! data[current++]=ncArgs.c2; ! } ! } break; + } else { + r=current+(ncArgs.c2==0 ? 1 : 2); + trailCC=insertOrdered(source,start, current, r, + ncArgs.c1, ncArgs.c2, cc); + current=r; } } + } + + if(ncArgs.next==ncArgs.limit) { + // we know the cc of the last code point + return trailCC; + } else { + if(!adjacent) { + // copy the second string part + do { + source[current++]=data[ncArgs.next++]; + } while(ncArgs.next!=ncArgs.limit); + ncArgs.limit=current; + } + PrevArgs prevArgs = new PrevArgs(); + prevArgs.src = data; + prevArgs.start = start; + prevArgs.current = ncArgs.limit; + return getPrevCC(prevArgs); + } + + } + + private static final class PrevArgs{ + char[] src; + int start; + int current; + char c1; + char c2; + } + + private static final class NextCCArgs{ + char[] source; + int next; + int limit; + char c1; + char c2; + } + + private static int /*unsigned*/ getPrevCC(PrevArgs args) { + args.c1=args.src[--args.current]; + args.c2=0; + + if (args.c1 < MIN_CCC_LCCC_CP) { + return 0; + } else if (UTF16.isLeadSurrogate(args.c1)) { + /* unpaired first surrogate */ + return 0; + } else if (!UTF16.isTrailSurrogate(args.c1)) { + return UCharacter.getCombiningClass(args.c1); + } else if (args.current!=args.start && + UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { + --args.current; + return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); + } else { + /* unpaired second surrogate */ + args.c2=0; + return 0; + } + } + + private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { + args.c1=args.source[args.next++]; + args.c2=0; ! if (UTF16.isTrailSurrogate(args.c1)) { ! /* unpaired second surrogate */ ! return 0; ! } else if (!UTF16.isLeadSurrogate(args.c1)) { ! return UCharacter.getCombiningClass(args.c1); ! } else if (args.next!=args.limit && ! UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ ! ++args.next; ! return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); ! } else { ! /* unpaired first surrogate */ ! args.c2=0; ! return 0; } + } + + private VersionInfo dataVersion; + + // Code point thresholds for quick check codes. + private int minDecompNoCP; + private int minCompNoMaybeCP; + + // Norm16 value thresholds for quick check combinations and types of extra data. + private int minYesNo; + private int minYesNoMappingsOnly; + private int minNoNo; + private int limitNoNo; + private int minMaybeYes; + + private Trie2_16 normTrie; + private String maybeYesCompositions; + private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters + private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 + private int[] tccc180; // [0x180] tccc values for U+0000..U+017F + }
< prev index next >