--- old/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2015-07-13 16:11:50.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2015-07-13 16:11:49.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,2453 +22,1706 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.BufferedInputStream; -import java.io.InputStream; +import java.nio.ByteBuffer; +import java.text.Normalizer; -/** - * @author Ram Viswanadha - */ +// Original filename in ICU4J: Normalizer2Impl.java public final class NormalizerImpl { - // Static block for the class to initialize its own self - static final NormalizerImpl IMPL; - - static - { - try - { - IMPL = new NormalizerImpl(); - } - catch (Exception e) - { - throw new RuntimeException(e.getMessage()); - } - } - - static final int UNSIGNED_BYTE_MASK =0xFF; - static final long UNSIGNED_INT_MASK = 0xffffffffL; - /* - * This new implementation of the normalization code loads its data from - * unorm.icu, which is generated with the gennorm tool. - * The format of that file is described at the end of this file. - */ - private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu"; - - // norm32 value constants - - // quick check flags 0..3 set mean "no" for their forms - public static final int QC_NFC=0x11; /* no|maybe */ - public static final int QC_NFKC=0x22; /* no|maybe */ - public static final int QC_NFD=4; /* no */ - public static final int QC_NFKD=8; /* no */ - - public static final int QC_ANY_NO=0xf; - - /* quick check flags 4..5 mean "maybe" for their forms; - * test flags>=QC_MAYBE - */ - public static final int QC_MAYBE=0x10; - public static final int QC_ANY_MAYBE=0x30; - - public static final int QC_MASK=0x3f; - - private static final int COMBINES_FWD=0x40; - private static final int COMBINES_BACK=0x80; - public static final int COMBINES_ANY=0xc0; - // UnicodeData.txt combining class in bits 15. - private static final int CC_SHIFT=8; - public static final int CC_MASK=0xff00; - // 16 bits for the index to UChars and other extra data - private static final int EXTRA_SHIFT=16; - - /* norm32 value constants using >16 bits */ - private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK; - private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK; - private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK; -// private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK; - private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK; - - - /* indexes[] value names */ - /* number of bytes in normalization trie */ - static final int INDEX_TRIE_SIZE = 0; - /* number of chars in extra data */ - static final int INDEX_CHAR_COUNT = 1; - /* number of uint16_t words for combining data */ - static final int INDEX_COMBINE_DATA_COUNT = 2; - /* first code point with quick check NFC NO/MAYBE */ - public static final int INDEX_MIN_NFC_NO_MAYBE = 6; - /* first code point with quick check NFKC NO/MAYBE */ - public static final int INDEX_MIN_NFKC_NO_MAYBE = 7; - /* first code point with quick check NFD NO/MAYBE */ - public static final int INDEX_MIN_NFD_NO_MAYBE = 8; - /* first code point with quick check NFKD NO/MAYBE */ - public static final int INDEX_MIN_NFKD_NO_MAYBE = 9; - /* number of bytes in FCD trie */ - static final int INDEX_FCD_TRIE_SIZE = 10; - /* number of bytes in the auxiliary trie */ - static final int INDEX_AUX_TRIE_SIZE = 11; - /* changing this requires a new formatVersion */ - static final int INDEX_TOP = 32; - - - /* AUX constants */ - /* value constants for auxTrie */ - private static final int AUX_UNSAFE_SHIFT = 11; - private static final int AUX_COMP_EX_SHIFT = 10; - private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12; - - private static final int AUX_MAX_FNC = 1<>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))& - (0x3ff<>Trie.INDEX_STAGE_1_SHIFT_; - /** Number of bits of a trail surrogate that are used in index table - * lookups. - */ - private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_; - - - // public utility - public static int getFromIndexesArr(int index){ - return indexes[index]; - } - - // protected constructor --------------------------------------------- - - /** - * Constructor - * @exception thrown when data reading fails or data corrupted - */ - private NormalizerImpl() throws IOException { - //data should be loaded only once - if(!isDataLoaded){ - - // jar access - InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME); - BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE); - NormalizerDataReader reader = new NormalizerDataReader(b); - - // read the indexes - indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP); - - byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]]; - - int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT]; - combiningTable = new char[combiningTableTop]; - - int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT]; - extraData = new char[extraDataTop]; - - byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]]; - byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]]; - - fcdTrieImpl = new FCDTrieImpl(); - normTrieImpl = new NormTrieImpl(); - auxTrieImpl = new AuxTrieImpl(); - - // load the rest of the data data and initialize the data members - reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable); - - NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl ); - FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl ); - AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl ); - - // we reached here without any exceptions so the data is fully - // loaded set the variable to true - isDataLoaded = true; - - // get the data format version - byte[] formatVersion = reader.getDataFormatVersion(); - - isFormatVersion_2_1 =( formatVersion[0]>2 - || - (formatVersion[0]==2 && formatVersion[1]>=1) - ); - isFormatVersion_2_2 =( formatVersion[0]>2 - || - (formatVersion[0]==2 && formatVersion[1]>=2) - ); - unicodeVersion = reader.getUnicodeVersion(); - b.close(); - } - } - - /* ---------------------------------------------------------------------- */ - - /* Korean Hangul and Jamo constants */ - - public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ - public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ - public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ - - public static final int HANGUL_BASE=0xac00; - - public static final int JAMO_L_COUNT=19; - public static final int JAMO_V_COUNT=21; - public static final int JAMO_T_COUNT=28; - public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; - - private static boolean isHangulWithoutJamoT(char c) { - c-=HANGUL_BASE; - return c=MIN_HANGUL; - } - - /* - * Given norm32 for Jamo V or T, - * is this a Jamo V? - */ - private static boolean isJamoVTNorm32JamoV(long norm32) { - return norm320) && isNorm32LeadSurrogate(norm32)) { - /* *p is a lead surrogate, get the real norm32 */ - norm32=getNorm32FromSurrogatePair(norm32, p[start+1]); - } - return norm32; - } - - //// for StringPrep - public static VersionInfo getUnicodeVersion(){ - return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1], - unicodeVersion[2], unicodeVersion[3]); - } - - public static char getFCD16(char c) { - return FCDTrieImpl.fcdTrie.getLeadValue(c); - } - - public static char getFCD16FromSurrogatePair(char fcd16, char c2) { - /* the surrogate index in fcd16 is an absolute offset over the - * start of stage 1 - * */ - return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2); - } - public static int getFCD16(int c) { - return FCDTrieImpl.fcdTrie.getCodePointValue(c); - } - - private static int getExtraDataIndex(long norm32) { - return (int)(norm32>>EXTRA_SHIFT); - } - - private static final class DecomposeArgs{ - int /*unsigned byte*/ cc; - int /*unsigned byte*/ trailCC; - int length; - } - /** - * - * get the canonical or compatibility decomposition for one character - * - * @return index into the extraData array - */ - private static int/*index*/ decompose(long/*unsigned*/ norm32, - int/*unsigned*/ qcMask, - DecomposeArgs args) { - int p= getExtraDataIndex(norm32); - args.length=extraData[p++]; - - if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) { - /* use compatibility decomposition, skip canonical data */ - p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK); - args.length>>=8; - } - - if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { - /* get the lead and trail cc's */ - char bothCCs=extraData[p++]; - args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); - args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; - } else { - /* lead and trail cc's are both 0 */ - args.cc=args.trailCC=0; - } - args.length&=DECOMP_LENGTH_MASK; - return p; - } + public static final class Hangul { + /* Korean Hangul and Jamo constants */ + public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ + public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ + public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ + public static final int HANGUL_BASE=0xac00; + public static final int HANGUL_END=0xd7a3; - /** - * get the canonical decomposition for one character - * @return index into the extraData array - */ - private static int decompose(long/*unsigned*/ norm32, - DecomposeArgs args) { + public static final int JAMO_L_COUNT=19; + public static final int JAMO_V_COUNT=21; + public static final int JAMO_T_COUNT=28; - int p= getExtraDataIndex(norm32); - args.length=extraData[p++]; + public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; + public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; - if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { - /* get the lead and trail cc's */ - char bothCCs=extraData[p++]; - args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); - args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; - } else { - /* lead and trail cc's are both 0 */ - args.cc=args.trailCC=0; + public static boolean isHangul(int c) { + return HANGUL_BASE<=c && c>CC_SHIFT)); + public static boolean isHangulWithoutJamoT(char c) { + c-=HANGUL_BASE; + return c>CC_SHIFT)); - } - - /* - * is this a safe boundary character for NF*D? - * (lead cc==0) - */ - public static boolean isNFDSafe(long/*unsigned*/ norm32, - int/*unsigned*/ccOrQCMask, - int/*unsigned*/ decompQCMask) { - if((norm32&ccOrQCMask)==0) { - return true; /* cc==0 and no decomposition: this is NF*D safe */ - } - - /* inspect its decomposition - maybe a Hangul but not a surrogate here*/ - if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) { - DecomposeArgs args=new DecomposeArgs(); - /* decomposes, get everything from the variable-length extra data */ - decompose(norm32, decompQCMask, args); - return args.cc==0; - } else { - /* no decomposition (or Hangul), test the cc directly */ - return (norm32&CC_MASK)==0; - } - } - - /* - * is this (or does its decomposition begin with) a "true starter"? - * (cc==0 and NF*C_YES) - */ - public static boolean isTrueStarter(long/*unsigned*/ norm32, - int/*unsigned*/ ccOrQCMask, - int/*unsigned*/ decompQCMask) { - if((norm32&ccOrQCMask)==0) { - return true; /* this is a true starter (could be Hangul or Jamo L)*/ - } - - /* inspect its decomposition - not a Hangul or a surrogate here */ - if((norm32&decompQCMask)!=0) { - int p; /* index into extra data array */ - DecomposeArgs args=new DecomposeArgs(); - /* decomposes, get everything from the variable-length extra data */ - p=decompose(norm32, decompQCMask, args); - - if(args.cc==0) { - int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK; - - /* does it begin with NFC_YES? */ - if((getNorm32(extraData,p, qcMask)&qcMask)==0) { - /* yes, the decomposition begins with a true starter */ - return true; - } - } - } - return false; - } - - /* reorder UTF-16 in-place ---------------------------------------------- */ - - /** - * simpler, single-character version of mergeOrdered() - - * bubble-insert one single code point into the preceding string - * which is already canonically ordered - * (c, c2) may or may not yet have been inserted at src[current]..src[p] - * - * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) - * - * before: src[start]..src[current] is already ordered, and - * src[current]..src[p] may or may not hold (c, c2) but - * must be exactly the same length as (c, c2) - * after: src[start]..src[p] is ordered - * - * @return the trailing combining class - */ - private static int/*unsigned byte*/ insertOrdered(char[] source, - int start, - int current, int p, - char c, char c2, - int/*unsigned byte*/ cc) { - int back, preBack; - int r; - int prevCC, trailCC=cc; - - if(start=prevCC - preBack=back=current; - PrevArgs prevArgs = new PrevArgs(); - prevArgs.current = current; - prevArgs.start = start; - prevArgs.src = source; - // get the prevCC - prevCC=getPrevCC(prevArgs); - preBack = prevArgs.current; - - if(cc=prevCC) { - break; - } - back=preBack; + public static int decompose(int c, Appendable buffer) { + try { + c-=HANGUL_BASE; + int c2=c%JAMO_T_COUNT; + c/=JAMO_T_COUNT; + buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); + buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); + if(c2==0) { + return 2; + } else { + buffer.append((char)(JAMO_T_BASE+c2)); + return 3; } - - - // this is where we are right now with all these indicies: - // [start]..[pPreBack] 0..? code points that we can ignore - // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc - // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) - // [current]..[p] 1 code point (c, c2) with cc - - // move the code units in between up - r=p; - do { - source[--r]=source[--current]; - } while(back!=current); + } catch(IOException e) { + throw new InternalError(e); } } - - // insert (c, c2) - source[current]=c; - if(c2!=0) { - source[(current+1)]=c2; - } - - // we know the cc of the last code point - return trailCC; } /** - * merge two UTF-16 string parts together - * to canonically order (order by combining classes) their concatenation - * - * the two strings may already be adjacent, so that the merging is done - * in-place if the two strings are not adjacent, then the buffer holding the - * first one must be large enough - * the second string may or may not be ordered in itself - * - * before: [start]..[current] is already ordered, and - * [next]..[limit] may be ordered in itself, but - * is not in relation to [start..current[ - * after: [start..current+(limit-next)[ is ordered - * - * the algorithm is a simple bubble-sort that takes the characters from - * src[next++] and inserts them in correct combining class order into the - * preceding part of the string - * - * since this function is called much less often than the single-code point - * insertOrdered(), it just uses that for easier maintenance - * - * @return the trailing combining class - */ - private static int /*unsigned byte*/ mergeOrdered(char[] source, - int start, - int current, - char[] data, - int next, - int limit, - boolean isOrdered) { - int r; - int /*unsigned byte*/ cc, trailCC=0; - boolean adjacent; - - adjacent= current==next; - NextCCArgs ncArgs = new NextCCArgs(); - ncArgs.source = data; - ncArgs.next = next; - ncArgs.limit = limit; - - if(start!=current || !isOrdered) { - - while(ncArgs.next=minNoMaybe && - (( norm32=getNorm32(c)) & ccOrQCMask)!=0) { - break; - } - prevCC=0; - } - - - // check one above-minimum, relevant code unit - if(isNorm32LeadSurrogate(norm32)) { - // c is a lead surrogate, get the real norm32 - if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) { - ++srcStart; - norm32=getNorm32FromSurrogatePair(norm32,c2); + * Writable buffer that takes care of canonical ordering. + * Its Appendable methods behave like the C++ implementation's + * appendZeroCC() methods. + *

+ * If dest is a StringBuilder, then the buffer writes directly to it. + * Otherwise, the buffer maintains a StringBuilder for intermediate text segments + * until no further changes are necessary and whole segments are appended. + * append() methods that take combining-class values always write to the StringBuilder. + * Other append() methods flush and append to the Appendable. + */ + public static final class ReorderingBuffer implements Appendable { + public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { + impl=ni; + app=dest; + if (app instanceof StringBuilder) { + appIsStringBuilder=true; + str=(StringBuilder)dest; + // In Java, the constructor subsumes public void init(int destCapacity) + str.ensureCapacity(destCapacity); + reorderStart=0; + if(str.length()==0) { + lastCC=0; } else { - norm32=0; - c2=0; - } - }else{ - c2=0; - } - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - norm32=0; - } - - // check the combining order - cc=(char)((norm32>>CC_SHIFT)&0xFF); - if(cc!=0 && cc=1) { - result= NormalizerBase.NO; - break; - } else if(qcNorm32!=0) { - // "maybe" can only occur for NFC and NFKC - if(allowMaybe){ - result=NormalizerBase.MAYBE; - }else{ - // normalize a section around here to see if it is really - // normalized or not - int prevStarter; - int/*unsigned*/ decompQCMask; - - decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask - - // find the previous starter - - // set prevStarter to the beginning of the current character - prevStarter=srcStart-1; - if(UTF16.isTrailSurrogate(src[prevStarter])) { - // safe because unpaired surrogates do not result - // in "maybe" - --prevStarter; - } - prevStarter=findPreviousStarter(src, start, prevStarter, - ccOrQCMask, decompQCMask, - (char)minNoMaybe); - - // find the next true starter in [src..limit[ - modifies - // src to point to the next starter - srcStart=findNextStarter(src,srcStart, srcLimit, qcMask, - decompQCMask,(char) minNoMaybe); - - //set the args for compose part - args.prevCC = prevCC; - - // decompose and recompose [prevStarter..src[ - buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx); - - // compare the normalized version with the original - if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) { - result=NormalizerBase.NO; // normalization differs - break; + setIterator(); + lastCC=previousCC(); + // Set reorderStart after the last code point with cc<=1 if there is one. + if(lastCC>1) { + while(previousCC()>1) {} } - - // continue after the next starter + reorderStart=codePointLimit; } - } - } - return result; - } - - - //------------------------------------------------------ - // make NFD & NFKD - //------------------------------------------------------ - - public static int decompose(char[] src,int srcStart,int srcLimit, - char[] dest,int destStart,int destLimit, - boolean compat,int[] outTrailCC, - UnicodeSet nx) { - - char[] buffer = new char[3]; - int prevSrc; - long norm32; - int ccOrQCMask, qcMask; - int reorderStartIndex, length; - char c, c2, minNoMaybe; - int/*unsigned byte*/ cc, prevCC, trailCC; - char[] p; - int pStart; - int destIndex = destStart; - int srcIndex = srcStart; - if(!compat) { - minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE]; - qcMask=QC_NFD; - } else { - minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE]; - qcMask=QC_NFKD; + } else { + appIsStringBuilder=false; + str=new StringBuilder(); + reorderStart=0; + lastCC=0; + } } - /* initialize */ - ccOrQCMask=CC_MASK|qcMask; - reorderStartIndex=0; - prevCC=0; - norm32=0; - c=0; - pStart=0; + public boolean isEmpty() { return str.length()==0; } + public int length() { return str.length(); } + public int getLastCC() { return lastCC; } - cc=trailCC=-1;//initialize to bogus value + public StringBuilder getStringBuilder() { return str; } - for(;;) { - /* count code units below the minimum or with irrelevant data for - * the quick check - */ - prevSrc=srcIndex; + public boolean equals(CharSequence s, int start, int limit) { + return UTF16Plus.equal(str, 0, str.length(), s, start, limit); + } - while(srcIndex!=srcLimit &&((c=src[srcIndex])0) { - buffer[2]=(char)(JAMO_T_BASE+c2); - length=3; - } else { - length=2; - } - - buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT); - buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT); + if(lastCC<=leadCC || leadCC==0) { + if(trailCC<=1) { + reorderStart=str.length()+(limit-start); + } else if(leadCC<=1) { + reorderStart=str.length()+1; // Ok if not a code point boundary. } + str.append(s, start, limit); + lastCC=trailCC; } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - // c is a lead surrogate, get the real norm32 - if(srcIndex!=srcLimit && - UTF16.isTrailSurrogate(c2=src[srcIndex])) { - ++srcIndex; - length=2; - norm32=getNorm32FromSurrogatePair(norm32, c2); + int c=Character.codePointAt(s, start); + start+=Character.charCount(c); + insert(c, leadCC); // insert first code point + while(start>CC_SHIFT)); - p=null; - pStart=-1; - } else { - DecomposeArgs arg = new DecomposeArgs(); - /* c decomposes, get everything from the variable-length - * extra data - */ - pStart=decompose(norm32, qcMask, arg); - p=extraData; - length=arg.length; - cc=arg.cc; - trailCC=arg.trailCC; - if(length==1) { - /* fastpath a single code unit from decomposition */ - c=p[pStart]; - c2=0; - p=null; - pStart=-1; - } - } + // The following append() methods work like C++ appendZeroCC(). + // They assume that the cc or trailCC of their input is 0. + // Most of them implement Appendable interface methods. + // @Override when we switch to Java 6 + public ReorderingBuffer append(char c) { + str.append(c); + lastCC=0; + reorderStart=str.length(); + return this; + } + + public void appendZeroCC(int c) { + str.appendCodePoint(c); + lastCC=0; + reorderStart=str.length(); + } + + // @Override when we switch to Java 6 + public ReorderingBuffer append(CharSequence s) { + if(s.length()!=0) { + str.append(s); + lastCC=0; + reorderStart=str.length(); } + return this; + } - /* append the decomposition to the destination buffer, assume - * length>0 - */ - if((destIndex+length)<=destLimit) { - int reorderSplit=destIndex; - if(p==null) { - /* fastpath: single code point */ - if(cc!=0 && cc0); - } - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; + // @Override when we switch to Java 6 + public ReorderingBuffer append(CharSequence s, int start, int limit) { + if(start!=limit) { + str.append(s, start, limit); + lastCC=0; + reorderStart=str.length(); } + return this; + } - prevCC=trailCC; - if(prevCC==0) { - reorderStartIndex=destIndex; + /** + * Flushes from the intermediate StringBuilder to the Appendable, + * if they are different objects. + * Used after recomposition. + * Must be called at the end when writing to a non-StringBuilder Appendable. + */ + public void flush() { + if(appIsStringBuilder) { + reorderStart=str.length(); + } else { + try { + app.append(str); + str.setLength(0); + reorderStart=0; + } catch(IOException e) { + throw new InternalError(e); // Avoid declaring "throws IOException". + } + } + lastCC=0; + } + + /** + * Flushes from the intermediate StringBuilder to the Appendable, + * if they are different objects. + * Then appends the new text to the Appendable or StringBuilder. + * Normally used after quick check loops find a non-empty sequence. + */ + public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { + if(appIsStringBuilder) { + str.append(s, start, limit); + reorderStart=str.length(); + } else { + try { + app.append(str).append(s, start, limit); + str.setLength(0); + reorderStart=0; + } catch(IOException e) { + throw new InternalError(e); // Avoid declaring "throws IOException". + } + } + lastCC=0; + return this; + } + + public void remove() { + str.setLength(0); + lastCC=0; + reorderStart=0; + } + + public void removeSuffix(int suffixLength) { + int oldLength=str.length(); + str.delete(oldLength-suffixLength, oldLength); + lastCC=0; + reorderStart=str.length(); + } + + // Inserts c somewhere before the last character. + // Requires 0cc;) {} + // insert c at codePointLimit, after the character with prevCC<=cc + if(c<=0xffff) { + str.insert(codePointLimit, (char)c); + if(cc<=1) { + reorderStart=codePointLimit+1; + } + } else { + str.insert(codePointLimit, Character.toChars(c)); + if(cc<=1) { + reorderStart=codePointLimit+2; + } } } - outTrailCC[0]=prevCC; + private final NormalizerImpl impl; + private final Appendable app; + private final StringBuilder str; + private final boolean appIsStringBuilder; + private int reorderStart; + private int lastCC; - return destIndex - destStart; - } + // private backward iterator + private void setIterator() { codePointStart=str.length(); } + private void skipPrevious() { // Requires 0=codePointStart) { + return 0; + } + int c=str.codePointBefore(codePointStart); + codePointStart-=Character.charCount(c); + if(c>EXTRA_SHIFT))); - return (int)(norm32&COMBINES_ANY); - } else { - /* c is a lead surrogate, get the real norm32 */ - if(args.start!=limit && UTF16.isTrailSurrogate(args.c2= - args.source[args.start])) { - ++args.start; - norm32=getNorm32FromSurrogatePair(norm32, args.c2); - } else { - args.c2=0; - return 0; - } + /** + * Compares two CharSequence subsequences for binary equality. + * @param s1 first sequence + * @param start1 start offset in first sequence + * @param limit1 limit offset in first sequence + * @param s2 second sequence + * @param start2 start offset in second sequence + * @param limit2 limit offset in second sequence + * @return true if s1.subSequence(start1, limit1) contains the same text + * as s2.subSequence(start2, limit2) + */ + public static boolean equal(CharSequence s1, int start1, int limit1, + CharSequence s2, int start2, int limit2) { + if((limit1-start1)!=(limit2-start2)) { + return false; } - - if(nx_contains(nx, args.c, args.c2)) { - return 0; /* excluded: norm32==0 */ + if(s1==s2 && start1==start2) { + return true; + } + while(start1>CC_SHIFT)&0xff); + public NormalizerImpl() {} - combineFlags=(int)(norm32&COMBINES_ANY); - if(combineFlags!=0) { - int index = getExtraDataIndex(norm32); - args.combiningIndex=index>0 ? extraData[(index-1)] :0; + private static final class IsAcceptable implements ICUBinary.Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0]==2; + } + } + + private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); + private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" + + public NormalizerImpl load(ByteBuffer bytes) { + try { + dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); + int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 + if(indexesLength<=IX_MIN_MAYBE_YES) { + throw new IOException("Normalizer2 data: not enough indexes"); + } + int[] inIndexes=new int[indexesLength]; + inIndexes[0]=indexesLength*4; + for(int i=1; i(nextOffset-offset)) { + throw new IOException("Normalizer2 data: not enough bytes for normTrie"); + } + ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes + + // Read the composition and mapping data. + offset=nextOffset; + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; + int numChars=(nextOffset-offset)/2; + char[] chars; + if(numChars!=0) { + chars=new char[numChars]; + for(int i=0; i>=1) { + if((c&0xff)==0) { + bits=smallFCD[c>>8]; // one byte per 0x100 code points + } + if((bits&1)!=0) { + for(int i=0; i<0x20; ++i, ++c) { + tccc180[c]=getFCD16FromNormData(c)&0xff; + } + } else { + c+=0x20; + } } - return combineFlags; + return this; + } catch(IOException e) { + throw new InternalError(e); } } - /* - * given a composition-result starter (c, c2) - which means its cc==0, - * it combines forward, it has extra data, its norm32!=0, - * it is not a Hangul or Jamo, - * get just its combineFwdIndex - * - * norm32(c) is special if and only if c2!=0 - */ - private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){ - long/*unsigned*/ norm32; + public NormalizerImpl load(String name) { + return load(ICUBinary.getRequiredData(name)); + } - norm32=getNorm32(c); - if(c2!=0) { - norm32=getNorm32FromSurrogatePair(norm32, c2); - } - return extraData[(getExtraDataIndex(norm32)-1)]; + public int getNorm16(int c) { + return normTrie.get(c); } - /* - * Find the recomposition result for - * a forward-combining character - * (specified with a pointer to its part of the combiningTable[]) - * and a backward-combining character - * (specified with its combineBackIndex). - * - * If these two characters combine, then set (value, value2) - * with the code unit(s) of the composition character. - * - * Return value: - * 0 do not combine - * 1 combine - * >1 combine, and the composition is a forward-combining starter - * - * See unormimp.h for a description of the composition table format. - */ - private static int/*unsigned*/ combine(char[]table,int tableStart, - int/*unsinged*/ combineBackIndex, - int[] outValues) { - int/*unsigned*/ key; - int value,value2; + public boolean isDecompYes(int norm16) { return norm16=MIN_NORMAL_MAYBE_YES) { + return norm16&0xff; } - - /* search in the starter's composition table */ - for(;;) { - key=table[tableStart++]; - if(key>=combineBackIndex) { - break; - } - tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1; + if(norm16=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; + } - /* get the composition result code point from the variable-length - * result value - */ - if((value&0x8000) != 0) { - if((value&0x4000) != 0) { - /* surrogate pair composition result */ - value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800)); - value2=table[tableStart+1]; + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + public int getFCD16(int c) { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); + } + + /** Returns the FCD data for U+0000<=c>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } + + /** Gets the FCD value from the regular normalization data. */ + public int getFCD16FromNormData(int c) { + // Only loops for 1:1 algorithmic mappings. + for(;;) { + int norm16=getNorm16(c); + if(norm16<=minYesNo) { + // no decomposition or Hangul syllable, all zeros + return 0; + } else if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16&=0xff; + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + if((firstUnit&MAPPING_LENGTH_MASK)==0) { + // A character that is deleted (maps to an empty string) must + // get the worst-case lccc and tccc values because arbitrary + // characters on both sides will become adjacent. + return 0x1ff; } else { - /* BMP composition result U+2000..U+ffff */ - value=table[tableStart+1]; - value2=0; + int fcd16=firstUnit>>8; // tccc + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc + } + return fcd16; } + } + } + } + + /** + * Gets the decomposition for one code point. + * @param c code point + * @return c's decomposition, if it has one; returns null if it does not have a decomposition + */ + public String getDecomposition(int c) { + int decomp=-1; + int norm16; + for(;;) { + if(c the rest of the loop body will reset starter - * to NULL; technically, a composed Hangul syllable is a - * starter, but it does not combine forward now that we have - * consumed all eligible Jamos; for Jamo V/T, combineFlags - * does not contain _NORM_COMBINES_FWD - */ + // only for quick check + int prevBoundary=src; + int prevCC=0; - } else if( - /* the starter is not a Hangul LV or Jamo V/T and */ - !((combineFwdIndex&0x8000)!=0) && - /* the combining mark is not blocked and */ - ((options&BEFORE_PRI_29)!=0 ? - (prevCC!=ncArg.cc || prevCC==0) : - (prevCC1) { - combineFwdIndex=getCombiningIndexFromStarter((char)value, - (char)value2); + if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { + src+=Character.charCount(c); } else { - starter=-1; + break; } - - /* we combined; continue with looking for compositions */ - continue; } } - - /* no combination this time */ - prevCC=ncArg.cc; - if(args.start==args.limit) { - return (char)prevCC; - } - - /* if (c, c2) did not combine, then check if it is a starter */ - if(ncArg.cc==0) { - /* found a new starter; combineFlags==0 if (c, c2) is excluded */ - if((combineFlags&COMBINES_FWD)!=0) { - /* it may combine with something, prepare for it */ - if(ncArg.c2==0) { - starterIsSupplementary=false; - starter=args.start-1; - } else { - starterIsSupplementary=false; - starter=args.start-2; - } - combineFwdIndex=combineBackIndex; + // copy these code units all at once + if(src!=prevSrc) { + if(buffer!=null) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); } else { - /* it will not combine with anything */ - starter=-1; + prevCC=0; + prevBoundary=src; } - } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) { - /* FCC: no discontiguous compositions; any intervening character blocks */ - starter=-1; - } - } - } - - // find the last true starter between src[start]....src[current] going - // backwards and return its index - private static int findPreviousStarter(char[]src, int srcStart, int current, - int/*unsigned*/ ccOrQCMask, - int/*unsigned*/ decompQCMask, - char minNoMaybe) { - long norm32; - PrevArgs args = new PrevArgs(); - args.src = src; - args.start = srcStart; - args.current = current; - - while(args.start=2) { - RecomposeArgs rcArgs = new RecomposeArgs(); - rcArgs.source = buffer; - rcArgs.start = 0; - rcArgs.limit = recomposeLimit; - args.prevCC=recompose(rcArgs, options, nx); - recomposeLimit = rcArgs.limit; - } - - /* return with a pointer to the recomposition and its length */ - args.length=recomposeLimit; - return buffer; - } - - private static boolean composeHangul(char prev, char c, - long/*unsigned*/ norm32, - char[] src,int[] srcIndex, int limit, - boolean compat, - char[] dest,int destIndex, - UnicodeSet nx) { - int start=srcIndex[0]; - if(isJamoVTNorm32JamoV(norm32)) { - /* c is a Jamo V, compose with previous Jamo L and - * following Jamo T */ - prev=(char)(prev-JAMO_L_BASE); - if(prev=limit) { + break; + } + c=Character.codePointAt(s, src); + cc=getCC(getNorm16(c)); + }; + buffer.append(s, 0, src, firstCC, prevCC); + buffer.append(s, src, limit); } - */ - public static int compose(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - int options,UnicodeSet nx) { - - int prevSrc, prevStarter; - long/*unsigned*/ norm32; - int ccOrQCMask, qcMask; - int reorderStartIndex, length; - char c, c2, minNoMaybe; - int/*unsigned byte*/ cc, prevCC; - int[] ioIndex = new int[1]; - int destIndex = destStart; - int srcIndex = srcStart; - - if((options&OPTIONS_COMPAT)!=0) { - minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE]; - qcMask=QC_NFKC; - } else { - minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE]; - qcMask=QC_NFC; - } + // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. + // doCompose: normalize + // !doCompose: isNormalized (buffer must be empty and initialized) + public boolean compose(CharSequence s, int src, int limit, + boolean onlyContiguous, + boolean doCompose, + ReorderingBuffer buffer) { + int minNoMaybeCP=minCompNoMaybeCP; /* - * prevStarter points to the last character before the current one - * that is a "true" starter with cc==0 and quick check "yes". - * - * prevStarter will be used instead of looking for a true starter - * while incrementally decomposing [prevStarter..prevSrc[ - * in _composePart(). Having a good prevStarter allows to just decompose - * the entire [prevStarter..prevSrc[. - * - * When _composePart() backs out from prevSrc back to prevStarter, - * then it also backs out destIndex by the same amount. - * Therefore, at all times, the (prevSrc-prevStarter) source units - * must correspond 1:1 to destination units counted with destIndex, - * except for reordering. - * This is true for the qc "yes" characters copied in the fast loop, - * and for pure reordering. - * prevStarter must be set forward to src when this is not true: - * In _composePart() and after composing a Hangul syllable. + * prevBoundary points to the last character before the current one + * that has a composition boundary before it with ccc==0 and quick check "yes". + * Keeping track of prevBoundary saves us looking for a composition boundary + * when we find a "no" or "maybe". * - * This mechanism relies on the assumption that the decomposition of a - * true starter also begins with a true starter. gennorm/store.c checks - * for this. + * When we back out from prevSrc back to prevBoundary, + * then we also remove those same characters (which had been simply copied + * or canonically-order-inserted) from the ReorderingBuffer. + * Therefore, at all times, the [prevBoundary..prevSrc[ source units + * must correspond 1:1 to destination units at the end of the destination buffer. */ - prevStarter=srcIndex; - - ccOrQCMask=CC_MASK|qcMask; - /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/ - prevCC=0; + int prevBoundary=src; + int prevSrc; + int c=0; + int norm16=0; - /* avoid compiler warnings */ - norm32=0; - c=0; + // only for isNormalized + int prevCC=0; for(;;) { - /* count code units below the minimum or with irrelevant data for - * the quick check */ - prevSrc=srcIndex; - - while(srcIndex!=srcLimit && ((c=src[srcIndex])=minNoNo. + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) + * or has ccc!=0. + * Check for Jamo V/T, then for regular characters. + * c is not a Hangul syllable or Jamo L because those have "yes" properties. + */ + if(isJamoVT(norm16) && prevBoundary!=prevSrc) { + char prev=s.charAt(prevSrc-1); + boolean needToDecompose=false; + if(c0 && - composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex, - srcLimit, (options&OPTIONS_COMPAT)!=0, dest, - destIndex<=destLimit ? destIndex-1: 0, - nx) + if(norm16>=MIN_YES_YES_WITH_CC) { + int cc=norm16&0xff; // cc!=0 + if( onlyContiguous && // FCC + (doCompose ? buffer.getLastCC() : prevCC)==0 && + prevBoundarycc ) { - srcIndex=ioIndex[0]; - prevStarter=srcIndex; + // Fails FCD test, need to decompose and contiguously recompose. + if(!doCompose) { + return false; + } + } else if(doCompose) { + buffer.append(c, cc); + continue; + } else if(prevCC<=cc) { + prevCC=cc; continue; + } else { + return false; } + } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { + return false; + } - srcIndex = ioIndex[0]; + /* + * Find appropriate boundaries around this character, + * decompose the source text from between the boundaries, + * and recompose it. + * + * We may need to remove the last few characters from the ReorderingBuffer + * to account for source text that was copied or appended + * but needs to take part in the recomposition. + */ - /* the Jamo V/T did not compose into a Hangul syllable, just - * append to dest */ - c2=0; - length=1; - prevStarter=prevSrc; - } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(srcIndex!=srcLimit && - UTF16.isTrailSurrogate(c2=src[srcIndex])) { - ++srcIndex; - length=2; - norm32=getNorm32FromSurrogatePair(norm32, c2); - } else { - /* c is an unpaired lead surrogate, nothing to do */ - c2=0; - length=1; - norm32=0; - } + /* + * Find the last composition boundary in [prevBoundary..src[. + * It is either the decomposition of the current character (at prevSrc), + * or prevBoundary. + */ + if(hasCompBoundaryBefore(c, norm16)) { + prevBoundary=prevSrc; + } else if(doCompose) { + buffer.removeSuffix(prevSrc-prevBoundary); + } + + // Find the next composition boundary in [src..limit[ - + // modifies src to point to the next starter. + src=findNextCompBoundary(s, src, limit); + + // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. + int recomposeStartIndex=buffer.length(); + decomposeShort(s, prevBoundary, src, buffer); + recompose(buffer, recomposeStartIndex, onlyContiguous); + if(!doCompose) { + if(!buffer.equals(s, prevBoundary, src)) { + return false; } - ComposePartArgs args =new ComposePartArgs(); - - /* we are looking at the character (c, c2) at [prevSrc..src[ */ - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - cc=0; - } else if((norm32&qcMask)==0) { - cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT)); - } else { - char[] p; - - /* - * find appropriate boundaries around this character, - * decompose the source text from between the boundaries, - * and recompose it - * - * this puts the intermediate text into the side buffer because - * it might be longer than the recomposition end result, - * or the destination buffer may be too short or missing - * - * note that destIndex may be adjusted backwards to account - * for source text that passed the quick check but needed to - * take part in the recomposition - */ - int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ - /* - * find the last true starter in [prevStarter..src[ - * it is either the decomposition of the current character (at prevSrc), - * or prevStarter - */ - if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) { - prevStarter=prevSrc; - } else { - /* adjust destIndex: back out what had been copied with qc "yes" */ - destIndex-=prevSrc-prevStarter; - } + buffer.remove(); + prevCC=0; + } - /* find the next true starter in [src..limit[ */ - srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask, - decompQCMask, minNoMaybe); - //args.prevStarter = prevStarter; - args.prevCC = prevCC; - //args.destIndex = destIndex; - args.length = length; - p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx); + // Move to the next starter. We never need to look back before this point again. + prevBoundary=src; + } + return true; + } - if(p==null) { - /* an error occurred (out of memory) */ - break; - } + /** + * Very similar to compose(): Make the same changes in both places if relevant. + * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) + * !doSpan: quickCheck + * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and + * bit 0: set if "maybe"; otherwise, if the span length<s.length() + * then the quick check result is "no" + */ + public int composeQuickCheck(CharSequence s, int src, int limit, + boolean onlyContiguous, boolean doSpan) { + int qcResult=0; + int minNoMaybeCP=minCompNoMaybeCP; - prevCC = args.prevCC; - length = args.length; + /* + * prevBoundary points to the last character before the current one + * that has a composition boundary before it with ccc==0 and quick check "yes". + */ + int prevBoundary=src; + int prevSrc; + int c=0; + int norm16=0; + int prevCC=0; - /* append the recomposed buffer contents to the destination - * buffer */ - if((destIndex+args.length)<=destLimit) { - int i=0; - while(i=minNoNo. + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) + * or has ccc!=0. + */ + if(isMaybeOrNonZeroCC(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if( onlyContiguous && // FCC + cc!=0 && + prevCC==0 && + prevBoundarycc + ) { + // Fails FCD test. + } else if(prevCC<=cc || cc==0) { prevCC=cc; + if(norm16>CC_SHIFT)&0xFF); - } - - public static boolean isFullCompositionExclusion(int c) { - if(isFormatVersion_2_1) { - int aux =AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux & AUX_COMP_EX_MASK)!=0; - } else { - return false; + public void composeAndAppend(CharSequence s, + boolean doCompose, + boolean onlyContiguous, + ReorderingBuffer buffer) { + int src=0, limit=s.length(); + if(!buffer.isEmpty()) { + int firstStarterInSrc=findNextCompBoundary(s, 0, limit); + if(0!=firstStarterInSrc) { + int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), + buffer.length()); + StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ + firstStarterInSrc+16); + middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); + buffer.removeSuffix(buffer.length()-lastStarterInDest); + middle.append(s, 0, firstStarterInSrc); + compose(middle, 0, middle.length(), onlyContiguous, true, buffer); + src=firstStarterInSrc; + } } - } - - public static boolean isCanonSafeStart(int c) { - if(isFormatVersion_2_1) { - int aux = AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux & AUX_UNSAFE_MASK)==0; + if(doCompose) { + compose(s, src, limit, onlyContiguous, true, buffer); } else { - return false; + buffer.append(s, src, limit); } } - /* Is c an NF-skippable code point? See unormimp.h. */ - public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) { - long /*unsigned int*/ norm32; - mask = mask & UNSIGNED_INT_MASK; - char aux; - - /* check conditions (a)..(e), see unormimp.h */ - norm32 = getNorm32(c); + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { + // Note: In this function we use buffer->appendZeroCC() because we track + // the lead and trail combining classes here, rather than leaving it to + // the ReorderingBuffer. + // The exception is the call to decomposeShort() which uses the buffer + // in the normal way. + + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. + // Similar to the prevBoundary in the compose() implementation. + int prevBoundary=src; + int prevSrc; + int c=0; + int prevFCD16=0; + int fcd16=0; - if((norm32&mask)!=0) { - return false; /* fails (a)..(e), not skippable */ - } + for(;;) { + // count code units with lccc==0 + for(prevSrc=src; src!=limit;) { + if((c=s.charAt(src))1) { + --prevBoundary; + } + } else { + int p=src-1; + if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { + prevBoundary=p; + } + } + if(buffer!=null) { + // The last lccc==0 character is excluded from the + // flush-and-append call in case it needs to be modified. + buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); + buffer.append(s, prevBoundary, src); + } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { + break; + } - if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){ - return true; /* NF*D, passed (a)..(c), is skippable */ + src+=Character.charCount(c); + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. + // Check for proper order, and decompose locally if necessary. + if((prevFCD16&0xff)<=(fcd16>>8)) { + // proper order: prev tccc <= current lccc + if((fcd16&0xff)<=1) { + prevBoundary=src; + } + if(buffer!=null) { + buffer.appendZeroCC(c); + } + prevFCD16=fcd16; + continue; + } else if(buffer==null) { + return prevBoundary; // quick check "no" + } else { + /* + * Back out the part of the source that we copied or appended + * already but is now going to be decomposed. + * prevSrc is set to after what was copied/appended. + */ + buffer.removeSuffix(prevSrc-prevBoundary); + /* + * Find the part of the source that needs to be decomposed, + * up to the next safe boundary. + */ + src=findNextFCDBoundary(s, src, limit); + /* + * The source text does not fulfill the conditions for FCD. + * Decompose and reorder a limited piece of the text. + */ + decomposeShort(s, prevBoundary, src, buffer); + prevBoundary=src; + prevFCD16=0; + } } - /* check conditions (a)..(e), see unormimp.h */ + return src; + } - /* NF*C/FCC, passed (a)..(e) */ - if((norm32& QC_NFD)==0) { - return true; /* no canonical decomposition, is skippable */ + // Note: hasDecompBoundary() could be implemented as aliases to + // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() + // at the cost of building the FCD trie for a decomposition normalizer. + public boolean hasDecompBoundary(int c, boolean before) { + for(;;) { + if(cMIN_NORMAL_MAYBE_YES) { + return false; // ccc!=0 + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + if((firstUnit&MAPPING_LENGTH_MASK)==0) { + return false; + } + if(!before) { + // decomp after-boundary: same as hasFCDBoundaryAfter(), + // fcd16<=1 || trailCC==0 + if(firstUnit>0x1ff) { + return false; // trailCC>1 + } + if(firstUnit<=0xff) { + return true; // trailCC==0 + } + // if(trailCC==1) test leadCC==0, same as checking for before-boundary + } + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; + } } + } - /* check Hangul syllables algorithmically */ - if(isNorm32HangulOrJamo(norm32)) { - /* Jamo passed (a)..(e) above, must be Hangul */ - return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */ - } + public boolean hasCompBoundaryBefore(int c) { + return c=minMaybeYes; } + private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } + private boolean isHangul(int norm16) { return norm16==minYesNo; } + private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; + // } + private int getCCFromNoNo(int norm16) { + if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + return extraData.charAt(norm16-1)&0xff; + } else { + return 0; } - - - aux = AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */ - - /* } else { FCC, test fcd<=1 instead of the above } */ } - public static UnicodeSet addPropertyStarts(UnicodeSet set) { + // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() + int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { int c; - - /* add the start code point of each same-value range of each trie */ - //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); - TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie); - RangeValueIterator.Element normResult = new RangeValueIterator.Element(); - - while(normIter.next(normResult)){ - set.add(normResult.start); + if(cpStart==(cpLimit-1)) { + c=s.charAt(cpStart); + } else { + c=Character.codePointAt(s, cpStart); } + int prevNorm16=getNorm16(c); + if(prevNorm16<=minYesNo) { + return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 + } else { + return extraData.charAt(prevNorm16)>>8; // tccc from yesNo + } + } - //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); - TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie); - RangeValueIterator.Element fcdResult = new RangeValueIterator.Element(); + // Requires algorithmic-NoNo. + private int mapAlgorithmic(int c, int norm16) { + return c+norm16-(minMaybeYes-MAX_DELTA-1); + } - while(fcdIter.next(fcdResult)){ - set.add(fcdResult.start); - } + // Requires minYesNolength2 */ { - lengthResult=1; - limit1=start1+length2; - } - - if(s1==s2) { - return lengthResult; - } - + private void decompose(int c, int norm16, + ReorderingBuffer buffer) { + // Only loops for 1:1 algorithmic mappings. for(;;) { - /* check pseudo-limit */ - if(s1Start==limit1) { - return lengthResult; - } - - c1=s1[s1Start]; - c2=s2[s2Start]; - if(c1!=c2) { - break; + // get the decomposition and the lead and trail cc's + if(isDecompYes(norm16)) { + // c does not decompose + buffer.append(c, getCCFromYesOrMaybe(norm16)); + } else if(isHangul(norm16)) { + // Hangul syllable: decompose algorithmically + Hangul.decompose(c, buffer); + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + norm16=getNorm16(c); + continue; + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + int length=firstUnit&MAPPING_LENGTH_MASK; + int leadCC, trailCC; + trailCC=firstUnit>>8; + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + leadCC=extraData.charAt(norm16-1)>>8; + } else { + leadCC=0; + } + ++norm16; // skip over the firstUnit + buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); } - ++s1Start; - ++s2Start; + return; } + } - /* setup for fix-up */ - limit1=start1+length1; - limit2=start2+length2; - - - /* if both values are in or above the surrogate range, fix them up */ - if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { - /* subtract 0x2800 from BMP code points to make them smaller than - * supplementary ones */ - if( - ( c1<=0xdbff && (s1Start+1)!=limit1 && - UTF16.isTrailSurrogate(s1[(s1Start+1)]) - ) || - ( UTF16.isTrailSurrogate(c1) && start1!=s1Start && - UTF16.isLeadSurrogate(s1[(s1Start-1)]) - ) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make If the lead and trail characters combine, then this function returns + * the following "compositeAndFwd" value: + *

+     * Bits 21..1  composite character
+     * Bit      0  set if the composite is a forward-combining starter
+     * 
+ * otherwise it returns -1. + * + *

The compositions list has (trail, compositeAndFwd) pair entries, + * encoded as either pairs or triples of 16-bit units. + * The last entry has the high bit of its first unit set. + * + *

The list is sorted by ascending trail characters (there are no duplicates). + * A linear search is used. + * + *

See normalizer2impl.h for a more detailed description + * of the compositions list format. + */ + private static int combine(String compositions, int list, int trail) { + int key1, firstUnit; + if(trail(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if((firstUnit&COMP_1_TRIPLE)!=0) { + return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); + } else { + return compositions.charAt(list+1); + } } - - if( - ( c2<=0xdbff && (s2Start+1)!=limit2 && - UTF16.isTrailSurrogate(s2[(s2Start+1)]) - ) || - ( UTF16.isTrailSurrogate(c2) && start2!=s2Start && - UTF16.isLeadSurrogate(s2[(s2Start-1)]) - ) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make >COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); + int key2=(trail<(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if(key2>(secondUnit=compositions.charAt(list+1))) { + if((firstUnit&COMP_1_LAST_TUPLE)!=0) { + break; + } else { + list+=3; + } + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { + return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); + } else { + break; + } + } else { + break; + } } } - - /* now c1 and c2 are in UTF-32-compatible order */ - return (int)c1-(int)c2; + return -1; } - - /* - * Status of tailored normalization - * - * This was done initially for investigation on Unicode public review issue 7 - * (http://www.unicode.org/review/). See Jitterbug 2481. - * While the UTC at meeting #94 (2003mar) did not take up the issue, this is - * a permanent feature in ICU 2.6 in support of IDNA which requires true - * Unicode 3.2 normalization. - * (NormalizationCorrections are rolled into IDNA mapping tables.) - * - * Tailored normalization as implemented here allows to "normalize less" - * than full Unicode normalization would. - * Based internally on a UnicodeSet of code points that are - * "excluded from normalization", the normalization functions leave those - * code points alone ("inert"). This means that tailored normalization - * still transforms text into a canonically equivalent form. - * It does not add decompositions to code points that do not have any or - * change decomposition results. - * - * Any function that searches for a safe boundary has not been touched, - * which means that these functions will be over-pessimistic when - * exclusions are applied. - * This should not matter because subsequent checks and normalizations - * do apply the exclusions; only a little more of the text may be processed - * than necessary under exclusions. - * - * Normalization exclusions have the following effect on excluded code points c: - * - c is not decomposed - * - c is not a composition target - * - c does not combine forward or backward for composition - * except that this is not implemented for Jamo - * - c is treated as having a combining class of 0 - */ - - /* - * Constants for the bit fields in the options bit set parameter. - * These need not be public. - * A user only needs to know the currently assigned values. - * The number and positions of reserved bits per field can remain private. - */ - private static final int OPTIONS_NX_MASK=0x1f; - private static final int OPTIONS_UNICODE_MASK=0xe0; - public static final int OPTIONS_SETS_MASK=0xff; -// private static final int OPTIONS_UNICODE_SHIFT=5; - private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1]; - - /* Constants for options flags for normalization.*/ - - /** - * Options bit 0, do not decompose Hangul syllables. - * @draft ICU 2.6 - */ - private static final int NX_HANGUL = 1; - /** - * Options bit 1, do not decompose CJK compatibility characters. - * @draft ICU 2.6 - */ - private static final int NX_CJK_COMPAT=2; - /** - * Options bit 8, use buggy recomposition described in - * Unicode Public Review Issue #29 - * at http://www.unicode.org/review/resolved-pri.html#pri29 - * - * Used in IDNA implementation according to strict interpretation - * of IDNA definition based on Unicode 3.2 which predates PRI #29. - * - * See ICU4C unormimp.h - * - * @draft ICU 3.2 - */ - public static final int BEFORE_PRI_29=0x100; - - /* - * The following options are used only in some composition functions. - * They use bits 12 and up to preserve lower bits for the available options - * space in unorm_compare() - - * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. - */ - - /** Options bit 12, for compatibility vs. canonical decomposition. */ - public static final int OPTIONS_COMPAT=0x1000; - /** Options bit 13, no discontiguous composition (FCC vs. NFC). */ - public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000; - - /* normalization exclusion sets --------------------------------------------- */ - /* - * Normalization exclusion UnicodeSets are used for tailored normalization; - * see the comment near the beginning of this file. + * Recomposes the buffer text starting at recomposeStartIndex + * (which is in NFD - decomposed and canonically ordered), + * and truncates the buffer contents. * - * By specifying one or several sets of code points, - * those code points become inert for normalization. - */ - private static final synchronized UnicodeSet internalGetNXHangul() { - /* internal function, does not check for incoming U_FAILURE */ - - if(nxCache[NX_HANGUL]==null) { - nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3); - } - return nxCache[NX_HANGUL]; - } + * Note that recomposition never lengthens the text: + * Any character consists of either one or two code units; + * a composition may contain at most one more code unit than the original starter, + * while the combining mark that is removed has at least one code unit. + */ + private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, + boolean onlyContiguous) { + StringBuilder sb=buffer.getStringBuilder(); + int p=recomposeStartIndex; + if(p==sb.length()) { + return; + } + + int starter, pRemove; + int compositionsList; + int c, compositeAndFwd; + int norm16; + int cc, prevCC; + boolean starterIsSupplementary; - private static final synchronized UnicodeSet internalGetNXCJKCompat() { - /* internal function, does not check for incoming U_FAILURE */ + // Some of the following variables are not used until we have a forward-combining starter + // and are only initialized now to avoid compiler warnings. + compositionsList=-1; // used as indicator for whether we have a forward-combining starter + starter=-1; + starterIsSupplementary=false; + prevCC=0; - if(nxCache[NX_CJK_COMPAT]==null) { + for(;;) { + c=sb.codePointAt(p); + p+=Character.charCount(c); + norm16=getNorm16(c); + cc=getCCFromYesOrMaybe(norm16); + if( // this character combines backward and + isMaybe(norm16) && + // we have seen a starter that combines forward and + compositionsList>=0 && + // the backward-combining character is not blocked + (prevCC=0) { + // The starter and the combining mark (c) do combine. + int composite=compositeAndFwd>>1; + + // Remove the combining mark. + pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark + sb.delete(pRemove, p); + p=pRemove; + // Replace the starter with the composite. + if(starterIsSupplementary) { + if(composite>0xffff) { + // both are supplementary + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); + } else { + sb.setCharAt(starter, (char)c); + sb.deleteCharAt(starter+1); + // The composite is shorter than the starter, + // move the intermediate characters forward one. + starterIsSupplementary=false; + --p; + } + } else if(composite>0xffff) { + // The composite is longer than the starter, + // move the intermediate characters back one. + starterIsSupplementary=true; + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); + ++p; + } else { + // both are on the BMP + sb.setCharAt(starter, (char)composite); + } - /* build a set from [CJK Ideographs]&[has canonical decomposition] */ - UnicodeSet set, hasDecomp; + // Keep prevCC because we removed the combining mark. - set=new UnicodeSet("[:Ideographic:]"); + if(p==sb.length()) { + break; + } + // Is the composite a starter that combines forward? + if((compositeAndFwd&1)!=0) { + compositionsList= + getCompositionsListForComposite(getNorm16(composite)); + } else { + compositionsList=-1; + } - /* start with an empty set for [has canonical decomposition] */ - hasDecomp=new UnicodeSet(); + // We combined; continue with looking for compositions. + continue; + } + } - /* iterate over all ideographs and remember which canonically decompose */ - UnicodeSetIterator it = new UnicodeSetIterator(set); - int start, end; - long norm32; + // no combination this time + prevCC=cc; + if(p==sb.length()) { + break; + } - while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) { - start=it.codepoint; - end=it.codepointEnd; - while(start<=end) { - norm32 = getNorm32(start); - if((norm32 & QC_NFD)>0) { - hasDecomp.add(start); + // If c did not combine, then check if it is a starter. + if(cc==0) { + // Found a new starter. + if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { + // It may combine with something, prepare for it. + if(c<=0xffff) { + starterIsSupplementary=false; + starter=p-1; + } else { + starterIsSupplementary=true; + starter=p-2; } - ++start; } + } else if(onlyContiguous) { + // FCC: no discontiguous compositions; any intervening character blocks. + compositionsList=-1; } - - /* hasDecomp now contains all ideographs that decompose canonically */ - nxCache[NX_CJK_COMPAT]=hasDecomp; - } - - return nxCache[NX_CJK_COMPAT]; + buffer.flush(); } - private static final synchronized UnicodeSet internalGetNXUnicode(int options) { - options &= OPTIONS_UNICODE_MASK; - if(options==0) { - return null; + /** + * Does c have a composition boundary before it? + * True if its decomposition begins with a character that has + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes + * (isCompYesAndZeroCC()) so we need not decompose. + */ + private boolean hasCompBoundaryBefore(int c, int norm16) { + for(;;) { + if(isCompYesAndZeroCC(norm16)) { + return true; + } else if(isMaybeOrNonZeroCC(norm16)) { + return false; + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + norm16=getNorm16(c); + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + if((firstUnit&MAPPING_LENGTH_MASK)==0) { + return false; + } + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { + return false; // non-zero leadCC + } + return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); + } } + } - if(nxCache[options]==null) { - /* build a set with all code points that were not designated by the specified Unicode version */ - UnicodeSet set = new UnicodeSet(); - - switch(options) { - case NormalizerBase.UNICODE_3_2: - set.applyPattern("[:^Age=3.2:]"); + private int findPreviousCompBoundary(CharSequence s, int p) { + while(p>0) { + int c=Character.codePointBefore(s, p); + p-=Character.charCount(c); + if(hasCompBoundaryBefore(c)) { break; - default: - return null; } - - nxCache[options]=set; + // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, + // but that's probably not worth the extra cost. } - - return nxCache[options]; + return p; } - /* Get a decomposition exclusion set. The data must be loaded. */ - private static final synchronized UnicodeSet internalGetNX(int options) { - options&=OPTIONS_SETS_MASK; - - if(nxCache[options]==null) { - /* return basic sets */ - if(options==NX_HANGUL) { - return internalGetNXHangul(); - } - if(options==NX_CJK_COMPAT) { - return internalGetNXCJKCompat(); - } - if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) { - return internalGetNXUnicode(options); - } - - /* build a set from multiple subsets */ - UnicodeSet set; - UnicodeSet other; - - set=new UnicodeSet(); - - - if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) { - set.addAll(other); - } - if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) { - set.addAll(other); - } - if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) { - set.addAll(other); + private int findNextCompBoundary(CharSequence s, int p, int limit) { + while(p= 0x0009 && c <= 0x000D) || @@ -2503,45 +1756,42 @@ } public static String canonicalDecomposeWithSingleQuotation(String string) { - char[] src = string.toCharArray(); - int srcIndex = 0; - int srcLimit = src.length; - char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 - int destIndex = 0; - int destLimit = dest.length; + Normalizer2 impl = Normalizer2.getNFDInstance(); + char[] src = string.toCharArray(); + int srcIndex = 0; + int srcLimit = src.length; + char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 + int destIndex = 0; + int destLimit = dest.length; - char[] buffer = new char[3]; int prevSrc; - long norm32; - int ccOrQCMask; - int qcMask = QC_NFD; + String norm; int reorderStartIndex, length; - char c, c2; - char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE]; + char c1, c2; + int cp; + int minNoMaybe = 0x00c0; int cc, prevCC, trailCC; char[] p; int pStart; - // initialize - ccOrQCMask = CC_MASK | qcMask; reorderStartIndex = 0; prevCC = 0; - norm32 = 0; - c = 0; + norm = null; + cp = 0; pStart = 0; cc = trailCC = -1; // initialize to bogus value - for(;;) { + c1 = 0; + for (;;) { prevSrc=srcIndex; //quick check (1)less than minNoMaybe (2)no decomp (3)hangual while (srcIndex != srcLimit && - (( c = src[srcIndex]) < minNoMaybe || - ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || - ( c >= '\uac00' && c <= '\ud7a3'))){ - + ((c1 = src[srcIndex]) < minNoMaybe || + (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || + (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables prevCC = 0; - ++srcIndex; + srcIndex += (cp < 0x10000) ? 1 : 2; } // copy these code units all at once @@ -2556,47 +1806,43 @@ } // end of source reached? - if(srcIndex == srcLimit) { + if (srcIndex == srcLimit) { break; } - // c already contains *src and norm32 is set for it, increment src - ++srcIndex; - if(isNorm32Regular(norm32)) { + // cp already contains *src and norm32 is set for it, increment src + srcIndex += (cp < 0x10000) ? 1 : 2; + + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { c2 = 0; length = 1; - } else { - // c is a lead surrogate, get the real norm32 - if(srcIndex != srcLimit && - Character.isLowSurrogate(c2 = src[srcIndex])) { - ++srcIndex; - length = 2; - norm32 = getNorm32FromSurrogatePair(norm32, c2); - } else { - c2 = 0; - length = 1; - norm32 = 0; + + if (Character.isHighSurrogate(c1) + || Character.isLowSurrogate(c1)) { + norm = null; } + } else { + length = 2; + c2 = src[srcIndex-1]; } - // get the decomposition and the lead and trail cc's - if((norm32 & qcMask) == 0) { - // c does not decompose - cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT)); - p = null; - pStart = -1; - } else { - DecomposeArgs arg = new DecomposeArgs(); - // c decomposes, get everything from the variable-length - // extra data - pStart = decompose(norm32, qcMask, arg); - p = extraData; - length = arg.length; - cc = arg.cc; - trailCC = arg.trailCC; - if(length == 1) { + // get the decomposition and the lead and trail cc's + if (norm == null) { + // cp does not decompose + cc = trailCC = UCharacter.getCombiningClass(cp); + p = null; + pStart = -1; + } else { + + pStart = 0; + p = norm.toCharArray(); + length = p.length; + int cpNum = norm.codePointCount(0, length); + cc= UCharacter.getCombiningClass(norm.codePointAt(0)); + trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); + if (length == 1) { // fastpath a single code unit from decomposition - c = p[pStart]; + c1 = p[pStart]; c2 = 0; p = null; pStart = -1; @@ -2610,27 +1856,28 @@ dest = tmpBuf; destLimit = dest.length; } + // append the decomposition to the destination buffer, assume length>0 { int reorderSplit = destIndex; - if(p == null) { + if (p == null) { // fastpath: single code point - if (needSingleQuotation(c)) { + if (needSingleQuotation(c1)) { //if we need single quotation, no need to consider "prevCC" //and it must NOT be a supplementary pair dest[destIndex++] = '\''; - dest[destIndex++] = c; + dest[destIndex++] = c1; dest[destIndex++] = '\''; trailCC = 0; } else if(cc != 0 && cc < prevCC) { - // (c, c2) is out of order with respect to the preceding + // (c1, c2) is out of order with respect to the preceding // text destIndex += length; - trailCC = insertOrdered(dest,reorderStartIndex, - reorderSplit, destIndex, c, c2, cc); + trailCC = insertOrdered(dest, reorderStartIndex, + reorderSplit, destIndex, c1, c2, cc); } else { - // just append (c, c2) - dest[destIndex++] = c; + // just append (c1, c2) + dest[destIndex++] = c1; if(c2 != 0) { dest[destIndex++] = c2; } @@ -2646,16 +1893,16 @@ do { dest[destIndex++] = p[pStart++]; } while(--length > 0); - } else - if(cc != 0 && cc < prevCC) { + } else if (cc != 0 && cc < prevCC) { destIndex += length; - trailCC = mergeOrdered(dest,reorderStartIndex, - reorderSplit,p, pStart,pStart+length); + trailCC = mergeOrdered(dest, reorderStartIndex, + reorderSplit, p, pStart, + pStart+length); } else { // just append the decomposition do { dest[destIndex++] = p[pStart++]; - } while(--length > 0); + } while (--length > 0); } } } @@ -2664,73 +1911,245 @@ reorderStartIndex = destIndex; } } + return new String(dest, 0, destIndex); } - //------------------------------------------------------ - // mapping method for IDNA/StringPrep - //------------------------------------------------------ - - /* - * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode - * 3.2 normalization with Corrigendum 4 corrections. However, normalization - * without the corrections is necessary for IDNA/StringPrep support. - * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option - * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five - * characters in Corrigendum 4 before normalization in order to avoid - * incorrect normalization. - * For the Corrigendum 4 issue, refer - * http://www.unicode.org/versions/corrigendum4.html + /** + * simpler, single-character version of mergeOrdered() - + * bubble-insert one single code point into the preceding string + * which is already canonically ordered + * (c, c2) may or may not yet have been inserted at src[current]..src[p] + * + * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) + * + * before: src[start]..src[current] is already ordered, and + * src[current]..src[p] may or may not hold (c, c2) but + * must be exactly the same length as (c, c2) + * after: src[start]..src[p] is ordered + * + * @return the trailing combining class */ + private static int/*unsigned byte*/ insertOrdered(char[] source, + int start, + int current, int p, + char c1, char c2, + int/*unsigned byte*/ cc) { + int back, preBack; + int r; + int prevCC, trailCC=cc; - /* - * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL. - */ - public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000; + if (start=prevCC + preBack=back=current; - private static final char[][] corrigendum4MappingTable = { - {'\uD844', '\uDF6A'}, // 0x2F868 - {'\u5F33'}, // 0x2F874 - {'\u43AB'}, // 0x2F91F - {'\u7AAE'}, // 0x2F95F - {'\u4D57'}}; // 0x2F9BF + PrevArgs prevArgs = new PrevArgs(); + prevArgs.current = current; + prevArgs.start = start; + prevArgs.src = source; + prevArgs.c1 = c1; + prevArgs.c2 = c2; - /* - * Removing Corrigendum 4 fix - * @return normalized text - */ - public static String convert(String str) { - if (str == null) { - return null; + // get the prevCC + prevCC=getPrevCC(prevArgs); + preBack = prevArgs.current; + + if(cc=prevCC) { + break; + } + back=preBack; + } + + // this is where we are right now with all these indicies: + // [start]..[pPreBack] 0..? code points that we can ignore + // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc + // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) + // [current]..[p] 1 code point (c, c2) with cc + + // move the code units in between up + r=p; + do { + source[--r]=source[--current]; + } while (back!=current); + } } - int ch = UCharacterIterator.DONE; - StringBuffer dest = new StringBuffer(); - UCharacterIterator iter = UCharacterIterator.getInstance(str); - - while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ - switch (ch) { - case 0x2F868: - dest.append(corrigendum4MappingTable[0]); - break; - case 0x2F874: - dest.append(corrigendum4MappingTable[1]); - break; - case 0x2F91F: - dest.append(corrigendum4MappingTable[2]); - break; - case 0x2F95F: - dest.append(corrigendum4MappingTable[3]); - break; - case 0x2F9BF: - dest.append(corrigendum4MappingTable[4]); - break; - default: - UTF16.append(dest,ch); - break; + // insert (c1, c2) + source[current] = c1; + if (c2!=0) { + source[(current+1)] = c2; + } + + // we know the cc of the last code point + return trailCC; + } + + /** + * merge two UTF-16 string parts together + * to canonically order (order by combining classes) their concatenation + * + * the two strings may already be adjacent, so that the merging is done + * in-place if the two strings are not adjacent, then the buffer holding the + * first one must be large enough + * the second string may or may not be ordered in itself + * + * before: [start]..[current] is already ordered, and + * [next]..[limit] may be ordered in itself, but + * is not in relation to [start..current[ + * after: [start..current+(limit-next)[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from + * src[next++] and inserts them in correct combining class order into the + * preceding part of the string + * + * since this function is called much less often than the single-code point + * insertOrdered(), it just uses that for easier maintenance + * + * @return the trailing combining class + */ + private static int /*unsigned byte*/ mergeOrdered(char[] source, + int start, + int current, + char[] data, + int next, + int limit) { + int r; + int /*unsigned byte*/ cc, trailCC=0; + boolean adjacent; + + adjacent= current==next; + NextCCArgs ncArgs = new NextCCArgs(); + ncArgs.source = data; + ncArgs.next = next; + ncArgs.limit = limit; + + if(start!=current) { + + while(ncArgs.next