< prev index next >
jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java
Print this page
*** 1,7 ****
/*
! * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
--- 1,7 ----
/*
! * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
*** 20,2549 ****
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
! * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
! * *
! * The original version of this source code and documentation is copyrighted *
! * and owned by IBM, These materials are provided under terms of a License *
! * Agreement between IBM and Sun. This technology is protected by multiple *
! * US and International patents. This notice and attribution to IBM may not *
! * to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
- import java.io.BufferedInputStream;
- import java.io.ByteArrayInputStream;
import java.io.IOException;
! import java.io.BufferedInputStream;
! import java.io.InputStream;
! /**
! * @author Ram Viswanadha
! */
public final class NormalizerImpl {
- // Static block for the class to initialize its own self
- static final NormalizerImpl IMPL;
-
- static
- {
- try
- {
- IMPL = new NormalizerImpl();
- }
- catch (Exception e)
- {
- throw new RuntimeException(e.getMessage());
- }
- }
-
- static final int UNSIGNED_BYTE_MASK =0xFF;
- static final long UNSIGNED_INT_MASK = 0xffffffffL;
- /*
- * This new implementation of the normalization code loads its data from
- * unorm.icu, which is generated with the gennorm tool.
- * The format of that file is described at the end of this file.
- */
- private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
-
- // norm32 value constants
-
- // quick check flags 0..3 set mean "no" for their forms
- public static final int QC_NFC=0x11; /* no|maybe */
- public static final int QC_NFKC=0x22; /* no|maybe */
- public static final int QC_NFD=4; /* no */
- public static final int QC_NFKD=8; /* no */
-
- public static final int QC_ANY_NO=0xf;
-
- /* quick check flags 4..5 mean "maybe" for their forms;
- * test flags>=QC_MAYBE
- */
- public static final int QC_MAYBE=0x10;
- public static final int QC_ANY_MAYBE=0x30;
-
- public static final int QC_MASK=0x3f;
-
- private static final int COMBINES_FWD=0x40;
- private static final int COMBINES_BACK=0x80;
- public static final int COMBINES_ANY=0xc0;
- // UnicodeData.txt combining class in bits 15.
- private static final int CC_SHIFT=8;
- public static final int CC_MASK=0xff00;
- // 16 bits for the index to UChars and other extra data
- private static final int EXTRA_SHIFT=16;
-
- /* norm32 value constants using >16 bits */
- private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK;
- private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK;
- private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK;
- // private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK;
- private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK;
-
-
- /* indexes[] value names */
- /* number of bytes in normalization trie */
- static final int INDEX_TRIE_SIZE = 0;
- /* number of chars in extra data */
- static final int INDEX_CHAR_COUNT = 1;
- /* number of uint16_t words for combining data */
- static final int INDEX_COMBINE_DATA_COUNT = 2;
- /* first code point with quick check NFC NO/MAYBE */
- public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
- /* first code point with quick check NFKC NO/MAYBE */
- public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
- /* first code point with quick check NFD NO/MAYBE */
- public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
- /* first code point with quick check NFKD NO/MAYBE */
- public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
- /* number of bytes in FCD trie */
- static final int INDEX_FCD_TRIE_SIZE = 10;
- /* number of bytes in the auxiliary trie */
- static final int INDEX_AUX_TRIE_SIZE = 11;
- /* changing this requires a new formatVersion */
- static final int INDEX_TOP = 32;
-
-
- /* AUX constants */
- /* value constants for auxTrie */
- private static final int AUX_UNSAFE_SHIFT = 11;
- private static final int AUX_COMP_EX_SHIFT = 10;
- private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
-
- private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT;
- private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
- private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
- private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
- private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);
-
- private static final int MAX_BUFFER_SIZE = 20;
-
- /*******************************/
-
- /* Wrappers for Trie implementations */
- static final class NormTrieImpl implements Trie.DataManipulate{
- static IntTrie normTrie= null;
- /**
- * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
- * data the index array offset of the indexes for that lead surrogate.
- * @param property data value for a surrogate from the trie, including
- * the folding offset
- * @return data offset or 0 if there is no data for the lead surrogate
- */
- /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
- public int getFoldingOffset(int value){
- return BMP_INDEX_LENGTH+
- ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
- (0x3ff<<SURROGATE_BLOCK_BITS));
- }
-
- }
- static final class FCDTrieImpl implements Trie.DataManipulate{
- static CharTrie fcdTrie=null;
- /**
- * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
- * data the index array offset of the indexes for that lead surrogate.
- * @param property data value for a surrogate from the trie, including
- * the folding offset
- * @return data offset or 0 if there is no data for the lead surrogate
- */
- /* fcdTrie: the folding offset is the lead FCD value itself */
- public int getFoldingOffset(int value){
- return value;
- }
- }
-
- static final class AuxTrieImpl implements Trie.DataManipulate{
- static CharTrie auxTrie = null;
- /**
- * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
- * data the index array offset of the indexes for that lead surrogate.
- * @param property data value for a surrogate from the trie, including
- * the folding offset
- * @return data offset or 0 if there is no data for the lead surrogate
- */
- /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
- public int getFoldingOffset(int value){
- return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
- }
- }
-
- /****************************************************/
-
-
- private static FCDTrieImpl fcdTrieImpl;
- private static NormTrieImpl normTrieImpl;
- private static AuxTrieImpl auxTrieImpl;
- private static int[] indexes;
- private static char[] combiningTable;
- private static char[] extraData;
-
- private static boolean isDataLoaded;
- private static boolean isFormatVersion_2_1;
- private static boolean isFormatVersion_2_2;
- private static byte[] unicodeVersion;
-
- /**
- * Default buffer size of datafile
- */
- private static final int DATA_BUFFER_SIZE = 25000;
-
- /**
- * FCD check: everything below this code point is known to have a 0
- * lead combining class
- */
- public static final int MIN_WITH_LEAD_CC=0x300;
-
-
- /**
- * Bit 7 of the length byte for a decomposition string in extra data is
- * a flag indicating whether the decomposition string is
- * preceded by a 16-bit word with the leading and trailing cc
- * of the decomposition (like for A-umlaut);
- * if not, then both cc's are zero (like for compatibility ideographs).
- */
- private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
- /**
- * Bits 6..0 of the length byte contain the actual length.
- */
- private static final int DECOMP_LENGTH_MASK=0x7f;
-
- /** Length of the BMP portion of the index (stage 1) array. */
- private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
- /** Number of bits of a trail surrogate that are used in index table
- * lookups.
- */
- private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
-
-
- // public utility
- public static int getFromIndexesArr(int index){
- return indexes[index];
- }
-
- // protected constructor ---------------------------------------------
-
- /**
- * Constructor
- * @exception thrown when data reading fails or data corrupted
- */
- private NormalizerImpl() throws IOException {
- //data should be loaded only once
- if(!isDataLoaded){
-
- // jar access
- InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
- BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
- NormalizerDataReader reader = new NormalizerDataReader(b);
-
- // read the indexes
- indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
-
- byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
-
- int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
- combiningTable = new char[combiningTableTop];
-
- int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
- extraData = new char[extraDataTop];
-
- byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
- byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
-
- fcdTrieImpl = new FCDTrieImpl();
- normTrieImpl = new NormTrieImpl();
- auxTrieImpl = new AuxTrieImpl();
-
- // load the rest of the data data and initialize the data members
- reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable);
-
- NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );
- FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl );
- AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl );
-
- // we reached here without any exceptions so the data is fully
- // loaded set the variable to true
- isDataLoaded = true;
-
- // get the data format version
- byte[] formatVersion = reader.getDataFormatVersion();
-
- isFormatVersion_2_1 =( formatVersion[0]>2
- ||
- (formatVersion[0]==2 && formatVersion[1]>=1)
- );
- isFormatVersion_2_2 =( formatVersion[0]>2
- ||
- (formatVersion[0]==2 && formatVersion[1]>=2)
- );
- unicodeVersion = reader.getUnicodeVersion();
- b.close();
- }
- }
-
- /* ---------------------------------------------------------------------- */
/* Korean Hangul and Jamo constants */
-
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
public static final int HANGUL_BASE=0xac00;
public static final int JAMO_L_COUNT=19;
public static final int JAMO_V_COUNT=21;
! public static final int JAMO_T_COUNT=28;
! public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
!
! private static boolean isHangulWithoutJamoT(char c) {
! c-=HANGUL_BASE;
! return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
! }
!
! /* norm32 helpers */
!
! /* is this a norm32 with a regular index? */
! private static boolean isNorm32Regular(long norm32) {
! return norm32<MIN_SPECIAL;
! }
!
! /* is this a norm32 with a special index for a lead surrogate? */
! private static boolean isNorm32LeadSurrogate(long norm32) {
! return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
! }
!
! /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
! private static boolean isNorm32HangulOrJamo(long norm32) {
! return norm32>=MIN_HANGUL;
! }
!
! /*
! * Given norm32 for Jamo V or T,
! * is this a Jamo V?
! */
! private static boolean isJamoVTNorm32JamoV(long norm32) {
! return norm32<JAMO_V_TOP;
! }
!
! /* data access primitives ----------------------------------------------- */
!
! public static long/*unsigned*/ getNorm32(char c) {
! return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
! }
!
! public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
! char c2) {
! /*
! * the surrogate index in norm32 stores only the number of the surrogate
! * index block see gennorm/store.c/getFoldedNormValue()
! */
! return ((UNSIGNED_INT_MASK) &
! NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
! }
! ///CLOVER:OFF
! private static long getNorm32(int c){
! return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
! }
!
! /*
! * get a norm32 from text with complete code points
! * (like from decompositions)
! */
! private static long/*unsigned*/ getNorm32(char[] p,int start,
! int/*unsigned*/ mask) {
! long/*unsigned*/ norm32= getNorm32(p[start]);
! if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
! /* *p is a lead surrogate, get the real norm32 */
! norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
! }
! return norm32;
! }
!
! //// for StringPrep
! public static VersionInfo getUnicodeVersion(){
! return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
! unicodeVersion[2], unicodeVersion[3]);
! }
!
! public static char getFCD16(char c) {
! return FCDTrieImpl.fcdTrie.getLeadValue(c);
! }
!
! public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
! /* the surrogate index in fcd16 is an absolute offset over the
! * start of stage 1
! * */
! return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
! }
! public static int getFCD16(int c) {
! return FCDTrieImpl.fcdTrie.getCodePointValue(c);
! }
!
! private static int getExtraDataIndex(long norm32) {
! return (int)(norm32>>EXTRA_SHIFT);
! }
!
! private static final class DecomposeArgs{
! int /*unsigned byte*/ cc;
! int /*unsigned byte*/ trailCC;
! int length;
! }
! /**
! *
! * get the canonical or compatibility decomposition for one character
! *
! * @return index into the extraData array
! */
! private static int/*index*/ decompose(long/*unsigned*/ norm32,
! int/*unsigned*/ qcMask,
! DecomposeArgs args) {
! int p= getExtraDataIndex(norm32);
! args.length=extraData[p++];
!
! if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
! /* use compatibility decomposition, skip canonical data */
! p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
! args.length>>=8;
! }
!
! if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
! /* get the lead and trail cc's */
! char bothCCs=extraData[p++];
! args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
! args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
! } else {
! /* lead and trail cc's are both 0 */
! args.cc=args.trailCC=0;
! }
!
! args.length&=DECOMP_LENGTH_MASK;
! return p;
! }
!
!
! /**
! * get the canonical decomposition for one character
! * @return index into the extraData array
! */
! private static int decompose(long/*unsigned*/ norm32,
! DecomposeArgs args) {
!
! int p= getExtraDataIndex(norm32);
! args.length=extraData[p++];
!
! if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
! /* get the lead and trail cc's */
! char bothCCs=extraData[p++];
! args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
! args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
! } else {
! /* lead and trail cc's are both 0 */
! args.cc=args.trailCC=0;
! }
!
! args.length&=DECOMP_LENGTH_MASK;
! return p;
! }
!
!
! private static final class NextCCArgs{
! char[] source;
! int next;
! int limit;
! char c;
! char c2;
! }
!
! /*
! * get the combining class of (c, c2)= args.source[args.next++]
! * before: args.next<args.limit after: args.next<=args.limit
! * if only one code unit is used, then c2==0
! */
! private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
! long /*unsigned*/ norm32;
!
! args.c=args.source[args.next++];
!
! norm32= getNorm32(args.c);
! if((norm32 & CC_MASK)==0) {
! args.c2=0;
! return 0;
! } else {
! if(!isNorm32LeadSurrogate(norm32)) {
! args.c2=0;
! } else {
! /* c is a lead surrogate, get the real norm32 */
! if(args.next!=args.limit &&
! UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
! ++args.next;
! norm32=getNorm32FromSurrogatePair(norm32, args.c2);
! } else {
! args.c2=0;
! return 0;
! }
! }
!
! return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
! }
! }
!
! private static final class PrevArgs{
! char[] src;
! int start;
! int current;
! char c;
! char c2;
! }
!
! /*
! * read backwards and get norm32
! * return 0 if the character is <minC
! * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
! * surrogate but read second!)
! */
! private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
! int/*unsigned*/ minC,
! int/*unsigned*/ mask) {
! long/*unsigned*/ norm32;
!
! args.c=args.src[--args.current];
! args.c2=0;
!
! /* check for a surrogate before getting norm32 to see if we need to
! * predecrement further
! */
! if(args.c<minC) {
! return 0;
! } else if(!UTF16.isSurrogate(args.c)) {
! return getNorm32(args.c);
! } else if(UTF16.isLeadSurrogate(args.c)) {
! /* unpaired first surrogate */
! return 0;
! } else if(args.current!=args.start &&
! UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
! --args.current;
! norm32=getNorm32(args.c2);
!
! if((norm32&mask)==0) {
! /* all surrogate pairs with this lead surrogate have
! * only irrelevant data
! */
! return 0;
! } else {
! /* norm32 must be a surrogate special */
! return getNorm32FromSurrogatePair(norm32, args.c);
! }
! } else {
! /* unpaired second surrogate */
! args.c2=0;
! return 0;
! }
! }
!
! /*
! * get the combining class of (c, c2)=*--p
! * before: start<p after: start<=p
! */
! private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {
!
! return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
! CC_MASK)>>CC_SHIFT));
! }
!
! /*
! * is this a safe boundary character for NF*D?
! * (lead cc==0)
! */
! public static boolean isNFDSafe(long/*unsigned*/ norm32,
! int/*unsigned*/ccOrQCMask,
! int/*unsigned*/ decompQCMask) {
! if((norm32&ccOrQCMask)==0) {
! return true; /* cc==0 and no decomposition: this is NF*D safe */
! }
!
! /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
! if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
! DecomposeArgs args=new DecomposeArgs();
! /* decomposes, get everything from the variable-length extra data */
! decompose(norm32, decompQCMask, args);
! return args.cc==0;
! } else {
! /* no decomposition (or Hangul), test the cc directly */
! return (norm32&CC_MASK)==0;
! }
! }
!
! /*
! * is this (or does its decomposition begin with) a "true starter"?
! * (cc==0 and NF*C_YES)
! */
! public static boolean isTrueStarter(long/*unsigned*/ norm32,
! int/*unsigned*/ ccOrQCMask,
! int/*unsigned*/ decompQCMask) {
! if((norm32&ccOrQCMask)==0) {
! return true; /* this is a true starter (could be Hangul or Jamo L)*/
! }
!
! /* inspect its decomposition - not a Hangul or a surrogate here */
! if((norm32&decompQCMask)!=0) {
! int p; /* index into extra data array */
! DecomposeArgs args=new DecomposeArgs();
! /* decomposes, get everything from the variable-length extra data */
! p=decompose(norm32, decompQCMask, args);
!
! if(args.cc==0) {
! int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;
!
! /* does it begin with NFC_YES? */
! if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
! /* yes, the decomposition begins with a true starter */
! return true;
! }
! }
! }
! return false;
! }
!
! /* reorder UTF-16 in-place ---------------------------------------------- */
!
! /**
! * simpler, single-character version of mergeOrdered() -
! * bubble-insert one single code point into the preceding string
! * which is already canonically ordered
! * (c, c2) may or may not yet have been inserted at src[current]..src[p]
! *
! * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
! *
! * before: src[start]..src[current] is already ordered, and
! * src[current]..src[p] may or may not hold (c, c2) but
! * must be exactly the same length as (c, c2)
! * after: src[start]..src[p] is ordered
! *
! * @return the trailing combining class
! */
! private static int/*unsigned byte*/ insertOrdered(char[] source,
! int start,
! int current, int p,
! char c, char c2,
! int/*unsigned byte*/ cc) {
! int back, preBack;
! int r;
! int prevCC, trailCC=cc;
!
! if(start<current && cc!=0) {
! // search for the insertion point where cc>=prevCC
! preBack=back=current;
! PrevArgs prevArgs = new PrevArgs();
! prevArgs.current = current;
! prevArgs.start = start;
! prevArgs.src = source;
! // get the prevCC
! prevCC=getPrevCC(prevArgs);
! preBack = prevArgs.current;
!
! if(cc<prevCC) {
! // this will be the last code point, so keep its cc
! trailCC=prevCC;
! back=preBack;
! while(start<preBack) {
! prevCC=getPrevCC(prevArgs);
! preBack=prevArgs.current;
! if(cc>=prevCC) {
! break;
! }
! back=preBack;
! }
!
!
! // this is where we are right now with all these indicies:
! // [start]..[pPreBack] 0..? code points that we can ignore
! // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
! // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
! // [current]..[p] 1 code point (c, c2) with cc
!
! // move the code units in between up
! r=p;
! do {
! source[--r]=source[--current];
! } while(back!=current);
! }
! }
!
! // insert (c, c2)
! source[current]=c;
! if(c2!=0) {
! source[(current+1)]=c2;
! }
!
! // we know the cc of the last code point
! return trailCC;
! }
!
! /**
! * merge two UTF-16 string parts together
! * to canonically order (order by combining classes) their concatenation
! *
! * the two strings may already be adjacent, so that the merging is done
! * in-place if the two strings are not adjacent, then the buffer holding the
! * first one must be large enough
! * the second string may or may not be ordered in itself
! *
! * before: [start]..[current] is already ordered, and
! * [next]..[limit] may be ordered in itself, but
! * is not in relation to [start..current[
! * after: [start..current+(limit-next)[ is ordered
! *
! * the algorithm is a simple bubble-sort that takes the characters from
! * src[next++] and inserts them in correct combining class order into the
! * preceding part of the string
! *
! * since this function is called much less often than the single-code point
! * insertOrdered(), it just uses that for easier maintenance
! *
! * @return the trailing combining class
! */
! private static int /*unsigned byte*/ mergeOrdered(char[] source,
! int start,
! int current,
! char[] data,
! int next,
! int limit,
! boolean isOrdered) {
! int r;
! int /*unsigned byte*/ cc, trailCC=0;
! boolean adjacent;
!
! adjacent= current==next;
! NextCCArgs ncArgs = new NextCCArgs();
! ncArgs.source = data;
! ncArgs.next = next;
! ncArgs.limit = limit;
!
! if(start!=current || !isOrdered) {
!
! while(ncArgs.next<ncArgs.limit) {
! cc=getNextCC(ncArgs);
! if(cc==0) {
! // does not bubble back
! trailCC=0;
! if(adjacent) {
! current=ncArgs.next;
! } else {
! data[current++]=ncArgs.c;
! if(ncArgs.c2!=0) {
! data[current++]=ncArgs.c2;
}
}
! if(isOrdered) {
! break;
} else {
! start=current;
}
! } else {
! r=current+(ncArgs.c2==0 ? 1 : 2);
! trailCC=insertOrdered(source,start, current, r,
! ncArgs.c, ncArgs.c2, cc);
! current=r;
}
}
}
! if(ncArgs.next==ncArgs.limit) {
! // we know the cc of the last code point
! return trailCC;
} else {
! if(!adjacent) {
! // copy the second string part
! do {
! source[current++]=data[ncArgs.next++];
! } while(ncArgs.next!=ncArgs.limit);
! ncArgs.limit=current;
}
! PrevArgs prevArgs = new PrevArgs();
! prevArgs.src = data;
! prevArgs.start = start;
! prevArgs.current = ncArgs.limit;
! return getPrevCC(prevArgs);
}
!
}
- private static int /*unsigned byte*/ mergeOrdered(char[] source,
- int start,
- int current,
- char[] data,
- final int next,
- final int limit) {
- return mergeOrdered(source,start,current,data,next,limit,true);
- }
-
- public static NormalizerBase.QuickCheckResult quickCheck(char[] src,
- int srcStart,
- int srcLimit,
- int minNoMaybe,
- int qcMask,
- int options,
- boolean allowMaybe,
- UnicodeSet nx){
-
- int ccOrQCMask;
- long norm32;
- char c, c2;
- char cc, prevCC;
- long qcNorm32;
- NormalizerBase.QuickCheckResult result;
- ComposePartArgs args = new ComposePartArgs();
- char[] buffer ;
- int start = srcStart;
-
- if(!isDataLoaded) {
- return NormalizerBase.MAYBE;
}
- // initialize
- ccOrQCMask=CC_MASK|qcMask;
- result=NormalizerBase.YES;
- prevCC=0;
! for(;;) {
! for(;;) {
! if(srcStart==srcLimit) {
! return result;
! } else if((c=src[srcStart++])>=minNoMaybe &&
! (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
! break;
! }
! prevCC=0;
! }
! // check one above-minimum, relevant code unit
! if(isNorm32LeadSurrogate(norm32)) {
! // c is a lead surrogate, get the real norm32
! if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
! ++srcStart;
! norm32=getNorm32FromSurrogatePair(norm32,c2);
! } else {
! norm32=0;
! c2=0;
! }
! }else{
! c2=0;
! }
! if(nx_contains(nx, c, c2)) {
! /* excluded: norm32==0 */
! norm32=0;
! }
!
! // check the combining order
! cc=(char)((norm32>>CC_SHIFT)&0xFF);
! if(cc!=0 && cc<prevCC) {
! return NormalizerBase.NO;
}
- prevCC=cc;
-
- // check for "no" or "maybe" quick check flags
- qcNorm32 = norm32 & qcMask;
- if((qcNorm32& QC_ANY_NO)>=1) {
- result= NormalizerBase.NO;
- break;
- } else if(qcNorm32!=0) {
- // "maybe" can only occur for NFC and NFKC
- if(allowMaybe){
- result=NormalizerBase.MAYBE;
- }else{
- // normalize a section around here to see if it is really
- // normalized or not
- int prevStarter;
- int/*unsigned*/ decompQCMask;
-
- decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask
! // find the previous starter
!
! // set prevStarter to the beginning of the current character
! prevStarter=srcStart-1;
! if(UTF16.isTrailSurrogate(src[prevStarter])) {
! // safe because unpaired surrogates do not result
! // in "maybe"
! --prevStarter;
}
- prevStarter=findPreviousStarter(src, start, prevStarter,
- ccOrQCMask, decompQCMask,
- (char)minNoMaybe);
-
- // find the next true starter in [src..limit[ - modifies
- // src to point to the next starter
- srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
- decompQCMask,(char) minNoMaybe);
-
- //set the args for compose part
- args.prevCC = prevCC;
-
- // decompose and recompose [prevStarter..src[
- buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
! // compare the normalized version with the original
! if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
! result=NormalizerBase.NO; // normalization differs
! break;
}
! // continue after the next starter
}
}
}
! return result;
}
-
-
- //------------------------------------------------------
- // make NFD & NFKD
- //------------------------------------------------------
-
- public static int decompose(char[] src,int srcStart,int srcLimit,
- char[] dest,int destStart,int destLimit,
- boolean compat,int[] outTrailCC,
- UnicodeSet nx) {
-
- char[] buffer = new char[3];
- int prevSrc;
- long norm32;
- int ccOrQCMask, qcMask;
- int reorderStartIndex, length;
- char c, c2, minNoMaybe;
- int/*unsigned byte*/ cc, prevCC, trailCC;
- char[] p;
- int pStart;
- int destIndex = destStart;
- int srcIndex = srcStart;
- if(!compat) {
- minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
- qcMask=QC_NFD;
- } else {
- minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
- qcMask=QC_NFKD;
}
-
- /* initialize */
- ccOrQCMask=CC_MASK|qcMask;
- reorderStartIndex=0;
- prevCC=0;
- norm32=0;
- c=0;
- pStart=0;
-
- cc=trailCC=-1;//initialize to bogus value
-
- for(;;) {
- /* count code units below the minimum or with irrelevant data for
- * the quick check
- */
- prevSrc=srcIndex;
-
- while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
- ((norm32=getNorm32(c))&ccOrQCMask)==0)){
- prevCC=0;
- ++srcIndex;
}
! /* copy these code units all at once */
! if(srcIndex!=prevSrc) {
! length=srcIndex-prevSrc;
! if((destIndex+length)<=destLimit) {
! System.arraycopy(src,prevSrc,dest,destIndex,length);
}
! destIndex+=length;
! reorderStartIndex=destIndex;
}
! /* end of source reached? */
! if(srcIndex==srcLimit) {
! break;
}
!
! /* c already contains *src and norm32 is set for it, increment src*/
! ++srcIndex;
!
! /* check one above-minimum, relevant code unit */
! /*
! * generally, set p and length to the decomposition string
! * in simple cases, p==NULL and (c, c2) will hold the length code
! * units to append in all cases, set cc to the lead and trailCC to
! * the trail combining class
! *
! * the following merge-sort of the current character into the
! * preceding, canonically ordered result text will use the
! * optimized insertOrdered()
! * if there is only one single code point to process;
! * this is indicated with p==NULL, and (c, c2) is the character to
! * insert
! * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
! * for a supplementary character)
! * otherwise, p[length] is merged in with _mergeOrdered()
! */
! if(isNorm32HangulOrJamo(norm32)) {
! if(nx_contains(nx, c)) {
! c2=0;
! p=null;
! length=1;
! } else {
! // Hangul syllable: decompose algorithmically
! p=buffer;
! pStart=0;
! cc=trailCC=0;
!
! c-=HANGUL_BASE;
!
! c2=(char)(c%JAMO_T_COUNT);
! c/=JAMO_T_COUNT;
! if(c2>0) {
! buffer[2]=(char)(JAMO_T_BASE+c2);
! length=3;
! } else {
! length=2;
}
! buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
! buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
! }
! } else {
! if(isNorm32Regular(norm32)) {
! c2=0;
! length=1;
! } else {
! // c is a lead surrogate, get the real norm32
! if(srcIndex!=srcLimit &&
! UTF16.isTrailSurrogate(c2=src[srcIndex])) {
! ++srcIndex;
! length=2;
! norm32=getNorm32FromSurrogatePair(norm32, c2);
! } else {
! c2=0;
! length=1;
! norm32=0;
}
}
! /* get the decomposition and the lead and trail cc's */
! if(nx_contains(nx, c, c2)) {
! /* excluded: norm32==0 */
! cc=trailCC=0;
! p=null;
! } else if((norm32&qcMask)==0) {
! /* c does not decompose */
! cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
! p=null;
! pStart=-1;
! } else {
! DecomposeArgs arg = new DecomposeArgs();
! /* c decomposes, get everything from the variable-length
! * extra data
! */
! pStart=decompose(norm32, qcMask, arg);
! p=extraData;
! length=arg.length;
! cc=arg.cc;
! trailCC=arg.trailCC;
! if(length==1) {
! /* fastpath a single code unit from decomposition */
! c=p[pStart];
! c2=0;
! p=null;
! pStart=-1;
}
}
}
! /* append the decomposition to the destination buffer, assume
! * length>0
! */
! if((destIndex+length)<=destLimit) {
! int reorderSplit=destIndex;
! if(p==null) {
! /* fastpath: single code point */
! if(cc!=0 && cc<prevCC) {
! /* (c, c2) is out of order with respect to the preceding
! * text
*/
! destIndex+=length;
! trailCC=insertOrdered(dest,reorderStartIndex,
! reorderSplit, destIndex, c, c2, cc);
} else {
! /* just append (c, c2) */
! dest[destIndex++]=c;
! if(c2!=0) {
! dest[destIndex++]=c2;
}
}
! } else {
! /* general: multiple code points (ordered by themselves)
! * from decomposition
! */
! if(cc!=0 && cc<prevCC) {
! /* the decomposition is out of order with respect to the
! * preceding text
! */
! destIndex+=length;
! trailCC=mergeOrdered(dest,reorderStartIndex,
! reorderSplit,p, pStart,pStart+length);
! } else {
! /* just append the decomposition */
! do {
! dest[destIndex++]=p[pStart++];
! } while(--length>0);
}
}
! } else {
! /* buffer overflow */
! /* keep incrementing the destIndex for preflighting */
! destIndex+=length;
}
! prevCC=trailCC;
! if(prevCC==0) {
! reorderStartIndex=destIndex;
}
}
-
- outTrailCC[0]=prevCC;
-
- return destIndex - destStart;
}
-
- /* make NFC & NFKC ------------------------------------------------------ */
- private static final class NextCombiningArgs{
- char[] source;
- int start;
- //int limit;
- char c;
- char c2;
- int/*unsigned*/ combiningIndex;
- char /*unsigned byte*/ cc;
}
! /* get the composition properties of the next character */
! private static int /*unsigned*/ getNextCombining(NextCombiningArgs args,
! int limit,
! UnicodeSet nx) {
! long/*unsigned*/ norm32;
! int combineFlags;
! /* get properties */
! args.c=args.source[args.start++];
! norm32=getNorm32(args.c);
!
! /* preset output values for most characters */
! args.c2=0;
! args.combiningIndex=0;
! args.cc=0;
! if((norm32&(CC_MASK|COMBINES_ANY))==0) {
return 0;
! } else {
! if(isNorm32Regular(norm32)) {
! /* set cc etc. below */
! } else if(isNorm32HangulOrJamo(norm32)) {
! /* a compatibility decomposition contained Jamos */
! args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
! (norm32>>EXTRA_SHIFT)));
! return (int)(norm32&COMBINES_ANY);
! } else {
! /* c is a lead surrogate, get the real norm32 */
! if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
! args.source[args.start])) {
! ++args.start;
! norm32=getNorm32FromSurrogatePair(norm32, args.c2);
! } else {
! args.c2=0;
return 0;
}
}
! if(nx_contains(nx, args.c, args.c2)) {
! return 0; /* excluded: norm32==0 */
}
! args.cc= (char)((norm32>>CC_SHIFT)&0xff);
! combineFlags=(int)(norm32&COMBINES_ANY);
! if(combineFlags!=0) {
! int index = getExtraDataIndex(norm32);
! args.combiningIndex=index>0 ? extraData[(index-1)] :0;
}
! return combineFlags;
}
}
! /*
! * given a composition-result starter (c, c2) - which means its cc==0,
! * it combines forward, it has extra data, its norm32!=0,
! * it is not a Hangul or Jamo,
! * get just its combineFwdIndex
! *
! * norm32(c) is special if and only if c2!=0
! */
! private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
! long/*unsigned*/ norm32;
! norm32=getNorm32(c);
! if(c2!=0) {
! norm32=getNorm32FromSurrogatePair(norm32, c2);
}
! return extraData[(getExtraDataIndex(norm32)-1)];
}
! /*
! * Find the recomposition result for
! * a forward-combining character
! * (specified with a pointer to its part of the combiningTable[])
! * and a backward-combining character
! * (specified with its combineBackIndex).
! *
! * If these two characters combine, then set (value, value2)
! * with the code unit(s) of the composition character.
! *
! * Return value:
! * 0 do not combine
! * 1 combine
! * >1 combine, and the composition is a forward-combining starter
! *
! * See unormimp.h for a description of the composition table format.
! */
! private static int/*unsigned*/ combine(char[]table,int tableStart,
! int/*unsinged*/ combineBackIndex,
! int[] outValues) {
! int/*unsigned*/ key;
! int value,value2;
! if(outValues.length<2){
! throw new IllegalArgumentException();
}
! /* search in the starter's composition table */
! for(;;) {
! key=table[tableStart++];
! if(key>=combineBackIndex) {
! break;
}
! tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
}
! /* mask off bit 15, the last-entry-in-the-list flag */
! if((key&0x7fff)==combineBackIndex) {
! /* found! combine! */
! value=table[tableStart];
!
! /* is the composition a starter that combines forward? */
! key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));
! /* get the composition result code point from the variable-length
! * result value
! */
! if((value&0x8000) != 0) {
! if((value&0x4000) != 0) {
! /* surrogate pair composition result */
! value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
! value2=table[tableStart+1];
! } else {
! /* BMP composition result U+2000..U+ffff */
! value=table[tableStart+1];
! value2=0;
}
! } else {
! /* BMP composition result U+0000..U+1fff */
! value&=0x1fff;
! value2=0;
}
- outValues[0]=value;
- outValues[1]=value2;
- return key;
} else {
! /* not found */
! return 0;
}
}
! private static final class RecomposeArgs{
! char[] source;
! int start;
! int limit;
}
- /*
- * recompose the characters in [p..limit[
- * (which is in NFD - decomposed and canonically ordered),
- * adjust limit, and return the trailing cc
- *
- * since for NFKC we may get Jamos in decompositions, we need to
- * recompose those too
- *
- * note that recomposition never lengthens the text:
- * any character consists of either one or two code units;
- * a composition may contain at most one more code unit than the original
- * starter, while the combining mark that is removed has at least one code
- * unit
- */
- private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
- int remove, q, r;
- int /*unsigned*/ combineFlags;
- int /*unsigned*/ combineFwdIndex, combineBackIndex;
- int /*unsigned*/ result, value=0, value2=0;
- int /*unsigned byte*/ prevCC;
- boolean starterIsSupplementary;
- int starter;
- int[] outValues = new int[2];
- starter=-1; /* no starter */
- combineFwdIndex=0; /* will not be used until starter!=NULL */
- starterIsSupplementary=false; /* will not be used until starter!=NULL */
- prevCC=0;
! NextCombiningArgs ncArg = new NextCombiningArgs();
! ncArg.source = args.source;
! ncArg.cc =0;
! ncArg.c2 =0;
! for(;;) {
! ncArg.start = args.start;
! combineFlags=getNextCombining(ncArg,args.limit,nx);
! combineBackIndex=ncArg.combiningIndex;
! args.start = ncArg.start;
!
! if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
! if((combineBackIndex&0x8000)!=0) {
! /* c is a Jamo V/T, see if we can compose it with the
! * previous character
! */
! /* for the PRI #29 fix, check that there is no intervening combining mark */
! if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
! remove=-1; /* NULL while no Hangul composition */
! combineFlags=0;
! ncArg.c2=args.source[starter];
! if(combineBackIndex==0xfff2) {
! /* Jamo V, compose with previous Jamo L and following
! * Jamo T
! */
! ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
! if(ncArg.c2<JAMO_L_COUNT) {
! remove=args.start-1;
! ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
! (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
! if(args.start!=args.limit &&
! (ncArg.c2=(char)(args.source[args.start]
! -JAMO_T_BASE))<JAMO_T_COUNT) {
! ++args.start;
! ncArg.c+=ncArg.c2;
! } else {
! /* the result is an LV syllable, which is a starter (unlike LVT) */
! combineFlags=COMBINES_FWD;
! }
! if(!nx_contains(nx, ncArg.c)) {
! args.source[starter]=ncArg.c;
! } else {
! /* excluded */
! if(!isHangulWithoutJamoT(ncArg.c)) {
! --args.start; /* undo the ++args.start from reading the Jamo T */
}
! /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
! remove=args.start;
}
}
! /*
! * Normally, the following can not occur:
! * Since the input is in NFD, there are no Hangul LV syllables that
! * a Jamo T could combine with.
! * All Jamo Ts are combined above when handling Jamo Vs.
! *
! * However, before the PRI #29 fix, this can occur due to
! * an intervening combining mark between the Hangul LV and the Jamo T.
*/
} else {
! /* Jamo T, compose with previous Hangul that does not have a Jamo T */
! if(isHangulWithoutJamoT(ncArg.c2)) {
! ncArg.c2+=ncArg.c-JAMO_T_BASE;
! if(!nx_contains(nx, ncArg.c2)) {
! remove=args.start-1;
! args.source[starter]=ncArg.c2;
}
}
}
-
- if(remove!=-1) {
- /* remove the Jamo(s) */
- q=remove;
- r=args.start;
- while(r<args.limit) {
- args.source[q++]=args.source[r++];
}
- args.start=remove;
- args.limit=q;
}
! ncArg.c2=0; /* c2 held *starter temporarily */
!
! if(combineFlags!=0) {
! /*
! * not starter=NULL because the composition is a Hangul LV syllable
! * and might combine once more (but only before the PRI #29 fix)
*/
!
! /* done? */
! if(args.start==args.limit) {
! return (char)prevCC;
! }
!
! /* the composition is a Hangul LV syllable which is a starter that combines forward */
! combineFwdIndex=0xfff0;
!
! /* we combined; continue with looking for compositions */
continue;
}
! }
!
! /*
! * now: cc==0 and the combining index does not include
! * "forward" -> the rest of the loop body will reset starter
! * to NULL; technically, a composed Hangul syllable is a
! * starter, but it does not combine forward now that we have
! * consumed all eligible Jamos; for Jamo V/T, combineFlags
! * does not contain _NORM_COMBINES_FWD
! */
!
! } else if(
! /* the starter is not a Hangul LV or Jamo V/T and */
! !((combineFwdIndex&0x8000)!=0) &&
! /* the combining mark is not blocked and */
! ((options&BEFORE_PRI_29)!=0 ?
! (prevCC!=ncArg.cc || prevCC==0) :
! (prevCC<ncArg.cc || prevCC==0)) &&
! /* the starter and the combining mark (c, c2) do combine */
! 0!=(result=combine(combiningTable,combineFwdIndex,
! combineBackIndex, outValues)) &&
! /* the composition result is not excluded */
! !nx_contains(nx, (char)value, (char)value2)
! ) {
! value=outValues[0];
! value2=outValues[1];
! /* replace the starter with the composition, remove the
! * combining mark
! */
! remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
!
! /* replace the starter with the composition */
! args.source[starter]=(char)value;
! if(starterIsSupplementary) {
! if(value2!=0) {
! /* both are supplementary */
! args.source[starter+1]=(char)value2;
} else {
! /* the composition is shorter than the starter,
! * move the intermediate characters forward one */
! starterIsSupplementary=false;
! q=starter+1;
! r=q+1;
! while(r<remove) {
! args.source[q++]=args.source[r++];
}
- --remove;
}
- } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB
- starterIsSupplementary=true;
- args.source[starter+1]=(char)value2;
- /* } else { both are on the BMP, nothing more to do */
}
! /* remove the combining mark by moving the following text
! * over it */
! if(remove<args.start) {
! q=remove;
! r=args.start;
! while(r<args.limit) {
! args.source[q++]=args.source[r++];
! }
! args.start=remove;
! args.limit=q;
! }
! /* keep prevCC because we removed the combining mark */
! /* done? */
! if(args.start==args.limit) {
! return (char)prevCC;
! }
! /* is the composition a starter that combines forward? */
! if(result>1) {
! combineFwdIndex=getCombiningIndexFromStarter((char)value,
! (char)value2);
! } else {
! starter=-1;
! }
! /* we combined; continue with looking for compositions */
! continue;
! }
! }
! /* no combination this time */
! prevCC=ncArg.cc;
! if(args.start==args.limit) {
! return (char)prevCC;
! }
! /* if (c, c2) did not combine, then check if it is a starter */
! if(ncArg.cc==0) {
! /* found a new starter; combineFlags==0 if (c, c2) is excluded */
! if((combineFlags&COMBINES_FWD)!=0) {
! /* it may combine with something, prepare for it */
! if(ncArg.c2==0) {
! starterIsSupplementary=false;
! starter=args.start-1;
} else {
! starterIsSupplementary=false;
! starter=args.start-2;
}
! combineFwdIndex=combineBackIndex;
} else {
! /* it will not combine with anything */
! starter=-1;
}
- } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
- /* FCC: no discontiguous compositions; any intervening character blocks */
- starter=-1;
}
}
}
!
! // find the last true starter between src[start]....src[current] going
! // backwards and return its index
! private static int findPreviousStarter(char[]src, int srcStart, int current,
! int/*unsigned*/ ccOrQCMask,
! int/*unsigned*/ decompQCMask,
! char minNoMaybe) {
! long norm32;
! PrevArgs args = new PrevArgs();
! args.src = src;
! args.start = srcStart;
! args.current = current;
!
! while(args.start<args.current) {
! norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
! if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
break;
}
- }
- return args.current;
- }
-
- /* find the first true starter in [src..limit[ and return the
- * pointer to it
- */
- private static int/*index*/ findNextStarter(char[] src,int start,int limit,
- int/*unsigned*/ qcMask,
- int/*unsigned*/ decompQCMask,
- char minNoMaybe) {
- int p;
- long/*unsigned*/ norm32;
- int ccOrQCMask;
- char c, c2;
-
- ccOrQCMask=CC_MASK|qcMask;
! DecomposeArgs decompArgs = new DecomposeArgs();
!
! for(;;) {
! if(start==limit) {
! break; /* end of string */
}
! c=src[start];
! if(c<minNoMaybe) {
! break; /* catches NUL terminater, too */
}
-
- norm32=getNorm32(c);
- if((norm32&ccOrQCMask)==0) {
- break; /* true starter */
}
!
! if(isNorm32LeadSurrogate(norm32)) {
! /* c is a lead surrogate, get the real norm32 */
! if((start+1)==limit ||
! !UTF16.isTrailSurrogate(c2=(src[start+1]))){
! /* unmatched first surrogate: counts as a true starter */
! break;
}
- norm32=getNorm32FromSurrogatePair(norm32, c2);
-
- if((norm32&ccOrQCMask)==0) {
- break; /* true starter */
}
! } else {
! c2=0;
}
! /* (c, c2) is not a true starter but its decomposition may be */
! if((norm32&decompQCMask)!=0) {
! /* (c, c2) decomposes, get everything from the variable-length
! * extra data */
! p=decompose(norm32, decompQCMask, decompArgs);
!
! /* get the first character's norm32 to check if it is a true
! * starter */
! if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
! break; /* true starter */
}
}
!
! start+= c2==0 ? 1 : 2; /* not a true starter, continue */
! }
!
! return start;
}
!
!
! private static final class ComposePartArgs{
! int prevCC;
! int length; /* length of decomposed part */
}
! /* decompose and recompose [prevStarter..src[ */
! private static char[] composePart(ComposePartArgs args,
! int prevStarter,
! char[] src, int start, int limit,
! int options,
! UnicodeSet nx) {
! int recomposeLimit;
! boolean compat =((options&OPTIONS_COMPAT)!=0);
! /* decompose [prevStarter..src[ */
! int[] outTrailCC = new int[1];
! char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];
! for(;;){
! args.length=decompose(src,prevStarter,(start),
! buffer,0,buffer.length,
! compat,outTrailCC,nx);
! if(args.length<=buffer.length){
break;
! }else{
! buffer = new char[args.length];
! }
}
-
- /* recompose the decomposition */
- recomposeLimit=args.length;
-
- if(args.length>=2) {
- RecomposeArgs rcArgs = new RecomposeArgs();
- rcArgs.source = buffer;
- rcArgs.start = 0;
- rcArgs.limit = recomposeLimit;
- args.prevCC=recompose(rcArgs, options, nx);
- recomposeLimit = rcArgs.limit;
}
!
! /* return with a pointer to the recomposition and its length */
! args.length=recomposeLimit;
! return buffer;
}
-
- private static boolean composeHangul(char prev, char c,
- long/*unsigned*/ norm32,
- char[] src,int[] srcIndex, int limit,
- boolean compat,
- char[] dest,int destIndex,
- UnicodeSet nx) {
- int start=srcIndex[0];
- if(isJamoVTNorm32JamoV(norm32)) {
- /* c is a Jamo V, compose with previous Jamo L and
- * following Jamo T */
- prev=(char)(prev-JAMO_L_BASE);
- if(prev<JAMO_L_COUNT) {
- c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
- (c-JAMO_V_BASE))*JAMO_T_COUNT);
-
- /* check if the next character is a Jamo T (normal or
- * compatibility) */
- if(start!=limit) {
- char next, t;
-
- next=src[start];
- if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
- /* normal Jamo T */
- ++start;
- c+=t;
- } else if(compat) {
- /* if NFKC, then check for compatibility Jamo T
- * (BMP only) */
- norm32=getNorm32(next);
- if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
- int p /*index into extra data array*/;
- DecomposeArgs dcArgs = new DecomposeArgs();
- p=decompose(norm32, QC_NFKD, dcArgs);
- if(dcArgs.length==1 &&
- (t=(char)(extraData[p]-JAMO_T_BASE))
- <JAMO_T_COUNT) {
- /* compatibility Jamo T */
- ++start;
- c+=t;
}
}
}
}
! if(nx_contains(nx, c)) {
! if(!isHangulWithoutJamoT(c)) {
! --start; /* undo ++start from reading the Jamo T */
}
! return false;
}
! dest[destIndex]=c;
! srcIndex[0]=start;
! return true;
}
! } else if(isHangulWithoutJamoT(prev)) {
! /* c is a Jamo T, compose with previous Hangul LV that does not
! * contain a Jamo T */
! c=(char)(prev+(c-JAMO_T_BASE));
! if(nx_contains(nx, c)) {
return false;
}
! dest[destIndex]=c;
! srcIndex[0]=start;
! return true;
}
return false;
}
! /*
! public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
! return compose(src,0,src.length,dest,0,dest.length,compat, nx);
}
! */
!
! public static int compose(char[] src, int srcStart, int srcLimit,
! char[] dest,int destStart,int destLimit,
! int options,UnicodeSet nx) {
!
! int prevSrc, prevStarter;
! long/*unsigned*/ norm32;
! int ccOrQCMask, qcMask;
! int reorderStartIndex, length;
! char c, c2, minNoMaybe;
! int/*unsigned byte*/ cc, prevCC;
! int[] ioIndex = new int[1];
! int destIndex = destStart;
! int srcIndex = srcStart;
!
! if((options&OPTIONS_COMPAT)!=0) {
! minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
! qcMask=QC_NFKC;
} else {
- minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
- qcMask=QC_NFC;
- }
-
- /*
- * prevStarter points to the last character before the current one
- * that is a "true" starter with cc==0 and quick check "yes".
- *
- * prevStarter will be used instead of looking for a true starter
- * while incrementally decomposing [prevStarter..prevSrc[
- * in _composePart(). Having a good prevStarter allows to just decompose
- * the entire [prevStarter..prevSrc[.
- *
- * When _composePart() backs out from prevSrc back to prevStarter,
- * then it also backs out destIndex by the same amount.
- * Therefore, at all times, the (prevSrc-prevStarter) source units
- * must correspond 1:1 to destination units counted with destIndex,
- * except for reordering.
- * This is true for the qc "yes" characters copied in the fast loop,
- * and for pure reordering.
- * prevStarter must be set forward to src when this is not true:
- * In _composePart() and after composing a Hangul syllable.
- *
- * This mechanism relies on the assumption that the decomposition of a
- * true starter also begins with a true starter. gennorm/store.c checks
- * for this.
- */
- prevStarter=srcIndex;
-
- ccOrQCMask=CC_MASK|qcMask;
- /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
- prevCC=0;
-
- /* avoid compiler warnings */
- norm32=0;
- c=0;
-
- for(;;) {
- /* count code units below the minimum or with irrelevant data for
- * the quick check */
- prevSrc=srcIndex;
-
- while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
- ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
prevCC=0;
- ++srcIndex;
- }
-
-
- /* copy these code units all at once */
- if(srcIndex!=prevSrc) {
- length=srcIndex-prevSrc;
- if((destIndex+length)<=destLimit) {
- System.arraycopy(src,prevSrc,dest,destIndex,length);
- }
- destIndex+=length;
- reorderStartIndex=destIndex;
-
- /* set prevStarter to the last character in the quick check
- * loop */
- prevStarter=srcIndex-1;
- if(UTF16.isTrailSurrogate(src[prevStarter]) &&
- prevSrc<prevStarter &&
- UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
- --prevStarter;
}
!
! prevSrc=srcIndex;
}
-
- /* end of source reached? */
- if(srcIndex==srcLimit) {
- break;
}
-
- /* c already contains *src and norm32 is set for it, increment src*/
- ++srcIndex;
-
/*
! * source buffer pointers:
*
* all done quick check current char not yet
! * "yes" but (c, c2) processed
* may combine
* forward
* [-------------[-------------[-------------[-------------[
* | | | | |
! * start prevStarter prevSrc src limit
*
*
! * destination buffer pointers and indexes:
*
* all done might take not filled yet
* characters for
* reordering
* [-------------[-------------[-------------[
* | | | |
! * dest reorderStartIndex destIndex destCapacity
! */
!
! /* check one above-minimum, relevant code unit */
! /*
! * norm32 is for c=*(src-1), and the quick check flag is "no" or
! * "maybe", and/or cc!=0
! * check for Jamo V/T, then for surrogates and regular characters
! * c is not a Hangul syllable or Jamo L because
! * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
*/
! if(isNorm32HangulOrJamo(norm32)) {
! /*
! * c is a Jamo V/T:
! * try to compose with the previous character, Jamo V also with
! * a following Jamo T, and set values here right now in case we
! * just continue with the main loop
! */
! prevCC=cc=0;
! reorderStartIndex=destIndex;
! ioIndex[0]=srcIndex;
! if(
! destIndex>0 &&
! composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
! srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
! destIndex<=destLimit ? destIndex-1: 0,
! nx)
) {
! srcIndex=ioIndex[0];
! prevStarter=srcIndex;
! continue;
}
!
! srcIndex = ioIndex[0];
!
! /* the Jamo V/T did not compose into a Hangul syllable, just
! * append to dest */
! c2=0;
! length=1;
! prevStarter=prevSrc;
! } else {
! if(isNorm32Regular(norm32)) {
! c2=0;
! length=1;
! } else {
! /* c is a lead surrogate, get the real norm32 */
! if(srcIndex!=srcLimit &&
! UTF16.isTrailSurrogate(c2=src[srcIndex])) {
! ++srcIndex;
! length=2;
! norm32=getNorm32FromSurrogatePair(norm32, c2);
! } else {
! /* c is an unpaired lead surrogate, nothing to do */
! c2=0;
! length=1;
! norm32=0;
! }
! }
! ComposePartArgs args =new ComposePartArgs();
!
! /* we are looking at the character (c, c2) at [prevSrc..src[ */
! if(nx_contains(nx, c, c2)) {
! /* excluded: norm32==0 */
! cc=0;
! } else if((norm32&qcMask)==0) {
! cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
} else {
! char[] p;
/*
! * find appropriate boundaries around this character,
* decompose the source text from between the boundaries,
! * and recompose it
*
! * this puts the intermediate text into the side buffer because
! * it might be longer than the recomposition end result,
! * or the destination buffer may be too short or missing
! *
! * note that destIndex may be adjusted backwards to account
! * for source text that passed the quick check but needed to
! * take part in the recomposition
*/
! int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
/*
! * find the last true starter in [prevStarter..src[
! * it is either the decomposition of the current character (at prevSrc),
! * or prevStarter
*/
! if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
! prevStarter=prevSrc;
! } else {
! /* adjust destIndex: back out what had been copied with qc "yes" */
! destIndex-=prevSrc-prevStarter;
}
! /* find the next true starter in [src..limit[ */
! srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
! decompQCMask, minNoMaybe);
! //args.prevStarter = prevStarter;
! args.prevCC = prevCC;
! //args.destIndex = destIndex;
! args.length = length;
! p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);
! if(p==null) {
! /* an error occurred (out of memory) */
! break;
}
! prevCC = args.prevCC;
! length = args.length;
!
! /* append the recomposed buffer contents to the destination
! * buffer */
! if((destIndex+args.length)<=destLimit) {
! int i=0;
! while(i<args.length) {
! dest[destIndex++]=p[i++];
! --length;
}
! } else {
! /* buffer overflow */
! /* keep incrementing the destIndex for preflighting */
! destIndex+=length;
}
! prevStarter=srcIndex;
! continue;
}
}
!
! /* append the single code point (c, c2) to the destination buffer */
! if((destIndex+length)<=destLimit) {
! if(cc!=0 && cc<prevCC) {
! /* (c, c2) is out of order with respect to the preceding
! * text */
! int reorderSplit= destIndex;
! destIndex+=length;
! prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
! destIndex, c, c2, cc);
! } else {
! /* just append (c, c2) */
! dest[destIndex++]=c;
! if(c2!=0) {
! dest[destIndex++]=c2;
}
- prevCC=cc;
}
} else {
! /* buffer overflow */
! /* keep incrementing the destIndex for preflighting */
! destIndex+=length;
! prevCC=cc;
}
}
-
- return destIndex - destStart;
}
!
! public static int getCombiningClass(int c) {
! long norm32;
! norm32=getNorm32(c);
! return (int)((norm32>>CC_SHIFT)&0xFF);
}
! public static boolean isFullCompositionExclusion(int c) {
! if(isFormatVersion_2_1) {
! int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
! return (aux & AUX_COMP_EX_MASK)!=0;
} else {
! return false;
}
}
!
! public static boolean isCanonSafeStart(int c) {
! if(isFormatVersion_2_1) {
! int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
! return (aux & AUX_UNSAFE_MASK)==0;
! } else {
! return false;
}
}
! /* Is c an NF<mode>-skippable code point? See unormimp.h. */
! public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) {
! long /*unsigned int*/ norm32;
! mask = mask & UNSIGNED_INT_MASK;
! char aux;
! /* check conditions (a)..(e), see unormimp.h */
! norm32 = getNorm32(c);
! if((norm32&mask)!=0) {
! return false; /* fails (a)..(e), not skippable */
}
!
! if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){
! return true; /* NF*D, passed (a)..(c), is skippable */
}
- /* check conditions (a)..(e), see unormimp.h */
-
- /* NF*C/FCC, passed (a)..(e) */
- if((norm32& QC_NFD)==0) {
- return true; /* no canonical decomposition, is skippable */
}
-
- /* check Hangul syllables algorithmically */
- if(isNorm32HangulOrJamo(norm32)) {
- /* Jamo passed (a)..(e) above, must be Hangul */
- return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
}
!
! /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
! /* NF*C, test (f) flag */
! if(!isFormatVersion_2_2) {
! return false; /* no (f) data, say not skippable to be safe */
}
-
-
- aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
- return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
-
- /* } else { FCC, test fcd<=1 instead of the above } */
}
-
- public static UnicodeSet addPropertyStarts(UnicodeSet set) {
- int c;
-
- /* add the start code point of each same-value range of each trie */
- //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
- TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
- RangeValueIterator.Element normResult = new RangeValueIterator.Element();
-
- while(normIter.next(normResult)){
- set.add(normResult.start);
}
!
! //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
! TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
! RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
!
! while(fcdIter.next(fcdResult)){
! set.add(fcdResult.start);
}
!
! if(isFormatVersion_2_1){
! //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
! TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
! RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
! while(auxIter.next(auxResult)){
! set.add(auxResult.start);
}
}
! /* add Hangul LV syllables and LV+1 because of skippables */
! for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
! set.add(c);
! set.add(c+1);
}
- set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
- return set; // for chaining
}
!
! /**
! * Internal API, used in UCharacter.getIntPropertyValue().
! * @internal
! * @param c code point
! * @param modeValue numeric value compatible with Mode
! * @return numeric value compatible with QuickCheck
! */
! public static final int quickCheck(int c, int modeValue) {
! final int qcMask[/*UNORM_MODE_COUNT*/]={
! 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
! };
!
! int norm32=(int)getNorm32(c)&qcMask[modeValue];
!
! if(norm32==0) {
! return 1; // YES
! } else if((norm32&QC_ANY_NO)!=0) {
! return 0; // NO
! } else /* _NORM_QC_ANY_MAYBE */ {
! return 2; // MAYBE;
}
}
! private static int strCompare(char[] s1, int s1Start, int s1Limit,
! char[] s2, int s2Start, int s2Limit,
! boolean codePointOrder) {
!
! int start1, start2, limit1, limit2;
!
! char c1, c2;
!
! /* setup for fix-up */
! start1=s1Start;
! start2=s2Start;
!
! int length1, length2;
!
! length1 = s1Limit - s1Start;
! length2 = s2Limit - s2Start;
!
! int lengthResult;
!
! if(length1<length2) {
! lengthResult=-1;
! limit1=start1+length1;
! } else if(length1==length2) {
! lengthResult=0;
! limit1=start1+length1;
! } else /* length1>length2 */ {
! lengthResult=1;
! limit1=start1+length2;
}
!
! if(s1==s2) {
! return lengthResult;
}
for(;;) {
! /* check pseudo-limit */
! if(s1Start==limit1) {
! return lengthResult;
}
-
- c1=s1[s1Start];
- c2=s2[s2Start];
- if(c1!=c2) {
- break;
}
- ++s1Start;
- ++s2Start;
}
! /* setup for fix-up */
! limit1=start1+length1;
! limit2=start2+length2;
!
! /* if both values are in or above the surrogate range, fix them up */
! if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
! /* subtract 0x2800 from BMP code points to make them smaller than
! * supplementary ones */
! if(
! ( c1<=0xdbff && (s1Start+1)!=limit1 &&
! UTF16.isTrailSurrogate(s1[(s1Start+1)])
! ) ||
! ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
! UTF16.isLeadSurrogate(s1[(s1Start-1)])
! )
! ) {
! /* part of a surrogate pair, leave >=d800 */
} else {
! /* BMP code point - may be surrogate code point - make <d800 */
! c1-=0x2800;
}
! if(
! ( c2<=0xdbff && (s2Start+1)!=limit2 &&
! UTF16.isTrailSurrogate(s2[(s2Start+1)])
! ) ||
! ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
! UTF16.isLeadSurrogate(s2[(s2Start-1)])
! )
! ) {
! /* part of a surrogate pair, leave >=d800 */
} else {
! /* BMP code point - may be surrogate code point - make <d800 */
! c2-=0x2800;
}
}
! /* now c1 and c2 are in UTF-32-compatible order */
! return (int)c1-(int)c2;
}
!
! /*
! * Status of tailored normalization
! *
! * This was done initially for investigation on Unicode public review issue 7
! * (http://www.unicode.org/review/). See Jitterbug 2481.
! * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
! * a permanent feature in ICU 2.6 in support of IDNA which requires true
! * Unicode 3.2 normalization.
! * (NormalizationCorrections are rolled into IDNA mapping tables.)
! *
! * Tailored normalization as implemented here allows to "normalize less"
! * than full Unicode normalization would.
! * Based internally on a UnicodeSet of code points that are
! * "excluded from normalization", the normalization functions leave those
! * code points alone ("inert"). This means that tailored normalization
! * still transforms text into a canonically equivalent form.
! * It does not add decompositions to code points that do not have any or
! * change decomposition results.
! *
! * Any function that searches for a safe boundary has not been touched,
! * which means that these functions will be over-pessimistic when
! * exclusions are applied.
! * This should not matter because subsequent checks and normalizations
! * do apply the exclusions; only a little more of the text may be processed
! * than necessary under exclusions.
! *
! * Normalization exclusions have the following effect on excluded code points c:
! * - c is not decomposed
! * - c is not a composition target
! * - c does not combine forward or backward for composition
! * except that this is not implemented for Jamo
! * - c is treated as having a combining class of 0
! */
!
! /*
! * Constants for the bit fields in the options bit set parameter.
! * These need not be public.
! * A user only needs to know the currently assigned values.
! * The number and positions of reserved bits per field can remain private.
! */
! private static final int OPTIONS_NX_MASK=0x1f;
! private static final int OPTIONS_UNICODE_MASK=0xe0;
! public static final int OPTIONS_SETS_MASK=0xff;
! // private static final int OPTIONS_UNICODE_SHIFT=5;
! private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
!
! /* Constants for options flags for normalization.*/
/**
! * Options bit 0, do not decompose Hangul syllables.
! * @draft ICU 2.6
*/
! private static final int NX_HANGUL = 1;
! /**
! * Options bit 1, do not decompose CJK compatibility characters.
! * @draft ICU 2.6
! */
! private static final int NX_CJK_COMPAT=2;
! /**
! * Options bit 8, use buggy recomposition described in
! * Unicode Public Review Issue #29
! * at http://www.unicode.org/review/resolved-pri.html#pri29
! *
! * Used in IDNA implementation according to strict interpretation
! * of IDNA definition based on Unicode 3.2 which predates PRI #29.
! *
! * See ICU4C unormimp.h
! *
! * @draft ICU 3.2
! */
! public static final int BEFORE_PRI_29=0x100;
! /*
! * The following options are used only in some composition functions.
! * They use bits 12 and up to preserve lower bits for the available options
! * space in unorm_compare() -
! * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
*/
! /** Options bit 12, for compatibility vs. canonical decomposition. */
! public static final int OPTIONS_COMPAT=0x1000;
! /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
! public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;
! /* normalization exclusion sets --------------------------------------------- */
/*
! * Normalization exclusion UnicodeSets are used for tailored normalization;
! * see the comment near the beginning of this file.
*
! * By specifying one or several sets of code points,
! * those code points become inert for normalization.
! */
! private static final synchronized UnicodeSet internalGetNXHangul() {
! /* internal function, does not check for incoming U_FAILURE */
! if(nxCache[NX_HANGUL]==null) {
! nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
}
- return nxCache[NX_HANGUL];
}
!
! private static final synchronized UnicodeSet internalGetNXCJKCompat() {
! /* internal function, does not check for incoming U_FAILURE */
!
! if(nxCache[NX_CJK_COMPAT]==null) {
!
! /* build a set from [CJK Ideographs]&[has canonical decomposition] */
! UnicodeSet set, hasDecomp;
!
! set=new UnicodeSet("[:Ideographic:]");
!
! /* start with an empty set for [has canonical decomposition] */
! hasDecomp=new UnicodeSet();
!
! /* iterate over all ideographs and remember which canonically decompose */
! UnicodeSetIterator it = new UnicodeSetIterator(set);
! int start, end;
! long norm32;
!
! while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
! start=it.codepoint;
! end=it.codepointEnd;
! while(start<=end) {
! norm32 = getNorm32(start);
! if((norm32 & QC_NFD)>0) {
! hasDecomp.add(start);
}
! ++start;
}
}
! /* hasDecomp now contains all ideographs that decompose canonically */
! nxCache[NX_CJK_COMPAT]=hasDecomp;
}
!
! return nxCache[NX_CJK_COMPAT];
}
! private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
! options &= OPTIONS_UNICODE_MASK;
! if(options==0) {
! return null;
}
! if(nxCache[options]==null) {
! /* build a set with all code points that were not designated by the specified Unicode version */
! UnicodeSet set = new UnicodeSet();
!
! switch(options) {
! case NormalizerBase.UNICODE_3_2:
! set.applyPattern("[:^Age=3.2:]");
break;
- default:
- return null;
}
! nxCache[options]=set;
}
-
- return nxCache[options];
}
!
! /* Get a decomposition exclusion set. The data must be loaded. */
! private static final synchronized UnicodeSet internalGetNX(int options) {
! options&=OPTIONS_SETS_MASK;
!
! if(nxCache[options]==null) {
! /* return basic sets */
! if(options==NX_HANGUL) {
! return internalGetNXHangul();
}
- if(options==NX_CJK_COMPAT) {
- return internalGetNXCJKCompat();
}
! if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
! return internalGetNXUnicode(options);
}
! /* build a set from multiple subsets */
! UnicodeSet set;
! UnicodeSet other;
!
! set=new UnicodeSet();
!
!
! if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
! set.addAll(other);
}
! if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
! set.addAll(other);
}
! if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
! set.addAll(other);
}
-
- nxCache[options]=set;
}
- return nxCache[options];
}
! public static final UnicodeSet getNX(int options) {
! if((options&=OPTIONS_SETS_MASK)==0) {
! /* incoming failure, or no decomposition exclusions requested */
! return null;
! } else {
! return internalGetNX(options);
}
}
!
! private static final boolean nx_contains(UnicodeSet nx, int c) {
! return nx!=null && nx.contains(c);
}
! private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
! return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
}
! /*****************************************************************************/
/**
* Get the canonical decomposition
* sherman for ComposedCharIter
*/
-
public static int getDecompose(int chars[], String decomps[]) {
! DecomposeArgs args = new DecomposeArgs();
int length=0;
! long norm32 = 0;
int ch = -1;
- int index = 0;
int i = 0;
while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
//TBD !!!! the hack code heres save us about 50ms for startup
//need a better solution/lookup
if (ch == 0x30ff)
ch = 0xf900;
! else if (ch == 0x10000)
ch = 0x1d15e;
else if (ch == 0x1d1c1)
ch = 0x2f800;
! norm32 = NormalizerImpl.getNorm32(ch);
! if((norm32 & QC_NFD)!=0 && i < chars.length) {
chars[i] = ch;
! index = decompose(norm32, args);
! decomps[i++] = new String(extraData,index, args.length);
}
}
return i;
}
//------------------------------------------------------
! // special method for Collation
//------------------------------------------------------
private static boolean needSingleQuotation(char c) {
return (c >= 0x0009 && c <= 0x000D) ||
(c >= 0x0020 && c <= 0x002F) ||
(c >= 0x003A && c <= 0x0040) ||
(c >= 0x005B && c <= 0x0060) ||
(c >= 0x007B && c <= 0x007E);
}
public static String canonicalDecomposeWithSingleQuotation(String string) {
char[] src = string.toCharArray();
int srcIndex = 0;
int srcLimit = src.length;
char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
int destIndex = 0;
int destLimit = dest.length;
- char[] buffer = new char[3];
int prevSrc;
! long norm32;
! int ccOrQCMask;
! int qcMask = QC_NFD;
int reorderStartIndex, length;
! char c, c2;
! char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE];
int cc, prevCC, trailCC;
char[] p;
int pStart;
-
// initialize
- ccOrQCMask = CC_MASK | qcMask;
reorderStartIndex = 0;
prevCC = 0;
! norm32 = 0;
! c = 0;
pStart = 0;
cc = trailCC = -1; // initialize to bogus value
! for(;;) {
prevSrc=srcIndex;
//quick check (1)less than minNoMaybe (2)no decomp (3)hangual
while (srcIndex != srcLimit &&
! (( c = src[srcIndex]) < minNoMaybe ||
! ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 ||
! ( c >= '\uac00' && c <= '\ud7a3'))){
!
prevCC = 0;
! ++srcIndex;
}
// copy these code units all at once
if (srcIndex != prevSrc) {
length = srcIndex - prevSrc;
--- 20,1799 ----
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
! * Copyright (C) 2009-2014, International Business Machines
! * Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
! import java.nio.ByteBuffer;
! import java.text.Normalizer;
! // Original filename in ICU4J: Normalizer2Impl.java
public final class NormalizerImpl {
+ public static final class Hangul {
/* Korean Hangul and Jamo constants */
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
public static final int HANGUL_BASE=0xac00;
+ public static final int HANGUL_END=0xd7a3;
public static final int JAMO_L_COUNT=19;
public static final int JAMO_V_COUNT=21;
! public static final int JAMO_T_COUNT=28;
!
! public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
! public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
!
! public static boolean isHangul(int c) {
! return HANGUL_BASE<=c && c<HANGUL_LIMIT;
}
+
+ public static boolean isHangulWithoutJamoT(char c) {
+ c-=HANGUL_BASE;
+ return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
!
! /**
! * Decomposes c, which must be a Hangul syllable, into buffer
! * and returns the length of the decomposition (2 or 3).
! */
! public static int decompose(int c, Appendable buffer) {
! try {
! c-=HANGUL_BASE;
! int c2=c%JAMO_T_COUNT;
! c/=JAMO_T_COUNT;
! buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
! buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
! if(c2==0) {
! return 2;
} else {
! buffer.append((char)(JAMO_T_BASE+c2));
! return 3;
}
! } catch(IOException e) {
! throw new InternalError(e);
}
}
}
! /**
! * Writable buffer that takes care of canonical ordering.
! * Its Appendable methods behave like the C++ implementation's
! * appendZeroCC() methods.
! * <p>
! * If dest is a StringBuilder, then the buffer writes directly to it.
! * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
! * until no further changes are necessary and whole segments are appended.
! * append() methods that take combining-class values always write to the StringBuilder.
! * Other append() methods flush and append to the Appendable.
! */
! public static final class ReorderingBuffer implements Appendable {
! public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
! impl=ni;
! app=dest;
! if (app instanceof StringBuilder) {
! appIsStringBuilder=true;
! str=(StringBuilder)dest;
! // In Java, the constructor subsumes public void init(int destCapacity)
! str.ensureCapacity(destCapacity);
! reorderStart=0;
! if(str.length()==0) {
! lastCC=0;
} else {
! setIterator();
! lastCC=previousCC();
! // Set reorderStart after the last code point with cc<=1 if there is one.
! if(lastCC>1) {
! while(previousCC()>1) {}
}
! reorderStart=codePointLimit;
}
! } else {
! appIsStringBuilder=false;
! str=new StringBuilder();
! reorderStart=0;
! lastCC=0;
}
}
! public boolean isEmpty() { return str.length()==0; }
! public int length() { return str.length(); }
! public int getLastCC() { return lastCC; }
+ public StringBuilder getStringBuilder() { return str; }
! public boolean equals(CharSequence s, int start, int limit) {
! return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
}
! // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
! public void setLastChar(char c) {
! str.setCharAt(str.length()-1, c);
}
! public void append(int c, int cc) {
! if(lastCC<=cc || cc==0) {
! str.appendCodePoint(c);
! lastCC=cc;
! if(cc<=1) {
! reorderStart=str.length();
! }
! } else {
! insert(c, cc);
! }
}
! // s must be in NFD, otherwise change the implementation.
! public void append(CharSequence s, int start, int limit,
! int leadCC, int trailCC) {
! if(start==limit) {
! return;
}
+ if(lastCC<=leadCC || leadCC==0) {
+ if(trailCC<=1) {
+ reorderStart=str.length()+(limit-start);
+ } else if(leadCC<=1) {
+ reorderStart=str.length()+1; // Ok if not a code point boundary.
}
+ str.append(s, start, limit);
+ lastCC=trailCC;
+ } else {
+ int c=Character.codePointAt(s, start);
+ start+=Character.charCount(c);
+ insert(c, leadCC); // insert first code point
+ while(start<limit) {
+ c=Character.codePointAt(s, start);
+ start+=Character.charCount(c);
+ if(start<limit) {
+ // s must be in NFD, otherwise we need to use getCC().
+ leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
+ } else {
+ leadCC=trailCC;
}
! append(c, leadCC);
}
}
}
! // The following append() methods work like C++ appendZeroCC().
! // They assume that the cc or trailCC of their input is 0.
! // Most of them implement Appendable interface methods.
! // @Override when we switch to Java 6
! public ReorderingBuffer append(char c) {
! str.append(c);
! lastCC=0;
! reorderStart=str.length();
! return this;
}
! public void appendZeroCC(int c) {
! str.appendCodePoint(c);
! lastCC=0;
! reorderStart=str.length();
}
! // @Override when we switch to Java 6
! public ReorderingBuffer append(CharSequence s) {
! if(s.length()!=0) {
! str.append(s);
! lastCC=0;
! reorderStart=str.length();
}
! return this;
}
! // @Override when we switch to Java 6
! public ReorderingBuffer append(CharSequence s, int start, int limit) {
! if(start!=limit) {
! str.append(s, start, limit);
! lastCC=0;
! reorderStart=str.length();
}
+ return this;
}
! /**
! * Flushes from the intermediate StringBuilder to the Appendable,
! * if they are different objects.
! * Used after recomposition.
! * Must be called at the end when writing to a non-StringBuilder Appendable.
! */
! public void flush() {
! if(appIsStringBuilder) {
! reorderStart=str.length();
! } else {
! try {
! app.append(str);
! str.setLength(0);
! reorderStart=0;
! } catch(IOException e) {
! throw new InternalError(e); // Avoid declaring "throws IOException".
}
}
+ lastCC=0;
}
! /**
! * Flushes from the intermediate StringBuilder to the Appendable,
! * if they are different objects.
! * Then appends the new text to the Appendable or StringBuilder.
! * Normally used after quick check loops find a non-empty sequence.
*/
! public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
! if(appIsStringBuilder) {
! str.append(s, start, limit);
! reorderStart=str.length();
} else {
! try {
! app.append(str).append(s, start, limit);
! str.setLength(0);
! reorderStart=0;
! } catch(IOException e) {
! throw new InternalError(e); // Avoid declaring "throws IOException".
}
}
! lastCC=0;
! return this;
}
+
+ public void remove() {
+ str.setLength(0);
+ lastCC=0;
+ reorderStart=0;
}
!
! public void removeSuffix(int suffixLength) {
! int oldLength=str.length();
! str.delete(oldLength-suffixLength, oldLength);
! lastCC=0;
! reorderStart=str.length();
}
! // Inserts c somewhere before the last character.
! // Requires 0<cc<lastCC which implies reorderStart<limit.
! private void insert(int c, int cc) {
! for(setIterator(), skipPrevious(); previousCC()>cc;) {}
! // insert c at codePointLimit, after the character with prevCC<=cc
! if(c<=0xffff) {
! str.insert(codePointLimit, (char)c);
! if(cc<=1) {
! reorderStart=codePointLimit+1;
}
+ } else {
+ str.insert(codePointLimit, Character.toChars(c));
+ if(cc<=1) {
+ reorderStart=codePointLimit+2;
}
}
}
! private final NormalizerImpl impl;
! private final Appendable app;
! private final StringBuilder str;
! private final boolean appIsStringBuilder;
! private int reorderStart;
! private int lastCC;
! // private backward iterator
! private void setIterator() { codePointStart=str.length(); }
! private void skipPrevious() { // Requires 0<codePointStart.
! codePointLimit=codePointStart;
! codePointStart=str.offsetByCodePoints(codePointStart, -1);
! }
! private int previousCC() { // Returns 0 if there is no previous character.
! codePointLimit=codePointStart;
! if(reorderStart>=codePointStart) {
return 0;
! }
! int c=str.codePointBefore(codePointStart);
! codePointStart-=Character.charCount(c);
! if(c<MIN_CCC_LCCC_CP) {
return 0;
}
+ return getCCFromYesOrMaybe(impl.getNorm16(c));
}
! private int codePointStart, codePointLimit;
}
! // TODO: Propose as public API on the UTF16 class.
! // TODO: Propose widening UTF16 methods that take char to take int.
! // TODO: Propose widening UTF16 methods that take String to take CharSequence.
! public static final class UTF16Plus {
! /**
! * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
! * is it a lead surrogate?
! * @param c code unit or code point
! * @return true or false
! */
! public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
! /**
! * Compares two CharSequence subsequences for binary equality.
! * @param s1 first sequence
! * @param start1 start offset in first sequence
! * @param limit1 limit offset in first sequence
! * @param s2 second sequence
! * @param start2 start offset in second sequence
! * @param limit2 limit offset in second sequence
! * @return true if s1.subSequence(start1, limit1) contains the same text
! * as s2.subSequence(start2, limit2)
! */
! public static boolean equal(CharSequence s1, int start1, int limit1,
! CharSequence s2, int start2, int limit2) {
! if((limit1-start1)!=(limit2-start2)) {
! return false;
! }
! if(s1==s2 && start1==start2) {
! return true;
! }
! while(start1<limit1) {
! if(s1.charAt(start1++)!=s2.charAt(start2++)) {
! return false;
! }
! }
! return true;
! }
}
! public NormalizerImpl() {}
!
! private static final class IsAcceptable implements ICUBinary.Authenticate {
! // @Override when we switch to Java 6
! public boolean isDataVersionAcceptable(byte version[]) {
! return version[0]==2;
}
}
! private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
! private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
! public NormalizerImpl load(ByteBuffer bytes) {
! try {
! dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
! int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
! if(indexesLength<=IX_MIN_MAYBE_YES) {
! throw new IOException("Normalizer2 data: not enough indexes");
}
! int[] inIndexes=new int[indexesLength];
! inIndexes[0]=indexesLength*4;
! for(int i=1; i<indexesLength; ++i) {
! inIndexes[i]=bytes.getInt();
}
! minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
! minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
!
! minYesNo=inIndexes[IX_MIN_YES_NO];
! minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
! minNoNo=inIndexes[IX_MIN_NO_NO];
! limitNoNo=inIndexes[IX_LIMIT_NO_NO];
! minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
! // Read the normTrie.
! int offset=inIndexes[IX_NORM_TRIE_OFFSET];
! int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
! normTrie=Trie2_16.createFromSerialized(bytes);
! int trieLength=normTrie.getSerializedLength();
! if(trieLength>(nextOffset-offset)) {
! throw new IOException("Normalizer2 data: not enough bytes for normTrie");
}
+ ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
! // Read the composition and mapping data.
! offset=nextOffset;
! nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
! int numChars=(nextOffset-offset)/2;
! char[] chars;
! if(numChars!=0) {
! chars=new char[numChars];
! for(int i=0; i<numChars; ++i) {
! chars[i]=bytes.getChar();
}
! maybeYesCompositions=new String(chars);
! extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
}
! // smallFCD: new in formatVersion 2
! offset=nextOffset;
! smallFCD=new byte[0x100];
! for(int i=0; i<0x100; ++i) {
! smallFCD[i]=bytes.get();
! }
! // Build tccc180[].
! // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
! tccc180=new int[0x180];
! int bits=0;
! for(int c=0; c<0x180; bits>>=1) {
! if((c&0xff)==0) {
! bits=smallFCD[c>>8]; // one byte per 0x100 code points
}
! if((bits&1)!=0) {
! for(int i=0; i<0x20; ++i, ++c) {
! tccc180[c]=getFCD16FromNormData(c)&0xff;
}
} else {
! c+=0x20;
}
}
+ return this;
+ } catch(IOException e) {
+ throw new InternalError(e);
+ }
+ }
! public NormalizerImpl load(String name) {
! return load(ICUBinary.getRequiredData(name));
}
! public int getNorm16(int c) {
! return normTrie.get(c);
! }
! public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
! public int getCC(int norm16) {
! if(norm16>=MIN_NORMAL_MAYBE_YES) {
! return norm16&0xff;
}
! if(norm16<minNoNo || limitNoNo<=norm16) {
! return 0;
}
+ return getCCFromNoNo(norm16);
}
! public static int getCCFromYesOrMaybe(int norm16) {
! return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
! }
!
! /**
! * Returns the FCD data for code point c.
! * @param c A Unicode code point.
! * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
+ public int getFCD16(int c) {
+ if(c<0) {
+ return 0;
+ } else if(c<0x180) {
+ return tccc180[c];
+ } else if(c<=0xffff) {
+ if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
+ }
+ return getFCD16FromNormData(c);
+ }
+
+ /** Returns the FCD data for U+0000<=c<U+0180. */
+ public int getFCD16FromBelow180(int c) { return tccc180[c]; }
+ /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
+ public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
+ // 0<=lead<=0xffff
+ byte bits=smallFCD[lead>>8];
+ if(bits==0) { return false; }
+ return ((bits>>((lead>>5)&7))&1)!=0;
+ }
+
+ /** Gets the FCD value from the regular normalization data. */
+ public int getFCD16FromNormData(int c) {
+ // Only loops for 1:1 algorithmic mappings.
+ for(;;) {
+ int norm16=getNorm16(c);
+ if(norm16<=minYesNo) {
+ // no decomposition or Hangul syllable, all zeros
+ return 0;
+ } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
+ // combining mark
+ norm16&=0xff;
+ return norm16|(norm16<<8);
+ } else if(norm16>=minMaybeYes) {
+ return 0;
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ c=mapAlgorithmic(c, norm16);
} else {
! // c decomposes, get everything from the variable-length extra data
! int firstUnit=extraData.charAt(norm16);
! if((firstUnit&MAPPING_LENGTH_MASK)==0) {
! // A character that is deleted (maps to an empty string) must
! // get the worst-case lccc and tccc values because arbitrary
! // characters on both sides will become adjacent.
! return 0x1ff;
! } else {
! int fcd16=firstUnit>>8; // tccc
! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
! fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc
}
+ return fcd16;
}
}
}
}
! /**
! * Gets the decomposition for one code point.
! * @param c code point
! * @return c's decomposition, if it has one; returns null if it does not have a decomposition
*/
! public String getDecomposition(int c) {
! int decomp=-1;
! int norm16;
! for(;;) {
! if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
! // c does not decompose
! } else if(isHangul(norm16)) {
! // Hangul syllable: decompose algorithmically
! StringBuilder buffer=new StringBuilder();
! Hangul.decompose(c, buffer);
! return buffer.toString();
! } else if(isDecompNoAlgorithmic(norm16)) {
! decomp=c=mapAlgorithmic(c, norm16);
continue;
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
+ return extraData.substring(norm16, norm16+length);
}
! if(decomp<0) {
! return null;
} else {
! return UTF16.valueOf(decomp);
}
}
}
! public static final int MIN_CCC_LCCC_CP=0x300;
! public static final int MIN_YES_YES_WITH_CC=0xff01;
! public static final int JAMO_VT=0xff00;
! public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
! public static final int MAX_DELTA=0x40;
! // Byte offsets from the start of the data, after the generic header.
! public static final int IX_NORM_TRIE_OFFSET=0;
! public static final int IX_EXTRA_DATA_OFFSET=1;
! public static final int IX_SMALL_FCD_OFFSET=2;
! // Code point thresholds for quick check codes.
! public static final int IX_MIN_DECOMP_NO_CP=8;
! public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
! // Norm16 value thresholds for quick check combinations and types of extra data.
! // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
! public static final int IX_MIN_YES_NO=10;
! public static final int IX_MIN_NO_NO=11;
! public static final int IX_LIMIT_NO_NO=12;
! public static final int IX_MIN_MAYBE_YES=13;
! // Mappings only in [minYesNoMappingsOnly..minNoNo[.
! public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
! public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
! public static final int MAPPING_LENGTH_MASK=0x1f;
!
! public static final int COMP_1_LAST_TUPLE=0x8000;
! public static final int COMP_1_TRIPLE=1;
! public static final int COMP_1_TRAIL_LIMIT=0x3400;
! public static final int COMP_1_TRAIL_MASK=0x7ffe;
! public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
! public static final int COMP_2_TRAIL_SHIFT=6;
! public static final int COMP_2_TRAIL_MASK=0xffc0;
!
! // higher-level functionality ------------------------------------------ ***
!
! /**
! * Decomposes s[src, limit[ and writes the result to dest.
! * limit can be NULL if src is NUL-terminated.
! * destLengthEstimate is the initial dest buffer capacity and can be -1.
! */
! public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
! int destLengthEstimate) {
! if(destLengthEstimate<0) {
! destLengthEstimate=limit-src;
! }
! dest.setLength(0);
! ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
! decompose(s, src, limit, buffer);
! }
!
! // Dual functionality:
! // buffer!=NULL: normalize
! // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
! public int decompose(CharSequence s, int src, int limit,
! ReorderingBuffer buffer) {
! int minNoCP=minDecompNoCP;
!
! int prevSrc;
! int c=0;
! int norm16=0;
!
! // only for quick check
! int prevBoundary=src;
! int prevCC=0;
!
! for(;;) {
! // count code units below the minimum or with irrelevant data for the quick check
! for(prevSrc=src; src!=limit;) {
! if( (c=s.charAt(src))<minNoCP ||
! isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
! ) {
! ++src;
! } else if(!UTF16.isSurrogate((char)c)) {
! break;
} else {
! char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
! c=Character.toCodePoint((char)c, c2);
! }
! } else /* trail surrogate */ {
! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
! --src;
! c=Character.toCodePoint(c2, (char)c);
! }
}
! if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
! src+=Character.charCount(c);
} else {
! break;
! }
}
}
+ // copy these code units all at once
+ if(src!=prevSrc) {
+ if(buffer!=null) {
+ buffer.flushAndAppendZeroCC(s, prevSrc, src);
+ } else {
+ prevCC=0;
+ prevBoundary=src;
}
}
! if(src==limit) {
break;
}
! // Check one above-minimum, relevant code point.
! src+=Character.charCount(c);
! if(buffer!=null) {
! decompose(c, norm16, buffer);
! } else {
! if(isDecompYes(norm16)) {
! int cc=getCCFromYesOrMaybe(norm16);
! if(prevCC<=cc || cc==0) {
! prevCC=cc;
! if(cc<=1) {
! prevBoundary=src;
}
! continue;
}
}
! return prevBoundary; // "no" or cc out of order
}
}
! return src;
}
! public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
! int limit=s.length();
! if(limit==0) {
! return;
}
+ if(doDecompose) {
+ decompose(s, 0, limit, buffer);
+ return;
}
! // Just merge the strings at the boundary.
! int c=Character.codePointAt(s, 0);
! int src=0;
! int firstCC, prevCC, cc;
! firstCC=prevCC=cc=getCC(getNorm16(c));
! while(cc!=0) {
! prevCC=cc;
! src+=Character.charCount(c);
! if(src>=limit) {
! break;
}
! c=Character.codePointAt(s, src);
! cc=getCC(getNorm16(c));
! };
! buffer.append(s, 0, src, firstCC, prevCC);
! buffer.append(s, src, limit);
}
! // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
! // doCompose: normalize
! // !doCompose: isNormalized (buffer must be empty and initialized)
! public boolean compose(CharSequence s, int src, int limit,
! boolean onlyContiguous,
! boolean doCompose,
! ReorderingBuffer buffer) {
! int minNoMaybeCP=minCompNoMaybeCP;
!
! /*
! * prevBoundary points to the last character before the current one
! * that has a composition boundary before it with ccc==0 and quick check "yes".
! * Keeping track of prevBoundary saves us looking for a composition boundary
! * when we find a "no" or "maybe".
! *
! * When we back out from prevSrc back to prevBoundary,
! * then we also remove those same characters (which had been simply copied
! * or canonically-order-inserted) from the ReorderingBuffer.
! * Therefore, at all times, the [prevBoundary..prevSrc[ source units
! * must correspond 1:1 to destination units at the end of the destination buffer.
! */
! int prevBoundary=src;
! int prevSrc;
! int c=0;
! int norm16=0;
! // only for isNormalized
! int prevCC=0;
! for(;;) {
! // count code units below the minimum or with irrelevant data for the quick check
! for(prevSrc=src; src!=limit;) {
! if( (c=s.charAt(src))<minNoMaybeCP ||
! isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
! ) {
! ++src;
! } else if(!UTF16.isSurrogate((char)c)) {
break;
! } else {
! char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
! c=Character.toCodePoint((char)c, c2);
! }
! } else /* trail surrogate */ {
! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
! --src;
! c=Character.toCodePoint(c2, (char)c);
}
}
! if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
! src+=Character.charCount(c);
! } else {
! break;
}
}
}
+ // copy these code units all at once
+ if(src!=prevSrc) {
+ if(src==limit) {
+ if(doCompose) {
+ buffer.flushAndAppendZeroCC(s, prevSrc, src);
}
+ break;
}
! // Set prevBoundary to the last character in the quick check loop.
! prevBoundary=src-1;
! if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
! Character.isHighSurrogate(s.charAt(prevBoundary-1))
! ) {
! --prevBoundary;
}
! if(doCompose) {
! // The last "quick check yes" character is excluded from the
! // flush-and-append call in case it needs to be modified.
! buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
! buffer.append(s, prevBoundary, src);
! } else {
! prevCC=0;
}
! // The start of the current character (c).
! prevSrc=src;
! } else if(src==limit) {
! break;
}
!
! src+=Character.charCount(c);
! /*
! * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
! * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
! * or has ccc!=0.
! * Check for Jamo V/T, then for regular characters.
! * c is not a Hangul syllable or Jamo L because those have "yes" properties.
! */
! if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
! char prev=s.charAt(prevSrc-1);
! boolean needToDecompose=false;
! if(c<Hangul.JAMO_T_BASE) {
! // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
! prev-=Hangul.JAMO_L_BASE;
! if(prev<Hangul.JAMO_L_COUNT) {
! if(!doCompose) {
return false;
}
! char syllable=(char)
! (Hangul.HANGUL_BASE+
! (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
! Hangul.JAMO_T_COUNT);
! char t;
! if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
! ++src;
! syllable+=t; // The next character was a Jamo T.
! prevBoundary=src;
! buffer.setLastChar(syllable);
! continue;
}
+ // If we see L+V+x where x!=T then we drop to the slow path,
+ // decompose and recompose.
+ // This is to deal with NFKC finding normal L and V but a
+ // compatibility variant of a T. We need to either fully compose that
+ // combination here (which would complicate the code and may not work
+ // with strange custom data) or use the slow path -- or else our replacing
+ // two input characters (L+V) with one output character (LV syllable)
+ // would violate the invariant that [prevBoundary..prevSrc[ has the same
+ // length as what we appended to the buffer since prevBoundary.
+ needToDecompose=true;
+ }
+ } else if(Hangul.isHangulWithoutJamoT(prev)) {
+ // c is a Jamo Trailing consonant,
+ // compose with previous Hangul LV that does not contain a Jamo T.
+ if(!doCompose) {
return false;
}
! buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
! prevBoundary=src;
! continue;
}
! if(!needToDecompose) {
! // The Jamo V/T did not compose into a Hangul syllable.
! if(doCompose) {
! buffer.append((char)c);
} else {
prevCC=0;
}
! continue;
}
}
/*
! * Source buffer pointers:
*
* all done quick check current char not yet
! * "yes" but (c) processed
* may combine
* forward
* [-------------[-------------[-------------[-------------[
* | | | | |
! * orig. src prevBoundary prevSrc src limit
*
*
! * Destination buffer pointers inside the ReorderingBuffer:
*
* all done might take not filled yet
* characters for
* reordering
* [-------------[-------------[-------------[
* | | | |
! * start reorderStart limit |
! * +remainingCap.+
*/
! if(norm16>=MIN_YES_YES_WITH_CC) {
! int cc=norm16&0xff; // cc!=0
! if( onlyContiguous && // FCC
! (doCompose ? buffer.getLastCC() : prevCC)==0 &&
! prevBoundary<prevSrc &&
! // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
! // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
! // passed the quick check "yes && ccc==0" test.
! // Check whether the last character was a "yesYes" or a "yesNo".
! // If a "yesNo", then we get its trailing ccc from its
! // mapping and check for canonical order.
! // All other cases are ok.
! getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
) {
! // Fails FCD test, need to decompose and contiguously recompose.
! if(!doCompose) {
! return false;
}
! } else if(doCompose) {
! buffer.append(c, cc);
! continue;
! } else if(prevCC<=cc) {
! prevCC=cc;
! continue;
} else {
! return false;
! }
! } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
! return false;
! }
/*
! * Find appropriate boundaries around this character,
* decompose the source text from between the boundaries,
! * and recompose it.
*
! * We may need to remove the last few characters from the ReorderingBuffer
! * to account for source text that was copied or appended
! * but needs to take part in the recomposition.
*/
!
/*
! * Find the last composition boundary in [prevBoundary..src[.
! * It is either the decomposition of the current character (at prevSrc),
! * or prevBoundary.
*/
! if(hasCompBoundaryBefore(c, norm16)) {
! prevBoundary=prevSrc;
! } else if(doCompose) {
! buffer.removeSuffix(prevSrc-prevBoundary);
}
! // Find the next composition boundary in [src..limit[ -
! // modifies src to point to the next starter.
! src=findNextCompBoundary(s, src, limit);
! // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
! int recomposeStartIndex=buffer.length();
! decomposeShort(s, prevBoundary, src, buffer);
! recompose(buffer, recomposeStartIndex, onlyContiguous);
! if(!doCompose) {
! if(!buffer.equals(s, prevBoundary, src)) {
! return false;
! }
! buffer.remove();
! prevCC=0;
}
! // Move to the next starter. We never need to look back before this point again.
! prevBoundary=src;
}
! return true;
}
! /**
! * Very similar to compose(): Make the same changes in both places if relevant.
! * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
! * !doSpan: quickCheck
! * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
! * bit 0: set if "maybe"; otherwise, if the span length<s.length()
! * then the quick check result is "no"
! */
! public int composeQuickCheck(CharSequence s, int src, int limit,
! boolean onlyContiguous, boolean doSpan) {
! int qcResult=0;
! int minNoMaybeCP=minCompNoMaybeCP;
!
! /*
! * prevBoundary points to the last character before the current one
! * that has a composition boundary before it with ccc==0 and quick check "yes".
! */
! int prevBoundary=src;
! int prevSrc;
! int c=0;
! int norm16=0;
! int prevCC=0;
!
! for(;;) {
! // count code units below the minimum or with irrelevant data for the quick check
! for(prevSrc=src;;) {
! if(src==limit) {
! return (src<<1)|qcResult; // "yes" or "maybe"
}
+ if( (c=s.charAt(src))<minNoMaybeCP ||
+ isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
+ ) {
+ ++src;
+ } else if(!UTF16.isSurrogate((char)c)) {
+ break;
+ } else {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
}
! } else /* trail surrogate */ {
! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
! --src;
! c=Character.toCodePoint(c2, (char)c);
}
}
+ if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
+ src+=Character.charCount(c);
} else {
! break;
}
}
}
! if(src!=prevSrc) {
! // Set prevBoundary to the last character in the quick check loop.
! prevBoundary=src-1;
! if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
! Character.isHighSurrogate(s.charAt(prevBoundary-1))
! ) {
! --prevBoundary;
! }
! prevCC=0;
! // The start of the current character (c).
! prevSrc=src;
}
! src+=Character.charCount(c);
! /*
! * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
! * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
! * or has ccc!=0.
! */
! if(isMaybeOrNonZeroCC(norm16)) {
! int cc=getCCFromYesOrMaybe(norm16);
! if( onlyContiguous && // FCC
! cc!=0 &&
! prevCC==0 &&
! prevBoundary<prevSrc &&
! // prevCC==0 && prevBoundary<prevSrc tell us that
! // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
! // passed the quick check "yes && ccc==0" test.
! // Check whether the last character was a "yesYes" or a "yesNo".
! // If a "yesNo", then we get its trailing ccc from its
! // mapping and check for canonical order.
! // All other cases are ok.
! getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
! ) {
! // Fails FCD test.
! } else if(prevCC<=cc || cc==0) {
! prevCC=cc;
! if(norm16<MIN_YES_YES_WITH_CC) {
! if(!doSpan) {
! qcResult=1;
} else {
! return prevBoundary<<1; // spanYes does not care to know it's "maybe"
}
}
! continue;
! }
! }
! return prevBoundary<<1; // "no"
}
}
! public void composeAndAppend(CharSequence s,
! boolean doCompose,
! boolean onlyContiguous,
! ReorderingBuffer buffer) {
! int src=0, limit=s.length();
! if(!buffer.isEmpty()) {
! int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
! if(0!=firstStarterInSrc) {
! int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
! buffer.length());
! StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
! firstStarterInSrc+16);
! middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
! buffer.removeSuffix(buffer.length()-lastStarterInDest);
! middle.append(s, 0, firstStarterInSrc);
! compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
! src=firstStarterInSrc;
! }
! }
! if(doCompose) {
! compose(s, src, limit, onlyContiguous, true, buffer);
! } else {
! buffer.append(s, src, limit);
! }
! }
! // Dual functionality:
! // buffer!=NULL: normalize
! // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
! public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
! // Note: In this function we use buffer->appendZeroCC() because we track
! // the lead and trail combining classes here, rather than leaving it to
! // the ReorderingBuffer.
! // The exception is the call to decomposeShort() which uses the buffer
! // in the normal way.
!
! // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
! // Similar to the prevBoundary in the compose() implementation.
! int prevBoundary=src;
! int prevSrc;
! int c=0;
! int prevFCD16=0;
! int fcd16=0;
! for(;;) {
! // count code units with lccc==0
! for(prevSrc=src; src!=limit;) {
! if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
! prevFCD16=~c;
! ++src;
! } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
! prevFCD16=0;
! ++src;
! } else {
! if(UTF16.isSurrogate((char)c)) {
! char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
! c=Character.toCodePoint((char)c, c2);
}
! } else /* trail surrogate */ {
! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
! --src;
! c=Character.toCodePoint(c2, (char)c);
}
}
}
! if((fcd16=getFCD16FromNormData(c))<=0xff) {
! prevFCD16=fcd16;
! src+=Character.charCount(c);
! } else {
! break;
}
}
}
! // copy these code units all at once
! if(src!=prevSrc) {
! if(src==limit) {
! if(buffer!=null) {
! buffer.flushAndAppendZeroCC(s, prevSrc, src);
}
! break;
}
+ prevBoundary=src;
+ // We know that the previous character's lccc==0.
+ if(prevFCD16<0) {
+ // Fetching the fcd16 value was deferred for this below-U+0300 code point.
+ int prev=~prevFCD16;
+ prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
+ if(prevFCD16>1) {
+ --prevBoundary;
+ }
+ } else {
+ int p=src-1;
+ if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
+ Character.isHighSurrogate(s.charAt(p-1))
+ ) {
+ --p;
+ // Need to fetch the previous character's FCD value because
+ // prevFCD16 was just for the trail surrogate code point.
+ prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
+ // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
}
! if(prevFCD16>1) {
! prevBoundary=p;
}
}
! if(buffer!=null) {
! // The last lccc==0 character is excluded from the
! // flush-and-append call in case it needs to be modified.
! buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
! buffer.append(s, prevBoundary, src);
}
+ // The start of the current character (c).
+ prevSrc=src;
+ } else if(src==limit) {
+ break;
}
! src+=Character.charCount(c);
! // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
! // Check for proper order, and decompose locally if necessary.
! if((prevFCD16&0xff)<=(fcd16>>8)) {
! // proper order: prev tccc <= current lccc
! if((fcd16&0xff)<=1) {
! prevBoundary=src;
}
! if(buffer!=null) {
! buffer.appendZeroCC(c);
! }
! prevFCD16=fcd16;
! continue;
! } else if(buffer==null) {
! return prevBoundary; // quick check "no"
! } else {
! /*
! * Back out the part of the source that we copied or appended
! * already but is now going to be decomposed.
! * prevSrc is set to after what was copied/appended.
! */
! buffer.removeSuffix(prevSrc-prevBoundary);
! /*
! * Find the part of the source that needs to be decomposed,
! * up to the next safe boundary.
! */
! src=findNextFCDBoundary(s, src, limit);
! /*
! * The source text does not fulfill the conditions for FCD.
! * Decompose and reorder a limited piece of the text.
! */
! decomposeShort(s, prevBoundary, src, buffer);
! prevBoundary=src;
! prevFCD16=0;
! }
! }
! return src;
}
+ // Note: hasDecompBoundary() could be implemented as aliases to
+ // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
+ // at the cost of building the FCD trie for a decomposition normalizer.
+ public boolean hasDecompBoundary(int c, boolean before) {
for(;;) {
! if(c<minDecompNoCP) {
! return true;
! }
! int norm16=getNorm16(c);
! if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
! return true;
! } else if(norm16>MIN_NORMAL_MAYBE_YES) {
! return false; // ccc!=0
! } else if(isDecompNoAlgorithmic(norm16)) {
! c=mapAlgorithmic(c, norm16);
! } else {
! // c decomposes, get everything from the variable-length extra data
! int firstUnit=extraData.charAt(norm16);
! if((firstUnit&MAPPING_LENGTH_MASK)==0) {
! return false;
! }
! if(!before) {
! // decomp after-boundary: same as hasFCDBoundaryAfter(),
! // fcd16<=1 || trailCC==0
! if(firstUnit>0x1ff) {
! return false; // trailCC>1
! }
! if(firstUnit<=0xff) {
! return true; // trailCC==0
! }
! // if(trailCC==1) test leadCC==0, same as checking for before-boundary
! }
! // true if leadCC==0 (hasFCDBoundaryBefore())
! return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
}
}
}
! public boolean hasCompBoundaryBefore(int c) {
! return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
! }
!
! private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
! private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
! private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
! private boolean isHangul(int norm16) { return norm16==minYesNo; }
! private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
!
! // UBool isCompYes(uint16_t norm16) const {
! // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
! // }
! // UBool isCompYesOrMaybe(uint16_t norm16) const {
! // return norm16<minNoNo || minMaybeYes<=norm16;
! // }
! // private boolean hasZeroCCFromDecompYes(int norm16) {
! // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
! // }
! private boolean isDecompYesAndZeroCC(int norm16) {
! return norm16<minYesNo ||
! norm16==JAMO_VT ||
! (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
! }
! /**
! * A little faster and simpler than isDecompYesAndZeroCC() but does not include
! * the MaybeYes which combine-forward and have ccc=0.
! * (Standard Unicode 5.2 normalization does not have such characters.)
! */
! private boolean isMostDecompYesAndZeroCC(int norm16) {
! return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
! }
!
! private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
!
! // For use with isCompYes().
! // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
! // static uint8_t getCCFromYes(uint16_t norm16) {
! // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
! // }
! private int getCCFromNoNo(int norm16) {
! if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
! return extraData.charAt(norm16-1)&0xff;
} else {
! return 0;
! }
}
! // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
! int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
! int c;
! if(cpStart==(cpLimit-1)) {
! c=s.charAt(cpStart);
! } else {
! c=Character.codePointAt(s, cpStart);
! }
! int prevNorm16=getNorm16(c);
! if(prevNorm16<=minYesNo) {
! return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
} else {
! return extraData.charAt(prevNorm16)>>8; // tccc from yesNo
}
}
! // Requires algorithmic-NoNo.
! private int mapAlgorithmic(int c, int norm16) {
! return c+norm16-(minMaybeYes-MAX_DELTA-1);
}
! // Requires minYesNo<norm16<limitNoNo.
! // private int getMapping(int norm16) { return /*extraData+*/norm16; }
/**
! * @return index into maybeYesCompositions, or -1
*/
! private int getCompositionsListForDecompYes(int norm16) {
! if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
! return -1;
! } else {
! if((norm16-=minMaybeYes)<0) {
! // norm16<minMaybeYes: index into extraData which is a substring at
! // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
! // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
! norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list
! }
! return norm16;
! }
! }
! /**
! * @return index into maybeYesCompositions
*/
+ private int getCompositionsListForComposite(int norm16) {
+ // composite has both mapping & compositions list
+ int firstUnit=extraData.charAt(norm16);
+ return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions
+ 1+ // +1 to skip the first unit with the mapping lenth
+ (firstUnit&MAPPING_LENGTH_MASK); // + mapping length
+ }
+
+ // Decompose a short piece of text which is likely to contain characters that
+ // fail the quick check loop and/or where the quick check loop's overhead
+ // is unlikely to be amortized.
+ // Called by the compose() and makeFCD() implementations.
+ // Public in Java for collation implementation code.
+ public void decomposeShort(CharSequence s, int src, int limit,
+ ReorderingBuffer buffer) {
+ while(src<limit) {
+ int c=Character.codePointAt(s, src);
+ src+=Character.charCount(c);
+ decompose(c, getNorm16(c), buffer);
+ }
+ }
! private void decompose(int c, int norm16,
! ReorderingBuffer buffer) {
! // Only loops for 1:1 algorithmic mappings.
! for(;;) {
! // get the decomposition and the lead and trail cc's
! if(isDecompYes(norm16)) {
! // c does not decompose
! buffer.append(c, getCCFromYesOrMaybe(norm16));
! } else if(isHangul(norm16)) {
! // Hangul syllable: decompose algorithmically
! Hangul.decompose(c, buffer);
! } else if(isDecompNoAlgorithmic(norm16)) {
! c=mapAlgorithmic(c, norm16);
! norm16=getNorm16(c);
! continue;
! } else {
! // c decomposes, get everything from the variable-length extra data
! int firstUnit=extraData.charAt(norm16);
! int length=firstUnit&MAPPING_LENGTH_MASK;
! int leadCC, trailCC;
! trailCC=firstUnit>>8;
! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
! leadCC=extraData.charAt(norm16-1)>>8;
! } else {
! leadCC=0;
! }
! ++norm16; // skip over the firstUnit
! buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
! }
! return;
! }
! }
! /**
! * Finds the recomposition result for
! * a forward-combining "lead" character,
! * specified with a pointer to its compositions list,
! * and a backward-combining "trail" character.
! *
! * <p>If the lead and trail characters combine, then this function returns
! * the following "compositeAndFwd" value:
! * <pre>
! * Bits 21..1 composite character
! * Bit 0 set if the composite is a forward-combining starter
! * </pre>
! * otherwise it returns -1.
! *
! * <p>The compositions list has (trail, compositeAndFwd) pair entries,
! * encoded as either pairs or triples of 16-bit units.
! * The last entry has the high bit of its first unit set.
! *
! * <p>The list is sorted by ascending trail characters (there are no duplicates).
! * A linear search is used.
! *
! * <p>See normalizer2impl.h for a more detailed description
! * of the compositions list format.
! */
! private static int combine(String compositions, int list, int trail) {
! int key1, firstUnit;
! if(trail<COMP_1_TRAIL_LIMIT) {
! // trail character is 0..33FF
! // result entry may have 2 or 3 units
! key1=(trail<<1);
! while(key1>(firstUnit=compositions.charAt(list))) {
! list+=2+(firstUnit&COMP_1_TRIPLE);
! }
! if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
! if((firstUnit&COMP_1_TRIPLE)!=0) {
! return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
! } else {
! return compositions.charAt(list+1);
! }
! }
! } else {
! // trail character is 3400..10FFFF
! // result entry has 3 units
! key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
! int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
! int secondUnit;
! for(;;) {
! if(key1>(firstUnit=compositions.charAt(list))) {
! list+=2+(firstUnit&COMP_1_TRIPLE);
! } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
! if(key2>(secondUnit=compositions.charAt(list+1))) {
! if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
! break;
! } else {
! list+=3;
! }
! } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
! return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
! } else {
! break;
! }
! } else {
! break;
! }
! }
! }
! return -1;
! }
/*
! * Recomposes the buffer text starting at recomposeStartIndex
! * (which is in NFD - decomposed and canonically ordered),
! * and truncates the buffer contents.
*
! * Note that recomposition never lengthens the text:
! * Any character consists of either one or two code units;
! * a composition may contain at most one more code unit than the original starter,
! * while the combining mark that is removed has at least one code unit.
! */
! private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
! boolean onlyContiguous) {
! StringBuilder sb=buffer.getStringBuilder();
! int p=recomposeStartIndex;
! if(p==sb.length()) {
! return;
! }
!
! int starter, pRemove;
! int compositionsList;
! int c, compositeAndFwd;
! int norm16;
! int cc, prevCC;
! boolean starterIsSupplementary;
!
! // Some of the following variables are not used until we have a forward-combining starter
! // and are only initialized now to avoid compiler warnings.
! compositionsList=-1; // used as indicator for whether we have a forward-combining starter
! starter=-1;
! starterIsSupplementary=false;
! prevCC=0;
! for(;;) {
! c=sb.codePointAt(p);
! p+=Character.charCount(c);
! norm16=getNorm16(c);
! cc=getCCFromYesOrMaybe(norm16);
! if( // this character combines backward and
! isMaybe(norm16) &&
! // we have seen a starter that combines forward and
! compositionsList>=0 &&
! // the backward-combining character is not blocked
! (prevCC<cc || prevCC==0)) {
! if(isJamoVT(norm16)) {
! // c is a Jamo V/T, see if we can compose it with the previous character.
! if(c<Hangul.JAMO_T_BASE) {
! // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
! char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
! if(prev<Hangul.JAMO_L_COUNT) {
! pRemove=p-1;
! char syllable=(char)
! (Hangul.HANGUL_BASE+
! (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
! Hangul.JAMO_T_COUNT);
! char t;
! if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
! ++p;
! syllable+=t; // The next character was a Jamo T.
! }
! sb.setCharAt(starter, syllable);
! // remove the Jamo V/T
! sb.delete(pRemove, p);
! p=pRemove;
}
}
! /*
! * No "else" for Jamo T:
! * Since the input is in NFD, there are no Hangul LV syllables that
! * a Jamo T could combine with.
! * All Jamo Ts are combined above when handling Jamo Vs.
! */
! if(p==sb.length()) {
! break;
}
! compositionsList=-1;
! continue;
! } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
! // The starter and the combining mark (c) do combine.
! int composite=compositeAndFwd>>1;
!
! // Remove the combining mark.
! pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark
! sb.delete(pRemove, p);
! p=pRemove;
! // Replace the starter with the composite.
! if(starterIsSupplementary) {
! if(composite>0xffff) {
! // both are supplementary
! sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
! sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
! } else {
! sb.setCharAt(starter, (char)c);
! sb.deleteCharAt(starter+1);
! // The composite is shorter than the starter,
! // move the intermediate characters forward one.
! starterIsSupplementary=false;
! --p;
}
+ } else if(composite>0xffff) {
+ // The composite is longer than the starter,
+ // move the intermediate characters back one.
+ starterIsSupplementary=true;
+ sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
+ sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
+ ++p;
+ } else {
+ // both are on the BMP
+ sb.setCharAt(starter, (char)composite);
}
! // Keep prevCC because we removed the combining mark.
+ if(p==sb.length()) {
+ break;
}
! // Is the composite a starter that combines forward?
! if((compositeAndFwd&1)!=0) {
! compositionsList=
! getCompositionsListForComposite(getNorm16(composite));
! } else {
! compositionsList=-1;
}
! // We combined; continue with looking for compositions.
! continue;
! }
}
! // no combination this time
! prevCC=cc;
! if(p==sb.length()) {
break;
}
! // If c did not combine, then check if it is a starter.
! if(cc==0) {
! // Found a new starter.
! if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
! // It may combine with something, prepare for it.
! if(c<=0xffff) {
! starterIsSupplementary=false;
! starter=p-1;
! } else {
! starterIsSupplementary=true;
! starter=p-2;
}
}
! } else if(onlyContiguous) {
! // FCC: no discontiguous compositions; any intervening character blocks.
! compositionsList=-1;
}
}
! buffer.flush();
}
! /**
! * Does c have a composition boundary before it?
! * True if its decomposition begins with a character that has
! * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
! * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
! * (isCompYesAndZeroCC()) so we need not decompose.
! */
! private boolean hasCompBoundaryBefore(int c, int norm16) {
! for(;;) {
! if(isCompYesAndZeroCC(norm16)) {
! return true;
! } else if(isMaybeOrNonZeroCC(norm16)) {
! return false;
! } else if(isDecompNoAlgorithmic(norm16)) {
! c=mapAlgorithmic(c, norm16);
! norm16=getNorm16(c);
! } else {
! // c decomposes, get everything from the variable-length extra data
! int firstUnit=extraData.charAt(norm16);
! if((firstUnit&MAPPING_LENGTH_MASK)==0) {
! return false;
}
! if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
! return false; // non-zero leadCC
}
! return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
}
}
}
! private int findPreviousCompBoundary(CharSequence s, int p) {
! while(p>0) {
! int c=Character.codePointBefore(s, p);
! p-=Character.charCount(c);
! if(hasCompBoundaryBefore(c)) {
! break;
}
+ // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
+ // but that's probably not worth the extra cost.
}
! return p;
}
! private int findNextCompBoundary(CharSequence s, int p, int limit) {
! while(p<limit) {
! int c=Character.codePointAt(s, p);
! int norm16=normTrie.get(c);
! if(hasCompBoundaryBefore(c, norm16)) {
! break;
! }
! p+=Character.charCount(c);
! }
! return p;
}
! private int findNextFCDBoundary(CharSequence s, int p, int limit) {
! while(p<limit) {
! int c=Character.codePointAt(s, p);
! if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
! break;
! }
! p+=Character.charCount(c);
! }
! return p;
! }
/**
* Get the canonical decomposition
* sherman for ComposedCharIter
*/
public static int getDecompose(int chars[], String decomps[]) {
! Normalizer2 impl = Normalizer2.getNFDInstance();
!
int length=0;
! int norm16 = 0;
int ch = -1;
int i = 0;
while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
//TBD !!!! the hack code heres save us about 50ms for startup
//need a better solution/lookup
if (ch == 0x30ff)
ch = 0xf900;
! else if (ch == 0x115bc)
ch = 0x1d15e;
else if (ch == 0x1d1c1)
ch = 0x2f800;
! String s = impl.getDecomposition(ch);
!
! if(s != null && i < chars.length) {
chars[i] = ch;
! decomps[i++] = s;
}
}
return i;
}
//------------------------------------------------------
! // special method for Collation (RBTableBuilder.build())
//------------------------------------------------------
private static boolean needSingleQuotation(char c) {
return (c >= 0x0009 && c <= 0x000D) ||
(c >= 0x0020 && c <= 0x002F) ||
(c >= 0x003A && c <= 0x0040) ||
(c >= 0x005B && c <= 0x0060) ||
(c >= 0x007B && c <= 0x007E);
}
public static String canonicalDecomposeWithSingleQuotation(String string) {
+ Normalizer2 impl = Normalizer2.getNFDInstance();
char[] src = string.toCharArray();
int srcIndex = 0;
int srcLimit = src.length;
char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
int destIndex = 0;
int destLimit = dest.length;
int prevSrc;
! String norm;
int reorderStartIndex, length;
! char c1, c2;
! int cp;
! int minNoMaybe = 0x00c0;
int cc, prevCC, trailCC;
char[] p;
int pStart;
// initialize
reorderStartIndex = 0;
prevCC = 0;
! norm = null;
! cp = 0;
pStart = 0;
cc = trailCC = -1; // initialize to bogus value
! c1 = 0;
! for (;;) {
prevSrc=srcIndex;
//quick check (1)less than minNoMaybe (2)no decomp (3)hangual
while (srcIndex != srcLimit &&
! ((c1 = src[srcIndex]) < minNoMaybe ||
! (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
! (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
prevCC = 0;
! srcIndex += (cp < 0x10000) ? 1 : 2;
}
// copy these code units all at once
if (srcIndex != prevSrc) {
length = srcIndex - prevSrc;
*** 2554,2604 ****
destIndex += length;
reorderStartIndex = destIndex;
}
// end of source reached?
! if(srcIndex == srcLimit) {
break;
}
- // c already contains *src and norm32 is set for it, increment src
- ++srcIndex;
! if(isNorm32Regular(norm32)) {
c2 = 0;
length = 1;
} else {
- // c is a lead surrogate, get the real norm32
- if(srcIndex != srcLimit &&
- Character.isLowSurrogate(c2 = src[srcIndex])) {
- ++srcIndex;
length = 2;
! norm32 = getNorm32FromSurrogatePair(norm32, c2);
! } else {
! c2 = 0;
! length = 1;
! norm32 = 0;
! }
}
// get the decomposition and the lead and trail cc's
! if((norm32 & qcMask) == 0) {
! // c does not decompose
! cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
p = null;
pStart = -1;
} else {
! DecomposeArgs arg = new DecomposeArgs();
! // c decomposes, get everything from the variable-length
! // extra data
! pStart = decompose(norm32, qcMask, arg);
! p = extraData;
! length = arg.length;
! cc = arg.cc;
! trailCC = arg.trailCC;
! if(length == 1) {
// fastpath a single code unit from decomposition
! c = p[pStart];
c2 = 0;
p = null;
pStart = -1;
}
}
--- 1804,1850 ----
destIndex += length;
reorderStartIndex = destIndex;
}
// end of source reached?
! if (srcIndex == srcLimit) {
break;
}
! // cp already contains *src and norm32 is set for it, increment src
! srcIndex += (cp < 0x10000) ? 1 : 2;
!
! if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
c2 = 0;
length = 1;
+
+ if (Character.isHighSurrogate(c1)
+ || Character.isLowSurrogate(c1)) {
+ norm = null;
+ }
} else {
length = 2;
! c2 = src[srcIndex-1];
}
// get the decomposition and the lead and trail cc's
! if (norm == null) {
! // cp does not decompose
! cc = trailCC = UCharacter.getCombiningClass(cp);
p = null;
pStart = -1;
} else {
!
! pStart = 0;
! p = norm.toCharArray();
! length = p.length;
! int cpNum = norm.codePointCount(0, length);
! cc= UCharacter.getCombiningClass(norm.codePointAt(0));
! trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
! if (length == 1) {
// fastpath a single code unit from decomposition
! c1 = p[pStart];
c2 = 0;
p = null;
pStart = -1;
}
}
*** 2608,2638 ****
char[] tmpBuf = new char[destLimit * 2];
System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
dest = tmpBuf;
destLimit = dest.length;
}
// append the decomposition to the destination buffer, assume length>0
{
int reorderSplit = destIndex;
! if(p == null) {
// fastpath: single code point
! if (needSingleQuotation(c)) {
//if we need single quotation, no need to consider "prevCC"
//and it must NOT be a supplementary pair
dest[destIndex++] = '\'';
! dest[destIndex++] = c;
dest[destIndex++] = '\'';
trailCC = 0;
} else if(cc != 0 && cc < prevCC) {
! // (c, c2) is out of order with respect to the preceding
// text
destIndex += length;
! trailCC = insertOrdered(dest,reorderStartIndex,
! reorderSplit, destIndex, c, c2, cc);
} else {
! // just append (c, c2)
! dest[destIndex++] = c;
if(c2 != 0) {
dest[destIndex++] = c2;
}
}
} else {
--- 1854,1885 ----
char[] tmpBuf = new char[destLimit * 2];
System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
dest = tmpBuf;
destLimit = dest.length;
}
+
// append the decomposition to the destination buffer, assume length>0
{
int reorderSplit = destIndex;
! if (p == null) {
// fastpath: single code point
! if (needSingleQuotation(c1)) {
//if we need single quotation, no need to consider "prevCC"
//and it must NOT be a supplementary pair
dest[destIndex++] = '\'';
! dest[destIndex++] = c1;
dest[destIndex++] = '\'';
trailCC = 0;
} else if(cc != 0 && cc < prevCC) {
! // (c1, c2) is out of order with respect to the preceding
// text
destIndex += length;
! trailCC = insertOrdered(dest, reorderStartIndex,
! reorderSplit, destIndex, c1, c2, cc);
} else {
! // just append (c1, c2)
! dest[destIndex++] = c1;
if(c2 != 0) {
dest[destIndex++] = c2;
}
}
} else {
*** 2644,2736 ****
dest[destIndex++] = '\'';
length--;
do {
dest[destIndex++] = p[pStart++];
} while(--length > 0);
! } else
! if(cc != 0 && cc < prevCC) {
destIndex += length;
! trailCC = mergeOrdered(dest,reorderStartIndex,
! reorderSplit,p, pStart,pStart+length);
} else {
// just append the decomposition
do {
dest[destIndex++] = p[pStart++];
! } while(--length > 0);
}
}
}
prevCC = trailCC;
if(prevCC == 0) {
reorderStartIndex = destIndex;
}
}
return new String(dest, 0, destIndex);
}
! //------------------------------------------------------
! // mapping method for IDNA/StringPrep
! //------------------------------------------------------
!
! /*
! * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
! * 3.2 normalization with Corrigendum 4 corrections. However, normalization
! * without the corrections is necessary for IDNA/StringPrep support.
! * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
! * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
! * characters in Corrigendum 4 before normalization in order to avoid
! * incorrect normalization.
! * For the Corrigendum 4 issue, refer
! * http://www.unicode.org/versions/corrigendum4.html
*/
! /*
! * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
! */
! public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000;
! private static final char[][] corrigendum4MappingTable = {
! {'\uD844', '\uDF6A'}, // 0x2F868
! {'\u5F33'}, // 0x2F874
! {'\u43AB'}, // 0x2F91F
! {'\u7AAE'}, // 0x2F95F
! {'\u4D57'}}; // 0x2F9BF
! /*
! * Removing Corrigendum 4 fix
! * @return normalized text
! */
! public static String convert(String str) {
! if (str == null) {
! return null;
}
! int ch = UCharacterIterator.DONE;
! StringBuffer dest = new StringBuffer();
! UCharacterIterator iter = UCharacterIterator.getInstance(str);
! while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
! switch (ch) {
! case 0x2F868:
! dest.append(corrigendum4MappingTable[0]);
! break;
! case 0x2F874:
! dest.append(corrigendum4MappingTable[1]);
! break;
! case 0x2F91F:
! dest.append(corrigendum4MappingTable[2]);
! break;
! case 0x2F95F:
! dest.append(corrigendum4MappingTable[3]);
! break;
! case 0x2F9BF:
! dest.append(corrigendum4MappingTable[4]);
! break;
! default:
! UTF16.append(dest,ch);
break;
}
}
! return dest.toString();
}
}
--- 1891,2155 ----
dest[destIndex++] = '\'';
length--;
do {
dest[destIndex++] = p[pStart++];
} while(--length > 0);
! } else if (cc != 0 && cc < prevCC) {
destIndex += length;
! trailCC = mergeOrdered(dest, reorderStartIndex,
! reorderSplit, p, pStart,
! pStart+length);
} else {
// just append the decomposition
do {
dest[destIndex++] = p[pStart++];
! } while (--length > 0);
}
}
}
prevCC = trailCC;
if(prevCC == 0) {
reorderStartIndex = destIndex;
}
}
+
return new String(dest, 0, destIndex);
}
! /**
! * simpler, single-character version of mergeOrdered() -
! * bubble-insert one single code point into the preceding string
! * which is already canonically ordered
! * (c, c2) may or may not yet have been inserted at src[current]..src[p]
! *
! * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
! *
! * before: src[start]..src[current] is already ordered, and
! * src[current]..src[p] may or may not hold (c, c2) but
! * must be exactly the same length as (c, c2)
! * after: src[start]..src[p] is ordered
! *
! * @return the trailing combining class
*/
+ private static int/*unsigned byte*/ insertOrdered(char[] source,
+ int start,
+ int current, int p,
+ char c1, char c2,
+ int/*unsigned byte*/ cc) {
+ int back, preBack;
+ int r;
+ int prevCC, trailCC=cc;
! if (start<current && cc!=0) {
! // search for the insertion point where cc>=prevCC
! preBack=back=current;
!
! PrevArgs prevArgs = new PrevArgs();
! prevArgs.current = current;
! prevArgs.start = start;
! prevArgs.src = source;
! prevArgs.c1 = c1;
! prevArgs.c2 = c2;
! // get the prevCC
! prevCC=getPrevCC(prevArgs);
! preBack = prevArgs.current;
! if(cc<prevCC) {
! // this will be the last code point, so keep its cc
! trailCC=prevCC;
! back=preBack;
! while(start<preBack) {
! prevCC=getPrevCC(prevArgs);
! preBack=prevArgs.current;
! if(cc>=prevCC) {
! break;
! }
! back=preBack;
}
! // this is where we are right now with all these indicies:
! // [start]..[pPreBack] 0..? code points that we can ignore
! // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
! // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
! // [current]..[p] 1 code point (c, c2) with cc
! // move the code units in between up
! r=p;
! do {
! source[--r]=source[--current];
! } while (back!=current);
! }
! }
!
! // insert (c1, c2)
! source[current] = c1;
! if (c2!=0) {
! source[(current+1)] = c2;
! }
!
! // we know the cc of the last code point
! return trailCC;
! }
!
! /**
! * merge two UTF-16 string parts together
! * to canonically order (order by combining classes) their concatenation
! *
! * the two strings may already be adjacent, so that the merging is done
! * in-place if the two strings are not adjacent, then the buffer holding the
! * first one must be large enough
! * the second string may or may not be ordered in itself
! *
! * before: [start]..[current] is already ordered, and
! * [next]..[limit] may be ordered in itself, but
! * is not in relation to [start..current[
! * after: [start..current+(limit-next)[ is ordered
! *
! * the algorithm is a simple bubble-sort that takes the characters from
! * src[next++] and inserts them in correct combining class order into the
! * preceding part of the string
! *
! * since this function is called much less often than the single-code point
! * insertOrdered(), it just uses that for easier maintenance
! *
! * @return the trailing combining class
! */
! private static int /*unsigned byte*/ mergeOrdered(char[] source,
! int start,
! int current,
! char[] data,
! int next,
! int limit) {
! int r;
! int /*unsigned byte*/ cc, trailCC=0;
! boolean adjacent;
!
! adjacent= current==next;
! NextCCArgs ncArgs = new NextCCArgs();
! ncArgs.source = data;
! ncArgs.next = next;
! ncArgs.limit = limit;
!
! if(start!=current) {
!
! while(ncArgs.next<ncArgs.limit) {
! cc=getNextCC(ncArgs);
! if(cc==0) {
! // does not bubble back
! trailCC=0;
! if(adjacent) {
! current=ncArgs.next;
! } else {
! data[current++]=ncArgs.c1;
! if(ncArgs.c2!=0) {
! data[current++]=ncArgs.c2;
! }
! }
break;
+ } else {
+ r=current+(ncArgs.c2==0 ? 1 : 2);
+ trailCC=insertOrdered(source,start, current, r,
+ ncArgs.c1, ncArgs.c2, cc);
+ current=r;
}
}
+ }
+
+ if(ncArgs.next==ncArgs.limit) {
+ // we know the cc of the last code point
+ return trailCC;
+ } else {
+ if(!adjacent) {
+ // copy the second string part
+ do {
+ source[current++]=data[ncArgs.next++];
+ } while(ncArgs.next!=ncArgs.limit);
+ ncArgs.limit=current;
+ }
+ PrevArgs prevArgs = new PrevArgs();
+ prevArgs.src = data;
+ prevArgs.start = start;
+ prevArgs.current = ncArgs.limit;
+ return getPrevCC(prevArgs);
+ }
+
+ }
+
+ private static final class PrevArgs{
+ char[] src;
+ int start;
+ int current;
+ char c1;
+ char c2;
+ }
+
+ private static final class NextCCArgs{
+ char[] source;
+ int next;
+ int limit;
+ char c1;
+ char c2;
+ }
+
+ private static int /*unsigned*/ getPrevCC(PrevArgs args) {
+ args.c1=args.src[--args.current];
+ args.c2=0;
+
+ if (args.c1 < MIN_CCC_LCCC_CP) {
+ return 0;
+ } else if (UTF16.isLeadSurrogate(args.c1)) {
+ /* unpaired first surrogate */
+ return 0;
+ } else if (!UTF16.isTrailSurrogate(args.c1)) {
+ return UCharacter.getCombiningClass(args.c1);
+ } else if (args.current!=args.start &&
+ UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
+ --args.current;
+ return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
+ } else {
+ /* unpaired second surrogate */
+ args.c2=0;
+ return 0;
+ }
+ }
+
+ private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
+ args.c1=args.source[args.next++];
+ args.c2=0;
! if (UTF16.isTrailSurrogate(args.c1)) {
! /* unpaired second surrogate */
! return 0;
! } else if (!UTF16.isLeadSurrogate(args.c1)) {
! return UCharacter.getCombiningClass(args.c1);
! } else if (args.next!=args.limit &&
! UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
! ++args.next;
! return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
! } else {
! /* unpaired first surrogate */
! args.c2=0;
! return 0;
}
+ }
+
+ private VersionInfo dataVersion;
+
+ // Code point thresholds for quick check codes.
+ private int minDecompNoCP;
+ private int minCompNoMaybeCP;
+
+ // Norm16 value thresholds for quick check combinations and types of extra data.
+ private int minYesNo;
+ private int minYesNoMappingsOnly;
+ private int minNoNo;
+ private int limitNoNo;
+ private int minMaybeYes;
+
+ private Trie2_16 normTrie;
+ private String maybeYesCompositions;
+ private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
+ private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
+ private int[] tccc180; // [0x180] tccc values for U+0000..U+017F
+
}
< prev index next >