1 /*
   2  * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.io.BufferedInputStream;
  40 import java.io.ByteArrayInputStream;
  41 import java.io.IOException;
  42 import java.io.BufferedInputStream;
  43 import java.io.InputStream;
  44 
  45 /**
  46  * @author  Ram Viswanadha
  47  */
  48 public final class NormalizerImpl {
  49     // Static block for the class to initialize its own self
  50     static final NormalizerImpl IMPL;
  51 
  52     static
  53     {
  54         try
  55         {
  56             IMPL = new NormalizerImpl();
  57         }
  58         catch (Exception e)
  59         {
  60             throw new RuntimeException(e.getMessage());
  61         }
  62     }
  63 
  64     static final int UNSIGNED_BYTE_MASK =0xFF;
  65     static final long UNSIGNED_INT_MASK = 0xffffffffL;
  66     /*
  67      * This new implementation of the normalization code loads its data from
  68      * unorm.icu, which is generated with the gennorm tool.
  69      * The format of that file is described at the end of this file.
  70      */
  71     private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
  72 
  73     // norm32 value constants
  74 
  75     // quick check flags 0..3 set mean "no" for their forms
  76     public static final int QC_NFC=0x11;          /* no|maybe */
  77     public static final int QC_NFKC=0x22;         /* no|maybe */
  78     public static final int QC_NFD=4;             /* no */
  79     public static final int QC_NFKD=8;            /* no */
  80 
  81     public static final int QC_ANY_NO=0xf;
  82 
  83     /* quick check flags 4..5 mean "maybe" for their forms;
  84      * test flags>=QC_MAYBE
  85      */
  86     public static final int QC_MAYBE=0x10;
  87     public static final int QC_ANY_MAYBE=0x30;
  88 
  89     public static final int QC_MASK=0x3f;
  90 
  91     private static final int COMBINES_FWD=0x40;
  92     private static final int COMBINES_BACK=0x80;
  93     public  static final int COMBINES_ANY=0xc0;
  94     // UnicodeData.txt combining class in bits 15.
  95     private static final int CC_SHIFT=8;
  96     public  static final int CC_MASK=0xff00;
  97     // 16 bits for the index to UChars and other extra data
  98     private static final int EXTRA_SHIFT=16;
  99 
 100     /* norm32 value constants using >16 bits */
 101     private static final long  MIN_SPECIAL    =  0xfc000000 & UNSIGNED_INT_MASK;
 102     private static final long  SURROGATES_TOP =  0xfff00000 & UNSIGNED_INT_MASK;
 103     private static final long  MIN_HANGUL     =  0xfff00000 & UNSIGNED_INT_MASK;
 104 //  private static final long  MIN_JAMO_V     =  0xfff20000 & UNSIGNED_INT_MASK;
 105     private static final long  JAMO_V_TOP     =  0xfff30000 & UNSIGNED_INT_MASK;
 106 
 107 
 108     /* indexes[] value names */
 109     /* number of bytes in normalization trie */
 110     static final int INDEX_TRIE_SIZE           = 0;
 111     /* number of chars in extra data */
 112     static final int INDEX_CHAR_COUNT           = 1;
 113     /* number of uint16_t words for combining data */
 114     static final int INDEX_COMBINE_DATA_COUNT = 2;
 115     /* first code point with quick check NFC NO/MAYBE */
 116     public static final int INDEX_MIN_NFC_NO_MAYBE   = 6;
 117     /* first code point with quick check NFKC NO/MAYBE */
 118     public static final int INDEX_MIN_NFKC_NO_MAYBE  = 7;
 119     /* first code point with quick check NFD NO/MAYBE */
 120     public static final int INDEX_MIN_NFD_NO_MAYBE   = 8;
 121     /* first code point with quick check NFKD NO/MAYBE */
 122     public static final int INDEX_MIN_NFKD_NO_MAYBE  = 9;
 123     /* number of bytes in FCD trie */
 124     static final int INDEX_FCD_TRIE_SIZE      = 10;
 125     /* number of bytes in the auxiliary trie */
 126     static final int INDEX_AUX_TRIE_SIZE      = 11;
 127     /* changing this requires a new formatVersion */
 128     static final int INDEX_TOP                = 32;
 129 
 130 
 131     /* AUX constants */
 132     /* value constants for auxTrie */
 133     private static final int AUX_UNSAFE_SHIFT           = 11;
 134     private static final int AUX_COMP_EX_SHIFT           = 10;
 135     private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
 136 
 137     private static final int AUX_MAX_FNC          =   1<<AUX_COMP_EX_SHIFT;
 138     private static final int AUX_UNSAFE_MASK      =   (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
 139     private static final int AUX_FNC_MASK         =   (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
 140     private static final int AUX_COMP_EX_MASK     =   (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
 141     private static final long AUX_NFC_SKIP_F_MASK =   ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);
 142 
 143     private static final int MAX_BUFFER_SIZE                    = 20;
 144 
 145     /*******************************/
 146 
 147     /* Wrappers for Trie implementations */
 148     static final class NormTrieImpl implements Trie.DataManipulate{
 149         static IntTrie normTrie= null;
 150        /**
 151         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
 152         * data the index array offset of the indexes for that lead surrogate.
 153         * @param property data value for a surrogate from the trie, including
 154         *         the folding offset
 155         * @return data offset or 0 if there is no data for the lead surrogate
 156         */
 157         /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
 158         public int getFoldingOffset(int value){
 159             return  BMP_INDEX_LENGTH+
 160                     ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
 161                     (0x3ff<<SURROGATE_BLOCK_BITS));
 162         }
 163 
 164     }
 165     static final class FCDTrieImpl implements Trie.DataManipulate{
 166         static CharTrie fcdTrie=null;
 167        /**
 168         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
 169         * data the index array offset of the indexes for that lead surrogate.
 170         * @param property data value for a surrogate from the trie, including
 171         *         the folding offset
 172         * @return data offset or 0 if there is no data for the lead surrogate
 173         */
 174         /* fcdTrie: the folding offset is the lead FCD value itself */
 175         public int getFoldingOffset(int value){
 176             return value;
 177         }
 178     }
 179 
 180     static final class AuxTrieImpl implements Trie.DataManipulate{
 181         static CharTrie auxTrie = null;
 182        /**
 183         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
 184         * data the index array offset of the indexes for that lead surrogate.
 185         * @param property data value for a surrogate from the trie, including
 186         *        the folding offset
 187         * @return data offset or 0 if there is no data for the lead surrogate
 188         */
 189         /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
 190         public int getFoldingOffset(int value){
 191             return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
 192         }
 193     }
 194 
 195     /****************************************************/
 196 
 197 
 198     private static FCDTrieImpl fcdTrieImpl;
 199     private static NormTrieImpl normTrieImpl;
 200     private static AuxTrieImpl auxTrieImpl;
 201     private static int[] indexes;
 202     private static char[] combiningTable;
 203     private static char[] extraData;
 204 
 205     private static boolean isDataLoaded;
 206     private static boolean isFormatVersion_2_1;
 207     private static boolean isFormatVersion_2_2;
 208     private static byte[] unicodeVersion;
 209 
 210     /**
 211      * Default buffer size of datafile
 212      */
 213     private static final int DATA_BUFFER_SIZE = 25000;
 214 
 215     /**
 216      * FCD check: everything below this code point is known to have a 0
 217      * lead combining class
 218      */
 219     public static final int MIN_WITH_LEAD_CC=0x300;
 220 
 221 
 222     /**
 223      * Bit 7 of the length byte for a decomposition string in extra data is
 224      * a flag indicating whether the decomposition string is
 225      * preceded by a 16-bit word with the leading and trailing cc
 226      * of the decomposition (like for A-umlaut);
 227      * if not, then both cc's are zero (like for compatibility ideographs).
 228      */
 229     private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
 230     /**
 231      * Bits 6..0 of the length byte contain the actual length.
 232      */
 233     private static final int DECOMP_LENGTH_MASK=0x7f;
 234 
 235     /** Length of the BMP portion of the index (stage 1) array. */
 236     private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
 237     /** Number of bits of a trail surrogate that are used in index table
 238      * lookups.
 239      */
 240     private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
 241 
 242 
 243    // public utility
 244    public static int getFromIndexesArr(int index){
 245         return indexes[index];
 246    }
 247 
 248    // protected constructor ---------------------------------------------
 249 
 250     /**
 251     * Constructor
 252     * @exception thrown when data reading fails or data corrupted
 253     */
 254     private NormalizerImpl() throws IOException {
 255         //data should be loaded only once
 256         if(!isDataLoaded){
 257 
 258             // jar access
 259             InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
 260             BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
 261             NormalizerDataReader reader = new NormalizerDataReader(b);
 262 
 263             // read the indexes
 264             indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
 265 
 266             byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
 267 
 268             int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
 269             combiningTable = new char[combiningTableTop];
 270 
 271             int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
 272             extraData = new char[extraDataTop];
 273 
 274             byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
 275             byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
 276 
 277             fcdTrieImpl = new FCDTrieImpl();
 278             normTrieImpl = new NormTrieImpl();
 279             auxTrieImpl = new AuxTrieImpl();
 280 
 281             // load the rest of the data data and initialize the data members
 282             reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable);
 283 
 284             NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );
 285             FCDTrieImpl.fcdTrie   = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl  );
 286             AuxTrieImpl.auxTrie   = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl  );
 287 
 288             // we reached here without any exceptions so the data is fully
 289             // loaded set the variable to true
 290             isDataLoaded = true;
 291 
 292             // get the data format version
 293             byte[] formatVersion = reader.getDataFormatVersion();
 294 
 295             isFormatVersion_2_1 =( formatVersion[0]>2
 296                                     ||
 297                                    (formatVersion[0]==2 && formatVersion[1]>=1)
 298                                  );
 299             isFormatVersion_2_2 =( formatVersion[0]>2
 300                                     ||
 301                                    (formatVersion[0]==2 && formatVersion[1]>=2)
 302                                  );
 303             unicodeVersion = reader.getUnicodeVersion();
 304             b.close();
 305         }
 306     }
 307 
 308     /* ---------------------------------------------------------------------- */
 309 
 310     /* Korean Hangul and Jamo constants */
 311 
 312     public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
 313     public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
 314     public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
 315 
 316     public static final int HANGUL_BASE=0xac00;
 317 
 318     public static final int JAMO_L_COUNT=19;
 319     public static final int JAMO_V_COUNT=21;
 320     public static final int JAMO_T_COUNT=28;
 321     public  static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
 322 
 323     private static boolean isHangulWithoutJamoT(char c) {
 324         c-=HANGUL_BASE;
 325         return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
 326     }
 327 
 328     /* norm32 helpers */
 329 
 330     /* is this a norm32 with a regular index? */
 331     private static boolean isNorm32Regular(long norm32) {
 332         return norm32<MIN_SPECIAL;
 333     }
 334 
 335     /* is this a norm32 with a special index for a lead surrogate? */
 336     private static boolean isNorm32LeadSurrogate(long norm32) {
 337         return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
 338     }
 339 
 340     /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
 341     private static boolean isNorm32HangulOrJamo(long norm32) {
 342         return norm32>=MIN_HANGUL;
 343     }
 344 
 345     /*
 346      * Given norm32 for Jamo V or T,
 347      * is this a Jamo V?
 348      */
 349     private static boolean isJamoVTNorm32JamoV(long norm32) {
 350         return norm32<JAMO_V_TOP;
 351     }
 352 
 353     /* data access primitives ----------------------------------------------- */
 354 
 355     public static long/*unsigned*/ getNorm32(char c) {
 356         return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
 357     }
 358 
 359     public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
 360                                                                char c2) {
 361         /*
 362          * the surrogate index in norm32 stores only the number of the surrogate
 363          * index block see gennorm/store.c/getFoldedNormValue()
 364          */
 365         return ((UNSIGNED_INT_MASK) &
 366                     NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
 367     }
 368     ///CLOVER:OFF
 369     private static long getNorm32(int c){
 370         return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
 371     }
 372 
 373     /*
 374      * get a norm32 from text with complete code points
 375      * (like from decompositions)
 376      */
 377     private static long/*unsigned*/ getNorm32(char[] p,int start,
 378                                               int/*unsigned*/ mask) {
 379         long/*unsigned*/ norm32= getNorm32(p[start]);
 380         if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
 381             /* *p is a lead surrogate, get the real norm32 */
 382             norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
 383         }
 384         return norm32;
 385     }
 386 
 387     //// for StringPrep
 388     public static VersionInfo getUnicodeVersion(){
 389         return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
 390                                        unicodeVersion[2], unicodeVersion[3]);
 391     }
 392 
 393     public static char    getFCD16(char c) {
 394         return  FCDTrieImpl.fcdTrie.getLeadValue(c);
 395     }
 396 
 397     public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
 398         /* the surrogate index in fcd16 is an absolute offset over the
 399          * start of stage 1
 400          * */
 401         return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
 402     }
 403     public static int getFCD16(int c) {
 404         return  FCDTrieImpl.fcdTrie.getCodePointValue(c);
 405     }
 406 
 407     private static int getExtraDataIndex(long norm32) {
 408         return (int)(norm32>>EXTRA_SHIFT);
 409     }
 410 
 411     private static final class DecomposeArgs{
 412         int /*unsigned byte*/ cc;
 413         int /*unsigned byte*/ trailCC;
 414         int length;
 415     }
 416     /**
 417      *
 418      * get the canonical or compatibility decomposition for one character
 419      *
 420      * @return index into the extraData array
 421      */
 422     private static int/*index*/ decompose(long/*unsigned*/ norm32,
 423                                           int/*unsigned*/ qcMask,
 424                                           DecomposeArgs args) {
 425         int p= getExtraDataIndex(norm32);
 426         args.length=extraData[p++];
 427 
 428         if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
 429             /* use compatibility decomposition, skip canonical data */
 430             p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
 431             args.length>>=8;
 432         }
 433 
 434         if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
 435             /* get the lead and trail cc's */
 436             char bothCCs=extraData[p++];
 437             args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
 438             args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
 439         } else {
 440             /* lead and trail cc's are both 0 */
 441             args.cc=args.trailCC=0;
 442         }
 443 
 444         args.length&=DECOMP_LENGTH_MASK;
 445         return p;
 446     }
 447 
 448 
 449     /**
 450      * get the canonical decomposition for one character
 451      * @return index into the extraData array
 452      */
 453     private static int decompose(long/*unsigned*/ norm32,
 454                                  DecomposeArgs args) {
 455 
 456         int p= getExtraDataIndex(norm32);
 457         args.length=extraData[p++];
 458 
 459         if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
 460             /* get the lead and trail cc's */
 461             char bothCCs=extraData[p++];
 462             args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
 463             args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
 464         } else {
 465             /* lead and trail cc's are both 0 */
 466             args.cc=args.trailCC=0;
 467         }
 468 
 469         args.length&=DECOMP_LENGTH_MASK;
 470         return p;
 471     }
 472 
 473 
 474     private static final class NextCCArgs{
 475         char[] source;
 476         int next;
 477         int limit;
 478         char c;
 479         char c2;
 480     }
 481 
 482     /*
 483      * get the combining class of (c, c2)= args.source[args.next++]
 484      * before: args.next<args.limit  after: args.next<=args.limit
 485      * if only one code unit is used, then c2==0
 486      */
 487     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
 488         long /*unsigned*/ norm32;
 489 
 490         args.c=args.source[args.next++];
 491 
 492         norm32= getNorm32(args.c);
 493         if((norm32 & CC_MASK)==0) {
 494             args.c2=0;
 495             return 0;
 496         } else {
 497             if(!isNorm32LeadSurrogate(norm32)) {
 498                 args.c2=0;
 499             } else {
 500                 /* c is a lead surrogate, get the real norm32 */
 501                 if(args.next!=args.limit &&
 502                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
 503                     ++args.next;
 504                     norm32=getNorm32FromSurrogatePair(norm32, args.c2);
 505                 } else {
 506                     args.c2=0;
 507                     return 0;
 508                 }
 509             }
 510 
 511             return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
 512         }
 513     }
 514 
 515     private static final class PrevArgs{
 516         char[] src;
 517         int start;
 518         int current;
 519         char c;
 520         char c2;
 521     }
 522 
 523     /*
 524      * read backwards and get norm32
 525      * return 0 if the character is <minC
 526      * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
 527      * surrogate but read second!)
 528      */
 529     private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
 530                                                       int/*unsigned*/ minC,
 531                                                       int/*unsigned*/ mask) {
 532         long/*unsigned*/ norm32;
 533 
 534         args.c=args.src[--args.current];
 535         args.c2=0;
 536 
 537         /* check for a surrogate before getting norm32 to see if we need to
 538          * predecrement further
 539          */
 540         if(args.c<minC) {
 541             return 0;
 542         } else if(!UTF16.isSurrogate(args.c)) {
 543             return getNorm32(args.c);
 544         } else if(UTF16.isLeadSurrogate(args.c)) {
 545             /* unpaired first surrogate */
 546             return 0;
 547         } else if(args.current!=args.start &&
 548                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
 549             --args.current;
 550             norm32=getNorm32(args.c2);
 551 
 552             if((norm32&mask)==0) {
 553                 /* all surrogate pairs with this lead surrogate have
 554                  * only irrelevant data
 555                  */
 556                 return 0;
 557             } else {
 558                 /* norm32 must be a surrogate special */
 559                 return getNorm32FromSurrogatePair(norm32, args.c);
 560             }
 561         } else {
 562             /* unpaired second surrogate */
 563             args.c2=0;
 564             return 0;
 565         }
 566     }
 567 
 568     /*
 569      * get the combining class of (c, c2)=*--p
 570      * before: start<p  after: start<=p
 571      */
 572     private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {
 573 
 574         return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
 575                                                          CC_MASK)>>CC_SHIFT));
 576     }
 577 
 578     /*
 579      * is this a safe boundary character for NF*D?
 580      * (lead cc==0)
 581      */
 582     public static boolean isNFDSafe(long/*unsigned*/ norm32,
 583                                      int/*unsigned*/ccOrQCMask,
 584                                      int/*unsigned*/ decompQCMask) {
 585         if((norm32&ccOrQCMask)==0) {
 586             return true; /* cc==0 and no decomposition: this is NF*D safe */
 587         }
 588 
 589         /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
 590         if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
 591             DecomposeArgs args=new DecomposeArgs();
 592             /* decomposes, get everything from the variable-length extra data */
 593             decompose(norm32, decompQCMask, args);
 594             return args.cc==0;
 595         } else {
 596             /* no decomposition (or Hangul), test the cc directly */
 597             return (norm32&CC_MASK)==0;
 598         }
 599     }
 600 
 601     /*
 602      * is this (or does its decomposition begin with) a "true starter"?
 603      * (cc==0 and NF*C_YES)
 604      */
 605     public static boolean isTrueStarter(long/*unsigned*/ norm32,
 606                                           int/*unsigned*/ ccOrQCMask,
 607                                           int/*unsigned*/ decompQCMask) {
 608         if((norm32&ccOrQCMask)==0) {
 609             return true; /* this is a true starter (could be Hangul or Jamo L)*/
 610         }
 611 
 612         /* inspect its decomposition - not a Hangul or a surrogate here */
 613         if((norm32&decompQCMask)!=0) {
 614             int p; /* index into extra data array */
 615             DecomposeArgs args=new DecomposeArgs();
 616             /* decomposes, get everything from the variable-length extra data */
 617             p=decompose(norm32, decompQCMask, args);
 618 
 619             if(args.cc==0) {
 620                 int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;
 621 
 622                 /* does it begin with NFC_YES? */
 623                 if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
 624                     /* yes, the decomposition begins with a true starter */
 625                     return true;
 626                 }
 627             }
 628         }
 629         return false;
 630     }
 631 
 632     /* reorder UTF-16 in-place ---------------------------------------------- */
 633 
 634     /**
 635      * simpler, single-character version of mergeOrdered() -
 636      * bubble-insert one single code point into the preceding string
 637      * which is already canonically ordered
 638      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
 639      *
 640      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
 641      *
 642      * before: src[start]..src[current] is already ordered, and
 643      *         src[current]..src[p]     may or may not hold (c, c2) but
 644      *                          must be exactly the same length as (c, c2)
 645      * after: src[start]..src[p] is ordered
 646      *
 647      * @return the trailing combining class
 648      */
 649     private static int/*unsigned byte*/ insertOrdered(char[] source,
 650                                                       int start,
 651                                                       int current, int p,
 652                                                          char c, char c2,
 653                                                          int/*unsigned byte*/ cc) {
 654         int back, preBack;
 655         int r;
 656         int prevCC, trailCC=cc;
 657 
 658         if(start<current && cc!=0) {
 659             // search for the insertion point where cc>=prevCC
 660             preBack=back=current;
 661             PrevArgs prevArgs = new PrevArgs();
 662             prevArgs.current  = current;
 663             prevArgs.start    = start;
 664             prevArgs.src      = source;
 665             // get the prevCC
 666             prevCC=getPrevCC(prevArgs);
 667             preBack = prevArgs.current;
 668 
 669             if(cc<prevCC) {
 670                 // this will be the last code point, so keep its cc
 671                 trailCC=prevCC;
 672                 back=preBack;
 673                 while(start<preBack) {
 674                     prevCC=getPrevCC(prevArgs);
 675                     preBack=prevArgs.current;
 676                     if(cc>=prevCC) {
 677                         break;
 678                     }
 679                     back=preBack;
 680                 }
 681 
 682 
 683                 // this is where we are right now with all these indicies:
 684                 // [start]..[pPreBack] 0..? code points that we can ignore
 685                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
 686                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
 687                 // [current]..[p]         1 code point (c, c2) with cc
 688 
 689                 // move the code units in between up
 690                 r=p;
 691                 do {
 692                     source[--r]=source[--current];
 693                 } while(back!=current);
 694             }
 695         }
 696 
 697         // insert (c, c2)
 698         source[current]=c;
 699         if(c2!=0) {
 700             source[(current+1)]=c2;
 701         }
 702 
 703         // we know the cc of the last code point
 704         return trailCC;
 705     }
 706 
 707     /**
 708      * merge two UTF-16 string parts together
 709      * to canonically order (order by combining classes) their concatenation
 710      *
 711      * the two strings may already be adjacent, so that the merging is done
 712      * in-place if the two strings are not adjacent, then the buffer holding the
 713      * first one must be large enough
 714      * the second string may or may not be ordered in itself
 715      *
 716      * before: [start]..[current] is already ordered, and
 717      *         [next]..[limit]    may be ordered in itself, but
 718      *                          is not in relation to [start..current[
 719      * after: [start..current+(limit-next)[ is ordered
 720      *
 721      * the algorithm is a simple bubble-sort that takes the characters from
 722      * src[next++] and inserts them in correct combining class order into the
 723      * preceding part of the string
 724      *
 725      * since this function is called much less often than the single-code point
 726      * insertOrdered(), it just uses that for easier maintenance
 727      *
 728      * @return the trailing combining class
 729      */
 730     private static int /*unsigned byte*/ mergeOrdered(char[] source,
 731                                                       int start,
 732                                                       int current,
 733                                                       char[] data,
 734                                                         int next,
 735                                                         int limit,
 736                                                         boolean isOrdered) {
 737             int r;
 738             int /*unsigned byte*/ cc, trailCC=0;
 739             boolean adjacent;
 740 
 741             adjacent= current==next;
 742             NextCCArgs ncArgs = new NextCCArgs();
 743             ncArgs.source = data;
 744             ncArgs.next   = next;
 745             ncArgs.limit  = limit;
 746 
 747             if(start!=current || !isOrdered) {
 748 
 749                 while(ncArgs.next<ncArgs.limit) {
 750                     cc=getNextCC(ncArgs);
 751                     if(cc==0) {
 752                         // does not bubble back
 753                         trailCC=0;
 754                         if(adjacent) {
 755                             current=ncArgs.next;
 756                         } else {
 757                             data[current++]=ncArgs.c;
 758                             if(ncArgs.c2!=0) {
 759                                 data[current++]=ncArgs.c2;
 760                             }
 761                         }
 762                         if(isOrdered) {
 763                             break;
 764                         } else {
 765                             start=current;
 766                         }
 767                     } else {
 768                         r=current+(ncArgs.c2==0 ? 1 : 2);
 769                         trailCC=insertOrdered(source,start, current, r,
 770                                               ncArgs.c, ncArgs.c2, cc);
 771                         current=r;
 772                     }
 773                 }
 774             }
 775 
 776             if(ncArgs.next==ncArgs.limit) {
 777                 // we know the cc of the last code point
 778                 return trailCC;
 779             } else {
 780                 if(!adjacent) {
 781                     // copy the second string part
 782                     do {
 783                         source[current++]=data[ncArgs.next++];
 784                     } while(ncArgs.next!=ncArgs.limit);
 785                     ncArgs.limit=current;
 786                 }
 787                 PrevArgs prevArgs = new PrevArgs();
 788                 prevArgs.src   = data;
 789                 prevArgs.start = start;
 790                 prevArgs.current =  ncArgs.limit;
 791                 return getPrevCC(prevArgs);
 792             }
 793 
 794     }
 795     private static int /*unsigned byte*/ mergeOrdered(char[] source,
 796                                                       int start,
 797                                                       int current,
 798                                                       char[] data,
 799                                                         final int next,
 800                                                         final int limit) {
 801         return mergeOrdered(source,start,current,data,next,limit,true);
 802     }
 803 
 804     public static NormalizerBase.QuickCheckResult quickCheck(char[] src,
 805                                                             int srcStart,
 806                                                             int srcLimit,
 807                                                             int minNoMaybe,
 808                                                             int qcMask,
 809                                                             int options,
 810                                                             boolean allowMaybe,
 811                                                             UnicodeSet nx){
 812 
 813         int ccOrQCMask;
 814         long norm32;
 815         char c, c2;
 816         char cc, prevCC;
 817         long qcNorm32;
 818         NormalizerBase.QuickCheckResult result;
 819         ComposePartArgs args = new ComposePartArgs();
 820         char[] buffer ;
 821         int start = srcStart;
 822 
 823         if(!isDataLoaded) {
 824             return NormalizerBase.MAYBE;
 825         }
 826         // initialize
 827         ccOrQCMask=CC_MASK|qcMask;
 828         result=NormalizerBase.YES;
 829         prevCC=0;
 830 
 831         for(;;) {
 832             for(;;) {
 833                 if(srcStart==srcLimit) {
 834                     return result;
 835                 } else if((c=src[srcStart++])>=minNoMaybe &&
 836                                   (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
 837                     break;
 838                 }
 839                 prevCC=0;
 840             }
 841 
 842 
 843             // check one above-minimum, relevant code unit
 844             if(isNorm32LeadSurrogate(norm32)) {
 845                 // c is a lead surrogate, get the real norm32
 846                 if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
 847                     ++srcStart;
 848                     norm32=getNorm32FromSurrogatePair(norm32,c2);
 849                 } else {
 850                     norm32=0;
 851                     c2=0;
 852                 }
 853             }else{
 854                 c2=0;
 855             }
 856             if(nx_contains(nx, c, c2)) {
 857                 /* excluded: norm32==0 */
 858                 norm32=0;
 859             }
 860 
 861             // check the combining order
 862             cc=(char)((norm32>>CC_SHIFT)&0xFF);
 863             if(cc!=0 && cc<prevCC) {
 864                 return NormalizerBase.NO;
 865             }
 866             prevCC=cc;
 867 
 868             // check for "no" or "maybe" quick check flags
 869             qcNorm32 = norm32 & qcMask;
 870             if((qcNorm32& QC_ANY_NO)>=1) {
 871                 result= NormalizerBase.NO;
 872                 break;
 873             } else if(qcNorm32!=0) {
 874                 // "maybe" can only occur for NFC and NFKC
 875                 if(allowMaybe){
 876                     result=NormalizerBase.MAYBE;
 877                 }else{
 878                     // normalize a section around here to see if it is really
 879                     // normalized or not
 880                     int prevStarter;
 881                     int/*unsigned*/ decompQCMask;
 882 
 883                     decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask
 884 
 885                     // find the previous starter
 886 
 887                     // set prevStarter to the beginning of the current character
 888                     prevStarter=srcStart-1;
 889                     if(UTF16.isTrailSurrogate(src[prevStarter])) {
 890                         // safe because unpaired surrogates do not result
 891                         // in "maybe"
 892                         --prevStarter;
 893                     }
 894                     prevStarter=findPreviousStarter(src, start, prevStarter,
 895                                                     ccOrQCMask, decompQCMask,
 896                                                     (char)minNoMaybe);
 897 
 898                     // find the next true starter in [src..limit[ - modifies
 899                     // src to point to the next starter
 900                     srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
 901                                              decompQCMask,(char) minNoMaybe);
 902 
 903                     //set the args for compose part
 904                     args.prevCC = prevCC;
 905 
 906                     // decompose and recompose [prevStarter..src[
 907                     buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
 908 
 909                     // compare the normalized version with the original
 910                     if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
 911                         result=NormalizerBase.NO; // normalization differs
 912                         break;
 913                     }
 914 
 915                     // continue after the next starter
 916                 }
 917             }
 918         }
 919         return result;
 920     }
 921 
 922 
 923     //------------------------------------------------------
 924     // make NFD & NFKD
 925     //------------------------------------------------------
 926 
 927     public static int decompose(char[] src,int srcStart,int srcLimit,
 928                                 char[] dest,int destStart,int destLimit,
 929                                  boolean compat,int[] outTrailCC,
 930                                  UnicodeSet nx) {
 931 
 932         char[] buffer = new char[3];
 933         int prevSrc;
 934         long norm32;
 935         int ccOrQCMask, qcMask;
 936         int reorderStartIndex, length;
 937         char c, c2, minNoMaybe;
 938         int/*unsigned byte*/ cc, prevCC, trailCC;
 939         char[] p;
 940         int pStart;
 941         int destIndex = destStart;
 942         int srcIndex = srcStart;
 943         if(!compat) {
 944             minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
 945             qcMask=QC_NFD;
 946         } else {
 947             minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
 948             qcMask=QC_NFKD;
 949         }
 950 
 951         /* initialize */
 952         ccOrQCMask=CC_MASK|qcMask;
 953         reorderStartIndex=0;
 954         prevCC=0;
 955         norm32=0;
 956         c=0;
 957         pStart=0;
 958 
 959         cc=trailCC=-1;//initialize to bogus value
 960 
 961         for(;;) {
 962             /* count code units below the minimum or with irrelevant data for
 963              * the quick check
 964              */
 965             prevSrc=srcIndex;
 966 
 967             while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
 968                                         ((norm32=getNorm32(c))&ccOrQCMask)==0)){
 969                 prevCC=0;
 970                 ++srcIndex;
 971             }
 972 
 973             /* copy these code units all at once */
 974             if(srcIndex!=prevSrc) {
 975                 length=srcIndex-prevSrc;
 976                 if((destIndex+length)<=destLimit) {
 977                     System.arraycopy(src,prevSrc,dest,destIndex,length);
 978                 }
 979 
 980                 destIndex+=length;
 981                 reorderStartIndex=destIndex;
 982             }
 983 
 984             /* end of source reached? */
 985             if(srcIndex==srcLimit) {
 986                 break;
 987             }
 988 
 989             /* c already contains *src and norm32 is set for it, increment src*/
 990             ++srcIndex;
 991 
 992             /* check one above-minimum, relevant code unit */
 993             /*
 994              * generally, set p and length to the decomposition string
 995              * in simple cases, p==NULL and (c, c2) will hold the length code
 996              * units to append in all cases, set cc to the lead and trailCC to
 997              * the trail combining class
 998              *
 999              * the following merge-sort of the current character into the
1000              * preceding, canonically ordered result text will use the
1001              * optimized insertOrdered()
1002              * if there is only one single code point to process;
1003              * this is indicated with p==NULL, and (c, c2) is the character to
1004              * insert
1005              * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1006              * for a supplementary character)
1007              * otherwise, p[length] is merged in with _mergeOrdered()
1008              */
1009             if(isNorm32HangulOrJamo(norm32)) {
1010                 if(nx_contains(nx, c)) {
1011                     c2=0;
1012                     p=null;
1013                     length=1;
1014                 } else {
1015                     // Hangul syllable: decompose algorithmically
1016                     p=buffer;
1017                     pStart=0;
1018                     cc=trailCC=0;
1019 
1020                     c-=HANGUL_BASE;
1021 
1022                     c2=(char)(c%JAMO_T_COUNT);
1023                     c/=JAMO_T_COUNT;
1024                     if(c2>0) {
1025                         buffer[2]=(char)(JAMO_T_BASE+c2);
1026                         length=3;
1027                     } else {
1028                         length=2;
1029                     }
1030 
1031                     buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
1032                     buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
1033                 }
1034             } else {
1035                 if(isNorm32Regular(norm32)) {
1036                     c2=0;
1037                     length=1;
1038                 } else {
1039                     // c is a lead surrogate, get the real norm32
1040                     if(srcIndex!=srcLimit &&
1041                                     UTF16.isTrailSurrogate(c2=src[srcIndex])) {
1042                         ++srcIndex;
1043                         length=2;
1044                         norm32=getNorm32FromSurrogatePair(norm32, c2);
1045                     } else {
1046                         c2=0;
1047                         length=1;
1048                         norm32=0;
1049                     }
1050                 }
1051 
1052                 /* get the decomposition and the lead and trail cc's */
1053                 if(nx_contains(nx, c, c2)) {
1054                     /* excluded: norm32==0 */
1055                     cc=trailCC=0;
1056                     p=null;
1057                 } else if((norm32&qcMask)==0) {
1058                     /* c does not decompose */
1059                     cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
1060                     p=null;
1061                     pStart=-1;
1062                 } else {
1063                     DecomposeArgs arg = new DecomposeArgs();
1064                     /* c decomposes, get everything from the variable-length
1065                      * extra data
1066                      */
1067                     pStart=decompose(norm32, qcMask, arg);
1068                     p=extraData;
1069                     length=arg.length;
1070                     cc=arg.cc;
1071                     trailCC=arg.trailCC;
1072                     if(length==1) {
1073                         /* fastpath a single code unit from decomposition */
1074                         c=p[pStart];
1075                         c2=0;
1076                         p=null;
1077                         pStart=-1;
1078                     }
1079                 }
1080             }
1081 
1082             /* append the decomposition to the destination buffer, assume
1083              * length>0
1084              */
1085             if((destIndex+length)<=destLimit) {
1086                 int reorderSplit=destIndex;
1087                 if(p==null) {
1088                     /* fastpath: single code point */
1089                     if(cc!=0 && cc<prevCC) {
1090                         /* (c, c2) is out of order with respect to the preceding
1091                          *  text
1092                          */
1093                         destIndex+=length;
1094                         trailCC=insertOrdered(dest,reorderStartIndex,
1095                                             reorderSplit, destIndex, c, c2, cc);
1096                     } else {
1097                         /* just append (c, c2) */
1098                         dest[destIndex++]=c;
1099                         if(c2!=0) {
1100                             dest[destIndex++]=c2;
1101                         }
1102                     }
1103                 } else {
1104                     /* general: multiple code points (ordered by themselves)
1105                      * from decomposition
1106                      */
1107                     if(cc!=0 && cc<prevCC) {
1108                         /* the decomposition is out of order with respect to the
1109                          *  preceding text
1110                          */
1111                         destIndex+=length;
1112                         trailCC=mergeOrdered(dest,reorderStartIndex,
1113                                           reorderSplit,p, pStart,pStart+length);
1114                     } else {
1115                         /* just append the decomposition */
1116                         do {
1117                             dest[destIndex++]=p[pStart++];
1118                         } while(--length>0);
1119                     }
1120                 }
1121             } else {
1122                 /* buffer overflow */
1123                 /* keep incrementing the destIndex for preflighting */
1124                 destIndex+=length;
1125             }
1126 
1127             prevCC=trailCC;
1128             if(prevCC==0) {
1129                 reorderStartIndex=destIndex;
1130             }
1131         }
1132 
1133         outTrailCC[0]=prevCC;
1134 
1135         return destIndex - destStart;
1136     }
1137 
1138     /* make NFC & NFKC ------------------------------------------------------ */
1139     private static final class NextCombiningArgs{
1140         char[] source;
1141         int start;
1142         //int limit;
1143         char c;
1144         char c2;
1145         int/*unsigned*/ combiningIndex;
1146         char /*unsigned byte*/ cc;
1147     }
1148 
1149     /* get the composition properties of the next character */
1150     private static int /*unsigned*/    getNextCombining(NextCombiningArgs args,
1151                                                     int limit,
1152                                                     UnicodeSet nx) {
1153         long/*unsigned*/ norm32;
1154         int combineFlags;
1155         /* get properties */
1156         args.c=args.source[args.start++];
1157         norm32=getNorm32(args.c);
1158 
1159         /* preset output values for most characters */
1160         args.c2=0;
1161         args.combiningIndex=0;
1162         args.cc=0;
1163 
1164         if((norm32&(CC_MASK|COMBINES_ANY))==0) {
1165             return 0;
1166         } else {
1167             if(isNorm32Regular(norm32)) {
1168                 /* set cc etc. below */
1169             } else if(isNorm32HangulOrJamo(norm32)) {
1170                 /* a compatibility decomposition contained Jamos */
1171                 args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
1172                                                         (norm32>>EXTRA_SHIFT)));
1173                 return (int)(norm32&COMBINES_ANY);
1174             } else {
1175                 /* c is a lead surrogate, get the real norm32 */
1176                 if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
1177                                                      args.source[args.start])) {
1178                     ++args.start;
1179                     norm32=getNorm32FromSurrogatePair(norm32, args.c2);
1180                 } else {
1181                     args.c2=0;
1182                     return 0;
1183                 }
1184             }
1185 
1186             if(nx_contains(nx, args.c, args.c2)) {
1187                 return 0; /* excluded: norm32==0 */
1188             }
1189 
1190             args.cc= (char)((norm32>>CC_SHIFT)&0xff);
1191 
1192             combineFlags=(int)(norm32&COMBINES_ANY);
1193             if(combineFlags!=0) {
1194                 int index = getExtraDataIndex(norm32);
1195                 args.combiningIndex=index>0 ? extraData[(index-1)] :0;
1196             }
1197 
1198             return combineFlags;
1199         }
1200     }
1201 
1202     /*
1203      * given a composition-result starter (c, c2) - which means its cc==0,
1204      * it combines forward, it has extra data, its norm32!=0,
1205      * it is not a Hangul or Jamo,
1206      * get just its combineFwdIndex
1207      *
1208      * norm32(c) is special if and only if c2!=0
1209      */
1210     private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
1211         long/*unsigned*/ norm32;
1212 
1213         norm32=getNorm32(c);
1214         if(c2!=0) {
1215             norm32=getNorm32FromSurrogatePair(norm32, c2);
1216         }
1217         return extraData[(getExtraDataIndex(norm32)-1)];
1218     }
1219 
1220     /*
1221      * Find the recomposition result for
1222      * a forward-combining character
1223      * (specified with a pointer to its part of the combiningTable[])
1224      * and a backward-combining character
1225      * (specified with its combineBackIndex).
1226      *
1227      * If these two characters combine, then set (value, value2)
1228      * with the code unit(s) of the composition character.
1229      *
1230      * Return value:
1231      * 0    do not combine
1232      * 1    combine
1233      * >1   combine, and the composition is a forward-combining starter
1234      *
1235      * See unormimp.h for a description of the composition table format.
1236      */
1237     private static int/*unsigned*/ combine(char[]table,int tableStart,
1238                                    int/*unsinged*/ combineBackIndex,
1239                                     int[] outValues) {
1240         int/*unsigned*/ key;
1241         int value,value2;
1242 
1243         if(outValues.length<2){
1244             throw new IllegalArgumentException();
1245         }
1246 
1247         /* search in the starter's composition table */
1248         for(;;) {
1249             key=table[tableStart++];
1250             if(key>=combineBackIndex) {
1251                 break;
1252             }
1253             tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
1254         }
1255 
1256         /* mask off bit 15, the last-entry-in-the-list flag */
1257         if((key&0x7fff)==combineBackIndex) {
1258             /* found! combine! */
1259             value=table[tableStart];
1260 
1261             /* is the composition a starter that combines forward? */
1262             key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));
1263 
1264             /* get the composition result code point from the variable-length
1265              * result value
1266              */
1267             if((value&0x8000) != 0) {
1268                 if((value&0x4000) != 0) {
1269                     /* surrogate pair composition result */
1270                     value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
1271                     value2=table[tableStart+1];
1272                 } else {
1273                     /* BMP composition result U+2000..U+ffff */
1274                     value=table[tableStart+1];
1275                     value2=0;
1276                 }
1277             } else {
1278                 /* BMP composition result U+0000..U+1fff */
1279                 value&=0x1fff;
1280                 value2=0;
1281             }
1282             outValues[0]=value;
1283             outValues[1]=value2;
1284             return key;
1285         } else {
1286             /* not found */
1287             return 0;
1288         }
1289     }
1290 
1291 
1292     private static final class RecomposeArgs{
1293         char[] source;
1294         int start;
1295         int limit;
1296     }
1297     /*
1298      * recompose the characters in [p..limit[
1299      * (which is in NFD - decomposed and canonically ordered),
1300      * adjust limit, and return the trailing cc
1301      *
1302      * since for NFKC we may get Jamos in decompositions, we need to
1303      * recompose those too
1304      *
1305      * note that recomposition never lengthens the text:
1306      * any character consists of either one or two code units;
1307      * a composition may contain at most one more code unit than the original
1308      * starter, while the combining mark that is removed has at least one code
1309      * unit
1310      */
1311     private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
1312         int  remove, q, r;
1313         int /*unsigned*/ combineFlags;
1314         int /*unsigned*/ combineFwdIndex, combineBackIndex;
1315         int /*unsigned*/ result, value=0, value2=0;
1316         int /*unsigned byte*/  prevCC;
1317         boolean starterIsSupplementary;
1318         int starter;
1319         int[] outValues = new int[2];
1320         starter=-1;                   /* no starter */
1321         combineFwdIndex=0;            /* will not be used until starter!=NULL */
1322         starterIsSupplementary=false; /* will not be used until starter!=NULL */
1323         prevCC=0;
1324 
1325         NextCombiningArgs ncArg = new NextCombiningArgs();
1326         ncArg.source  = args.source;
1327 
1328         ncArg.cc      =0;
1329         ncArg.c2      =0;
1330 
1331         for(;;) {
1332             ncArg.start = args.start;
1333             combineFlags=getNextCombining(ncArg,args.limit,nx);
1334             combineBackIndex=ncArg.combiningIndex;
1335             args.start = ncArg.start;
1336 
1337             if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
1338                 if((combineBackIndex&0x8000)!=0) {
1339                     /* c is a Jamo V/T, see if we can compose it with the
1340                      * previous character
1341                      */
1342                     /* for the PRI #29 fix, check that there is no intervening combining mark */
1343                     if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
1344                         remove=-1; /* NULL while no Hangul composition */
1345                         combineFlags=0;
1346                         ncArg.c2=args.source[starter];
1347                         if(combineBackIndex==0xfff2) {
1348                             /* Jamo V, compose with previous Jamo L and following
1349                              * Jamo T
1350                              */
1351                             ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
1352                             if(ncArg.c2<JAMO_L_COUNT) {
1353                                 remove=args.start-1;
1354                                 ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
1355                                                (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
1356                                 if(args.start!=args.limit &&
1357                                             (ncArg.c2=(char)(args.source[args.start]
1358                                              -JAMO_T_BASE))<JAMO_T_COUNT) {
1359                                     ++args.start;
1360                                     ncArg.c+=ncArg.c2;
1361                                  } else {
1362                                      /* the result is an LV syllable, which is a starter (unlike LVT) */
1363                                      combineFlags=COMBINES_FWD;
1364                                 }
1365                                 if(!nx_contains(nx, ncArg.c)) {
1366                                     args.source[starter]=ncArg.c;
1367                                    } else {
1368                                     /* excluded */
1369                                     if(!isHangulWithoutJamoT(ncArg.c)) {
1370                                         --args.start; /* undo the ++args.start from reading the Jamo T */
1371                                     }
1372                                     /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1373                                     remove=args.start;
1374                                 }
1375                             }
1376 
1377                         /*
1378                          * Normally, the following can not occur:
1379                          * Since the input is in NFD, there are no Hangul LV syllables that
1380                          * a Jamo T could combine with.
1381                          * All Jamo Ts are combined above when handling Jamo Vs.
1382                          *
1383                          * However, before the PRI #29 fix, this can occur due to
1384                          * an intervening combining mark between the Hangul LV and the Jamo T.
1385                          */
1386                         } else {
1387                             /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1388                             if(isHangulWithoutJamoT(ncArg.c2)) {
1389                                 ncArg.c2+=ncArg.c-JAMO_T_BASE;
1390                                 if(!nx_contains(nx, ncArg.c2)) {
1391                                     remove=args.start-1;
1392                                     args.source[starter]=ncArg.c2;
1393                                 }
1394                             }
1395                         }
1396 
1397                         if(remove!=-1) {
1398                             /* remove the Jamo(s) */
1399                             q=remove;
1400                             r=args.start;
1401                             while(r<args.limit) {
1402                                 args.source[q++]=args.source[r++];
1403                             }
1404                             args.start=remove;
1405                             args.limit=q;
1406                         }
1407 
1408                         ncArg.c2=0; /* c2 held *starter temporarily */
1409 
1410                         if(combineFlags!=0) {
1411                             /*
1412                              * not starter=NULL because the composition is a Hangul LV syllable
1413                              * and might combine once more (but only before the PRI #29 fix)
1414                              */
1415 
1416                             /* done? */
1417                             if(args.start==args.limit) {
1418                                 return (char)prevCC;
1419                             }
1420 
1421                             /* the composition is a Hangul LV syllable which is a starter that combines forward */
1422                             combineFwdIndex=0xfff0;
1423 
1424                             /* we combined; continue with looking for compositions */
1425                             continue;
1426                         }
1427                     }
1428 
1429                     /*
1430                      * now: cc==0 and the combining index does not include
1431                      * "forward" -> the rest of the loop body will reset starter
1432                      * to NULL; technically, a composed Hangul syllable is a
1433                      * starter, but it does not combine forward now that we have
1434                      * consumed all eligible Jamos; for Jamo V/T, combineFlags
1435                      * does not contain _NORM_COMBINES_FWD
1436                      */
1437 
1438                 } else if(
1439                     /* the starter is not a Hangul LV or Jamo V/T and */
1440                     !((combineFwdIndex&0x8000)!=0) &&
1441                     /* the combining mark is not blocked and */
1442                     ((options&BEFORE_PRI_29)!=0 ?
1443                         (prevCC!=ncArg.cc || prevCC==0) :
1444                         (prevCC<ncArg.cc || prevCC==0)) &&
1445                     /* the starter and the combining mark (c, c2) do combine */
1446                     0!=(result=combine(combiningTable,combineFwdIndex,
1447                                        combineBackIndex, outValues)) &&
1448                     /* the composition result is not excluded */
1449                     !nx_contains(nx, (char)value, (char)value2)
1450                 ) {
1451                     value=outValues[0];
1452                     value2=outValues[1];
1453                     /* replace the starter with the composition, remove the
1454                      * combining mark
1455                      */
1456                     remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
1457 
1458                     /* replace the starter with the composition */
1459                     args.source[starter]=(char)value;
1460                     if(starterIsSupplementary) {
1461                         if(value2!=0) {
1462                             /* both are supplementary */
1463                             args.source[starter+1]=(char)value2;
1464                         } else {
1465                             /* the composition is shorter than the starter,
1466                              * move the intermediate characters forward one */
1467                             starterIsSupplementary=false;
1468                             q=starter+1;
1469                             r=q+1;
1470                             while(r<remove) {
1471                                 args.source[q++]=args.source[r++];
1472                             }
1473                             --remove;
1474                         }
1475                     } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB
1476                         starterIsSupplementary=true;
1477                         args.source[starter+1]=(char)value2;
1478                     /* } else { both are on the BMP, nothing more to do */
1479                     }
1480 
1481                     /* remove the combining mark by moving the following text
1482                      * over it */
1483                     if(remove<args.start) {
1484                         q=remove;
1485                         r=args.start;
1486                         while(r<args.limit) {
1487                             args.source[q++]=args.source[r++];
1488                         }
1489                         args.start=remove;
1490                         args.limit=q;
1491                     }
1492 
1493                     /* keep prevCC because we removed the combining mark */
1494 
1495                     /* done? */
1496                     if(args.start==args.limit) {
1497                         return (char)prevCC;
1498                     }
1499 
1500                     /* is the composition a starter that combines forward? */
1501                     if(result>1) {
1502                        combineFwdIndex=getCombiningIndexFromStarter((char)value,
1503                                                                   (char)value2);
1504                     } else {
1505                        starter=-1;
1506                     }
1507 
1508                     /* we combined; continue with looking for compositions */
1509                     continue;
1510                 }
1511             }
1512 
1513             /* no combination this time */
1514             prevCC=ncArg.cc;
1515             if(args.start==args.limit) {
1516                 return (char)prevCC;
1517             }
1518 
1519             /* if (c, c2) did not combine, then check if it is a starter */
1520             if(ncArg.cc==0) {
1521                 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1522                 if((combineFlags&COMBINES_FWD)!=0) {
1523                     /* it may combine with something, prepare for it */
1524                     if(ncArg.c2==0) {
1525                         starterIsSupplementary=false;
1526                         starter=args.start-1;
1527                     } else {
1528                         starterIsSupplementary=false;
1529                         starter=args.start-2;
1530                     }
1531                     combineFwdIndex=combineBackIndex;
1532                 } else {
1533                     /* it will not combine with anything */
1534                     starter=-1;
1535                 }
1536             } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
1537                 /* FCC: no discontiguous compositions; any intervening character blocks */
1538                 starter=-1;
1539             }
1540         }
1541     }
1542 
1543     // find the last true starter between src[start]....src[current] going
1544     // backwards and return its index
1545     private static int findPreviousStarter(char[]src, int srcStart, int current,
1546                                           int/*unsigned*/ ccOrQCMask,
1547                                           int/*unsigned*/ decompQCMask,
1548                                           char minNoMaybe) {
1549        long norm32;
1550        PrevArgs args = new PrevArgs();
1551        args.src = src;
1552        args.start = srcStart;
1553        args.current = current;
1554 
1555        while(args.start<args.current) {
1556            norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
1557            if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1558                break;
1559            }
1560        }
1561        return args.current;
1562     }
1563 
1564     /* find the first true starter in [src..limit[ and return the
1565      * pointer to it
1566      */
1567     private static int/*index*/    findNextStarter(char[] src,int start,int limit,
1568                                                  int/*unsigned*/ qcMask,
1569                                                  int/*unsigned*/ decompQCMask,
1570                                                  char minNoMaybe) {
1571         int p;
1572         long/*unsigned*/ norm32;
1573         int ccOrQCMask;
1574         char c, c2;
1575 
1576         ccOrQCMask=CC_MASK|qcMask;
1577 
1578         DecomposeArgs decompArgs = new DecomposeArgs();
1579 
1580         for(;;) {
1581             if(start==limit) {
1582                 break; /* end of string */
1583             }
1584             c=src[start];
1585             if(c<minNoMaybe) {
1586                 break; /* catches NUL terminater, too */
1587             }
1588 
1589             norm32=getNorm32(c);
1590             if((norm32&ccOrQCMask)==0) {
1591                 break; /* true starter */
1592             }
1593 
1594             if(isNorm32LeadSurrogate(norm32)) {
1595                 /* c is a lead surrogate, get the real norm32 */
1596                 if((start+1)==limit ||
1597                                    !UTF16.isTrailSurrogate(c2=(src[start+1]))){
1598                     /* unmatched first surrogate: counts as a true starter */
1599                     break;
1600                 }
1601                 norm32=getNorm32FromSurrogatePair(norm32, c2);
1602 
1603                 if((norm32&ccOrQCMask)==0) {
1604                     break; /* true starter */
1605                 }
1606             } else {
1607                 c2=0;
1608             }
1609 
1610             /* (c, c2) is not a true starter but its decomposition may be */
1611             if((norm32&decompQCMask)!=0) {
1612                 /* (c, c2) decomposes, get everything from the variable-length
1613                  *  extra data */
1614                 p=decompose(norm32, decompQCMask, decompArgs);
1615 
1616                 /* get the first character's norm32 to check if it is a true
1617                  * starter */
1618                 if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
1619                     break; /* true starter */
1620                 }
1621             }
1622 
1623             start+= c2==0 ? 1 : 2; /* not a true starter, continue */
1624         }
1625 
1626         return start;
1627     }
1628 
1629 
1630     private static final class ComposePartArgs{
1631         int prevCC;
1632         int length;   /* length of decomposed part */
1633     }
1634 
1635      /* decompose and recompose [prevStarter..src[ */
1636     private static char[] composePart(ComposePartArgs args,
1637                                       int prevStarter,
1638                                          char[] src, int start, int limit,
1639                                        int options,
1640                                        UnicodeSet nx) {
1641         int recomposeLimit;
1642         boolean compat =((options&OPTIONS_COMPAT)!=0);
1643 
1644         /* decompose [prevStarter..src[ */
1645         int[] outTrailCC = new int[1];
1646         char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];
1647 
1648         for(;;){
1649             args.length=decompose(src,prevStarter,(start),
1650                                       buffer,0,buffer.length,
1651                                       compat,outTrailCC,nx);
1652             if(args.length<=buffer.length){
1653                 break;
1654             }else{
1655                 buffer = new char[args.length];
1656             }
1657         }
1658 
1659         /* recompose the decomposition */
1660         recomposeLimit=args.length;
1661 
1662         if(args.length>=2) {
1663             RecomposeArgs rcArgs = new RecomposeArgs();
1664             rcArgs.source    = buffer;
1665             rcArgs.start    = 0;
1666             rcArgs.limit    = recomposeLimit;
1667             args.prevCC=recompose(rcArgs, options, nx);
1668             recomposeLimit = rcArgs.limit;
1669         }
1670 
1671         /* return with a pointer to the recomposition and its length */
1672         args.length=recomposeLimit;
1673         return buffer;
1674     }
1675 
1676     private static boolean composeHangul(char prev, char c,
1677                                          long/*unsigned*/ norm32,
1678                                          char[] src,int[] srcIndex, int limit,
1679                                             boolean compat,
1680                                          char[] dest,int destIndex,
1681                                          UnicodeSet nx) {
1682         int start=srcIndex[0];
1683         if(isJamoVTNorm32JamoV(norm32)) {
1684             /* c is a Jamo V, compose with previous Jamo L and
1685              * following Jamo T */
1686             prev=(char)(prev-JAMO_L_BASE);
1687             if(prev<JAMO_L_COUNT) {
1688                 c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
1689                                                  (c-JAMO_V_BASE))*JAMO_T_COUNT);
1690 
1691                 /* check if the next character is a Jamo T (normal or
1692                  * compatibility) */
1693                 if(start!=limit) {
1694                     char next, t;
1695 
1696                     next=src[start];
1697                     if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1698                         /* normal Jamo T */
1699                         ++start;
1700                         c+=t;
1701                     } else if(compat) {
1702                         /* if NFKC, then check for compatibility Jamo T
1703                          * (BMP only) */
1704                         norm32=getNorm32(next);
1705                         if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
1706                             int p /*index into extra data array*/;
1707                             DecomposeArgs dcArgs = new DecomposeArgs();
1708                             p=decompose(norm32, QC_NFKD, dcArgs);
1709                             if(dcArgs.length==1 &&
1710                                    (t=(char)(extraData[p]-JAMO_T_BASE))
1711                                                    <JAMO_T_COUNT) {
1712                                 /* compatibility Jamo T */
1713                                 ++start;
1714                                 c+=t;
1715                             }
1716                         }
1717                     }
1718                 }
1719                 if(nx_contains(nx, c)) {
1720                     if(!isHangulWithoutJamoT(c)) {
1721                         --start; /* undo ++start from reading the Jamo T */
1722                     }
1723                     return false;
1724                 }
1725                 dest[destIndex]=c;
1726                 srcIndex[0]=start;
1727                 return true;
1728             }
1729         } else if(isHangulWithoutJamoT(prev)) {
1730             /* c is a Jamo T, compose with previous Hangul LV that does not
1731              * contain a Jamo T */
1732             c=(char)(prev+(c-JAMO_T_BASE));
1733             if(nx_contains(nx, c)) {
1734                 return false;
1735             }
1736             dest[destIndex]=c;
1737             srcIndex[0]=start;
1738             return true;
1739         }
1740         return false;
1741     }
1742     /*
1743     public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1744         return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1745     }
1746     */
1747 
1748     public static int compose(char[] src, int srcStart, int srcLimit,
1749                               char[] dest,int destStart,int destLimit,
1750                               int options,UnicodeSet nx) {
1751 
1752         int prevSrc, prevStarter;
1753         long/*unsigned*/ norm32;
1754         int ccOrQCMask, qcMask;
1755         int  reorderStartIndex, length;
1756         char c, c2, minNoMaybe;
1757         int/*unsigned byte*/ cc, prevCC;
1758         int[] ioIndex = new int[1];
1759         int destIndex = destStart;
1760         int srcIndex = srcStart;
1761 
1762         if((options&OPTIONS_COMPAT)!=0) {
1763             minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
1764             qcMask=QC_NFKC;
1765         } else {
1766             minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
1767             qcMask=QC_NFC;
1768         }
1769 
1770         /*
1771          * prevStarter points to the last character before the current one
1772          * that is a "true" starter with cc==0 and quick check "yes".
1773          *
1774          * prevStarter will be used instead of looking for a true starter
1775          * while incrementally decomposing [prevStarter..prevSrc[
1776          * in _composePart(). Having a good prevStarter allows to just decompose
1777          * the entire [prevStarter..prevSrc[.
1778          *
1779          * When _composePart() backs out from prevSrc back to prevStarter,
1780          * then it also backs out destIndex by the same amount.
1781          * Therefore, at all times, the (prevSrc-prevStarter) source units
1782          * must correspond 1:1 to destination units counted with destIndex,
1783          * except for reordering.
1784          * This is true for the qc "yes" characters copied in the fast loop,
1785          * and for pure reordering.
1786          * prevStarter must be set forward to src when this is not true:
1787          * In _composePart() and after composing a Hangul syllable.
1788          *
1789          * This mechanism relies on the assumption that the decomposition of a
1790          * true starter also begins with a true starter. gennorm/store.c checks
1791          * for this.
1792          */
1793         prevStarter=srcIndex;
1794 
1795         ccOrQCMask=CC_MASK|qcMask;
1796         /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
1797         prevCC=0;
1798 
1799         /* avoid compiler warnings */
1800         norm32=0;
1801         c=0;
1802 
1803         for(;;) {
1804             /* count code units below the minimum or with irrelevant data for
1805              * the quick check */
1806             prevSrc=srcIndex;
1807 
1808             while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
1809                      ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
1810                 prevCC=0;
1811                 ++srcIndex;
1812             }
1813 
1814 
1815             /* copy these code units all at once */
1816             if(srcIndex!=prevSrc) {
1817                 length=srcIndex-prevSrc;
1818                 if((destIndex+length)<=destLimit) {
1819                     System.arraycopy(src,prevSrc,dest,destIndex,length);
1820                 }
1821                 destIndex+=length;
1822                 reorderStartIndex=destIndex;
1823 
1824                 /* set prevStarter to the last character in the quick check
1825                  * loop */
1826                 prevStarter=srcIndex-1;
1827                 if(UTF16.isTrailSurrogate(src[prevStarter]) &&
1828                     prevSrc<prevStarter &&
1829                     UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
1830                     --prevStarter;
1831                 }
1832 
1833                 prevSrc=srcIndex;
1834             }
1835 
1836             /* end of source reached? */
1837             if(srcIndex==srcLimit) {
1838                 break;
1839             }
1840 
1841             /* c already contains *src and norm32 is set for it, increment src*/
1842             ++srcIndex;
1843 
1844             /*
1845              * source buffer pointers:
1846              *
1847              *  all done      quick check   current char  not yet
1848              *                "yes" but     (c, c2)       processed
1849              *                may combine
1850              *                forward
1851              * [-------------[-------------[-------------[-------------[
1852              * |             |             |             |             |
1853              * start         prevStarter   prevSrc       src           limit
1854              *
1855              *
1856              * destination buffer pointers and indexes:
1857              *
1858              *  all done      might take    not filled yet
1859              *                characters for
1860              *                reordering
1861              * [-------------[-------------[-------------[
1862              * |             |             |             |
1863              * dest      reorderStartIndex destIndex     destCapacity
1864              */
1865 
1866             /* check one above-minimum, relevant code unit */
1867             /*
1868              * norm32 is for c=*(src-1), and the quick check flag is "no" or
1869              * "maybe", and/or cc!=0
1870              * check for Jamo V/T, then for surrogates and regular characters
1871              * c is not a Hangul syllable or Jamo L because
1872              * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
1873              */
1874             if(isNorm32HangulOrJamo(norm32)) {
1875                 /*
1876                  * c is a Jamo V/T:
1877                  * try to compose with the previous character, Jamo V also with
1878                  * a following Jamo T, and set values here right now in case we
1879                  * just continue with the main loop
1880                  */
1881                 prevCC=cc=0;
1882                 reorderStartIndex=destIndex;
1883                 ioIndex[0]=srcIndex;
1884                 if(
1885                     destIndex>0 &&
1886                     composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
1887                                   srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
1888                                   destIndex<=destLimit ? destIndex-1: 0,
1889                                   nx)
1890                 ) {
1891                     srcIndex=ioIndex[0];
1892                     prevStarter=srcIndex;
1893                     continue;
1894                 }
1895 
1896                 srcIndex = ioIndex[0];
1897 
1898                 /* the Jamo V/T did not compose into a Hangul syllable, just
1899                  * append to dest */
1900                 c2=0;
1901                 length=1;
1902                 prevStarter=prevSrc;
1903             } else {
1904                 if(isNorm32Regular(norm32)) {
1905                     c2=0;
1906                     length=1;
1907                 } else {
1908                     /* c is a lead surrogate, get the real norm32 */
1909                     if(srcIndex!=srcLimit &&
1910                                      UTF16.isTrailSurrogate(c2=src[srcIndex])) {
1911                         ++srcIndex;
1912                         length=2;
1913                         norm32=getNorm32FromSurrogatePair(norm32, c2);
1914                     } else {
1915                         /* c is an unpaired lead surrogate, nothing to do */
1916                         c2=0;
1917                         length=1;
1918                         norm32=0;
1919                     }
1920                 }
1921                 ComposePartArgs args =new ComposePartArgs();
1922 
1923                 /* we are looking at the character (c, c2) at [prevSrc..src[ */
1924                 if(nx_contains(nx, c, c2)) {
1925                     /* excluded: norm32==0 */
1926                     cc=0;
1927                 } else if((norm32&qcMask)==0) {
1928                     cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
1929                 } else {
1930                     char[] p;
1931 
1932                     /*
1933                      * find appropriate boundaries around this character,
1934                      * decompose the source text from between the boundaries,
1935                      * and recompose it
1936                      *
1937                      * this puts the intermediate text into the side buffer because
1938                      * it might be longer than the recomposition end result,
1939                      * or the destination buffer may be too short or missing
1940                      *
1941                      * note that destIndex may be adjusted backwards to account
1942                      * for source text that passed the quick check but needed to
1943                      * take part in the recomposition
1944                      */
1945                     int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
1946                     /*
1947                      * find the last true starter in [prevStarter..src[
1948                      * it is either the decomposition of the current character (at prevSrc),
1949                      * or prevStarter
1950                      */
1951                     if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
1952                         prevStarter=prevSrc;
1953                     } else {
1954                         /* adjust destIndex: back out what had been copied with qc "yes" */
1955                         destIndex-=prevSrc-prevStarter;
1956                     }
1957 
1958                     /* find the next true starter in [src..limit[ */
1959                     srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
1960                                                decompQCMask, minNoMaybe);
1961                     //args.prevStarter = prevStarter;
1962                     args.prevCC    = prevCC;
1963                     //args.destIndex = destIndex;
1964                     args.length = length;
1965                     p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);
1966 
1967                     if(p==null) {
1968                         /* an error occurred (out of memory) */
1969                         break;
1970                     }
1971 
1972                     prevCC      = args.prevCC;
1973                     length      = args.length;
1974 
1975                     /* append the recomposed buffer contents to the destination
1976                      * buffer */
1977                     if((destIndex+args.length)<=destLimit) {
1978                         int i=0;
1979                         while(i<args.length) {
1980                             dest[destIndex++]=p[i++];
1981                             --length;
1982                         }
1983                     } else {
1984                         /* buffer overflow */
1985                         /* keep incrementing the destIndex for preflighting */
1986                         destIndex+=length;
1987                     }
1988 
1989                     prevStarter=srcIndex;
1990                     continue;
1991                 }
1992             }
1993 
1994             /* append the single code point (c, c2) to the destination buffer */
1995             if((destIndex+length)<=destLimit) {
1996                 if(cc!=0 && cc<prevCC) {
1997                     /* (c, c2) is out of order with respect to the preceding
1998                      * text */
1999                     int reorderSplit= destIndex;
2000                     destIndex+=length;
2001                     prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
2002                                          destIndex, c, c2, cc);
2003                 } else {
2004                     /* just append (c, c2) */
2005                     dest[destIndex++]=c;
2006                     if(c2!=0) {
2007                         dest[destIndex++]=c2;
2008                     }
2009                     prevCC=cc;
2010                 }
2011             } else {
2012                 /* buffer overflow */
2013                 /* keep incrementing the destIndex for preflighting */
2014                 destIndex+=length;
2015                 prevCC=cc;
2016             }
2017         }
2018 
2019         return destIndex - destStart;
2020     }
2021 
2022     public static int getCombiningClass(int c) {
2023         long norm32;
2024         norm32=getNorm32(c);
2025         return (int)((norm32>>CC_SHIFT)&0xFF);
2026     }
2027 
2028     public static boolean isFullCompositionExclusion(int c) {
2029         if(isFormatVersion_2_1) {
2030             int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
2031             return (aux & AUX_COMP_EX_MASK)!=0;
2032         } else {
2033             return false;
2034         }
2035     }
2036 
2037     public static boolean isCanonSafeStart(int c) {
2038         if(isFormatVersion_2_1) {
2039             int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2040             return (aux & AUX_UNSAFE_MASK)==0;
2041         } else {
2042             return false;
2043         }
2044     }
2045 
2046     /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2047     public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) {
2048         long /*unsigned int*/ norm32;
2049         mask = mask & UNSIGNED_INT_MASK;
2050         char aux;
2051 
2052         /* check conditions (a)..(e), see unormimp.h */
2053         norm32 = getNorm32(c);
2054 
2055         if((norm32&mask)!=0) {
2056             return false; /* fails (a)..(e), not skippable */
2057         }
2058 
2059         if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){
2060             return true; /* NF*D, passed (a)..(c), is skippable */
2061         }
2062         /* check conditions (a)..(e), see unormimp.h */
2063 
2064         /* NF*C/FCC, passed (a)..(e) */
2065         if((norm32& QC_NFD)==0) {
2066             return true; /* no canonical decomposition, is skippable */
2067         }
2068 
2069         /* check Hangul syllables algorithmically */
2070         if(isNorm32HangulOrJamo(norm32)) {
2071             /* Jamo passed (a)..(e) above, must be Hangul */
2072             return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
2073         }
2074 
2075         /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2076         /* NF*C, test (f) flag */
2077         if(!isFormatVersion_2_2) {
2078             return false; /* no (f) data, say not skippable to be safe */
2079         }
2080 
2081 
2082         aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2083         return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
2084 
2085         /* } else { FCC, test fcd<=1 instead of the above } */
2086     }
2087 
2088     public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2089         int c;
2090 
2091         /* add the start code point of each same-value range of each trie */
2092         //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2093         TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2094         RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2095 
2096         while(normIter.next(normResult)){
2097             set.add(normResult.start);
2098         }
2099 
2100         //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2101         TrieIterator fcdIter  = new TrieIterator(FCDTrieImpl.fcdTrie);
2102         RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2103 
2104         while(fcdIter.next(fcdResult)){
2105             set.add(fcdResult.start);
2106         }
2107 
2108         if(isFormatVersion_2_1){
2109             //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2110             TrieIterator auxIter  = new TrieIterator(AuxTrieImpl.auxTrie);
2111             RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2112             while(auxIter.next(auxResult)){
2113                 set.add(auxResult.start);
2114             }
2115         }
2116         /* add Hangul LV syllables and LV+1 because of skippables */
2117         for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
2118             set.add(c);
2119             set.add(c+1);
2120         }
2121         set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2122         return set; // for chaining
2123     }
2124 
2125     /**
2126      * Internal API, used in UCharacter.getIntPropertyValue().
2127      * @internal
2128      * @param c code point
2129      * @param modeValue numeric value compatible with Mode
2130      * @return numeric value compatible with QuickCheck
2131      */
2132     public static final int quickCheck(int c, int modeValue) {
2133         final int qcMask[/*UNORM_MODE_COUNT*/]={
2134             0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
2135         };
2136 
2137         int norm32=(int)getNorm32(c)&qcMask[modeValue];
2138 
2139         if(norm32==0) {
2140             return 1; // YES
2141         } else if((norm32&QC_ANY_NO)!=0) {
2142             return 0; // NO
2143         } else /* _NORM_QC_ANY_MAYBE */ {
2144             return 2; // MAYBE;
2145         }
2146     }
2147 
2148     private static int strCompare(char[] s1, int s1Start, int s1Limit,
2149                                   char[] s2, int s2Start, int s2Limit,
2150                                   boolean codePointOrder) {
2151 
2152         int start1, start2, limit1, limit2;
2153 
2154         char c1, c2;
2155 
2156         /* setup for fix-up */
2157         start1=s1Start;
2158         start2=s2Start;
2159 
2160         int length1, length2;
2161 
2162         length1 = s1Limit - s1Start;
2163         length2 = s2Limit - s2Start;
2164 
2165         int lengthResult;
2166 
2167         if(length1<length2) {
2168             lengthResult=-1;
2169             limit1=start1+length1;
2170         } else if(length1==length2) {
2171             lengthResult=0;
2172             limit1=start1+length1;
2173         } else /* length1>length2 */ {
2174             lengthResult=1;
2175             limit1=start1+length2;
2176         }
2177 
2178         if(s1==s2) {
2179             return lengthResult;
2180         }
2181 
2182         for(;;) {
2183             /* check pseudo-limit */
2184             if(s1Start==limit1) {
2185                 return lengthResult;
2186             }
2187 
2188             c1=s1[s1Start];
2189             c2=s2[s2Start];
2190             if(c1!=c2) {
2191                 break;
2192             }
2193             ++s1Start;
2194             ++s2Start;
2195         }
2196 
2197         /* setup for fix-up */
2198         limit1=start1+length1;
2199         limit2=start2+length2;
2200 
2201 
2202         /* if both values are in or above the surrogate range, fix them up */
2203         if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
2204             /* subtract 0x2800 from BMP code points to make them smaller than
2205              *  supplementary ones */
2206             if(
2207                 ( c1<=0xdbff && (s1Start+1)!=limit1 &&
2208                   UTF16.isTrailSurrogate(s1[(s1Start+1)])
2209                 ) ||
2210                 ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
2211                   UTF16.isLeadSurrogate(s1[(s1Start-1)])
2212                 )
2213             ) {
2214                 /* part of a surrogate pair, leave >=d800 */
2215             } else {
2216                 /* BMP code point - may be surrogate code point - make <d800 */
2217                 c1-=0x2800;
2218             }
2219 
2220             if(
2221                 ( c2<=0xdbff && (s2Start+1)!=limit2 &&
2222                   UTF16.isTrailSurrogate(s2[(s2Start+1)])
2223                 ) ||
2224                 ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
2225                   UTF16.isLeadSurrogate(s2[(s2Start-1)])
2226                 )
2227             ) {
2228                 /* part of a surrogate pair, leave >=d800 */
2229             } else {
2230                 /* BMP code point - may be surrogate code point - make <d800 */
2231                 c2-=0x2800;
2232             }
2233         }
2234 
2235         /* now c1 and c2 are in UTF-32-compatible order */
2236         return (int)c1-(int)c2;
2237     }
2238 
2239 
2240     /*
2241      * Status of tailored normalization
2242      *
2243      * This was done initially for investigation on Unicode public review issue 7
2244      * (http://www.unicode.org/review/). See Jitterbug 2481.
2245      * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
2246      * a permanent feature in ICU 2.6 in support of IDNA which requires true
2247      * Unicode 3.2 normalization.
2248      * (NormalizationCorrections are rolled into IDNA mapping tables.)
2249      *
2250      * Tailored normalization as implemented here allows to "normalize less"
2251      * than full Unicode normalization would.
2252      * Based internally on a UnicodeSet of code points that are
2253      * "excluded from normalization", the normalization functions leave those
2254      * code points alone ("inert"). This means that tailored normalization
2255      * still transforms text into a canonically equivalent form.
2256      * It does not add decompositions to code points that do not have any or
2257      * change decomposition results.
2258      *
2259      * Any function that searches for a safe boundary has not been touched,
2260      * which means that these functions will be over-pessimistic when
2261      * exclusions are applied.
2262      * This should not matter because subsequent checks and normalizations
2263      * do apply the exclusions; only a little more of the text may be processed
2264      * than necessary under exclusions.
2265      *
2266      * Normalization exclusions have the following effect on excluded code points c:
2267      * - c is not decomposed
2268      * - c is not a composition target
2269      * - c does not combine forward or backward for composition
2270      *   except that this is not implemented for Jamo
2271      * - c is treated as having a combining class of 0
2272      */
2273 
2274     /*
2275      * Constants for the bit fields in the options bit set parameter.
2276      * These need not be public.
2277      * A user only needs to know the currently assigned values.
2278      * The number and positions of reserved bits per field can remain private.
2279      */
2280     private static final int OPTIONS_NX_MASK=0x1f;
2281     private static final int OPTIONS_UNICODE_MASK=0xe0;
2282     public  static final int OPTIONS_SETS_MASK=0xff;
2283 //  private static final int OPTIONS_UNICODE_SHIFT=5;
2284     private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
2285 
2286     /* Constants for options flags for normalization.*/
2287 
2288     /**
2289      * Options bit 0, do not decompose Hangul syllables.
2290      * @draft ICU 2.6
2291      */
2292     private static final int NX_HANGUL = 1;
2293     /**
2294      * Options bit 1, do not decompose CJK compatibility characters.
2295      * @draft ICU 2.6
2296      */
2297     private static final int NX_CJK_COMPAT=2;
2298     /**
2299      * Options bit 8, use buggy recomposition described in
2300      * Unicode Public Review Issue #29
2301      * at http://www.unicode.org/review/resolved-pri.html#pri29
2302      *
2303      * Used in IDNA implementation according to strict interpretation
2304      * of IDNA definition based on Unicode 3.2 which predates PRI #29.
2305      *
2306      * See ICU4C unormimp.h
2307      *
2308      * @draft ICU 3.2
2309      */
2310     public static final int BEFORE_PRI_29=0x100;
2311 
2312     /*
2313      * The following options are used only in some composition functions.
2314      * They use bits 12 and up to preserve lower bits for the available options
2315      * space in unorm_compare() -
2316      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
2317      */
2318 
2319     /** Options bit 12, for compatibility vs. canonical decomposition. */
2320     public static final int OPTIONS_COMPAT=0x1000;
2321     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
2322     public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;
2323 
2324     /* normalization exclusion sets --------------------------------------------- */
2325 
2326     /*
2327      * Normalization exclusion UnicodeSets are used for tailored normalization;
2328      * see the comment near the beginning of this file.
2329      *
2330      * By specifying one or several sets of code points,
2331      * those code points become inert for normalization.
2332      */
2333     private static final synchronized UnicodeSet internalGetNXHangul() {
2334         /* internal function, does not check for incoming U_FAILURE */
2335 
2336         if(nxCache[NX_HANGUL]==null) {
2337              nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
2338         }
2339         return nxCache[NX_HANGUL];
2340     }
2341 
2342     private static final synchronized UnicodeSet internalGetNXCJKCompat() {
2343         /* internal function, does not check for incoming U_FAILURE */
2344 
2345         if(nxCache[NX_CJK_COMPAT]==null) {
2346 
2347             /* build a set from [CJK Ideographs]&[has canonical decomposition] */
2348             UnicodeSet set, hasDecomp;
2349 
2350             set=new UnicodeSet("[:Ideographic:]");
2351 
2352             /* start with an empty set for [has canonical decomposition] */
2353             hasDecomp=new UnicodeSet();
2354 
2355             /* iterate over all ideographs and remember which canonically decompose */
2356             UnicodeSetIterator it = new UnicodeSetIterator(set);
2357             int start, end;
2358             long norm32;
2359 
2360             while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
2361                 start=it.codepoint;
2362                 end=it.codepointEnd;
2363                 while(start<=end) {
2364                     norm32 = getNorm32(start);
2365                     if((norm32 & QC_NFD)>0) {
2366                         hasDecomp.add(start);
2367                     }
2368                     ++start;
2369                 }
2370             }
2371 
2372             /* hasDecomp now contains all ideographs that decompose canonically */
2373              nxCache[NX_CJK_COMPAT]=hasDecomp;
2374 
2375         }
2376 
2377         return nxCache[NX_CJK_COMPAT];
2378     }
2379 
2380     private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
2381         options &= OPTIONS_UNICODE_MASK;
2382         if(options==0) {
2383             return null;
2384         }
2385 
2386         if(nxCache[options]==null) {
2387             /* build a set with all code points that were not designated by the specified Unicode version */
2388             UnicodeSet set = new UnicodeSet();
2389 
2390             switch(options) {
2391             case NormalizerBase.UNICODE_3_2:
2392                 set.applyPattern("[:^Age=3.2:]");
2393                 break;
2394             default:
2395                 return null;
2396             }
2397 
2398             nxCache[options]=set;
2399         }
2400 
2401         return nxCache[options];
2402     }
2403 
2404     /* Get a decomposition exclusion set. The data must be loaded. */
2405     private static final synchronized UnicodeSet internalGetNX(int options) {
2406         options&=OPTIONS_SETS_MASK;
2407 
2408         if(nxCache[options]==null) {
2409             /* return basic sets */
2410             if(options==NX_HANGUL) {
2411                 return internalGetNXHangul();
2412             }
2413             if(options==NX_CJK_COMPAT) {
2414                 return internalGetNXCJKCompat();
2415             }
2416             if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
2417                 return internalGetNXUnicode(options);
2418             }
2419 
2420             /* build a set from multiple subsets */
2421             UnicodeSet set;
2422             UnicodeSet other;
2423 
2424             set=new UnicodeSet();
2425 
2426 
2427             if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
2428                 set.addAll(other);
2429             }
2430             if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
2431                 set.addAll(other);
2432             }
2433             if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
2434                 set.addAll(other);
2435             }
2436 
2437                nxCache[options]=set;
2438         }
2439         return nxCache[options];
2440     }
2441 
2442     public static final UnicodeSet getNX(int options) {
2443         if((options&=OPTIONS_SETS_MASK)==0) {
2444             /* incoming failure, or no decomposition exclusions requested */
2445             return null;
2446         } else {
2447             return internalGetNX(options);
2448         }
2449     }
2450 
2451     private static final boolean nx_contains(UnicodeSet nx, int c) {
2452         return nx!=null && nx.contains(c);
2453     }
2454 
2455     private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
2456         return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
2457     }
2458 
2459 /*****************************************************************************/
2460 
2461     /**
2462      * Get the canonical decomposition
2463      * sherman  for ComposedCharIter
2464      */
2465 
2466     public static int getDecompose(int chars[], String decomps[]) {
2467         DecomposeArgs args = new DecomposeArgs();
2468         int length=0;
2469         long norm32 = 0;
2470         int ch = -1;
2471         int index = 0;
2472         int i = 0;
2473 
2474         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
2475             //TBD !!!! the hack code heres save us about 50ms for startup
2476             //need a better solution/lookup
2477             if (ch == 0x30ff)
2478                 ch = 0xf900;
2479             else if (ch == 0x10000)
2480                 ch = 0x1d15e;
2481             else if (ch == 0x1d1c1)
2482                 ch = 0x2f800;
2483 
2484             norm32 = NormalizerImpl.getNorm32(ch);
2485             if((norm32 & QC_NFD)!=0 && i < chars.length) {
2486                 chars[i] = ch;
2487                 index = decompose(norm32, args);
2488                 decomps[i++] = new String(extraData,index, args.length);
2489             }
2490         }
2491         return i;
2492     }
2493 
2494     //------------------------------------------------------
2495     // special method for Collation
2496     //------------------------------------------------------
2497     private static boolean needSingleQuotation(char c) {
2498         return (c >= 0x0009 && c <= 0x000D) ||
2499                (c >= 0x0020 && c <= 0x002F) ||
2500                (c >= 0x003A && c <= 0x0040) ||
2501                (c >= 0x005B && c <= 0x0060) ||
2502                (c >= 0x007B && c <= 0x007E);
2503     }
2504 
2505     public static String canonicalDecomposeWithSingleQuotation(String string) {
2506         char[] src = string.toCharArray();
2507         int    srcIndex = 0;
2508         int    srcLimit = src.length;
2509         char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
2510         int    destIndex = 0;
2511         int    destLimit = dest.length;
2512 
2513         char[] buffer = new char[3];
2514         int prevSrc;
2515         long norm32;
2516         int ccOrQCMask;
2517         int qcMask = QC_NFD;
2518         int reorderStartIndex, length;
2519         char c, c2;
2520         char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE];
2521         int cc, prevCC, trailCC;
2522         char[] p;
2523         int pStart;
2524 
2525 
2526         // initialize
2527         ccOrQCMask = CC_MASK | qcMask;
2528         reorderStartIndex = 0;
2529         prevCC = 0;
2530         norm32 = 0;
2531         c = 0;
2532         pStart = 0;
2533 
2534         cc = trailCC = -1; // initialize to bogus value
2535         for(;;) {
2536             prevSrc=srcIndex;
2537             //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
2538             while (srcIndex != srcLimit &&
2539                    (( c = src[srcIndex]) < minNoMaybe ||
2540                     ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 ||
2541                     ( c >= '\uac00' && c <= '\ud7a3'))){
2542 
2543                 prevCC = 0;
2544                 ++srcIndex;
2545             }
2546 
2547             // copy these code units all at once
2548             if (srcIndex != prevSrc) {
2549                 length = srcIndex - prevSrc;
2550                 if ((destIndex + length) <= destLimit) {
2551                     System.arraycopy(src,prevSrc,dest,destIndex,length);
2552                 }
2553 
2554                 destIndex += length;
2555                 reorderStartIndex = destIndex;
2556             }
2557 
2558             // end of source reached?
2559             if(srcIndex == srcLimit) {
2560                 break;
2561             }
2562             // c already contains *src and norm32 is set for it, increment src
2563             ++srcIndex;
2564 
2565             if(isNorm32Regular(norm32)) {
2566                 c2 = 0;
2567                 length = 1;
2568             } else {
2569                 // c is a lead surrogate, get the real norm32
2570                 if(srcIndex != srcLimit &&
2571                     Character.isLowSurrogate(c2 = src[srcIndex])) {
2572                         ++srcIndex;
2573                         length = 2;
2574                         norm32 = getNorm32FromSurrogatePair(norm32, c2);
2575                 } else {
2576                     c2 = 0;
2577                     length = 1;
2578                     norm32 = 0;
2579                 }
2580             }
2581 
2582             // get the decomposition and the lead and trail cc's
2583             if((norm32 & qcMask) == 0) {
2584                 // c does not decompose
2585                 cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2586                 p = null;
2587                 pStart = -1;
2588             } else {
2589                 DecomposeArgs arg = new DecomposeArgs();
2590                 // c decomposes, get everything from the variable-length
2591                 // extra data
2592                 pStart = decompose(norm32, qcMask, arg);
2593                 p = extraData;
2594                 length = arg.length;
2595                 cc = arg.cc;
2596                 trailCC = arg.trailCC;
2597                 if(length == 1) {
2598                     // fastpath a single code unit from decomposition
2599                     c = p[pStart];
2600                     c2 = 0;
2601                     p = null;
2602                     pStart = -1;
2603                 }
2604             }
2605 
2606             if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
2607                 // buffer overflow
2608                 char[] tmpBuf = new char[destLimit * 2];
2609                 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
2610                 dest = tmpBuf;
2611                 destLimit = dest.length;
2612             }
2613             // append the decomposition to the destination buffer, assume length>0
2614             {
2615                 int reorderSplit = destIndex;
2616                 if(p == null) {
2617                     // fastpath: single code point
2618                     if (needSingleQuotation(c)) {
2619                         //if we need single quotation, no need to consider "prevCC"
2620                         //and it must NOT be a supplementary pair
2621                         dest[destIndex++] = '\'';
2622                         dest[destIndex++] = c;
2623                         dest[destIndex++] = '\'';
2624                         trailCC = 0;
2625                     } else if(cc != 0 && cc < prevCC) {
2626                         // (c, c2) is out of order with respect to the preceding
2627                         //  text
2628                         destIndex += length;
2629                         trailCC = insertOrdered(dest,reorderStartIndex,
2630                                                 reorderSplit, destIndex, c, c2, cc);
2631                     } else {
2632                         // just append (c, c2)
2633                         dest[destIndex++] = c;
2634                         if(c2 != 0) {
2635                             dest[destIndex++] = c2;
2636                         }
2637                     }
2638                 } else {
2639                     // general: multiple code points (ordered by themselves)
2640                     // from decomposition
2641                     if (needSingleQuotation(p[pStart])) {
2642                         dest[destIndex++] = '\'';
2643                         dest[destIndex++] = p[pStart++];
2644                         dest[destIndex++] = '\'';
2645                         length--;
2646                         do {
2647                             dest[destIndex++] = p[pStart++];
2648                         } while(--length > 0);
2649                     } else
2650                     if(cc != 0 && cc < prevCC) {
2651                         destIndex += length;
2652                         trailCC = mergeOrdered(dest,reorderStartIndex,
2653                                                reorderSplit,p, pStart,pStart+length);
2654                     } else {
2655                         // just append the decomposition
2656                         do {
2657                             dest[destIndex++] = p[pStart++];
2658                         } while(--length > 0);
2659                     }
2660                 }
2661             }
2662             prevCC = trailCC;
2663             if(prevCC == 0) {
2664                 reorderStartIndex = destIndex;
2665             }
2666         }
2667         return new String(dest, 0, destIndex);
2668     }
2669 
2670     //------------------------------------------------------
2671     // mapping method for IDNA/StringPrep
2672     //------------------------------------------------------
2673 
2674     /*
2675      * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
2676      * 3.2 normalization with Corrigendum 4 corrections. However, normalization
2677      * without the corrections is necessary for IDNA/StringPrep support.
2678      * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
2679      * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
2680      * characters in Corrigendum 4 before normalization in order to avoid
2681      * incorrect normalization.
2682      * For the Corrigendum 4 issue, refer
2683      *   http://www.unicode.org/versions/corrigendum4.html
2684      */
2685 
2686     /*
2687      * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
2688      */
2689     public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000;
2690 
2691     private static final char[][] corrigendum4MappingTable = {
2692         {'\uD844', '\uDF6A'},  // 0x2F868
2693         {'\u5F33'},            // 0x2F874
2694         {'\u43AB'},            // 0x2F91F
2695         {'\u7AAE'},            // 0x2F95F
2696         {'\u4D57'}};           // 0x2F9BF
2697 
2698     /*
2699      * Removing Corrigendum 4 fix
2700      * @return normalized text
2701      */
2702     public static String convert(String str) {
2703         if (str == null) {
2704             return null;
2705         }
2706 
2707         int ch  = UCharacterIterator.DONE;
2708         StringBuffer dest = new StringBuffer();
2709         UCharacterIterator iter = UCharacterIterator.getInstance(str);
2710 
2711         while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
2712             switch (ch) {
2713             case 0x2F868:
2714                 dest.append(corrigendum4MappingTable[0]);
2715                 break;
2716             case 0x2F874:
2717                 dest.append(corrigendum4MappingTable[1]);
2718                 break;
2719             case 0x2F91F:
2720                 dest.append(corrigendum4MappingTable[2]);
2721                 break;
2722             case 0x2F95F:
2723                 dest.append(corrigendum4MappingTable[3]);
2724                 break;
2725             case 0x2F9BF:
2726                 dest.append(corrigendum4MappingTable[4]);
2727                 break;
2728             default:
2729                 UTF16.append(dest,ch);
2730                 break;
2731             }
2732         }
2733 
2734         return dest.toString();
2735     }
2736 }