1 /*
   2  * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 
  33 package sun.text.normalizer;
  34 
  35 import java.io.IOException;
  36 import java.nio.ByteBuffer;
  37 import java.text.Normalizer;
  38 
  39 // Original filename in ICU4J: Normalizer2Impl.java
  40 public final class NormalizerImpl {
  41 
  42     public static final class Hangul {
  43         /* Korean Hangul and Jamo constants */
  44         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
  45         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
  46         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
  47 
  48         public static final int HANGUL_BASE=0xac00;
  49         public static final int HANGUL_END=0xd7a3;
  50 
  51         public static final int JAMO_L_COUNT=19;
  52         public static final int JAMO_V_COUNT=21;
  53         public static final int JAMO_T_COUNT=28;
  54 
  55         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
  56         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
  57 
  58         public static boolean isHangul(int c) {
  59             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
  60         }
  61 
  62         public static boolean isHangulWithoutJamoT(char c) {
  63             c-=HANGUL_BASE;
  64             return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
  65         }
  66 
  67         /**
  68          * Decomposes c, which must be a Hangul syllable, into buffer
  69          * and returns the length of the decomposition (2 or 3).
  70          */
  71         public static int decompose(int c, Appendable buffer) {
  72             try {
  73                 c-=HANGUL_BASE;
  74                 int c2=c%JAMO_T_COUNT;
  75                 c/=JAMO_T_COUNT;
  76                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
  77                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
  78                 if(c2==0) {
  79                     return 2;
  80                 } else {
  81                     buffer.append((char)(JAMO_T_BASE+c2));
  82                     return 3;
  83                 }
  84             } catch(IOException e) {
  85                 throw new InternalError(e);
  86             }
  87         }
  88     }
  89 
  90     /**
  91      * Writable buffer that takes care of canonical ordering.
  92      * Its Appendable methods behave like the C++ implementation's
  93      * appendZeroCC() methods.
  94      * <p>
  95      * If dest is a StringBuilder, then the buffer writes directly to it.
  96      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
  97      * until no further changes are necessary and whole segments are appended.
  98      * append() methods that take combining-class values always write to the StringBuilder.
  99      * Other append() methods flush and append to the Appendable.
 100      */
 101     public static final class ReorderingBuffer implements Appendable {
 102         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
 103             impl=ni;
 104             app=dest;
 105             if (app instanceof StringBuilder) {
 106                 appIsStringBuilder=true;
 107                 str=(StringBuilder)dest;
 108                 // In Java, the constructor subsumes public void init(int destCapacity)
 109                 str.ensureCapacity(destCapacity);
 110                 reorderStart=0;
 111                 if(str.length()==0) {
 112                     lastCC=0;
 113                 } else {
 114                     setIterator();
 115                     lastCC=previousCC();
 116                     // Set reorderStart after the last code point with cc<=1 if there is one.
 117                     if(lastCC>1) {
 118                         while(previousCC()>1) {}
 119                     }
 120                     reorderStart=codePointLimit;
 121                 }
 122             } else {
 123                 appIsStringBuilder=false;
 124                 str=new StringBuilder();
 125                 reorderStart=0;
 126                 lastCC=0;
 127             }
 128         }
 129 
 130         public boolean isEmpty() { return str.length()==0; }
 131         public int length() { return str.length(); }
 132         public int getLastCC() { return lastCC; }
 133 
 134         public StringBuilder getStringBuilder() { return str; }
 135 
 136         public boolean equals(CharSequence s, int start, int limit) {
 137             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 138         }
 139 
 140         // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
 141         public void setLastChar(char c) {
 142             str.setCharAt(str.length()-1, c);
 143         }
 144 
 145         public void append(int c, int cc) {
 146             if(lastCC<=cc || cc==0) {
 147                 str.appendCodePoint(c);
 148                 lastCC=cc;
 149                 if(cc<=1) {
 150                     reorderStart=str.length();
 151                 }
 152             } else {
 153                 insert(c, cc);
 154             }
 155         }
 156 
 157         // s must be in NFD, otherwise change the implementation.
 158         public void append(CharSequence s, int start, int limit,
 159                            int leadCC, int trailCC) {
 160             if(start==limit) {
 161                 return;
 162             }
 163             if(lastCC<=leadCC || leadCC==0) {
 164                 if(trailCC<=1) {
 165                     reorderStart=str.length()+(limit-start);
 166                 } else if(leadCC<=1) {
 167                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 168                 }
 169                 str.append(s, start, limit);
 170                 lastCC=trailCC;
 171             } else {
 172                 int c=Character.codePointAt(s, start);
 173                 start+=Character.charCount(c);
 174                 insert(c, leadCC);  // insert first code point
 175                 while(start<limit) {
 176                     c=Character.codePointAt(s, start);
 177                     start+=Character.charCount(c);
 178                     if(start<limit) {
 179                         // s must be in NFD, otherwise we need to use getCC().
 180                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
 181                     } else {
 182                         leadCC=trailCC;
 183                     }
 184                     append(c, leadCC);
 185                 }
 186             }
 187         }
 188 
 189         // The following append() methods work like C++ appendZeroCC().
 190         // They assume that the cc or trailCC of their input is 0.
 191         // Most of them implement Appendable interface methods.
 192         // @Override when we switch to Java 6
 193         public ReorderingBuffer append(char c) {
 194             str.append(c);
 195             lastCC=0;
 196             reorderStart=str.length();
 197             return this;
 198         }
 199 
 200         public void appendZeroCC(int c) {
 201             str.appendCodePoint(c);
 202             lastCC=0;
 203             reorderStart=str.length();
 204         }
 205 
 206         // @Override when we switch to Java 6
 207         public ReorderingBuffer append(CharSequence s) {
 208             if(s.length()!=0) {
 209                 str.append(s);
 210                 lastCC=0;
 211                 reorderStart=str.length();
 212             }
 213             return this;
 214         }
 215 
 216         // @Override when we switch to Java 6
 217         public ReorderingBuffer append(CharSequence s, int start, int limit) {
 218             if(start!=limit) {
 219                 str.append(s, start, limit);
 220                 lastCC=0;
 221                 reorderStart=str.length();
 222             }
 223             return this;
 224         }
 225 
 226         /**
 227          * Flushes from the intermediate StringBuilder to the Appendable,
 228          * if they are different objects.
 229          * Used after recomposition.
 230          * Must be called at the end when writing to a non-StringBuilder Appendable.
 231          */
 232         public void flush() {
 233             if(appIsStringBuilder) {
 234                 reorderStart=str.length();
 235             } else {
 236                 try {
 237                     app.append(str);
 238                     str.setLength(0);
 239                     reorderStart=0;
 240                 } catch(IOException e) {
 241                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 242                 }
 243             }
 244             lastCC=0;
 245         }
 246 
 247         /**
 248          * Flushes from the intermediate StringBuilder to the Appendable,
 249          * if they are different objects.
 250          * Then appends the new text to the Appendable or StringBuilder.
 251          * Normally used after quick check loops find a non-empty sequence.
 252          */
 253         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
 254             if(appIsStringBuilder) {
 255                 str.append(s, start, limit);
 256                 reorderStart=str.length();
 257             } else {
 258                 try {
 259                     app.append(str).append(s, start, limit);
 260                     str.setLength(0);
 261                     reorderStart=0;
 262                 } catch(IOException e) {
 263                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 264                 }
 265             }
 266             lastCC=0;
 267             return this;
 268         }
 269 
 270         public void remove() {
 271             str.setLength(0);
 272             lastCC=0;
 273             reorderStart=0;
 274         }
 275 
 276         public void removeSuffix(int suffixLength) {
 277             int oldLength=str.length();
 278             str.delete(oldLength-suffixLength, oldLength);
 279             lastCC=0;
 280             reorderStart=str.length();
 281         }
 282 
 283         // Inserts c somewhere before the last character.
 284         // Requires 0<cc<lastCC which implies reorderStart<limit.
 285         private void insert(int c, int cc) {
 286             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 287             // insert c at codePointLimit, after the character with prevCC<=cc
 288             if(c<=0xffff) {
 289                 str.insert(codePointLimit, (char)c);
 290                 if(cc<=1) {
 291                     reorderStart=codePointLimit+1;
 292                 }
 293             } else {
 294                 str.insert(codePointLimit, Character.toChars(c));
 295                 if(cc<=1) {
 296                     reorderStart=codePointLimit+2;
 297                 }
 298             }
 299         }
 300 
 301         private final NormalizerImpl impl;
 302         private final Appendable app;
 303         private final StringBuilder str;
 304         private final boolean appIsStringBuilder;
 305         private int reorderStart;
 306         private int lastCC;
 307 
 308         // private backward iterator
 309         private void setIterator() { codePointStart=str.length(); }
 310         private void skipPrevious() {  // Requires 0<codePointStart.
 311             codePointLimit=codePointStart;
 312             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 313         }
 314         private int previousCC() {  // Returns 0 if there is no previous character.
 315             codePointLimit=codePointStart;
 316             if(reorderStart>=codePointStart) {
 317                 return 0;
 318             }
 319             int c=str.codePointBefore(codePointStart);
 320             codePointStart-=Character.charCount(c);
 321             if(c<MIN_CCC_LCCC_CP) {
 322                 return 0;
 323             }
 324             return getCCFromYesOrMaybe(impl.getNorm16(c));
 325         }
 326 
 327         private int codePointStart, codePointLimit;
 328     }
 329 
 330     // TODO: Propose as public API on the UTF16 class.
 331     // TODO: Propose widening UTF16 methods that take char to take int.
 332     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 333     public static final class UTF16Plus {
 334         /**
 335          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 336          * is it a lead surrogate?
 337          * @param c code unit or code point
 338          * @return true or false
 339          */
 340         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 341 
 342         /**
 343          * Compares two CharSequence subsequences for binary equality.
 344          * @param s1 first sequence
 345          * @param start1 start offset in first sequence
 346          * @param limit1 limit offset in first sequence
 347          * @param s2 second sequence
 348          * @param start2 start offset in second sequence
 349          * @param limit2 limit offset in second sequence
 350          * @return true if s1.subSequence(start1, limit1) contains the same text
 351          *              as s2.subSequence(start2, limit2)
 352          */
 353         public static boolean equal(CharSequence s1, int start1, int limit1,
 354                                     CharSequence s2, int start2, int limit2) {
 355             if((limit1-start1)!=(limit2-start2)) {
 356                 return false;
 357             }
 358             if(s1==s2 && start1==start2) {
 359                 return true;
 360             }
 361             while(start1<limit1) {
 362                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 363                     return false;
 364                 }
 365             }
 366             return true;
 367         }
 368     }
 369 
 370     public NormalizerImpl() {}
 371 
 372     private static final class IsAcceptable implements ICUBinary.Authenticate {
 373         // @Override when we switch to Java 6
 374         public boolean isDataVersionAcceptable(byte version[]) {
 375             return version[0]==2;
 376         }
 377     }
 378 
 379     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 380     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 381 
 382     public NormalizerImpl load(ByteBuffer bytes) {
 383         try {
 384             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 385             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 386             if(indexesLength<=IX_MIN_MAYBE_YES) {
 387                 throw new IOException("Normalizer2 data: not enough indexes");
 388             }
 389             int[] inIndexes=new int[indexesLength];
 390             inIndexes[0]=indexesLength*4;
 391             for(int i=1; i<indexesLength; ++i) {
 392                 inIndexes[i]=bytes.getInt();
 393             }
 394 
 395             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 396             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 397 
 398             minYesNo=inIndexes[IX_MIN_YES_NO];
 399             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 400             minNoNo=inIndexes[IX_MIN_NO_NO];
 401             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 402             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 403 
 404             // Read the normTrie.
 405             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 406             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 407             normTrie=Trie2_16.createFromSerialized(bytes);
 408             int trieLength=normTrie.getSerializedLength();
 409             if(trieLength>(nextOffset-offset)) {
 410                 throw new IOException("Normalizer2 data: not enough bytes for normTrie");
 411             }
 412             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 413 
 414             // Read the composition and mapping data.
 415             offset=nextOffset;
 416             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 417             int numChars=(nextOffset-offset)/2;
 418             char[] chars;
 419             if(numChars!=0) {
 420                 chars=new char[numChars];
 421                 for(int i=0; i<numChars; ++i) {
 422                     chars[i]=bytes.getChar();
 423                 }
 424                 maybeYesCompositions=new String(chars);
 425                 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
 426             }
 427 
 428             // smallFCD: new in formatVersion 2
 429             offset=nextOffset;
 430             smallFCD=new byte[0x100];
 431             for(int i=0; i<0x100; ++i) {
 432                 smallFCD[i]=bytes.get();
 433             }
 434 
 435             // Build tccc180[].
 436             // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
 437             tccc180=new int[0x180];
 438             int bits=0;
 439             for(int c=0; c<0x180; bits>>=1) {
 440                 if((c&0xff)==0) {
 441                     bits=smallFCD[c>>8];  // one byte per 0x100 code points
 442                 }
 443                 if((bits&1)!=0) {
 444                     for(int i=0; i<0x20; ++i, ++c) {
 445                         tccc180[c]=getFCD16FromNormData(c)&0xff;
 446                     }
 447                 } else {
 448                     c+=0x20;
 449                 }
 450             }
 451 
 452             return this;
 453         } catch(IOException e) {
 454             throw new InternalError(e);
 455         }
 456     }
 457 
 458     public NormalizerImpl load(String name) {
 459         return load(ICUBinary.getRequiredData(name));
 460     }
 461 
 462     public int getNorm16(int c) {
 463         return normTrie.get(c);
 464     }
 465 
 466     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 467 
 468     public int getCC(int norm16) {
 469         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 470             return norm16&0xff;
 471         }
 472         if(norm16<minNoNo || limitNoNo<=norm16) {
 473             return 0;
 474         }
 475         return getCCFromNoNo(norm16);
 476     }
 477 
 478     public static int getCCFromYesOrMaybe(int norm16) {
 479         return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
 480     }
 481 
 482     /**
 483      * Returns the FCD data for code point c.
 484      * @param c A Unicode code point.
 485      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
 486      */
 487     public int getFCD16(int c) {
 488         if(c<0) {
 489             return 0;
 490         } else if(c<0x180) {
 491             return tccc180[c];
 492         } else if(c<=0xffff) {
 493             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
 494         }
 495         return getFCD16FromNormData(c);
 496     }
 497 
 498     /** Returns the FCD data for U+0000<=c<U+0180. */
 499     public int getFCD16FromBelow180(int c) { return tccc180[c]; }
 500     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
 501     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
 502         // 0<=lead<=0xffff
 503         byte bits=smallFCD[lead>>8];
 504         if(bits==0) { return false; }
 505         return ((bits>>((lead>>5)&7))&1)!=0;
 506     }
 507 
 508     /** Gets the FCD value from the regular normalization data. */
 509     public int getFCD16FromNormData(int c) {
 510         // Only loops for 1:1 algorithmic mappings.
 511         for(;;) {
 512             int norm16=getNorm16(c);
 513             if(norm16<=minYesNo) {
 514                 // no decomposition or Hangul syllable, all zeros
 515                 return 0;
 516             } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
 517                 // combining mark
 518                 norm16&=0xff;
 519                 return norm16|(norm16<<8);
 520             } else if(norm16>=minMaybeYes) {
 521                 return 0;
 522             } else if(isDecompNoAlgorithmic(norm16)) {
 523                 c=mapAlgorithmic(c, norm16);
 524             } else {
 525                 // c decomposes, get everything from the variable-length extra data
 526                 int firstUnit=extraData.charAt(norm16);
 527                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
 528                     // A character that is deleted (maps to an empty string) must
 529                     // get the worst-case lccc and tccc values because arbitrary
 530                     // characters on both sides will become adjacent.
 531                     return 0x1ff;
 532                 } else {
 533                     int fcd16=firstUnit>>8;  // tccc
 534                     if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 535                         fcd16|=extraData.charAt(norm16-1)&0xff00;  // lccc
 536                     }
 537                     return fcd16;
 538                 }
 539             }
 540         }
 541     }
 542 
 543     /**
 544      * Gets the decomposition for one code point.
 545      * @param c code point
 546      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 547      */
 548     public String getDecomposition(int c) {
 549         int decomp=-1;
 550         int norm16;
 551         for(;;) {
 552             if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 553                 // c does not decompose
 554             } else if(isHangul(norm16)) {
 555                 // Hangul syllable: decompose algorithmically
 556                 StringBuilder buffer=new StringBuilder();
 557                 Hangul.decompose(c, buffer);
 558                 return buffer.toString();
 559             } else if(isDecompNoAlgorithmic(norm16)) {
 560                 decomp=c=mapAlgorithmic(c, norm16);
 561                 continue;
 562             } else {
 563                 // c decomposes, get everything from the variable-length extra data
 564                 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
 565                 return extraData.substring(norm16, norm16+length);
 566             }
 567             if(decomp<0) {
 568                 return null;
 569             } else {
 570                 return UTF16.valueOf(decomp);
 571             }
 572         }
 573     }
 574 
 575     public static final int MIN_CCC_LCCC_CP=0x300;
 576 
 577     public static final int MIN_YES_YES_WITH_CC=0xff01;
 578     public static final int JAMO_VT=0xff00;
 579     public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
 580     public static final int MAX_DELTA=0x40;
 581 
 582     // Byte offsets from the start of the data, after the generic header.
 583     public static final int IX_NORM_TRIE_OFFSET=0;
 584     public static final int IX_EXTRA_DATA_OFFSET=1;
 585     public static final int IX_SMALL_FCD_OFFSET=2;
 586 
 587     // Code point thresholds for quick check codes.
 588     public static final int IX_MIN_DECOMP_NO_CP=8;
 589     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
 590 
 591     // Norm16 value thresholds for quick check combinations and types of extra data.
 592     // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
 593     public static final int IX_MIN_YES_NO=10;
 594     public static final int IX_MIN_NO_NO=11;
 595     public static final int IX_LIMIT_NO_NO=12;
 596     public static final int IX_MIN_MAYBE_YES=13;
 597 
 598     // Mappings only in [minYesNoMappingsOnly..minNoNo[.
 599     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
 600 
 601     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
 602     public static final int MAPPING_LENGTH_MASK=0x1f;
 603 
 604     public static final int COMP_1_LAST_TUPLE=0x8000;
 605     public static final int COMP_1_TRIPLE=1;
 606     public static final int COMP_1_TRAIL_LIMIT=0x3400;
 607     public static final int COMP_1_TRAIL_MASK=0x7ffe;
 608     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
 609     public static final int COMP_2_TRAIL_SHIFT=6;
 610     public static final int COMP_2_TRAIL_MASK=0xffc0;
 611 
 612     // higher-level functionality ------------------------------------------ ***
 613 
 614     /**
 615      * Decomposes s[src, limit[ and writes the result to dest.
 616      * limit can be NULL if src is NUL-terminated.
 617      * destLengthEstimate is the initial dest buffer capacity and can be -1.
 618      */
 619     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
 620                    int destLengthEstimate) {
 621         if(destLengthEstimate<0) {
 622             destLengthEstimate=limit-src;
 623         }
 624         dest.setLength(0);
 625         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
 626         decompose(s, src, limit, buffer);
 627     }
 628 
 629     // Dual functionality:
 630     // buffer!=NULL: normalize
 631     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
 632     public int decompose(CharSequence s, int src, int limit,
 633                          ReorderingBuffer buffer) {
 634         int minNoCP=minDecompNoCP;
 635 
 636         int prevSrc;
 637         int c=0;
 638         int norm16=0;
 639 
 640         // only for quick check
 641         int prevBoundary=src;
 642         int prevCC=0;
 643 
 644         for(;;) {
 645             // count code units below the minimum or with irrelevant data for the quick check
 646             for(prevSrc=src; src!=limit;) {
 647                 if( (c=s.charAt(src))<minNoCP ||
 648                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 649                 ) {
 650                     ++src;
 651                 } else if(!UTF16.isSurrogate((char)c)) {
 652                     break;
 653                 } else {
 654                     char c2;
 655                     if(UTF16Plus.isSurrogateLead(c)) {
 656                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
 657                             c=Character.toCodePoint((char)c, c2);
 658                         }
 659                     } else /* trail surrogate */ {
 660                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
 661                             --src;
 662                             c=Character.toCodePoint(c2, (char)c);
 663                         }
 664                     }
 665                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
 666                         src+=Character.charCount(c);
 667                     } else {
 668                         break;
 669                     }
 670                 }
 671             }
 672             // copy these code units all at once
 673             if(src!=prevSrc) {
 674                 if(buffer!=null) {
 675                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
 676                 } else {
 677                     prevCC=0;
 678                     prevBoundary=src;
 679                 }
 680             }
 681             if(src==limit) {
 682                 break;
 683             }
 684 
 685             // Check one above-minimum, relevant code point.
 686             src+=Character.charCount(c);
 687             if(buffer!=null) {
 688                 decompose(c, norm16, buffer);
 689             } else {
 690                 if(isDecompYes(norm16)) {
 691                     int cc=getCCFromYesOrMaybe(norm16);
 692                     if(prevCC<=cc || cc==0) {
 693                         prevCC=cc;
 694                         if(cc<=1) {
 695                             prevBoundary=src;
 696                         }
 697                         continue;
 698                     }
 699                 }
 700                 return prevBoundary;  // "no" or cc out of order
 701             }
 702         }
 703         return src;
 704     }
 705 
 706     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
 707         int limit=s.length();
 708         if(limit==0) {
 709             return;
 710         }
 711         if(doDecompose) {
 712             decompose(s, 0, limit, buffer);
 713             return;
 714         }
 715         // Just merge the strings at the boundary.
 716         int c=Character.codePointAt(s, 0);
 717         int src=0;
 718         int firstCC, prevCC, cc;
 719         firstCC=prevCC=cc=getCC(getNorm16(c));
 720         while(cc!=0) {
 721             prevCC=cc;
 722             src+=Character.charCount(c);
 723             if(src>=limit) {
 724                 break;
 725             }
 726             c=Character.codePointAt(s, src);
 727             cc=getCC(getNorm16(c));
 728         };
 729         buffer.append(s, 0, src, firstCC, prevCC);
 730         buffer.append(s, src, limit);
 731     }
 732 
 733     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 734     // doCompose: normalize
 735     // !doCompose: isNormalized (buffer must be empty and initialized)
 736     public boolean compose(CharSequence s, int src, int limit,
 737                            boolean onlyContiguous,
 738                            boolean doCompose,
 739                            ReorderingBuffer buffer) {
 740         int minNoMaybeCP=minCompNoMaybeCP;
 741 
 742         /*
 743          * prevBoundary points to the last character before the current one
 744          * that has a composition boundary before it with ccc==0 and quick check "yes".
 745          * Keeping track of prevBoundary saves us looking for a composition boundary
 746          * when we find a "no" or "maybe".
 747          *
 748          * When we back out from prevSrc back to prevBoundary,
 749          * then we also remove those same characters (which had been simply copied
 750          * or canonically-order-inserted) from the ReorderingBuffer.
 751          * Therefore, at all times, the [prevBoundary..prevSrc[ source units
 752          * must correspond 1:1 to destination units at the end of the destination buffer.
 753          */
 754         int prevBoundary=src;
 755         int prevSrc;
 756         int c=0;
 757         int norm16=0;
 758 
 759         // only for isNormalized
 760         int prevCC=0;
 761 
 762         for(;;) {
 763             // count code units below the minimum or with irrelevant data for the quick check
 764             for(prevSrc=src; src!=limit;) {
 765                 if( (c=s.charAt(src))<minNoMaybeCP ||
 766                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 767                 ) {
 768                     ++src;
 769                 } else if(!UTF16.isSurrogate((char)c)) {
 770                     break;
 771                 } else {
 772                     char c2;
 773                     if(UTF16Plus.isSurrogateLead(c)) {
 774                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
 775                             c=Character.toCodePoint((char)c, c2);
 776                         }
 777                     } else /* trail surrogate */ {
 778                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
 779                             --src;
 780                             c=Character.toCodePoint(c2, (char)c);
 781                         }
 782                     }
 783                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
 784                         src+=Character.charCount(c);
 785                     } else {
 786                         break;
 787                     }
 788                 }
 789             }
 790             // copy these code units all at once
 791             if(src!=prevSrc) {
 792                 if(src==limit) {
 793                     if(doCompose) {
 794                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
 795                     }
 796                     break;
 797                 }
 798                 // Set prevBoundary to the last character in the quick check loop.
 799                 prevBoundary=src-1;
 800                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
 801                     Character.isHighSurrogate(s.charAt(prevBoundary-1))
 802                 ) {
 803                     --prevBoundary;
 804                 }
 805                 if(doCompose) {
 806                     // The last "quick check yes" character is excluded from the
 807                     // flush-and-append call in case it needs to be modified.
 808                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
 809                     buffer.append(s, prevBoundary, src);
 810                 } else {
 811                     prevCC=0;
 812                 }
 813                 // The start of the current character (c).
 814                 prevSrc=src;
 815             } else if(src==limit) {
 816                 break;
 817             }
 818 
 819             src+=Character.charCount(c);
 820             /*
 821              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 822              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
 823              * or has ccc!=0.
 824              * Check for Jamo V/T, then for regular characters.
 825              * c is not a Hangul syllable or Jamo L because those have "yes" properties.
 826              */
 827             if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
 828                 char prev=s.charAt(prevSrc-1);
 829                 boolean needToDecompose=false;
 830                 if(c<Hangul.JAMO_T_BASE) {
 831                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
 832                     prev-=Hangul.JAMO_L_BASE;
 833                     if(prev<Hangul.JAMO_L_COUNT) {
 834                         if(!doCompose) {
 835                             return false;
 836                         }
 837                         char syllable=(char)
 838                             (Hangul.HANGUL_BASE+
 839                              (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
 840                              Hangul.JAMO_T_COUNT);
 841                         char t;
 842                         if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
 843                             ++src;
 844                             syllable+=t;  // The next character was a Jamo T.
 845                             prevBoundary=src;
 846                             buffer.setLastChar(syllable);
 847                             continue;
 848                         }
 849                         // If we see L+V+x where x!=T then we drop to the slow path,
 850                         // decompose and recompose.
 851                         // This is to deal with NFKC finding normal L and V but a
 852                         // compatibility variant of a T. We need to either fully compose that
 853                         // combination here (which would complicate the code and may not work
 854                         // with strange custom data) or use the slow path -- or else our replacing
 855                         // two input characters (L+V) with one output character (LV syllable)
 856                         // would violate the invariant that [prevBoundary..prevSrc[ has the same
 857                         // length as what we appended to the buffer since prevBoundary.
 858                         needToDecompose=true;
 859                     }
 860                 } else if(Hangul.isHangulWithoutJamoT(prev)) {
 861                     // c is a Jamo Trailing consonant,
 862                     // compose with previous Hangul LV that does not contain a Jamo T.
 863                     if(!doCompose) {
 864                         return false;
 865                     }
 866                     buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
 867                     prevBoundary=src;
 868                     continue;
 869                 }
 870                 if(!needToDecompose) {
 871                     // The Jamo V/T did not compose into a Hangul syllable.
 872                     if(doCompose) {
 873                         buffer.append((char)c);
 874                     } else {
 875                         prevCC=0;
 876                     }
 877                     continue;
 878                 }
 879             }
 880             /*
 881              * Source buffer pointers:
 882              *
 883              *  all done      quick check   current char  not yet
 884              *                "yes" but     (c)           processed
 885              *                may combine
 886              *                forward
 887              * [-------------[-------------[-------------[-------------[
 888              * |             |             |             |             |
 889              * orig. src     prevBoundary  prevSrc       src           limit
 890              *
 891              *
 892              * Destination buffer pointers inside the ReorderingBuffer:
 893              *
 894              *  all done      might take    not filled yet
 895              *                characters for
 896              *                reordering
 897              * [-------------[-------------[-------------[
 898              * |             |             |             |
 899              * start         reorderStart  limit         |
 900              *                             +remainingCap.+
 901              */
 902             if(norm16>=MIN_YES_YES_WITH_CC) {
 903                 int cc=norm16&0xff;  // cc!=0
 904                 if( onlyContiguous &&  // FCC
 905                     (doCompose ? buffer.getLastCC() : prevCC)==0 &&
 906                     prevBoundary<prevSrc &&
 907                     // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
 908                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
 909                     // passed the quick check "yes && ccc==0" test.
 910                     // Check whether the last character was a "yesYes" or a "yesNo".
 911                     // If a "yesNo", then we get its trailing ccc from its
 912                     // mapping and check for canonical order.
 913                     // All other cases are ok.
 914                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
 915                 ) {
 916                     // Fails FCD test, need to decompose and contiguously recompose.
 917                     if(!doCompose) {
 918                         return false;
 919                     }
 920                 } else if(doCompose) {
 921                     buffer.append(c, cc);
 922                     continue;
 923                 } else if(prevCC<=cc) {
 924                     prevCC=cc;
 925                     continue;
 926                 } else {
 927                     return false;
 928                 }
 929             } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
 930                 return false;
 931             }
 932 
 933             /*
 934              * Find appropriate boundaries around this character,
 935              * decompose the source text from between the boundaries,
 936              * and recompose it.
 937              *
 938              * We may need to remove the last few characters from the ReorderingBuffer
 939              * to account for source text that was copied or appended
 940              * but needs to take part in the recomposition.
 941              */
 942 
 943             /*
 944              * Find the last composition boundary in [prevBoundary..src[.
 945              * It is either the decomposition of the current character (at prevSrc),
 946              * or prevBoundary.
 947              */
 948             if(hasCompBoundaryBefore(c, norm16)) {
 949                 prevBoundary=prevSrc;
 950             } else if(doCompose) {
 951                 buffer.removeSuffix(prevSrc-prevBoundary);
 952             }
 953 
 954             // Find the next composition boundary in [src..limit[ -
 955             // modifies src to point to the next starter.
 956             src=findNextCompBoundary(s, src, limit);
 957 
 958             // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
 959             int recomposeStartIndex=buffer.length();
 960             decomposeShort(s, prevBoundary, src, buffer);
 961             recompose(buffer, recomposeStartIndex, onlyContiguous);
 962             if(!doCompose) {
 963                 if(!buffer.equals(s, prevBoundary, src)) {
 964                     return false;
 965                 }
 966                 buffer.remove();
 967                 prevCC=0;
 968             }
 969 
 970             // Move to the next starter. We never need to look back before this point again.
 971             prevBoundary=src;
 972         }
 973         return true;
 974     }
 975 
 976     /**
 977      * Very similar to compose(): Make the same changes in both places if relevant.
 978      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
 979      * !doSpan: quickCheck
 980      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
 981      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 982      *         then the quick check result is "no"
 983      */
 984     public int composeQuickCheck(CharSequence s, int src, int limit,
 985                                  boolean onlyContiguous, boolean doSpan) {
 986         int qcResult=0;
 987         int minNoMaybeCP=minCompNoMaybeCP;
 988 
 989         /*
 990          * prevBoundary points to the last character before the current one
 991          * that has a composition boundary before it with ccc==0 and quick check "yes".
 992          */
 993         int prevBoundary=src;
 994         int prevSrc;
 995         int c=0;
 996         int norm16=0;
 997         int prevCC=0;
 998 
 999         for(;;) {
1000             // count code units below the minimum or with irrelevant data for the quick check
1001             for(prevSrc=src;;) {
1002                 if(src==limit) {
1003                     return (src<<1)|qcResult;  // "yes" or "maybe"
1004                 }
1005                 if( (c=s.charAt(src))<minNoMaybeCP ||
1006                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1007                 ) {
1008                     ++src;
1009                 } else if(!UTF16.isSurrogate((char)c)) {
1010                     break;
1011                 } else {
1012                     char c2;
1013                     if(UTF16Plus.isSurrogateLead(c)) {
1014                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1015                             c=Character.toCodePoint((char)c, c2);
1016                         }
1017                     } else /* trail surrogate */ {
1018                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1019                             --src;
1020                             c=Character.toCodePoint(c2, (char)c);
1021                         }
1022                     }
1023                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1024                         src+=Character.charCount(c);
1025                     } else {
1026                         break;
1027                     }
1028                 }
1029             }
1030             if(src!=prevSrc) {
1031                 // Set prevBoundary to the last character in the quick check loop.
1032                 prevBoundary=src-1;
1033                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1034                         Character.isHighSurrogate(s.charAt(prevBoundary-1))
1035                 ) {
1036                     --prevBoundary;
1037                 }
1038                 prevCC=0;
1039                 // The start of the current character (c).
1040                 prevSrc=src;
1041             }
1042 
1043             src+=Character.charCount(c);
1044             /*
1045              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1046              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1047              * or has ccc!=0.
1048              */
1049             if(isMaybeOrNonZeroCC(norm16)) {
1050                 int cc=getCCFromYesOrMaybe(norm16);
1051                 if( onlyContiguous &&  // FCC
1052                     cc!=0 &&
1053                     prevCC==0 &&
1054                     prevBoundary<prevSrc &&
1055                     // prevCC==0 && prevBoundary<prevSrc tell us that
1056                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1057                     // passed the quick check "yes && ccc==0" test.
1058                     // Check whether the last character was a "yesYes" or a "yesNo".
1059                     // If a "yesNo", then we get its trailing ccc from its
1060                     // mapping and check for canonical order.
1061                     // All other cases are ok.
1062                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1063                 ) {
1064                     // Fails FCD test.
1065                 } else if(prevCC<=cc || cc==0) {
1066                     prevCC=cc;
1067                     if(norm16<MIN_YES_YES_WITH_CC) {
1068                         if(!doSpan) {
1069                             qcResult=1;
1070                         } else {
1071                             return prevBoundary<<1;  // spanYes does not care to know it's "maybe"
1072                         }
1073                     }
1074                     continue;
1075                 }
1076             }
1077             return prevBoundary<<1;  // "no"
1078         }
1079     }
1080 
1081     public void composeAndAppend(CharSequence s,
1082                                  boolean doCompose,
1083                                  boolean onlyContiguous,
1084                                  ReorderingBuffer buffer) {
1085         int src=0, limit=s.length();
1086         if(!buffer.isEmpty()) {
1087             int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
1088             if(0!=firstStarterInSrc) {
1089                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1090                                                                buffer.length());
1091                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1092                                                        firstStarterInSrc+16);
1093                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1094                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1095                 middle.append(s, 0, firstStarterInSrc);
1096                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1097                 src=firstStarterInSrc;
1098             }
1099         }
1100         if(doCompose) {
1101             compose(s, src, limit, onlyContiguous, true, buffer);
1102         } else {
1103             buffer.append(s, src, limit);
1104         }
1105     }
1106 
1107     // Dual functionality:
1108     // buffer!=NULL: normalize
1109     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1110     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1111         // Note: In this function we use buffer->appendZeroCC() because we track
1112         // the lead and trail combining classes here, rather than leaving it to
1113         // the ReorderingBuffer.
1114         // The exception is the call to decomposeShort() which uses the buffer
1115         // in the normal way.
1116 
1117         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1118         // Similar to the prevBoundary in the compose() implementation.
1119         int prevBoundary=src;
1120         int prevSrc;
1121         int c=0;
1122         int prevFCD16=0;
1123         int fcd16=0;
1124 
1125         for(;;) {
1126             // count code units with lccc==0
1127             for(prevSrc=src; src!=limit;) {
1128                 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
1129                     prevFCD16=~c;
1130                     ++src;
1131                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1132                     prevFCD16=0;
1133                     ++src;
1134                 } else {
1135                     if(UTF16.isSurrogate((char)c)) {
1136                         char c2;
1137                         if(UTF16Plus.isSurrogateLead(c)) {
1138                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1139                                 c=Character.toCodePoint((char)c, c2);
1140                             }
1141                         } else /* trail surrogate */ {
1142                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1143                                 --src;
1144                                 c=Character.toCodePoint(c2, (char)c);
1145                             }
1146                         }
1147                     }
1148                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1149                         prevFCD16=fcd16;
1150                         src+=Character.charCount(c);
1151                     } else {
1152                         break;
1153                     }
1154               }
1155             }
1156             // copy these code units all at once
1157             if(src!=prevSrc) {
1158                 if(src==limit) {
1159                     if(buffer!=null) {
1160                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1161                     }
1162                     break;
1163                 }
1164                 prevBoundary=src;
1165                 // We know that the previous character's lccc==0.
1166                 if(prevFCD16<0) {
1167                     // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1168                     int prev=~prevFCD16;
1169                     prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1170                     if(prevFCD16>1) {
1171                         --prevBoundary;
1172                     }
1173                 } else {
1174                     int p=src-1;
1175                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1176                         Character.isHighSurrogate(s.charAt(p-1))
1177                     ) {
1178                         --p;
1179                         // Need to fetch the previous character's FCD value because
1180                         // prevFCD16 was just for the trail surrogate code point.
1181                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1182                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1183                     }
1184                     if(prevFCD16>1) {
1185                         prevBoundary=p;
1186                     }
1187                 }
1188                 if(buffer!=null) {
1189                     // The last lccc==0 character is excluded from the
1190                     // flush-and-append call in case it needs to be modified.
1191                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1192                     buffer.append(s, prevBoundary, src);
1193                 }
1194                 // The start of the current character (c).
1195                 prevSrc=src;
1196             } else if(src==limit) {
1197                 break;
1198             }
1199 
1200             src+=Character.charCount(c);
1201             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1202             // Check for proper order, and decompose locally if necessary.
1203             if((prevFCD16&0xff)<=(fcd16>>8)) {
1204                 // proper order: prev tccc <= current lccc
1205                 if((fcd16&0xff)<=1) {
1206                     prevBoundary=src;
1207                 }
1208                 if(buffer!=null) {
1209                     buffer.appendZeroCC(c);
1210                 }
1211                 prevFCD16=fcd16;
1212                 continue;
1213             } else if(buffer==null) {
1214                 return prevBoundary;  // quick check "no"
1215             } else {
1216                 /*
1217                  * Back out the part of the source that we copied or appended
1218                  * already but is now going to be decomposed.
1219                  * prevSrc is set to after what was copied/appended.
1220                  */
1221                 buffer.removeSuffix(prevSrc-prevBoundary);
1222                 /*
1223                  * Find the part of the source that needs to be decomposed,
1224                  * up to the next safe boundary.
1225                  */
1226                 src=findNextFCDBoundary(s, src, limit);
1227                 /*
1228                  * The source text does not fulfill the conditions for FCD.
1229                  * Decompose and reorder a limited piece of the text.
1230                  */
1231                 decomposeShort(s, prevBoundary, src, buffer);
1232                 prevBoundary=src;
1233                 prevFCD16=0;
1234             }
1235         }
1236         return src;
1237     }
1238 
1239     // Note: hasDecompBoundary() could be implemented as aliases to
1240     // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
1241     // at the cost of building the FCD trie for a decomposition normalizer.
1242     public boolean hasDecompBoundary(int c, boolean before) {
1243         for(;;) {
1244             if(c<minDecompNoCP) {
1245                 return true;
1246             }
1247             int norm16=getNorm16(c);
1248             if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
1249                 return true;
1250             } else if(norm16>MIN_NORMAL_MAYBE_YES) {
1251                 return false;  // ccc!=0
1252             } else if(isDecompNoAlgorithmic(norm16)) {
1253                 c=mapAlgorithmic(c, norm16);
1254             } else {
1255                 // c decomposes, get everything from the variable-length extra data
1256                 int firstUnit=extraData.charAt(norm16);
1257                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1258                     return false;
1259                 }
1260                 if(!before) {
1261                     // decomp after-boundary: same as hasFCDBoundaryAfter(),
1262                     // fcd16<=1 || trailCC==0
1263                     if(firstUnit>0x1ff) {
1264                         return false;  // trailCC>1
1265                     }
1266                     if(firstUnit<=0xff) {
1267                         return true;  // trailCC==0
1268                     }
1269                     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1270                 }
1271                 // true if leadCC==0 (hasFCDBoundaryBefore())
1272                 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
1273             }
1274         }
1275     }
1276 
1277     public boolean hasCompBoundaryBefore(int c) {
1278         return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
1279     }
1280 
1281     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1282     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1283     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1284     private boolean isHangul(int norm16) { return norm16==minYesNo; }
1285     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1286 
1287     // UBool isCompYes(uint16_t norm16) const {
1288     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1289     // }
1290     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1291     //     return norm16<minNoNo || minMaybeYes<=norm16;
1292     // }
1293     // private boolean hasZeroCCFromDecompYes(int norm16) {
1294     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1295     // }
1296     private boolean isDecompYesAndZeroCC(int norm16) {
1297         return norm16<minYesNo ||
1298                norm16==JAMO_VT ||
1299                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1300     }
1301 
1302     /**
1303      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1304      * the MaybeYes which combine-forward and have ccc=0.
1305      * (Standard Unicode 5.2 normalization does not have such characters.)
1306      */
1307     private boolean isMostDecompYesAndZeroCC(int norm16) {
1308         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1309     }
1310 
1311     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1312 
1313     // For use with isCompYes().
1314     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1315     // static uint8_t getCCFromYes(uint16_t norm16) {
1316     //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
1317     // }
1318     private int getCCFromNoNo(int norm16) {
1319         if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1320             return extraData.charAt(norm16-1)&0xff;
1321         } else {
1322             return 0;
1323         }
1324     }
1325 
1326     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
1327     int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
1328         int c;
1329         if(cpStart==(cpLimit-1)) {
1330             c=s.charAt(cpStart);
1331         } else {
1332             c=Character.codePointAt(s, cpStart);
1333         }
1334         int prevNorm16=getNorm16(c);
1335         if(prevNorm16<=minYesNo) {
1336             return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
1337         } else {
1338             return extraData.charAt(prevNorm16)>>8;  // tccc from yesNo
1339         }
1340     }
1341 
1342     // Requires algorithmic-NoNo.
1343     private int mapAlgorithmic(int c, int norm16) {
1344         return c+norm16-(minMaybeYes-MAX_DELTA-1);
1345     }
1346 
1347     // Requires minYesNo<norm16<limitNoNo.
1348     // private int getMapping(int norm16) { return /*extraData+*/norm16; }
1349 
1350     /**
1351      * @return index into maybeYesCompositions, or -1
1352      */
1353     private int getCompositionsListForDecompYes(int norm16) {
1354         if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
1355             return -1;
1356         } else {
1357             if((norm16-=minMaybeYes)<0) {
1358                 // norm16<minMaybeYes: index into extraData which is a substring at
1359                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1360                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1361                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1362             }
1363             return norm16;
1364         }
1365     }
1366 
1367     /**
1368      * @return index into maybeYesCompositions
1369      */
1370     private int getCompositionsListForComposite(int norm16) {
1371         // composite has both mapping & compositions list
1372         int firstUnit=extraData.charAt(norm16);
1373         return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+  // mapping in maybeYesCompositions
1374             1+  // +1 to skip the first unit with the mapping lenth
1375             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1376     }
1377 
1378     // Decompose a short piece of text which is likely to contain characters that
1379     // fail the quick check loop and/or where the quick check loop's overhead
1380     // is unlikely to be amortized.
1381     // Called by the compose() and makeFCD() implementations.
1382     // Public in Java for collation implementation code.
1383     public void decomposeShort(CharSequence s, int src, int limit,
1384                                ReorderingBuffer buffer) {
1385         while(src<limit) {
1386             int c=Character.codePointAt(s, src);
1387             src+=Character.charCount(c);
1388             decompose(c, getNorm16(c), buffer);
1389         }
1390     }
1391 
1392     private void decompose(int c, int norm16,
1393                            ReorderingBuffer buffer) {
1394         // Only loops for 1:1 algorithmic mappings.
1395         for(;;) {
1396             // get the decomposition and the lead and trail cc's
1397             if(isDecompYes(norm16)) {
1398                 // c does not decompose
1399                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1400             } else if(isHangul(norm16)) {
1401                 // Hangul syllable: decompose algorithmically
1402                 Hangul.decompose(c, buffer);
1403             } else if(isDecompNoAlgorithmic(norm16)) {
1404                 c=mapAlgorithmic(c, norm16);
1405                 norm16=getNorm16(c);
1406                 continue;
1407             } else {
1408                 // c decomposes, get everything from the variable-length extra data
1409                 int firstUnit=extraData.charAt(norm16);
1410                 int length=firstUnit&MAPPING_LENGTH_MASK;
1411                 int leadCC, trailCC;
1412                 trailCC=firstUnit>>8;
1413                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1414                     leadCC=extraData.charAt(norm16-1)>>8;
1415                 } else {
1416                     leadCC=0;
1417                 }
1418                 ++norm16;  // skip over the firstUnit
1419                 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
1420             }
1421             return;
1422         }
1423     }
1424 
1425     /**
1426      * Finds the recomposition result for
1427      * a forward-combining "lead" character,
1428      * specified with a pointer to its compositions list,
1429      * and a backward-combining "trail" character.
1430      *
1431      * <p>If the lead and trail characters combine, then this function returns
1432      * the following "compositeAndFwd" value:
1433      * <pre>
1434      * Bits 21..1  composite character
1435      * Bit      0  set if the composite is a forward-combining starter
1436      * </pre>
1437      * otherwise it returns -1.
1438      *
1439      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1440      * encoded as either pairs or triples of 16-bit units.
1441      * The last entry has the high bit of its first unit set.
1442      *
1443      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1444      * A linear search is used.
1445      *
1446      * <p>See normalizer2impl.h for a more detailed description
1447      * of the compositions list format.
1448      */
1449     private static int combine(String compositions, int list, int trail) {
1450         int key1, firstUnit;
1451         if(trail<COMP_1_TRAIL_LIMIT) {
1452             // trail character is 0..33FF
1453             // result entry may have 2 or 3 units
1454             key1=(trail<<1);
1455             while(key1>(firstUnit=compositions.charAt(list))) {
1456                 list+=2+(firstUnit&COMP_1_TRIPLE);
1457             }
1458             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1459                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1460                     return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1461                 } else {
1462                     return compositions.charAt(list+1);
1463                 }
1464             }
1465         } else {
1466             // trail character is 3400..10FFFF
1467             // result entry has 3 units
1468             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1469             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1470             int secondUnit;
1471             for(;;) {
1472                 if(key1>(firstUnit=compositions.charAt(list))) {
1473                     list+=2+(firstUnit&COMP_1_TRIPLE);
1474                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1475                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1476                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1477                             break;
1478                         } else {
1479                             list+=3;
1480                         }
1481                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1482                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1483                     } else {
1484                         break;
1485                     }
1486                 } else {
1487                     break;
1488                 }
1489             }
1490         }
1491         return -1;
1492     }
1493 
1494     /*
1495      * Recomposes the buffer text starting at recomposeStartIndex
1496      * (which is in NFD - decomposed and canonically ordered),
1497      * and truncates the buffer contents.
1498      *
1499      * Note that recomposition never lengthens the text:
1500      * Any character consists of either one or two code units;
1501      * a composition may contain at most one more code unit than the original starter,
1502      * while the combining mark that is removed has at least one code unit.
1503      */
1504     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1505                            boolean onlyContiguous) {
1506         StringBuilder sb=buffer.getStringBuilder();
1507         int p=recomposeStartIndex;
1508         if(p==sb.length()) {
1509             return;
1510         }
1511 
1512         int starter, pRemove;
1513         int compositionsList;
1514         int c, compositeAndFwd;
1515         int norm16;
1516         int cc, prevCC;
1517         boolean starterIsSupplementary;
1518 
1519         // Some of the following variables are not used until we have a forward-combining starter
1520         // and are only initialized now to avoid compiler warnings.
1521         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1522         starter=-1;
1523         starterIsSupplementary=false;
1524         prevCC=0;
1525 
1526         for(;;) {
1527             c=sb.codePointAt(p);
1528             p+=Character.charCount(c);
1529             norm16=getNorm16(c);
1530             cc=getCCFromYesOrMaybe(norm16);
1531             if( // this character combines backward and
1532                 isMaybe(norm16) &&
1533                 // we have seen a starter that combines forward and
1534                 compositionsList>=0 &&
1535                 // the backward-combining character is not blocked
1536                 (prevCC<cc || prevCC==0)) {
1537                 if(isJamoVT(norm16)) {
1538                     // c is a Jamo V/T, see if we can compose it with the previous character.
1539                     if(c<Hangul.JAMO_T_BASE) {
1540                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1541                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1542                         if(prev<Hangul.JAMO_L_COUNT) {
1543                             pRemove=p-1;
1544                             char syllable=(char)
1545                                 (Hangul.HANGUL_BASE+
1546                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1547                                  Hangul.JAMO_T_COUNT);
1548                             char t;
1549                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1550                                 ++p;
1551                                 syllable+=t;  // The next character was a Jamo T.
1552                             }
1553                             sb.setCharAt(starter, syllable);
1554                             // remove the Jamo V/T
1555                             sb.delete(pRemove, p);
1556                             p=pRemove;
1557                         }
1558                     }
1559                     /*
1560                      * No "else" for Jamo T:
1561                      * Since the input is in NFD, there are no Hangul LV syllables that
1562                      * a Jamo T could combine with.
1563                      * All Jamo Ts are combined above when handling Jamo Vs.
1564                      */
1565                     if(p==sb.length()) {
1566                         break;
1567                     }
1568                     compositionsList=-1;
1569                     continue;
1570                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1571                     // The starter and the combining mark (c) do combine.
1572                     int composite=compositeAndFwd>>1;
1573 
1574                     // Remove the combining mark.
1575                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1576                     sb.delete(pRemove, p);
1577                     p=pRemove;
1578                     // Replace the starter with the composite.
1579                     if(starterIsSupplementary) {
1580                         if(composite>0xffff) {
1581                             // both are supplementary
1582                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1583                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1584                         } else {
1585                             sb.setCharAt(starter, (char)c);
1586                             sb.deleteCharAt(starter+1);
1587                             // The composite is shorter than the starter,
1588                             // move the intermediate characters forward one.
1589                             starterIsSupplementary=false;
1590                             --p;
1591                         }
1592                     } else if(composite>0xffff) {
1593                         // The composite is longer than the starter,
1594                         // move the intermediate characters back one.
1595                         starterIsSupplementary=true;
1596                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1597                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1598                         ++p;
1599                     } else {
1600                         // both are on the BMP
1601                         sb.setCharAt(starter, (char)composite);
1602                     }
1603 
1604                     // Keep prevCC because we removed the combining mark.
1605 
1606                     if(p==sb.length()) {
1607                         break;
1608                     }
1609                     // Is the composite a starter that combines forward?
1610                     if((compositeAndFwd&1)!=0) {
1611                         compositionsList=
1612                             getCompositionsListForComposite(getNorm16(composite));
1613                     } else {
1614                         compositionsList=-1;
1615                     }
1616 
1617                     // We combined; continue with looking for compositions.
1618                     continue;
1619                 }
1620             }
1621 
1622             // no combination this time
1623             prevCC=cc;
1624             if(p==sb.length()) {
1625                 break;
1626             }
1627 
1628             // If c did not combine, then check if it is a starter.
1629             if(cc==0) {
1630                 // Found a new starter.
1631                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1632                     // It may combine with something, prepare for it.
1633                     if(c<=0xffff) {
1634                         starterIsSupplementary=false;
1635                         starter=p-1;
1636                     } else {
1637                         starterIsSupplementary=true;
1638                         starter=p-2;
1639                     }
1640                 }
1641             } else if(onlyContiguous) {
1642                 // FCC: no discontiguous compositions; any intervening character blocks.
1643                 compositionsList=-1;
1644             }
1645         }
1646         buffer.flush();
1647     }
1648 
1649     /**
1650      * Does c have a composition boundary before it?
1651      * True if its decomposition begins with a character that has
1652      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1653      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1654      * (isCompYesAndZeroCC()) so we need not decompose.
1655      */
1656     private boolean hasCompBoundaryBefore(int c, int norm16) {
1657         for(;;) {
1658             if(isCompYesAndZeroCC(norm16)) {
1659                 return true;
1660             } else if(isMaybeOrNonZeroCC(norm16)) {
1661                 return false;
1662             } else if(isDecompNoAlgorithmic(norm16)) {
1663                 c=mapAlgorithmic(c, norm16);
1664                 norm16=getNorm16(c);
1665             } else {
1666                 // c decomposes, get everything from the variable-length extra data
1667                 int firstUnit=extraData.charAt(norm16);
1668                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1669                     return false;
1670                 }
1671                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
1672                     return false;  // non-zero leadCC
1673                 }
1674                 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
1675             }
1676         }
1677     }
1678 
1679     private int findPreviousCompBoundary(CharSequence s, int p) {
1680         while(p>0) {
1681             int c=Character.codePointBefore(s, p);
1682             p-=Character.charCount(c);
1683             if(hasCompBoundaryBefore(c)) {
1684                 break;
1685             }
1686             // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1687             // but that's probably not worth the extra cost.
1688         }
1689         return p;
1690     }
1691 
1692     private int findNextCompBoundary(CharSequence s, int p, int limit) {
1693         while(p<limit) {
1694             int c=Character.codePointAt(s, p);
1695             int norm16=normTrie.get(c);
1696             if(hasCompBoundaryBefore(c, norm16)) {
1697                 break;
1698             }
1699             p+=Character.charCount(c);
1700         }
1701         return p;
1702     }
1703 
1704     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1705         while(p<limit) {
1706             int c=Character.codePointAt(s, p);
1707             if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
1708                 break;
1709             }
1710             p+=Character.charCount(c);
1711         }
1712         return p;
1713     }
1714 
1715     /**
1716      * Get the canonical decomposition
1717      * sherman  for ComposedCharIter
1718      */
1719     public static int getDecompose(int chars[], String decomps[]) {
1720         Normalizer2 impl = Normalizer2.getNFDInstance();
1721 
1722         int length=0;
1723         int norm16 = 0;
1724         int ch = -1;
1725         int i = 0;
1726 
1727         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1728             //TBD !!!! the hack code heres save us about 50ms for startup
1729             //need a better solution/lookup
1730             if (ch == 0x30ff)
1731                 ch = 0xf900;
1732             else if (ch == 0x115bc)
1733                 ch = 0x1d15e;
1734             else if (ch == 0x1d1c1)
1735                 ch = 0x2f800;
1736 
1737             String s = impl.getDecomposition(ch);
1738 
1739             if(s != null && i < chars.length) {
1740                 chars[i] = ch;
1741                 decomps[i++] = s;
1742             }
1743         }
1744         return i;
1745     }
1746 
1747     //------------------------------------------------------
1748     // special method for Collation (RBTableBuilder.build())
1749     //------------------------------------------------------
1750     private static boolean needSingleQuotation(char c) {
1751         return (c >= 0x0009 && c <= 0x000D) ||
1752                (c >= 0x0020 && c <= 0x002F) ||
1753                (c >= 0x003A && c <= 0x0040) ||
1754                (c >= 0x005B && c <= 0x0060) ||
1755                (c >= 0x007B && c <= 0x007E);
1756     }
1757 
1758     public static String canonicalDecomposeWithSingleQuotation(String string) {
1759        Normalizer2 impl = Normalizer2.getNFDInstance();
1760        char[] src = string.toCharArray();
1761        int    srcIndex = 0;
1762        int    srcLimit = src.length;
1763        char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
1764        int    destIndex = 0;
1765        int    destLimit = dest.length;
1766 
1767         int prevSrc;
1768         String norm;
1769         int reorderStartIndex, length;
1770         char c1, c2;
1771         int cp;
1772         int minNoMaybe = 0x00c0;
1773         int cc, prevCC, trailCC;
1774         char[] p;
1775         int pStart;
1776 
1777         // initialize
1778         reorderStartIndex = 0;
1779         prevCC = 0;
1780         norm = null;
1781         cp = 0;
1782         pStart = 0;
1783 
1784         cc = trailCC = -1; // initialize to bogus value
1785         c1 = 0;
1786         for (;;) {
1787             prevSrc=srcIndex;
1788             //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
1789             while (srcIndex != srcLimit &&
1790                    ((c1 = src[srcIndex]) < minNoMaybe ||
1791                     (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
1792                     (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
1793                 prevCC = 0;
1794                 srcIndex += (cp < 0x10000) ? 1 : 2;
1795             }
1796 
1797             // copy these code units all at once
1798             if (srcIndex != prevSrc) {
1799                 length = srcIndex - prevSrc;
1800                 if ((destIndex + length) <= destLimit) {
1801                     System.arraycopy(src,prevSrc,dest,destIndex,length);
1802                 }
1803 
1804                 destIndex += length;
1805                 reorderStartIndex = destIndex;
1806             }
1807 
1808             // end of source reached?
1809             if (srcIndex == srcLimit) {
1810                 break;
1811             }
1812 
1813             // cp already contains *src and norm32 is set for it, increment src
1814             srcIndex += (cp < 0x10000) ? 1 : 2;
1815 
1816             if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1817                 c2 = 0;
1818                 length = 1;
1819 
1820                 if (Character.isHighSurrogate(c1)
1821                     || Character.isLowSurrogate(c1)) {
1822                     norm = null;
1823                 }
1824             } else {
1825                 length = 2;
1826                 c2 = src[srcIndex-1];
1827             }
1828 
1829           // get the decomposition and the lead and trail cc's
1830           if (norm == null) {
1831               // cp does not decompose
1832               cc = trailCC = UCharacter.getCombiningClass(cp);
1833               p = null;
1834               pStart = -1;
1835           } else {
1836 
1837                 pStart = 0;
1838                 p = norm.toCharArray();
1839                 length = p.length;
1840                 int cpNum = norm.codePointCount(0, length);
1841                 cc= UCharacter.getCombiningClass(norm.codePointAt(0));
1842                 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
1843                 if (length == 1) {
1844                     // fastpath a single code unit from decomposition
1845                     c1 = p[pStart];
1846                     c2 = 0;
1847                     p = null;
1848                     pStart = -1;
1849                 }
1850             }
1851 
1852             if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
1853                 // buffer overflow
1854                 char[] tmpBuf = new char[destLimit * 2];
1855                 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
1856                 dest = tmpBuf;
1857                 destLimit = dest.length;
1858             }
1859 
1860             // append the decomposition to the destination buffer, assume length>0
1861             {
1862                 int reorderSplit = destIndex;
1863                 if (p == null) {
1864                     // fastpath: single code point
1865                     if (needSingleQuotation(c1)) {
1866                         //if we need single quotation, no need to consider "prevCC"
1867                         //and it must NOT be a supplementary pair
1868                         dest[destIndex++] = '\'';
1869                         dest[destIndex++] = c1;
1870                         dest[destIndex++] = '\'';
1871                         trailCC = 0;
1872                     } else if(cc != 0 && cc < prevCC) {
1873                         // (c1, c2) is out of order with respect to the preceding
1874                         //  text
1875                         destIndex += length;
1876                         trailCC = insertOrdered(dest, reorderStartIndex,
1877                                                 reorderSplit, destIndex, c1, c2, cc);
1878                     } else {
1879                         // just append (c1, c2)
1880                         dest[destIndex++] = c1;
1881                         if(c2 != 0) {
1882                             dest[destIndex++] = c2;
1883                         }
1884                     }
1885                 } else {
1886                     // general: multiple code points (ordered by themselves)
1887                     // from decomposition
1888                     if (needSingleQuotation(p[pStart])) {
1889                         dest[destIndex++] = '\'';
1890                         dest[destIndex++] = p[pStart++];
1891                         dest[destIndex++] = '\'';
1892                         length--;
1893                         do {
1894                             dest[destIndex++] = p[pStart++];
1895                         } while(--length > 0);
1896                     } else if (cc != 0 && cc < prevCC) {
1897                         destIndex += length;
1898                         trailCC = mergeOrdered(dest, reorderStartIndex,
1899                                                reorderSplit, p, pStart,
1900                                                pStart+length);
1901                     } else {
1902                         // just append the decomposition
1903                         do {
1904                             dest[destIndex++] = p[pStart++];
1905                         } while (--length > 0);
1906                     }
1907                 }
1908             }
1909             prevCC = trailCC;
1910             if(prevCC == 0) {
1911                 reorderStartIndex = destIndex;
1912             }
1913         }
1914 
1915         return new String(dest, 0, destIndex);
1916     }
1917 
1918     /**
1919      * simpler, single-character version of mergeOrdered() -
1920      * bubble-insert one single code point into the preceding string
1921      * which is already canonically ordered
1922      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
1923      *
1924      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1925      *
1926      * before: src[start]..src[current] is already ordered, and
1927      *         src[current]..src[p]     may or may not hold (c, c2) but
1928      *                          must be exactly the same length as (c, c2)
1929      * after: src[start]..src[p] is ordered
1930      *
1931      * @return the trailing combining class
1932      */
1933     private static int/*unsigned byte*/ insertOrdered(char[] source,
1934                                                       int start,
1935                                                       int current, int p,
1936                                                       char c1, char c2,
1937                                                       int/*unsigned byte*/ cc) {
1938         int back, preBack;
1939         int r;
1940         int prevCC, trailCC=cc;
1941 
1942         if (start<current && cc!=0) {
1943             // search for the insertion point where cc>=prevCC
1944             preBack=back=current;
1945 
1946             PrevArgs prevArgs = new PrevArgs();
1947             prevArgs.current  = current;
1948             prevArgs.start    = start;
1949             prevArgs.src      = source;
1950             prevArgs.c1       = c1;
1951             prevArgs.c2       = c2;
1952 
1953             // get the prevCC
1954             prevCC=getPrevCC(prevArgs);
1955             preBack = prevArgs.current;
1956 
1957             if(cc<prevCC) {
1958                 // this will be the last code point, so keep its cc
1959                 trailCC=prevCC;
1960                 back=preBack;
1961                 while(start<preBack) {
1962                     prevCC=getPrevCC(prevArgs);
1963                     preBack=prevArgs.current;
1964                     if(cc>=prevCC) {
1965                         break;
1966                     }
1967                     back=preBack;
1968                 }
1969 
1970                 // this is where we are right now with all these indicies:
1971                 // [start]..[pPreBack] 0..? code points that we can ignore
1972                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
1973                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
1974                 // [current]..[p]         1 code point (c, c2) with cc
1975 
1976                 // move the code units in between up
1977                 r=p;
1978                 do {
1979                     source[--r]=source[--current];
1980                 } while (back!=current);
1981             }
1982         }
1983 
1984         // insert (c1, c2)
1985         source[current] = c1;
1986         if (c2!=0) {
1987             source[(current+1)] = c2;
1988         }
1989 
1990         // we know the cc of the last code point
1991         return trailCC;
1992     }
1993 
1994     /**
1995      * merge two UTF-16 string parts together
1996      * to canonically order (order by combining classes) their concatenation
1997      *
1998      * the two strings may already be adjacent, so that the merging is done
1999      * in-place if the two strings are not adjacent, then the buffer holding the
2000      * first one must be large enough
2001      * the second string may or may not be ordered in itself
2002      *
2003      * before: [start]..[current] is already ordered, and
2004      *         [next]..[limit]    may be ordered in itself, but
2005      *                          is not in relation to [start..current[
2006      * after: [start..current+(limit-next)[ is ordered
2007      *
2008      * the algorithm is a simple bubble-sort that takes the characters from
2009      * src[next++] and inserts them in correct combining class order into the
2010      * preceding part of the string
2011      *
2012      * since this function is called much less often than the single-code point
2013      * insertOrdered(), it just uses that for easier maintenance
2014      *
2015      * @return the trailing combining class
2016      */
2017     private static int /*unsigned byte*/ mergeOrdered(char[] source,
2018                                                       int start,
2019                                                       int current,
2020                                                       char[] data,
2021                                                         int next,
2022                                                         int limit) {
2023             int r;
2024             int /*unsigned byte*/ cc, trailCC=0;
2025             boolean adjacent;
2026 
2027             adjacent= current==next;
2028             NextCCArgs ncArgs = new NextCCArgs();
2029             ncArgs.source = data;
2030             ncArgs.next   = next;
2031             ncArgs.limit  = limit;
2032 
2033             if(start!=current) {
2034 
2035                 while(ncArgs.next<ncArgs.limit) {
2036                     cc=getNextCC(ncArgs);
2037                     if(cc==0) {
2038                         // does not bubble back
2039                         trailCC=0;
2040                         if(adjacent) {
2041                             current=ncArgs.next;
2042                         } else {
2043                             data[current++]=ncArgs.c1;
2044                             if(ncArgs.c2!=0) {
2045                                 data[current++]=ncArgs.c2;
2046                             }
2047                         }
2048                         break;
2049                     } else {
2050                         r=current+(ncArgs.c2==0 ? 1 : 2);
2051                         trailCC=insertOrdered(source,start, current, r,
2052                                               ncArgs.c1, ncArgs.c2, cc);
2053                         current=r;
2054                     }
2055                 }
2056             }
2057 
2058             if(ncArgs.next==ncArgs.limit) {
2059                 // we know the cc of the last code point
2060                 return trailCC;
2061             } else {
2062                 if(!adjacent) {
2063                     // copy the second string part
2064                     do {
2065                         source[current++]=data[ncArgs.next++];
2066                     } while(ncArgs.next!=ncArgs.limit);
2067                     ncArgs.limit=current;
2068                 }
2069                 PrevArgs prevArgs = new PrevArgs();
2070                 prevArgs.src   = data;
2071                 prevArgs.start = start;
2072                 prevArgs.current =  ncArgs.limit;
2073                 return getPrevCC(prevArgs);
2074             }
2075 
2076     }
2077 
2078     private static final class PrevArgs{
2079         char[] src;
2080         int start;
2081         int current;
2082         char c1;
2083         char c2;
2084     }
2085 
2086     private static final class NextCCArgs{
2087         char[] source;
2088         int next;
2089         int limit;
2090         char c1;
2091         char c2;
2092     }
2093 
2094     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2095         args.c1=args.src[--args.current];
2096         args.c2=0;
2097 
2098         if (args.c1 < MIN_CCC_LCCC_CP) {
2099             return 0;
2100         } else if (UTF16.isLeadSurrogate(args.c1)) {
2101             /* unpaired first surrogate */
2102             return 0;
2103         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2104             return UCharacter.getCombiningClass(args.c1);
2105         } else if (args.current!=args.start &&
2106                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2107             --args.current;
2108             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2109         } else {
2110             /* unpaired second surrogate */
2111             args.c2=0;
2112             return 0;
2113         }
2114     }
2115 
2116     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2117         args.c1=args.source[args.next++];
2118         args.c2=0;
2119 
2120         if (UTF16.isTrailSurrogate(args.c1)) {
2121             /* unpaired second surrogate */
2122             return 0;
2123         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2124             return UCharacter.getCombiningClass(args.c1);
2125         } else if (args.next!=args.limit &&
2126                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2127             ++args.next;
2128             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2129         } else {
2130             /* unpaired first surrogate */
2131             args.c2=0;
2132             return 0;
2133         }
2134     }
2135 
2136     private VersionInfo dataVersion;
2137 
2138     // Code point thresholds for quick check codes.
2139     private int minDecompNoCP;
2140     private int minCompNoMaybeCP;
2141 
2142     // Norm16 value thresholds for quick check combinations and types of extra data.
2143     private int minYesNo;
2144     private int minYesNoMappingsOnly;
2145     private int minNoNo;
2146     private int limitNoNo;
2147     private int minMaybeYes;
2148 
2149     private Trie2_16 normTrie;
2150     private String maybeYesCompositions;
2151     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2152     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2153     private int[] tccc180;  // [0x180] tccc values for U+0000..U+017F
2154 
2155 }