1 /*
   2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 package sun.text.normalizer;
  33 
  34 import java.io.IOException;
  35 import java.nio.ByteBuffer;
  36 import java.text.Normalizer;
  37 
  38 // Original filename in ICU4J: Normalizer2Impl.java
  39 public final class NormalizerImpl {
  40     public static final class Hangul {
  41         /* Korean Hangul and Jamo constants */
  42         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
  43         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
  44         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
  45 
  46         public static final int HANGUL_BASE=0xac00;
  47         public static final int HANGUL_END=0xd7a3;
  48 
  49         public static final int JAMO_L_COUNT=19;
  50         public static final int JAMO_V_COUNT=21;
  51         public static final int JAMO_T_COUNT=28;
  52 
  53         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
  54         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
  55 
  56         public static boolean isHangul(int c) {
  57             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
  58         }
  59         public static boolean isHangulLV(int c) {
  60             c-=HANGUL_BASE;
  61             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
  62         }
  63 
  64                 /**
  65          * Decomposes c, which must be a Hangul syllable, into buffer
  66          * and returns the length of the decomposition (2 or 3).
  67          */
  68         public static int decompose(int c, Appendable buffer) {
  69             try {
  70                 c-=HANGUL_BASE;
  71                 int c2=c%JAMO_T_COUNT;
  72                 c/=JAMO_T_COUNT;
  73                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
  74                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
  75                 if(c2==0) {
  76                     return 2;
  77                 } else {
  78                     buffer.append((char)(JAMO_T_BASE+c2));
  79                     return 3;
  80                 }
  81             } catch(IOException e) {
  82                 throw new InternalError(e);
  83             }
  84         }
  85     }
  86 
  87     /**
  88      * Writable buffer that takes care of canonical ordering.
  89      * Its Appendable methods behave like the C++ implementation's
  90      * appendZeroCC() methods.
  91      * <p>
  92      * If dest is a StringBuilder, then the buffer writes directly to it.
  93      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
  94      * until no further changes are necessary and whole segments are appended.
  95      * append() methods that take combining-class values always write to the StringBuilder.
  96      * Other append() methods flush and append to the Appendable.
  97      */
  98     public static final class ReorderingBuffer implements Appendable {
  99         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
 100             impl=ni;
 101             app=dest;
 102             if (app instanceof StringBuilder) {
 103                 appIsStringBuilder=true;
 104                 str=(StringBuilder)dest;
 105                 // In Java, the constructor subsumes public void init(int destCapacity)
 106                 str.ensureCapacity(destCapacity);
 107                 reorderStart=0;
 108                 if(str.length()==0) {
 109                     lastCC=0;
 110                 } else {
 111                     setIterator();
 112                     lastCC=previousCC();
 113                     // Set reorderStart after the last code point with cc<=1 if there is one.
 114                     if(lastCC>1) {
 115                         while(previousCC()>1) {}
 116                     }
 117                     reorderStart=codePointLimit;
 118                 }
 119             } else {
 120                 appIsStringBuilder=false;
 121                 str=new StringBuilder();
 122                 reorderStart=0;
 123                 lastCC=0;
 124             }
 125         }
 126 
 127         public boolean isEmpty() { return str.length()==0; }
 128         public int length() { return str.length(); }
 129         public int getLastCC() { return lastCC; }
 130 
 131         public StringBuilder getStringBuilder() { return str; }
 132 
 133         public boolean equals(CharSequence s, int start, int limit) {
 134             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 135         }
 136 
 137         public void append(int c, int cc) {
 138             if(lastCC<=cc || cc==0) {
 139                 str.appendCodePoint(c);
 140                 lastCC=cc;
 141                 if(cc<=1) {
 142                     reorderStart=str.length();
 143                 }
 144             } else {
 145                 insert(c, cc);
 146             }
 147         }
 148         // s must be in NFD, otherwise change the implementation.
 149         public void append(CharSequence s, int start, int limit,
 150                            int leadCC, int trailCC) {
 151             if(start==limit) {
 152                 return;
 153             }
 154             if(lastCC<=leadCC || leadCC==0) {
 155                 if(trailCC<=1) {
 156                     reorderStart=str.length()+(limit-start);
 157                 } else if(leadCC<=1) {
 158                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 159                 }
 160                 str.append(s, start, limit);
 161                 lastCC=trailCC;
 162             } else {
 163                 int c=Character.codePointAt(s, start);
 164                 start+=Character.charCount(c);
 165                 insert(c, leadCC);  // insert first code point
 166                 while(start<limit) {
 167                     c=Character.codePointAt(s, start);
 168                     start+=Character.charCount(c);
 169                     if(start<limit) {
 170                         // s must be in NFD, otherwise we need to use getCC().
 171                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
 172                     } else {
 173                         leadCC=trailCC;
 174                     }
 175                     append(c, leadCC);
 176                 }
 177             }
 178         }
 179         // The following append() methods work like C++ appendZeroCC().
 180         // They assume that the cc or trailCC of their input is 0.
 181         // Most of them implement Appendable interface methods.
 182         @Override
 183         public ReorderingBuffer append(char c) {
 184             str.append(c);
 185             lastCC=0;
 186             reorderStart=str.length();
 187             return this;
 188         }
 189         public void appendZeroCC(int c) {
 190             str.appendCodePoint(c);
 191             lastCC=0;
 192             reorderStart=str.length();
 193         }
 194         @Override
 195         public ReorderingBuffer append(CharSequence s) {
 196             if(s.length()!=0) {
 197                 str.append(s);
 198                 lastCC=0;
 199                 reorderStart=str.length();
 200             }
 201             return this;
 202         }
 203         @Override
 204         public ReorderingBuffer append(CharSequence s, int start, int limit) {
 205             if(start!=limit) {
 206                 str.append(s, start, limit);
 207                 lastCC=0;
 208                 reorderStart=str.length();
 209             }
 210             return this;
 211         }
 212         /**
 213          * Flushes from the intermediate StringBuilder to the Appendable,
 214          * if they are different objects.
 215          * Used after recomposition.
 216          * Must be called at the end when writing to a non-StringBuilder Appendable.
 217          */
 218         public void flush() {
 219             if(appIsStringBuilder) {
 220                 reorderStart=str.length();
 221             } else {
 222                 try {
 223                     app.append(str);
 224                     str.setLength(0);
 225                     reorderStart=0;
 226                 } catch(IOException e) {
 227                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 228                 }
 229             }
 230             lastCC=0;
 231         }
 232         /**
 233          * Flushes from the intermediate StringBuilder to the Appendable,
 234          * if they are different objects.
 235          * Then appends the new text to the Appendable or StringBuilder.
 236          * Normally used after quick check loops find a non-empty sequence.
 237          */
 238         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
 239             if(appIsStringBuilder) {
 240                 str.append(s, start, limit);
 241                 reorderStart=str.length();
 242             } else {
 243                 try {
 244                     app.append(str).append(s, start, limit);
 245                     str.setLength(0);
 246                     reorderStart=0;
 247                 } catch(IOException e) {
 248                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 249                 }
 250             }
 251             lastCC=0;
 252             return this;
 253         }
 254         public void remove() {
 255             str.setLength(0);
 256             lastCC=0;
 257             reorderStart=0;
 258         }
 259         public void removeSuffix(int suffixLength) {
 260             int oldLength=str.length();
 261             str.delete(oldLength-suffixLength, oldLength);
 262             lastCC=0;
 263             reorderStart=str.length();
 264         }
 265 
 266         // Inserts c somewhere before the last character.
 267         // Requires 0<cc<lastCC which implies reorderStart<limit.
 268         private void insert(int c, int cc) {
 269             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 270             // insert c at codePointLimit, after the character with prevCC<=cc
 271             if(c<=0xffff) {
 272                 str.insert(codePointLimit, (char)c);
 273                 if(cc<=1) {
 274                     reorderStart=codePointLimit+1;
 275                 }
 276             } else {
 277                 str.insert(codePointLimit, Character.toChars(c));
 278                 if(cc<=1) {
 279                     reorderStart=codePointLimit+2;
 280                 }
 281             }
 282         }
 283 
 284         private final NormalizerImpl impl;
 285         private final Appendable app;
 286         private final StringBuilder str;
 287         private final boolean appIsStringBuilder;
 288         private int reorderStart;
 289         private int lastCC;
 290 
 291         // private backward iterator
 292         private void setIterator() { codePointStart=str.length(); }
 293         private void skipPrevious() {  // Requires 0<codePointStart.
 294             codePointLimit=codePointStart;
 295             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 296         }
 297         private int previousCC() {  // Returns 0 if there is no previous character.
 298             codePointLimit=codePointStart;
 299             if(reorderStart>=codePointStart) {
 300                 return 0;
 301             }
 302             int c=str.codePointBefore(codePointStart);
 303             codePointStart-=Character.charCount(c);
 304             return impl.getCCFromYesOrMaybeCP(c);
 305         }
 306         private int codePointStart, codePointLimit;
 307     }
 308 
 309     // TODO: Propose as public API on the UTF16 class.
 310     // TODO: Propose widening UTF16 methods that take char to take int.
 311     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 312     public static final class UTF16Plus {
 313         /**
 314          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 315          * is it a lead surrogate?
 316          * @param c code unit or code point
 317          * @return true or false
 318          */
 319         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 320 
 321         /**
 322          * Compares two CharSequence subsequences for binary equality.
 323          * @param s1 first sequence
 324          * @param start1 start offset in first sequence
 325          * @param limit1 limit offset in first sequence
 326          * @param s2 second sequence
 327          * @param start2 start offset in second sequence
 328          * @param limit2 limit offset in second sequence
 329          * @return true if s1.subSequence(start1, limit1) contains the same text
 330          *              as s2.subSequence(start2, limit2)
 331          */
 332         public static boolean equal(CharSequence s1, int start1, int limit1,
 333                                     CharSequence s2, int start2, int limit2) {
 334             if((limit1-start1)!=(limit2-start2)) {
 335                 return false;
 336             }
 337             if(s1==s2 && start1==start2) {
 338                 return true;
 339             }
 340             while(start1<limit1) {
 341                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 342                     return false;
 343                 }
 344             }
 345             return true;
 346         }
 347     }
 348 
 349     public NormalizerImpl() {}
 350 
 351     private static final class IsAcceptable implements ICUBinary.Authenticate {
 352         public boolean isDataVersionAcceptable(byte version[]) {
 353             return version[0]==3;
 354         }
 355     }
 356     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 357     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 358 
 359     public NormalizerImpl load(ByteBuffer bytes) {
 360         try {
 361             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 362             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 363             if(indexesLength<=IX_MIN_LCCC_CP) {
 364                 throw new InternalError("Normalizer2 data: not enough indexes");
 365             }
 366             int[] inIndexes=new int[indexesLength];
 367             inIndexes[0]=indexesLength*4;
 368             for(int i=1; i<indexesLength; ++i) {
 369                 inIndexes[i]=bytes.getInt();
 370             }
 371 
 372             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 373             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 374             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 375 
 376             minYesNo=inIndexes[IX_MIN_YES_NO];
 377             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 378             minNoNo=inIndexes[IX_MIN_NO_NO];
 379             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 380             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 381             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 382             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 383             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 384             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 385             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 386 
 387             // Read the normTrie.
 388             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 389             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 390             normTrie=Trie2_16.createFromSerialized(bytes);
 391             int trieLength=normTrie.getSerializedLength();
 392             if(trieLength>(nextOffset-offset)) {
 393                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
 394             }
 395             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 396 
 397             // Read the composition and mapping data.
 398             offset=nextOffset;
 399             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 400             int numChars=(nextOffset-offset)/2;
 401             char[] chars;
 402             if(numChars!=0) {
 403                 chars=new char[numChars];
 404                 for(int i=0; i<numChars; ++i) {
 405                     chars[i]=bytes.getChar();
 406                 }
 407                 maybeYesCompositions=new String(chars);
 408                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 409             }
 410 
 411             // smallFCD: new in formatVersion 2
 412             offset=nextOffset;
 413             smallFCD=new byte[0x100];
 414             bytes.get(smallFCD);
 415 
 416             return this;
 417         } catch(IOException e) {
 418             throw new InternalError(e);
 419         }
 420     }
 421     public NormalizerImpl load(String name) {
 422         return load(ICUBinary.getRequiredData(name));
 423     }
 424 
 425 
 426     public int getNorm16(int c) { return normTrie.get(c); }
 427     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
 428     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
 429     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 430 
 431     public int getCC(int norm16) {
 432         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 433             return getCCFromNormalYesOrMaybe(norm16);
 434         }
 435         if(norm16<minNoNo || limitNoNo<=norm16) {
 436             return 0;
 437         }
 438         return getCCFromNoNo(norm16);
 439     }
 440     public static int getCCFromNormalYesOrMaybe(int norm16) {
 441         return (norm16 >> OFFSET_SHIFT) & 0xff;
 442     }
 443     public static int getCCFromYesOrMaybe(int norm16) {
 444         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
 445     }
 446     public int getCCFromYesOrMaybeCP(int c) {
 447         if (c < minCompNoMaybeCP) { return 0; }
 448         return getCCFromYesOrMaybe(getNorm16(c));
 449     }
 450 
 451     /**
 452      * Returns the FCD data for code point c.
 453      * @param c A Unicode code point.
 454      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
 455      */
 456     public int getFCD16(int c) {
 457         if(c<minDecompNoCP) {
 458             return 0;
 459         } else if(c<=0xffff) {
 460             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
 461         }
 462         return getFCD16FromNormData(c);
 463     }
 464     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
 465     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
 466         // 0<=lead<=0xffff
 467         byte bits=smallFCD[lead>>8];
 468         if(bits==0) { return false; }
 469         return ((bits>>((lead>>5)&7))&1)!=0;
 470     }
 471 
 472     /** Gets the FCD value from the regular normalization data. */
 473     public int getFCD16FromNormData(int c) {
 474         int norm16=getNorm16(c);
 475         if (norm16 >= limitNoNo) {
 476             if(norm16>=MIN_NORMAL_MAYBE_YES) {
 477                 // combining mark
 478                 norm16=getCCFromNormalYesOrMaybe(norm16);
 479                 return norm16|(norm16<<8);
 480             } else if(norm16>=minMaybeYes) {
 481                 return 0;
 482             } else {  // isDecompNoAlgorithmic(norm16)
 483                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
 484                 if (deltaTrailCC <= DELTA_TCCC_1) {
 485                     return deltaTrailCC >> OFFSET_SHIFT;
 486                 }
 487                 // Maps to an isCompYesAndZeroCC.
 488                 c=mapAlgorithmic(c, norm16);
 489                 norm16=getNorm16(c);
 490             }
 491         }
 492         if(norm16<=minYesNo || isHangulLVT(norm16)) {
 493             // no decomposition or Hangul syllable, all zeros
 494             return 0;
 495         }
 496         // c decomposes, get everything from the variable-length extra data
 497         int mapping=norm16>>OFFSET_SHIFT;
 498         int firstUnit=extraData.charAt(mapping);
 499         int fcd16=firstUnit>>8;  // tccc
 500         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 501             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
 502         }
 503         return fcd16;
 504     }
 505 
 506     /**
 507      * Gets the decomposition for one code point.
 508      * @param c code point
 509      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 510      */
 511     public String getDecomposition(int c) {
 512         int norm16;
 513         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 514             // c does not decompose
 515             return null;
 516         }
 517         int decomp = -1;
 518         if(isDecompNoAlgorithmic(norm16)) {
 519             // Maps to an isCompYesAndZeroCC.
 520             decomp=c=mapAlgorithmic(c, norm16);
 521             // The mapping might decompose further.
 522             norm16 = getNorm16(c);
 523         }
 524         if (norm16 < minYesNo) {
 525             if(decomp<0) {
 526                 return null;
 527             } else {
 528                 return UTF16.valueOf(decomp);
 529             }
 530         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 531             // Hangul syllable: decompose algorithmically
 532             StringBuilder buffer=new StringBuilder();
 533             Hangul.decompose(c, buffer);
 534             return buffer.toString();
 535         }
 536         // c decomposes, get everything from the variable-length extra data
 537         int mapping=norm16>>OFFSET_SHIFT;
 538         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
 539         return extraData.substring(mapping, mapping+length);
 540     }
 541 
 542     // Fixed norm16 values.
 543     public static final int MIN_YES_YES_WITH_CC=0xfe02;
 544     public static final int JAMO_VT=0xfe00;
 545     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
 546     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
 547     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
 548 
 549     // norm16 bit 0 is comp-boundary-after.
 550     public static final int HAS_COMP_BOUNDARY_AFTER=1;
 551     public static final int OFFSET_SHIFT=1;
 552 
 553     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
 554     // tccc (0, 1, >1) for quick FCC boundary-after tests.
 555     public static final int DELTA_TCCC_0=0;
 556     public static final int DELTA_TCCC_1=2;
 557     public static final int DELTA_TCCC_GT_1=4;
 558     public static final int DELTA_TCCC_MASK=6;
 559     public static final int DELTA_SHIFT=3;
 560 
 561     public static final int MAX_DELTA=0x40;
 562 
 563     // Byte offsets from the start of the data, after the generic header.
 564     public static final int IX_NORM_TRIE_OFFSET=0;
 565     public static final int IX_EXTRA_DATA_OFFSET=1;
 566     public static final int IX_SMALL_FCD_OFFSET=2;
 567     public static final int IX_RESERVED3_OFFSET=3;
 568     public static final int IX_TOTAL_SIZE=7;
 569     public static final int MIN_CCC_LCCC_CP=0x300;
 570     // Code point thresholds for quick check codes.
 571     public static final int IX_MIN_DECOMP_NO_CP=8;
 572     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
 573 
 574     // Norm16 value thresholds for quick check combinations and types of extra data.
 575 
 576     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
 577     public static final int IX_MIN_YES_NO=10;
 578     /** Mappings are comp-normalized. */
 579     public static final int IX_MIN_NO_NO=11;
 580     public static final int IX_LIMIT_NO_NO=12;
 581     public static final int IX_MIN_MAYBE_YES=13;
 582 
 583     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
 584     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
 585     /** Mappings are not comp-normalized but have a comp boundary before. */
 586     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
 587     /** Mappings do not have a comp boundary before. */
 588     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
 589     /** Mappings to the empty string. */
 590     public static final int IX_MIN_NO_NO_EMPTY=17;
 591 
 592     public static final int IX_MIN_LCCC_CP=18;
 593     public static final int IX_COUNT=20;
 594 
 595     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
 596     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
 597     // unused bit 0x20;
 598     public static final int MAPPING_LENGTH_MASK=0x1f;
 599 
 600     public static final int COMP_1_LAST_TUPLE=0x8000;
 601     public static final int COMP_1_TRIPLE=1;
 602     public static final int COMP_1_TRAIL_LIMIT=0x3400;
 603     public static final int COMP_1_TRAIL_MASK=0x7ffe;
 604     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
 605     public static final int COMP_2_TRAIL_SHIFT=6;
 606     public static final int COMP_2_TRAIL_MASK=0xffc0;
 607 
 608     // higher-level functionality ------------------------------------------ ***
 609 
 610     /**
 611      * Decomposes s[src, limit[ and writes the result to dest.
 612      * limit can be NULL if src is NUL-terminated.
 613      * destLengthEstimate is the initial dest buffer capacity and can be -1.
 614      */
 615     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
 616                    int destLengthEstimate) {
 617         if(destLengthEstimate<0) {
 618             destLengthEstimate=limit-src;
 619         }
 620         dest.setLength(0);
 621         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
 622         decompose(s, src, limit, buffer);
 623     }
 624 
 625     // Dual functionality:
 626     // buffer!=NULL: normalize
 627     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
 628     public int decompose(CharSequence s, int src, int limit,
 629                          ReorderingBuffer buffer) {
 630         int minNoCP=minDecompNoCP;
 631 
 632         int prevSrc;
 633         int c=0;
 634         int norm16=0;
 635 
 636         // only for quick check
 637         int prevBoundary=src;
 638         int prevCC=0;
 639 
 640         for(;;) {
 641             // count code units below the minimum or with irrelevant data for the quick check
 642             for(prevSrc=src; src!=limit;) {
 643                 if( (c=s.charAt(src))<minNoCP ||
 644                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 645                 ) {
 646                     ++src;
 647                 } else if(!UTF16.isSurrogate((char)c)) {
 648                     break;
 649                 } else {
 650                     char c2;
 651                     if(UTF16Plus.isSurrogateLead(c)) {
 652                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
 653                             c=Character.toCodePoint((char)c, c2);
 654                         }
 655                     } else /* trail surrogate */ {
 656                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
 657                             --src;
 658                             c=Character.toCodePoint(c2, (char)c);
 659                         }
 660                     }
 661                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
 662                         src+=Character.charCount(c);
 663                     } else {
 664                         break;
 665                     }
 666                 }
 667             }
 668             // copy these code units all at once
 669             if(src!=prevSrc) {
 670                 if(buffer!=null) {
 671                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
 672                 } else {
 673                     prevCC=0;
 674                     prevBoundary=src;
 675                 }
 676             }
 677             if(src==limit) {
 678                 break;
 679             }
 680 
 681             // Check one above-minimum, relevant code point.
 682             src+=Character.charCount(c);
 683             if(buffer!=null) {
 684                 decompose(c, norm16, buffer);
 685             } else {
 686                 if(isDecompYes(norm16)) {
 687                     int cc=getCCFromYesOrMaybe(norm16);
 688                     if(prevCC<=cc || cc==0) {
 689                         prevCC=cc;
 690                         if(cc<=1) {
 691                             prevBoundary=src;
 692                         }
 693                         continue;
 694                     }
 695                 }
 696                 return prevBoundary;  // "no" or cc out of order
 697             }
 698         }
 699         return src;
 700     }
 701     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
 702         int limit=s.length();
 703         if(limit==0) {
 704             return;
 705         }
 706         if(doDecompose) {
 707             decompose(s, 0, limit, buffer);
 708             return;
 709         }
 710         // Just merge the strings at the boundary.
 711         int c=Character.codePointAt(s, 0);
 712         int src=0;
 713         int firstCC, prevCC, cc;
 714         firstCC=prevCC=cc=getCC(getNorm16(c));
 715         while(cc!=0) {
 716             prevCC=cc;
 717             src+=Character.charCount(c);
 718             if(src>=limit) {
 719                 break;
 720             }
 721             c=Character.codePointAt(s, src);
 722             cc=getCC(getNorm16(c));
 723         };
 724         buffer.append(s, 0, src, firstCC, prevCC);
 725         buffer.append(s, src, limit);
 726     }
 727 
 728     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 729     // doCompose: normalize
 730     // !doCompose: isNormalized (buffer must be empty and initialized)
 731     public boolean compose(CharSequence s, int src, int limit,
 732                            boolean onlyContiguous,
 733                            boolean doCompose,
 734                            ReorderingBuffer buffer) {
 735         int prevBoundary=src;
 736         int minNoMaybeCP=minCompNoMaybeCP;
 737 
 738         for (;;) {
 739             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 740             // or with (compYes && ccc==0) properties.
 741             int prevSrc;
 742             int c = 0;
 743             int norm16 = 0;
 744             for (;;) {
 745                 if (src == limit) {
 746                     if (prevBoundary != limit && doCompose) {
 747                         buffer.append(s, prevBoundary, limit);
 748                     }
 749                     return true;
 750                 }
 751                 if( (c=s.charAt(src))<minNoMaybeCP ||
 752                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 753                 ) {
 754                     ++src;
 755                 } else {
 756                     prevSrc = src++;
 757                     if(!UTF16.isSurrogate((char)c)) {
 758                         break;
 759                     } else {
 760                         char c2;
 761                         if(UTF16Plus.isSurrogateLead(c)) {
 762                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
 763                                 ++src;
 764                                 c=Character.toCodePoint((char)c, c2);
 765                             }
 766                         } else /* trail surrogate */ {
 767                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
 768                                 --prevSrc;
 769                                 c=Character.toCodePoint(c2, (char)c);
 770                             }
 771                         }
 772                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
 773                             break;
 774                         }
 775                     }
 776                 }
 777             }
 778             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 779             // The current character is either a "noNo" (has a mapping)
 780             // or a "maybeYes" (combines backward)
 781             // or a "yesYes" with ccc!=0.
 782             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
 783 
 784             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
 785             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
 786                 if (!doCompose) {
 787                     return false;
 788                 }
 789                 // Fast path for mapping a character that is immediately surrounded by boundaries.
 790                 // In this case, we need not decompose around the current character.
 791                 if (isDecompNoAlgorithmic(norm16)) {
 792                     // Maps to a single isCompYesAndZeroCC character
 793                     // which also implies hasCompBoundaryBefore.
 794                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 795                             hasCompBoundaryBefore(s, src, limit)) {
 796                         if (prevBoundary != prevSrc) {
 797                             buffer.append(s, prevBoundary, prevSrc);
 798                         }
 799                         buffer.append(mapAlgorithmic(c, norm16), 0);
 800                         prevBoundary = src;
 801                         continue;
 802                     }
 803                 } else if (norm16 < minNoNoCompBoundaryBefore) {
 804                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
 805                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 806                             hasCompBoundaryBefore(s, src, limit)) {
 807                         if (prevBoundary != prevSrc) {
 808                             buffer.append(s, prevBoundary, prevSrc);
 809                         }
 810                         int mapping = norm16 >> OFFSET_SHIFT;
 811                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
 812                         buffer.append(extraData, mapping, mapping + length);
 813                         prevBoundary = src;
 814                         continue;
 815                     }
 816                 } else if (norm16 >= minNoNoEmpty) {
 817                     // The current character maps to nothing.
 818                     // Simply omit it from the output if there is a boundary before _or_ after it.
 819                     // The character itself implies no boundaries.
 820                     if (hasCompBoundaryBefore(s, src, limit) ||
 821                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
 822                         if (prevBoundary != prevSrc) {
 823                             buffer.append(s, prevBoundary, prevSrc);
 824                         }
 825                         prevBoundary = src;
 826                         continue;
 827                     }
 828                 }
 829                 // Other "noNo" type, or need to examine more text around this character:
 830                 // Fall through to the slow path.
 831             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
 832                 char prev=s.charAt(prevSrc-1);
 833                 if(c<Hangul.JAMO_T_BASE) {
 834                     // The current character is a Jamo Vowel,
 835                     // compose with previous Jamo L and following Jamo T.
 836                     char l = (char)(prev-Hangul.JAMO_L_BASE);
 837                     if(l<Hangul.JAMO_L_COUNT) {
 838                         if (!doCompose) {
 839                             return false;
 840                         }
 841                         int t;
 842                         if (src != limit &&
 843                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
 844                                 t < Hangul.JAMO_T_COUNT) {
 845                             // The next character is a Jamo T.
 846                             ++src;
 847                         } else if (hasCompBoundaryBefore(s, src, limit)) {
 848                             // No Jamo T follows, not even via decomposition.
 849                             t = 0;
 850                         } else {
 851                             t = -1;
 852                         }
 853                         if (t >= 0) {
 854                             int syllable = Hangul.HANGUL_BASE +
 855                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
 856                                 Hangul.JAMO_T_COUNT + t;
 857                             --prevSrc;  // Replace the Jamo L as well.
 858                             if (prevBoundary != prevSrc) {
 859                                 buffer.append(s, prevBoundary, prevSrc);
 860                             }
 861                             buffer.append((char)syllable);
 862                             prevBoundary = src;
 863                             continue;
 864                         }
 865                         // If we see L+V+x where x!=T then we drop to the slow path,
 866                         // decompose and recompose.
 867                         // This is to deal with NFKC finding normal L and V but a
 868                         // compatibility variant of a T.
 869                         // We need to either fully compose that combination here
 870                         // (which would complicate the code and may not work with strange custom data)
 871                         // or use the slow path.
 872                     }
 873                 } else if (Hangul.isHangulLV(prev)) {
 874                     // The current character is a Jamo Trailing consonant,
 875                     // compose with previous Hangul LV that does not contain a Jamo T.
 876                     if (!doCompose) {
 877                         return false;
 878                     }
 879                     int syllable = prev + c - Hangul.JAMO_T_BASE;
 880                     --prevSrc;  // Replace the Hangul LV as well.
 881                     if (prevBoundary != prevSrc) {
 882                         buffer.append(s, prevBoundary, prevSrc);
 883                     }
 884                     buffer.append((char)syllable);
 885                     prevBoundary = src;
 886                     continue;
 887                 }
 888                 // No matching context, or may need to decompose surrounding text first:
 889                 // Fall through to the slow path.
 890             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
 891                 // One or more combining marks that do not combine-back:
 892                 // Check for canonical order, copy unchanged if ok and
 893                 // if followed by a character with a boundary-before.
 894                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
 895                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
 896                     // Fails FCD test, need to decompose and contiguously recompose.
 897                     if (!doCompose) {
 898                         return false;
 899                     }
 900                 } else {
 901                     // If !onlyContiguous (not FCC), then we ignore the tccc of
 902                     // the previous character which passed the quick check "yes && ccc==0" test.
 903                     int n16;
 904                     for (;;) {
 905                         if (src == limit) {
 906                             if (doCompose) {
 907                                 buffer.append(s, prevBoundary, limit);
 908                             }
 909                             return true;
 910                         }
 911                         int prevCC = cc;
 912                         c = Character.codePointAt(s, src);
 913                         n16 = normTrie.get(c);
 914                         if (n16 >= MIN_YES_YES_WITH_CC) {
 915                             cc = getCCFromNormalYesOrMaybe(n16);
 916                             if (prevCC > cc) {
 917                                 if (!doCompose) {
 918                                     return false;
 919                                 }
 920                                 break;
 921                             }
 922                         } else {
 923                             break;
 924                         }
 925                         src += Character.charCount(c);
 926                     }
 927                     // p is after the last in-order combining mark.
 928                     // If there is a boundary here, then we continue with no change.
 929                     if (norm16HasCompBoundaryBefore(n16)) {
 930                         if (isCompYesAndZeroCC(n16)) {
 931                             src += Character.charCount(c);
 932                         }
 933                         continue;
 934                     }
 935                     // Use the slow path. There is no boundary in [prevSrc, src[.
 936                 }
 937             }
 938 
 939             // Slow path: Find the nearest boundaries around the current character,
 940             // decompose and recompose.
 941             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
 942                 c = Character.codePointBefore(s, prevSrc);
 943                 norm16 = normTrie.get(c);
 944                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 945                     prevSrc -= Character.charCount(c);
 946                 }
 947             }
 948             if (doCompose && prevBoundary != prevSrc) {
 949                 buffer.append(s, prevBoundary, prevSrc);
 950             }
 951             int recomposeStartIndex=buffer.length();
 952             // We know there is not a boundary here.
 953             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
 954                            buffer);
 955             // Decompose until the next boundary.
 956             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
 957                                  buffer);
 958             recompose(buffer, recomposeStartIndex, onlyContiguous);
 959             if(!doCompose) {
 960                 if(!buffer.equals(s, prevSrc, src)) {
 961                     return false;
 962                 }
 963                 buffer.remove();
 964             }
 965             prevBoundary=src;
 966         }
 967     }
 968 
 969     /**
 970      * Very similar to compose(): Make the same changes in both places if relevant.
 971      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
 972      * !doSpan: quickCheck
 973      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
 974      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 975      *         then the quick check result is "no"
 976      */
 977     public int composeQuickCheck(CharSequence s, int src, int limit,
 978                                  boolean onlyContiguous, boolean doSpan) {
 979         int qcResult=0;
 980         int prevBoundary=src;
 981         int minNoMaybeCP=minCompNoMaybeCP;
 982 
 983         for(;;) {
 984             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 985             // or with (compYes && ccc==0) properties.
 986             int prevSrc;
 987             int c = 0;
 988             int norm16 = 0;
 989             for (;;) {
 990                 if(src==limit) {
 991                     return (src<<1)|qcResult;  // "yes" or "maybe"
 992                 }
 993                 if( (c=s.charAt(src))<minNoMaybeCP ||
 994                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 995                 ) {
 996                     ++src;
 997                 } else {
 998                     prevSrc = src++;
 999                     if(!UTF16.isSurrogate((char)c)) {
1000                         break;
1001                     } else {
1002                         char c2;
1003                         if(UTF16Plus.isSurrogateLead(c)) {
1004                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1005                                 ++src;
1006                                 c=Character.toCodePoint((char)c, c2);
1007                             }
1008                         } else /* trail surrogate */ {
1009                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1010                                 --prevSrc;
1011                                 c=Character.toCodePoint(c2, (char)c);
1012                             }
1013                         }
1014                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1015                             break;
1016                         }
1017                     }
1018                 }
1019             }
1020             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1021             // The current character is either a "noNo" (has a mapping)
1022             // or a "maybeYes" (combines backward)
1023             // or a "yesYes" with ccc!=0.
1024             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1025 
1026             int prevNorm16 = INERT;
1027             if (prevBoundary != prevSrc) {
1028                 prevBoundary = prevSrc;
1029                 if (!norm16HasCompBoundaryBefore(norm16)) {
1030                     c = Character.codePointBefore(s, prevSrc);
1031                     int n16 = getNorm16(c);
1032                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1033                         prevBoundary -= Character.charCount(c);
1034                         prevNorm16 = n16;
1035                     }
1036                 }
1037             }
1038 
1039             if(isMaybeOrNonZeroCC(norm16)) {
1040                 int cc=getCCFromYesOrMaybe(norm16);
1041                 if (onlyContiguous /* FCC */ && cc != 0 &&
1042                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1043                     // The [prevBoundary..prevSrc[ character
1044                     // passed the quick check "yes && ccc==0" test
1045                     // but is out of canonical order with the current combining mark.
1046                 } else {
1047                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1048                     // the previous character which passed the quick check "yes && ccc==0" test.
1049                     for (;;) {
1050                         if (norm16 < MIN_YES_YES_WITH_CC) {
1051                             if (!doSpan) {
1052                                 qcResult = 1;
1053                             } else {
1054                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1055                             }
1056                         }
1057                         if (src == limit) {
1058                             return (src<<1) | qcResult;  // "yes" or "maybe"
1059                         }
1060                         int prevCC = cc;
1061                         c = Character.codePointAt(s, src);
1062                         norm16 = getNorm16(c);
1063                         if (isMaybeOrNonZeroCC(norm16)) {
1064                             cc = getCCFromYesOrMaybe(norm16);
1065                             if (!(prevCC <= cc || cc == 0)) {
1066                                 break;
1067                             }
1068                         } else {
1069                             break;
1070                         }
1071                         src += Character.charCount(c);
1072                     }
1073                     // src is after the last in-order combining mark.
1074                     if (isCompYesAndZeroCC(norm16)) {
1075                         prevBoundary = src;
1076                         src += Character.charCount(c);
1077                         continue;
1078                     }
1079                 }
1080             }
1081             return prevBoundary<<1;  // "no"
1082         }
1083     }
1084     public void composeAndAppend(CharSequence s,
1085                                  boolean doCompose,
1086                                  boolean onlyContiguous,
1087                                  ReorderingBuffer buffer) {
1088         int src=0, limit=s.length();
1089         if(!buffer.isEmpty()) {
1090             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1091             if(0!=firstStarterInSrc) {
1092                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1093                                                                buffer.length(), onlyContiguous);
1094                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1095                                                        firstStarterInSrc+16);
1096                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1097                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1098                 middle.append(s, 0, firstStarterInSrc);
1099                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1100                 src=firstStarterInSrc;
1101             }
1102         }
1103         if(doCompose) {
1104             compose(s, src, limit, onlyContiguous, true, buffer);
1105         } else {
1106             buffer.append(s, src, limit);
1107         }
1108     }
1109     // Dual functionality:
1110     // buffer!=NULL: normalize
1111     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1112     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1113         // Note: In this function we use buffer->appendZeroCC() because we track
1114         // the lead and trail combining classes here, rather than leaving it to
1115         // the ReorderingBuffer.
1116         // The exception is the call to decomposeShort() which uses the buffer
1117         // in the normal way.
1118 
1119         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1120         // Similar to the prevBoundary in the compose() implementation.
1121         int prevBoundary=src;
1122         int prevSrc;
1123         int c=0;
1124         int prevFCD16=0;
1125         int fcd16=0;
1126 
1127         for(;;) {
1128             // count code units with lccc==0
1129             for(prevSrc=src; src!=limit;) {
1130                 if((c=s.charAt(src))<minLcccCP) {
1131                     prevFCD16=~c;
1132                     ++src;
1133                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1134                     prevFCD16=0;
1135                     ++src;
1136                 } else {
1137                     if(UTF16.isSurrogate((char)c)) {
1138                         char c2;
1139                         if(UTF16Plus.isSurrogateLead(c)) {
1140                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1141                                 c=Character.toCodePoint((char)c, c2);
1142                             }
1143                         } else /* trail surrogate */ {
1144                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1145                                 --src;
1146                                 c=Character.toCodePoint(c2, (char)c);
1147                             }
1148                         }
1149                     }
1150                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1151                         prevFCD16=fcd16;
1152                         src+=Character.charCount(c);
1153                     } else {
1154                         break;
1155                     }
1156                 }
1157             }
1158             // copy these code units all at once
1159             if(src!=prevSrc) {
1160                 if(src==limit) {
1161                     if(buffer!=null) {
1162                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1163                     }
1164                     break;
1165                 }
1166                 prevBoundary=src;
1167                 // We know that the previous character's lccc==0.
1168                 if(prevFCD16<0) {
1169                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1170                     int prev=~prevFCD16;
1171                     if(prev<minDecompNoCP) {
1172                         prevFCD16=0;
1173                     } else {
1174                         prevFCD16=getFCD16FromNormData(prev);
1175                         if(prevFCD16>1) {
1176                             --prevBoundary;
1177                         }
1178                     }
1179                 } else {
1180                     int p=src-1;
1181                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1182                         Character.isHighSurrogate(s.charAt(p-1))
1183                     ) {
1184                         --p;
1185                         // Need to fetch the previous character's FCD value because
1186                         // prevFCD16 was just for the trail surrogate code point.
1187                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1188                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1189                     }
1190                     if(prevFCD16>1) {
1191                         prevBoundary=p;
1192                     }
1193                 }
1194                 if(buffer!=null) {
1195                     // The last lccc==0 character is excluded from the
1196                     // flush-and-append call in case it needs to be modified.
1197                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1198                     buffer.append(s, prevBoundary, src);
1199                 }
1200                 // The start of the current character (c).
1201                 prevSrc=src;
1202             } else if(src==limit) {
1203                 break;
1204             }
1205 
1206             src+=Character.charCount(c);
1207             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1208             // Check for proper order, and decompose locally if necessary.
1209             if((prevFCD16&0xff)<=(fcd16>>8)) {
1210                 // proper order: prev tccc <= current lccc
1211                 if((fcd16&0xff)<=1) {
1212                     prevBoundary=src;
1213                 }
1214                 if(buffer!=null) {
1215                     buffer.appendZeroCC(c);
1216                 }
1217                 prevFCD16=fcd16;
1218                 continue;
1219             } else if(buffer==null) {
1220                 return prevBoundary;  // quick check "no"
1221             } else {
1222                 /*
1223                  * Back out the part of the source that we copied or appended
1224                  * already but is now going to be decomposed.
1225                  * prevSrc is set to after what was copied/appended.
1226                  */
1227                 buffer.removeSuffix(prevSrc-prevBoundary);
1228                 /*
1229                  * Find the part of the source that needs to be decomposed,
1230                  * up to the next safe boundary.
1231                  */
1232                 src=findNextFCDBoundary(s, src, limit);
1233                 /*
1234                  * The source text does not fulfill the conditions for FCD.
1235                  * Decompose and reorder a limited piece of the text.
1236                  */
1237                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1238                 prevBoundary=src;
1239                 prevFCD16=0;
1240             }
1241         }
1242         return src;
1243     }
1244 
1245     public boolean hasDecompBoundaryBefore(int c) {
1246         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1247             norm16HasDecompBoundaryBefore(getNorm16(c));
1248     }
1249     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1250         if (norm16 < minNoNoCompNoMaybeCC) {
1251             return true;
1252         }
1253         if (norm16 >= limitNoNo) {
1254             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1255         }
1256         // c decomposes, get everything from the variable-length extra data
1257         int mapping=norm16>>OFFSET_SHIFT;
1258         int firstUnit=extraData.charAt(mapping);
1259         // true if leadCC==0 (hasFCDBoundaryBefore())
1260         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1261     }
1262     public boolean hasDecompBoundaryAfter(int c) {
1263         if (c < minDecompNoCP) {
1264             return true;
1265         }
1266         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1267             return true;
1268         }
1269         return norm16HasDecompBoundaryAfter(getNorm16(c));
1270     }
1271     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1272         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1273             return true;
1274         }
1275         if (norm16 >= limitNoNo) {
1276             if (isMaybeOrNonZeroCC(norm16)) {
1277                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1278             }
1279             // Maps to an isCompYesAndZeroCC.
1280             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1281         }
1282         // c decomposes, get everything from the variable-length extra data
1283         int mapping=norm16>>OFFSET_SHIFT;
1284         int firstUnit=extraData.charAt(mapping);
1285         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1286         // fcd16<=1 || trailCC==0
1287         if(firstUnit>0x1ff) {
1288             return false;  // trailCC>1
1289         }
1290         if(firstUnit<=0xff) {
1291             return true;  // trailCC==0
1292         }
1293         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1294         // true if leadCC==0 (hasFCDBoundaryBefore())
1295         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1296     }
1297     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1298 
1299     public boolean hasCompBoundaryBefore(int c) {
1300         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1301     }
1302     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1303         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1304     }
1305 
1306     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1307     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1308     private static boolean isInert(int norm16) { return norm16==INERT; }
1309     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1310     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
1311     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
1312     private boolean isHangulLVT(int norm16) {
1313         return norm16==hangulLVT();
1314     }
1315     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1316     // UBool isCompYes(uint16_t norm16) const {
1317     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1318     // }
1319     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1320     //     return norm16<minNoNo || minMaybeYes<=norm16;
1321     // }
1322     // private boolean hasZeroCCFromDecompYes(int norm16) {
1323     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1324     // }
1325     private boolean isDecompYesAndZeroCC(int norm16) {
1326         return norm16<minYesNo ||
1327                norm16==JAMO_VT ||
1328                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1329     }
1330     /**
1331      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1332      * the MaybeYes which combine-forward and have ccc=0.
1333      * (Standard Unicode 10 normalization does not have such characters.)
1334      */
1335     private boolean isMostDecompYesAndZeroCC(int norm16) {
1336         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1337     }
1338     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1339 
1340     // For use with isCompYes().
1341     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1342     // static uint8_t getCCFromYes(uint16_t norm16) {
1343     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1344     // }
1345     private int getCCFromNoNo(int norm16) {
1346         int mapping=norm16>>OFFSET_SHIFT;
1347         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1348             return extraData.charAt(mapping-1)&0xff;
1349         } else {
1350             return 0;
1351         }
1352     }
1353     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1354         if(norm16<=minYesNo) {
1355             return 0;  // yesYes and Hangul LV have ccc=tccc=0
1356         } else {
1357             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1358             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1359         }
1360     }
1361 
1362     // Requires algorithmic-NoNo.
1363     private int mapAlgorithmic(int c, int norm16) {
1364         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1365     }
1366 
1367     // Requires minYesNo<norm16<limitNoNo.
1368     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1369 
1370     /**
1371      * @return index into maybeYesCompositions, or -1
1372      */
1373     private int getCompositionsListForDecompYes(int norm16) {
1374         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1375             return -1;
1376         } else {
1377             if((norm16-=minMaybeYes)<0) {
1378                 // norm16<minMaybeYes: index into extraData which is a substring at
1379                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1380                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1381                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1382             }
1383             return norm16>>OFFSET_SHIFT;
1384         }
1385     }
1386     /**
1387      * @return index into maybeYesCompositions
1388      */
1389     private int getCompositionsListForComposite(int norm16) {
1390         // A composite has both mapping & compositions list.
1391         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1392         int firstUnit=maybeYesCompositions.charAt(list);
1393         return list+  // mapping in maybeYesCompositions
1394             1+  // +1 to skip the first unit with the mapping length
1395             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1396     }
1397 
1398     // Decompose a short piece of text which is likely to contain characters that
1399     // fail the quick check loop and/or where the quick check loop's overhead
1400     // is unlikely to be amortized.
1401     // Called by the compose() and makeFCD() implementations.
1402     // Public in Java for collation implementation code.
1403     private int decomposeShort(
1404             CharSequence s, int src, int limit,
1405             boolean stopAtCompBoundary, boolean onlyContiguous,
1406             ReorderingBuffer buffer) {
1407         while(src<limit) {
1408             int c=Character.codePointAt(s, src);
1409             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1410                 return src;
1411             }
1412             int norm16 = getNorm16(c);
1413             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1414                 return src;
1415             }
1416             src+=Character.charCount(c);
1417             decompose(c, norm16, buffer);
1418             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1419                 return src;
1420             }
1421         }
1422         return src;
1423     }
1424     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1425         // get the decomposition and the lead and trail cc's
1426         if (norm16 >= limitNoNo) {
1427             if (isMaybeOrNonZeroCC(norm16)) {
1428                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1429                 return;
1430             }
1431             // Maps to an isCompYesAndZeroCC.
1432             c=mapAlgorithmic(c, norm16);
1433             norm16=getNorm16(c);
1434         }
1435         if (norm16 < minYesNo) {
1436             // c does not decompose
1437             buffer.append(c, 0);
1438         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1439             // Hangul syllable: decompose algorithmically
1440             Hangul.decompose(c, buffer);
1441         } else {
1442             // c decomposes, get everything from the variable-length extra data
1443             int mapping=norm16>>OFFSET_SHIFT;
1444             int firstUnit=extraData.charAt(mapping);
1445             int length=firstUnit&MAPPING_LENGTH_MASK;
1446             int leadCC, trailCC;
1447             trailCC=firstUnit>>8;
1448             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1449                 leadCC=extraData.charAt(mapping-1)>>8;
1450             } else {
1451                 leadCC=0;
1452             }
1453             ++mapping;  // skip over the firstUnit
1454             buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
1455         }
1456     }
1457 
1458     /**
1459      * Finds the recomposition result for
1460      * a forward-combining "lead" character,
1461      * specified with a pointer to its compositions list,
1462      * and a backward-combining "trail" character.
1463      *
1464      * <p>If the lead and trail characters combine, then this function returns
1465      * the following "compositeAndFwd" value:
1466      * <pre>
1467      * Bits 21..1  composite character
1468      * Bit      0  set if the composite is a forward-combining starter
1469      * </pre>
1470      * otherwise it returns -1.
1471      *
1472      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1473      * encoded as either pairs or triples of 16-bit units.
1474      * The last entry has the high bit of its first unit set.
1475      *
1476      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1477      * A linear search is used.
1478      *
1479      * <p>See normalizer2impl.h for a more detailed description
1480      * of the compositions list format.
1481      */
1482     private static int combine(String compositions, int list, int trail) {
1483         int key1, firstUnit;
1484         if(trail<COMP_1_TRAIL_LIMIT) {
1485             // trail character is 0..33FF
1486             // result entry may have 2 or 3 units
1487             key1=(trail<<1);
1488             while(key1>(firstUnit=compositions.charAt(list))) {
1489                 list+=2+(firstUnit&COMP_1_TRIPLE);
1490             }
1491             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1492                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1493                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1494                 } else {
1495                     return compositions.charAt(list+1);
1496                 }
1497             }
1498         } else {
1499             // trail character is 3400..10FFFF
1500             // result entry has 3 units
1501             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1502             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1503             int secondUnit;
1504             for(;;) {
1505                 if(key1>(firstUnit=compositions.charAt(list))) {
1506                     list+=2+(firstUnit&COMP_1_TRIPLE);
1507                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1508                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1509                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1510                             break;
1511                         } else {
1512                             list+=3;
1513                         }
1514                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1515                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1516                     } else {
1517                         break;
1518                     }
1519                 } else {
1520                     break;
1521                 }
1522             }
1523         }
1524         return -1;
1525     }
1526 
1527     /*
1528      * Recomposes the buffer text starting at recomposeStartIndex
1529      * (which is in NFD - decomposed and canonically ordered),
1530      * and truncates the buffer contents.
1531      *
1532      * Note that recomposition never lengthens the text:
1533      * Any character consists of either one or two code units;
1534      * a composition may contain at most one more code unit than the original starter,
1535      * while the combining mark that is removed has at least one code unit.
1536      */
1537     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1538                            boolean onlyContiguous) {
1539         StringBuilder sb=buffer.getStringBuilder();
1540         int p=recomposeStartIndex;
1541         if(p==sb.length()) {
1542             return;
1543         }
1544 
1545         int starter, pRemove;
1546         int compositionsList;
1547         int c, compositeAndFwd;
1548         int norm16;
1549         int cc, prevCC;
1550         boolean starterIsSupplementary;
1551 
1552         // Some of the following variables are not used until we have a forward-combining starter
1553         // and are only initialized now to avoid compiler warnings.
1554         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1555         starter=-1;
1556         starterIsSupplementary=false;
1557         prevCC=0;
1558 
1559         for(;;) {
1560             c=sb.codePointAt(p);
1561             p+=Character.charCount(c);
1562             norm16=getNorm16(c);
1563             cc=getCCFromYesOrMaybe(norm16);
1564             if( // this character combines backward and
1565                 isMaybe(norm16) &&
1566                 // we have seen a starter that combines forward and
1567                 compositionsList>=0 &&
1568                 // the backward-combining character is not blocked
1569                 (prevCC<cc || prevCC==0)
1570             ) {
1571                 if(isJamoVT(norm16)) {
1572                     // c is a Jamo V/T, see if we can compose it with the previous character.
1573                     if(c<Hangul.JAMO_T_BASE) {
1574                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1575                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1576                         if(prev<Hangul.JAMO_L_COUNT) {
1577                             pRemove=p-1;
1578                             char syllable=(char)
1579                                 (Hangul.HANGUL_BASE+
1580                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1581                                  Hangul.JAMO_T_COUNT);
1582                             char t;
1583                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1584                                 ++p;
1585                                 syllable+=t;  // The next character was a Jamo T.
1586                             }
1587                             sb.setCharAt(starter, syllable);
1588                             // remove the Jamo V/T
1589                             sb.delete(pRemove, p);
1590                             p=pRemove;
1591                         }
1592                     }
1593                     /*
1594                      * No "else" for Jamo T:
1595                      * Since the input is in NFD, there are no Hangul LV syllables that
1596                      * a Jamo T could combine with.
1597                      * All Jamo Ts are combined above when handling Jamo Vs.
1598                      */
1599                     if(p==sb.length()) {
1600                         break;
1601                     }
1602                     compositionsList=-1;
1603                     continue;
1604                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1605                     // The starter and the combining mark (c) do combine.
1606                     int composite=compositeAndFwd>>1;
1607 
1608                     // Remove the combining mark.
1609                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1610                     sb.delete(pRemove, p);
1611                     p=pRemove;
1612                     // Replace the starter with the composite.
1613                     if(starterIsSupplementary) {
1614                         if(composite>0xffff) {
1615                             // both are supplementary
1616                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1617                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1618                         } else {
1619                             sb.setCharAt(starter, (char)c);
1620                             sb.deleteCharAt(starter+1);
1621                             // The composite is shorter than the starter,
1622                             // move the intermediate characters forward one.
1623                             starterIsSupplementary=false;
1624                             --p;
1625                         }
1626                     } else if(composite>0xffff) {
1627                         // The composite is longer than the starter,
1628                         // move the intermediate characters back one.
1629                         starterIsSupplementary=true;
1630                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1631                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1632                         ++p;
1633                     } else {
1634                         // both are on the BMP
1635                         sb.setCharAt(starter, (char)composite);
1636                     }
1637 
1638                     // Keep prevCC because we removed the combining mark.
1639 
1640                     if(p==sb.length()) {
1641                         break;
1642                     }
1643                     // Is the composite a starter that combines forward?
1644                     if((compositeAndFwd&1)!=0) {
1645                         compositionsList=
1646                             getCompositionsListForComposite(getNorm16(composite));
1647                     } else {
1648                         compositionsList=-1;
1649                     }
1650 
1651                     // We combined; continue with looking for compositions.
1652                     continue;
1653                 }
1654             }
1655 
1656             // no combination this time
1657             prevCC=cc;
1658             if(p==sb.length()) {
1659                 break;
1660             }
1661 
1662             // If c did not combine, then check if it is a starter.
1663             if(cc==0) {
1664                 // Found a new starter.
1665                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1666                     // It may combine with something, prepare for it.
1667                     if(c<=0xffff) {
1668                         starterIsSupplementary=false;
1669                         starter=p-1;
1670                     } else {
1671                         starterIsSupplementary=true;
1672                         starter=p-2;
1673                     }
1674                 }
1675             } else if(onlyContiguous) {
1676                 // FCC: no discontiguous compositions; any intervening character blocks.
1677                 compositionsList=-1;
1678             }
1679         }
1680         buffer.flush();
1681     }
1682 
1683     /**
1684      * Does c have a composition boundary before it?
1685      * True if its decomposition begins with a character that has
1686      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1687      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1688      * (isCompYesAndZeroCC()) so we need not decompose.
1689      */
1690     private boolean hasCompBoundaryBefore(int c, int norm16) {
1691         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
1692     }
1693     private boolean norm16HasCompBoundaryBefore(int norm16) {
1694         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
1695     }
1696     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
1697         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
1698     }
1699     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
1700         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1701             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
1702     }
1703     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
1704         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
1705     }
1706     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
1707     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
1708         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
1709             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
1710     }
1711 
1712     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
1713         while(p>0) {
1714             int c=Character.codePointBefore(s, p);
1715             int norm16 = getNorm16(c);
1716             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1717                 break;
1718             }
1719             p-=Character.charCount(c);
1720             if(hasCompBoundaryBefore(c, norm16)) {
1721                 break;
1722             }
1723         }
1724         return p;
1725     }
1726     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
1727         while(p<limit) {
1728             int c=Character.codePointAt(s, p);
1729             int norm16=normTrie.get(c);
1730             if(hasCompBoundaryBefore(c, norm16)) {
1731                 break;
1732             }
1733             p+=Character.charCount(c);
1734             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1735                 break;
1736             }
1737         }
1738         return p;
1739     }
1740 
1741 
1742     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1743         while(p<limit) {
1744             int c=Character.codePointAt(s, p);
1745             int norm16;
1746             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
1747                 break;
1748             }
1749             p+=Character.charCount(c);
1750             if (norm16HasDecompBoundaryAfter(norm16)) {
1751                 break;
1752             }
1753         }
1754         return p;
1755     }
1756 
1757     /**
1758      * Get the canonical decomposition
1759      * sherman  for ComposedCharIter
1760      */
1761     public static int getDecompose(int chars[], String decomps[]) {
1762         Normalizer2 impl = Normalizer2.getNFDInstance();
1763 
1764         int length=0;
1765         int norm16 = 0;
1766         int ch = -1;
1767         int i = 0;
1768 
1769         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1770             //TBD !!!! the hack code heres save us about 50ms for startup
1771             //need a better solution/lookup
1772             if (ch == 0x30ff)
1773                 ch = 0xf900;
1774             else if (ch == 0x115bc)
1775                 ch = 0x1d15e;
1776             else if (ch == 0x1d1c1)
1777                 ch = 0x2f800;
1778 
1779             String s = impl.getDecomposition(ch);
1780 
1781             if(s != null && i < chars.length) {
1782                 chars[i] = ch;
1783                 decomps[i++] = s;
1784             }
1785         }
1786         return i;
1787     }
1788 
1789     //------------------------------------------------------
1790     // special method for Collation (RBTableBuilder.build())
1791     //------------------------------------------------------
1792     private static boolean needSingleQuotation(char c) {
1793         return (c >= 0x0009 && c <= 0x000D) ||
1794                (c >= 0x0020 && c <= 0x002F) ||
1795                (c >= 0x003A && c <= 0x0040) ||
1796                (c >= 0x005B && c <= 0x0060) ||
1797                (c >= 0x007B && c <= 0x007E);
1798     }
1799 
1800     public static String canonicalDecomposeWithSingleQuotation(String string) {
1801        Normalizer2 impl = Normalizer2.getNFDInstance();
1802        char[] src = string.toCharArray();
1803        int    srcIndex = 0;
1804        int    srcLimit = src.length;
1805        char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
1806        int    destIndex = 0;
1807        int    destLimit = dest.length;
1808 
1809         int prevSrc;
1810         String norm;
1811         int reorderStartIndex, length;
1812         char c1, c2;
1813         int cp;
1814         int minNoMaybe = 0x00c0;
1815         int cc, prevCC, trailCC;
1816         char[] p;
1817         int pStart;
1818 
1819         // initialize
1820         reorderStartIndex = 0;
1821         prevCC = 0;
1822         norm = null;
1823         cp = 0;
1824         pStart = 0;
1825 
1826         cc = trailCC = -1; // initialize to bogus value
1827         c1 = 0;
1828         for (;;) {
1829             prevSrc=srcIndex;
1830             //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
1831             while (srcIndex != srcLimit &&
1832                    ((c1 = src[srcIndex]) < minNoMaybe ||
1833                     (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
1834                     (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
1835                 prevCC = 0;
1836                 srcIndex += (cp < 0x10000) ? 1 : 2;
1837             }
1838 
1839             // copy these code units all at once
1840             if (srcIndex != prevSrc) {
1841                 length = srcIndex - prevSrc;
1842                 if ((destIndex + length) <= destLimit) {
1843                     System.arraycopy(src,prevSrc,dest,destIndex,length);
1844                 }
1845 
1846                 destIndex += length;
1847                 reorderStartIndex = destIndex;
1848             }
1849 
1850             // end of source reached?
1851             if (srcIndex == srcLimit) {
1852                 break;
1853             }
1854 
1855             // cp already contains *src and norm32 is set for it, increment src
1856             srcIndex += (cp < 0x10000) ? 1 : 2;
1857 
1858             if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1859                 c2 = 0;
1860                 length = 1;
1861 
1862                 if (Character.isHighSurrogate(c1)
1863                     || Character.isLowSurrogate(c1)) {
1864                     norm = null;
1865                 }
1866             } else {
1867                 length = 2;
1868                 c2 = src[srcIndex-1];
1869             }
1870 
1871           // get the decomposition and the lead and trail cc's
1872           if (norm == null) {
1873               // cp does not decompose
1874               cc = trailCC = UCharacter.getCombiningClass(cp);
1875               p = null;
1876               pStart = -1;
1877           } else {
1878 
1879                 pStart = 0;
1880                 p = norm.toCharArray();
1881                 length = p.length;
1882                 int cpNum = norm.codePointCount(0, length);
1883                 cc= UCharacter.getCombiningClass(norm.codePointAt(0));
1884                 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
1885                 if (length == 1) {
1886                     // fastpath a single code unit from decomposition
1887                     c1 = p[pStart];
1888                     c2 = 0;
1889                     p = null;
1890                     pStart = -1;
1891                 }
1892             }
1893 
1894             if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
1895                 // buffer overflow
1896                 char[] tmpBuf = new char[destLimit * 2];
1897                 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
1898                 dest = tmpBuf;
1899                 destLimit = dest.length;
1900             }
1901 
1902             // append the decomposition to the destination buffer, assume length>0
1903             {
1904                 int reorderSplit = destIndex;
1905                 if (p == null) {
1906                     // fastpath: single code point
1907                     if (needSingleQuotation(c1)) {
1908                         //if we need single quotation, no need to consider "prevCC"
1909                         //and it must NOT be a supplementary pair
1910                         dest[destIndex++] = '\'';
1911                         dest[destIndex++] = c1;
1912                         dest[destIndex++] = '\'';
1913                         trailCC = 0;
1914                     } else if(cc != 0 && cc < prevCC) {
1915                         // (c1, c2) is out of order with respect to the preceding
1916                         //  text
1917                         destIndex += length;
1918                         trailCC = insertOrdered(dest, reorderStartIndex,
1919                                                 reorderSplit, destIndex, c1, c2, cc);
1920                     } else {
1921                         // just append (c1, c2)
1922                         dest[destIndex++] = c1;
1923                         if(c2 != 0) {
1924                             dest[destIndex++] = c2;
1925                         }
1926                     }
1927                 } else {
1928                     // general: multiple code points (ordered by themselves)
1929                     // from decomposition
1930                     if (needSingleQuotation(p[pStart])) {
1931                         dest[destIndex++] = '\'';
1932                         dest[destIndex++] = p[pStart++];
1933                         dest[destIndex++] = '\'';
1934                         length--;
1935                         do {
1936                             dest[destIndex++] = p[pStart++];
1937                         } while(--length > 0);
1938                     } else if (cc != 0 && cc < prevCC) {
1939                         destIndex += length;
1940                         trailCC = mergeOrdered(dest, reorderStartIndex,
1941                                                reorderSplit, p, pStart,
1942                                                pStart+length);
1943                     } else {
1944                         // just append the decomposition
1945                         do {
1946                             dest[destIndex++] = p[pStart++];
1947                         } while (--length > 0);
1948                     }
1949                 }
1950             }
1951             prevCC = trailCC;
1952             if(prevCC == 0) {
1953                 reorderStartIndex = destIndex;
1954             }
1955         }
1956 
1957         return new String(dest, 0, destIndex);
1958     }
1959 
1960     /**
1961      * simpler, single-character version of mergeOrdered() -
1962      * bubble-insert one single code point into the preceding string
1963      * which is already canonically ordered
1964      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
1965      *
1966      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1967      *
1968      * before: src[start]..src[current] is already ordered, and
1969      *         src[current]..src[p]     may or may not hold (c, c2) but
1970      *                          must be exactly the same length as (c, c2)
1971      * after: src[start]..src[p] is ordered
1972      *
1973      * @return the trailing combining class
1974      */
1975     private static int/*unsigned byte*/ insertOrdered(char[] source,
1976                                                       int start,
1977                                                       int current, int p,
1978                                                       char c1, char c2,
1979                                                       int/*unsigned byte*/ cc) {
1980         int back, preBack;
1981         int r;
1982         int prevCC, trailCC=cc;
1983 
1984         if (start<current && cc!=0) {
1985             // search for the insertion point where cc>=prevCC
1986             preBack=back=current;
1987 
1988             PrevArgs prevArgs = new PrevArgs();
1989             prevArgs.current  = current;
1990             prevArgs.start    = start;
1991             prevArgs.src      = source;
1992             prevArgs.c1       = c1;
1993             prevArgs.c2       = c2;
1994 
1995             // get the prevCC
1996             prevCC=getPrevCC(prevArgs);
1997             preBack = prevArgs.current;
1998 
1999             if(cc<prevCC) {
2000                 // this will be the last code point, so keep its cc
2001                 trailCC=prevCC;
2002                 back=preBack;
2003                 while(start<preBack) {
2004                     prevCC=getPrevCC(prevArgs);
2005                     preBack=prevArgs.current;
2006                     if(cc>=prevCC) {
2007                         break;
2008                     }
2009                     back=preBack;
2010                 }
2011 
2012                 // this is where we are right now with all these indicies:
2013                 // [start]..[pPreBack] 0..? code points that we can ignore
2014                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
2015                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
2016                 // [current]..[p]         1 code point (c, c2) with cc
2017 
2018                 // move the code units in between up
2019                 r=p;
2020                 do {
2021                     source[--r]=source[--current];
2022                 } while (back!=current);
2023             }
2024         }
2025 
2026         // insert (c1, c2)
2027         source[current] = c1;
2028         if (c2!=0) {
2029             source[(current+1)] = c2;
2030         }
2031 
2032         // we know the cc of the last code point
2033         return trailCC;
2034     }
2035     /**
2036      * merge two UTF-16 string parts together
2037      * to canonically order (order by combining classes) their concatenation
2038      *
2039      * the two strings may already be adjacent, so that the merging is done
2040      * in-place if the two strings are not adjacent, then the buffer holding the
2041      * first one must be large enough
2042      * the second string may or may not be ordered in itself
2043      *
2044      * before: [start]..[current] is already ordered, and
2045      *         [next]..[limit]    may be ordered in itself, but
2046      *                          is not in relation to [start..current[
2047      * after: [start..current+(limit-next)[ is ordered
2048      *
2049      * the algorithm is a simple bubble-sort that takes the characters from
2050      * src[next++] and inserts them in correct combining class order into the
2051      * preceding part of the string
2052      *
2053      * since this function is called much less often than the single-code point
2054      * insertOrdered(), it just uses that for easier maintenance
2055      *
2056      * @return the trailing combining class
2057      */
2058     private static int /*unsigned byte*/ mergeOrdered(char[] source,
2059                                                       int start,
2060                                                       int current,
2061                                                       char[] data,
2062                                                         int next,
2063                                                         int limit) {
2064             int r;
2065             int /*unsigned byte*/ cc, trailCC=0;
2066             boolean adjacent;
2067 
2068             adjacent= current==next;
2069             NextCCArgs ncArgs = new NextCCArgs();
2070             ncArgs.source = data;
2071             ncArgs.next   = next;
2072             ncArgs.limit  = limit;
2073 
2074             if(start!=current) {
2075 
2076                 while(ncArgs.next<ncArgs.limit) {
2077                     cc=getNextCC(ncArgs);
2078                     if(cc==0) {
2079                         // does not bubble back
2080                         trailCC=0;
2081                         if(adjacent) {
2082                             current=ncArgs.next;
2083                         } else {
2084                             data[current++]=ncArgs.c1;
2085                             if(ncArgs.c2!=0) {
2086                                 data[current++]=ncArgs.c2;
2087                             }
2088                         }
2089                         break;
2090                     } else {
2091                         r=current+(ncArgs.c2==0 ? 1 : 2);
2092                         trailCC=insertOrdered(source,start, current, r,
2093                                               ncArgs.c1, ncArgs.c2, cc);
2094                         current=r;
2095                     }
2096                 }
2097             }
2098 
2099             if(ncArgs.next==ncArgs.limit) {
2100                 // we know the cc of the last code point
2101                 return trailCC;
2102             } else {
2103                 if(!adjacent) {
2104                     // copy the second string part
2105                     do {
2106                         source[current++]=data[ncArgs.next++];
2107                     } while(ncArgs.next!=ncArgs.limit);
2108                     ncArgs.limit=current;
2109                 }
2110                 PrevArgs prevArgs = new PrevArgs();
2111                 prevArgs.src   = data;
2112                 prevArgs.start = start;
2113                 prevArgs.current =  ncArgs.limit;
2114                 return getPrevCC(prevArgs);
2115             }
2116 
2117     }
2118     private static final class PrevArgs{
2119         char[] src;
2120         int start;
2121         int current;
2122         char c1;
2123         char c2;
2124     }
2125 
2126     private static final class NextCCArgs{
2127         char[] source;
2128         int next;
2129         int limit;
2130         char c1;
2131         char c2;
2132     }
2133     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2134         args.c1=args.source[args.next++];
2135         args.c2=0;
2136 
2137         if (UTF16.isTrailSurrogate(args.c1)) {
2138             /* unpaired second surrogate */
2139             return 0;
2140         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2141             return UCharacter.getCombiningClass(args.c1);
2142         } else if (args.next!=args.limit &&
2143                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2144             ++args.next;
2145             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2146         } else {
2147             /* unpaired first surrogate */
2148             args.c2=0;
2149             return 0;
2150         }
2151     }
2152     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2153         args.c1=args.src[--args.current];
2154         args.c2=0;
2155 
2156         if (args.c1 < MIN_CCC_LCCC_CP) {
2157             return 0;
2158         } else if (UTF16.isLeadSurrogate(args.c1)) {
2159             /* unpaired first surrogate */
2160             return 0;
2161         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2162             return UCharacter.getCombiningClass(args.c1);
2163         } else if (args.current!=args.start &&
2164                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2165             --args.current;
2166             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2167         } else {
2168             /* unpaired second surrogate */
2169             args.c2=0;
2170             return 0;
2171         }
2172     }
2173 
2174     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2175         if (start == p) {
2176             return 0;
2177         }
2178         return getFCD16(Character.codePointBefore(s, p));
2179     }
2180 
2181     private VersionInfo dataVersion;
2182 
2183     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2184     private int minDecompNoCP;
2185     private int minCompNoMaybeCP;
2186     private int minLcccCP;
2187 
2188     // Norm16 value thresholds for quick check combinations and types of extra data.
2189     private int minYesNo;
2190     private int minYesNoMappingsOnly;
2191     private int minNoNo;
2192     private int minNoNoCompBoundaryBefore;
2193     private int minNoNoCompNoMaybeCC;
2194     private int minNoNoEmpty;
2195     private int limitNoNo;
2196     private int centerNoNoDelta;
2197     private int minMaybeYes;
2198 
2199     private Trie2_16 normTrie;
2200     private String maybeYesCompositions;
2201     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2202     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2203 
2204    }