1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 package sun.text.normalizer;
  33 
  34 import java.io.IOException;
  35 import java.nio.ByteBuffer;
  36 import java.text.Normalizer;
  37 
  38 // Original filename in ICU4J: Normalizer2Impl.java
  39 public final class NormalizerImpl {
  40     public static final class Hangul {
  41         /* Korean Hangul and Jamo constants */
  42         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
  43         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
  44         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
  45 
  46         public static final int HANGUL_BASE=0xac00;
  47         public static final int HANGUL_END=0xd7a3;
  48 
  49         public static final int JAMO_L_COUNT=19;
  50         public static final int JAMO_V_COUNT=21;
  51         public static final int JAMO_T_COUNT=28;
  52 
  53         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
  54         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
  55 
  56         public static boolean isHangul(int c) {
  57             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
  58         }
  59         public static boolean isHangulLV(int c) {
  60             c-=HANGUL_BASE;
  61             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
  62         }
  63 
  64         /**
  65          * Decomposes c, which must be a Hangul syllable, into buffer
  66          * and returns the length of the decomposition (2 or 3).
  67          */
  68         public static int decompose(int c, Appendable buffer) {
  69             try {
  70                 c-=HANGUL_BASE;
  71                 int c2=c%JAMO_T_COUNT;
  72                 c/=JAMO_T_COUNT;
  73                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
  74                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
  75                 if(c2==0) {
  76                     return 2;
  77                 } else {
  78                     buffer.append((char)(JAMO_T_BASE+c2));
  79                     return 3;
  80                 }
  81             } catch(IOException e) {
  82                 throw new InternalError(e);
  83             }
  84         }
  85     }
  86 
  87     /**
  88      * Writable buffer that takes care of canonical ordering.
  89      * Its Appendable methods behave like the C++ implementation's
  90      * appendZeroCC() methods.
  91      * <p>
  92      * If dest is a StringBuilder, then the buffer writes directly to it.
  93      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
  94      * until no further changes are necessary and whole segments are appended.
  95      * append() methods that take combining-class values always write to the StringBuilder.
  96      * Other append() methods flush and append to the Appendable.
  97      */
  98     public static final class ReorderingBuffer implements Appendable {
  99         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
 100             impl=ni;
 101             app=dest;
 102             if (app instanceof StringBuilder) {
 103                 appIsStringBuilder=true;
 104                 str=(StringBuilder)dest;
 105                 // In Java, the constructor subsumes public void init(int destCapacity)
 106                 str.ensureCapacity(destCapacity);
 107                 reorderStart=0;
 108                 if(str.length()==0) {
 109                     lastCC=0;
 110                 } else {
 111                     setIterator();
 112                     lastCC=previousCC();
 113                     // Set reorderStart after the last code point with cc<=1 if there is one.
 114                     if(lastCC>1) {
 115                         while(previousCC()>1) {}
 116                     }
 117                     reorderStart=codePointLimit;
 118                 }
 119             } else {
 120                 appIsStringBuilder=false;
 121                 str=new StringBuilder();
 122                 reorderStart=0;
 123                 lastCC=0;
 124             }
 125         }
 126 
 127         public boolean isEmpty() { return str.length()==0; }
 128         public int length() { return str.length(); }
 129         public int getLastCC() { return lastCC; }
 130 
 131         public StringBuilder getStringBuilder() { return str; }
 132 
 133         public boolean equals(CharSequence s, int start, int limit) {
 134             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 135         }
 136 
 137         public void append(int c, int cc) {
 138             if(lastCC<=cc || cc==0) {
 139                 str.appendCodePoint(c);
 140                 lastCC=cc;
 141                 if(cc<=1) {
 142                     reorderStart=str.length();
 143                 }
 144             } else {
 145                 insert(c, cc);
 146             }
 147         }
 148         public void append(CharSequence s, int start, int limit, boolean isNFD,
 149                            int leadCC, int trailCC) {
 150             if(start==limit) {
 151                 return;
 152             }
 153             if(lastCC<=leadCC || leadCC==0) {
 154                 if(trailCC<=1) {
 155                     reorderStart=str.length()+(limit-start);
 156                 } else if(leadCC<=1) {
 157                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 158                 }
 159                 str.append(s, start, limit);
 160                 lastCC=trailCC;
 161             } else {
 162                 int c=Character.codePointAt(s, start);
 163                 start+=Character.charCount(c);
 164                 insert(c, leadCC);  // insert first code point
 165                 while(start<limit) {
 166                     c=Character.codePointAt(s, start);
 167                     start+=Character.charCount(c);
 168                     if(start<limit) {
 169                         if (isNFD) {
 170                             leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
 171                         } else {
 172                             leadCC = impl.getCC(impl.getNorm16(c));
 173                         }
 174                     } else {
 175                         leadCC=trailCC;
 176                     }
 177                     append(c, leadCC);
 178                 }
 179             }
 180         }
 181         // The following append() methods work like C++ appendZeroCC().
 182         // They assume that the cc or trailCC of their input is 0.
 183         // Most of them implement Appendable interface methods.
 184         @Override
 185         public ReorderingBuffer append(char c) {
 186             str.append(c);
 187             lastCC=0;
 188             reorderStart=str.length();
 189             return this;
 190         }
 191         public void appendZeroCC(int c) {
 192             str.appendCodePoint(c);
 193             lastCC=0;
 194             reorderStart=str.length();
 195         }
 196         @Override
 197         public ReorderingBuffer append(CharSequence s) {
 198             if(s.length()!=0) {
 199                 str.append(s);
 200                 lastCC=0;
 201                 reorderStart=str.length();
 202             }
 203             return this;
 204         }
 205         @Override
 206         public ReorderingBuffer append(CharSequence s, int start, int limit) {
 207             if(start!=limit) {
 208                 str.append(s, start, limit);
 209                 lastCC=0;
 210                 reorderStart=str.length();
 211             }
 212             return this;
 213         }
 214         /**
 215          * Flushes from the intermediate StringBuilder to the Appendable,
 216          * if they are different objects.
 217          * Used after recomposition.
 218          * Must be called at the end when writing to a non-StringBuilder Appendable.
 219          */
 220         public void flush() {
 221             if(appIsStringBuilder) {
 222                 reorderStart=str.length();
 223             } else {
 224                 try {
 225                     app.append(str);
 226                     str.setLength(0);
 227                     reorderStart=0;
 228                 } catch(IOException e) {
 229                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 230                 }
 231             }
 232             lastCC=0;
 233         }
 234         /**
 235          * Flushes from the intermediate StringBuilder to the Appendable,
 236          * if they are different objects.
 237          * Then appends the new text to the Appendable or StringBuilder.
 238          * Normally used after quick check loops find a non-empty sequence.
 239          */
 240         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
 241             if(appIsStringBuilder) {
 242                 str.append(s, start, limit);
 243                 reorderStart=str.length();
 244             } else {
 245                 try {
 246                     app.append(str).append(s, start, limit);
 247                     str.setLength(0);
 248                     reorderStart=0;
 249                 } catch(IOException e) {
 250                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 251                 }
 252             }
 253             lastCC=0;
 254             return this;
 255         }
 256         public void remove() {
 257             str.setLength(0);
 258             lastCC=0;
 259             reorderStart=0;
 260         }
 261         public void removeSuffix(int suffixLength) {
 262             int oldLength=str.length();
 263             str.delete(oldLength-suffixLength, oldLength);
 264             lastCC=0;
 265             reorderStart=str.length();
 266         }
 267 
 268         // Inserts c somewhere before the last character.
 269         // Requires 0<cc<lastCC which implies reorderStart<limit.
 270         private void insert(int c, int cc) {
 271             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 272             // insert c at codePointLimit, after the character with prevCC<=cc
 273             if(c<=0xffff) {
 274                 str.insert(codePointLimit, (char)c);
 275                 if(cc<=1) {
 276                     reorderStart=codePointLimit+1;
 277                 }
 278             } else {
 279                 str.insert(codePointLimit, Character.toChars(c));
 280                 if(cc<=1) {
 281                     reorderStart=codePointLimit+2;
 282                 }
 283             }
 284         }
 285 
 286         private final NormalizerImpl impl;
 287         private final Appendable app;
 288         private final StringBuilder str;
 289         private final boolean appIsStringBuilder;
 290         private int reorderStart;
 291         private int lastCC;
 292 
 293         // private backward iterator
 294         private void setIterator() { codePointStart=str.length(); }
 295         private void skipPrevious() {  // Requires 0<codePointStart.
 296             codePointLimit=codePointStart;
 297             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 298         }
 299         private int previousCC() {  // Returns 0 if there is no previous character.
 300             codePointLimit=codePointStart;
 301             if(reorderStart>=codePointStart) {
 302                 return 0;
 303             }
 304             int c=str.codePointBefore(codePointStart);
 305             codePointStart-=Character.charCount(c);
 306             return impl.getCCFromYesOrMaybeCP(c);
 307         }
 308         private int codePointStart, codePointLimit;
 309     }
 310 
 311     // TODO: Propose as public API on the UTF16 class.
 312     // TODO: Propose widening UTF16 methods that take char to take int.
 313     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 314     public static final class UTF16Plus {
 315         /**
 316          * Is this code point a lead surrogate (U+d800..U+dbff)?
 317          * @param c code unit or code point
 318          * @return true or false
 319          */
 320         public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
 321         /**
 322          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 323          * is it a lead surrogate?
 324          * @param c code unit or code point
 325          * @return true or false
 326          */
 327         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 328 
 329         /**
 330          * Compares two CharSequence subsequences for binary equality.
 331          * @param s1 first sequence
 332          * @param start1 start offset in first sequence
 333          * @param limit1 limit offset in first sequence
 334          * @param s2 second sequence
 335          * @param start2 start offset in second sequence
 336          * @param limit2 limit offset in second sequence
 337          * @return true if s1.subSequence(start1, limit1) contains the same text
 338          *              as s2.subSequence(start2, limit2)
 339          */
 340         public static boolean equal(CharSequence s1, int start1, int limit1,
 341                                     CharSequence s2, int start2, int limit2) {
 342             if((limit1-start1)!=(limit2-start2)) {
 343                 return false;
 344             }
 345             if(s1==s2 && start1==start2) {
 346                 return true;
 347             }
 348             while(start1<limit1) {
 349                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 350                     return false;
 351                 }
 352             }
 353             return true;
 354         }
 355     }
 356 
 357     public NormalizerImpl() {}
 358 
 359     private static final class IsAcceptable implements ICUBinary.Authenticate {
 360         public boolean isDataVersionAcceptable(byte version[]) {
 361             return version[0]==4;
 362         }
 363     }
 364     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 365     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 366 
 367     public NormalizerImpl load(ByteBuffer bytes) {
 368         try {
 369             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 370             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 371             if(indexesLength<=IX_MIN_LCCC_CP) {
 372                 throw new InternalError("Normalizer2 data: not enough indexes");
 373             }
 374             int[] inIndexes=new int[indexesLength];
 375             inIndexes[0]=indexesLength*4;
 376             for(int i=1; i<indexesLength; ++i) {
 377                 inIndexes[i]=bytes.getInt();
 378             }
 379 
 380             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 381             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 382             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 383 
 384             minYesNo=inIndexes[IX_MIN_YES_NO];
 385             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 386             minNoNo=inIndexes[IX_MIN_NO_NO];
 387             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 388             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 389             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 390             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 391             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 392             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 393             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 394 
 395             // Read the normTrie.
 396             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 397             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 398             int triePosition = bytes.position();
 399             normTrie = CodePointTrie.Fast16.fromBinary(bytes);
 400             int trieLength = bytes.position() - triePosition;
 401             if(trieLength>(nextOffset-offset)) {
 402                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
 403             }
 404             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 405 
 406             // Read the composition and mapping data.
 407             offset=nextOffset;
 408             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 409             int numChars=(nextOffset-offset)/2;
 410             if(numChars!=0) {
 411                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
 412                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 413             }
 414 
 415             // smallFCD: new in formatVersion 2
 416             offset=nextOffset;
 417             smallFCD=new byte[0x100];
 418             bytes.get(smallFCD);
 419 
 420             return this;
 421         } catch(IOException e) {
 422             throw new InternalError(e);
 423         }
 424     }
 425     public NormalizerImpl load(String name) {
 426         return load(ICUBinary.getRequiredData(name));
 427     }
 428 
 429     // The trie stores values for lead surrogate code *units*.
 430     // Surrogate code *points* are inert.
 431     public int getNorm16(int c) {
 432         return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
 433     }
 434     public int getRawNorm16(int c) { return normTrie.get(c); }
 435     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
 436     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
 437     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 438 
 439     public int getCC(int norm16) {
 440         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 441             return getCCFromNormalYesOrMaybe(norm16);
 442         }
 443         if(norm16<minNoNo || limitNoNo<=norm16) {
 444             return 0;
 445         }
 446         return getCCFromNoNo(norm16);
 447     }
 448     public static int getCCFromNormalYesOrMaybe(int norm16) {
 449         return (norm16 >> OFFSET_SHIFT) & 0xff;
 450     }
 451     public static int getCCFromYesOrMaybe(int norm16) {
 452         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
 453     }
 454     public int getCCFromYesOrMaybeCP(int c) {
 455         if (c < minCompNoMaybeCP) { return 0; }
 456         return getCCFromYesOrMaybe(getNorm16(c));
 457     }
 458 
 459     /**
 460      * Returns the FCD data for code point c.
 461      * @param c A Unicode code point.
 462      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
 463      */
 464     public int getFCD16(int c) {
 465         if(c<minDecompNoCP) {
 466             return 0;
 467         } else if(c<=0xffff) {
 468             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
 469         }
 470         return getFCD16FromNormData(c);
 471     }
 472     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
 473     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
 474         // 0<=lead<=0xffff
 475         byte bits=smallFCD[lead>>8];
 476         if(bits==0) { return false; }
 477         return ((bits>>((lead>>5)&7))&1)!=0;
 478     }
 479 
 480     /** Gets the FCD value from the regular normalization data. */
 481     public int getFCD16FromNormData(int c) {
 482         int norm16=getNorm16(c);
 483         if (norm16 >= limitNoNo) {
 484             if(norm16>=MIN_NORMAL_MAYBE_YES) {
 485                 // combining mark
 486                 norm16=getCCFromNormalYesOrMaybe(norm16);
 487                 return norm16|(norm16<<8);
 488             } else if(norm16>=minMaybeYes) {
 489                 return 0;
 490             } else {  // isDecompNoAlgorithmic(norm16)
 491                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
 492                 if (deltaTrailCC <= DELTA_TCCC_1) {
 493                     return deltaTrailCC >> OFFSET_SHIFT;
 494                 }
 495                 // Maps to an isCompYesAndZeroCC.
 496                 c=mapAlgorithmic(c, norm16);
 497                 norm16=getRawNorm16(c);
 498             }
 499         }
 500         if(norm16<=minYesNo || isHangulLVT(norm16)) {
 501             // no decomposition or Hangul syllable, all zeros
 502             return 0;
 503         }
 504         // c decomposes, get everything from the variable-length extra data
 505         int mapping=norm16>>OFFSET_SHIFT;
 506         int firstUnit=extraData.charAt(mapping);
 507         int fcd16=firstUnit>>8;  // tccc
 508         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 509             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
 510         }
 511         return fcd16;
 512     }
 513 
 514     /**
 515      * Gets the decomposition for one code point.
 516      * @param c code point
 517      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 518      */
 519     public String getDecomposition(int c) {
 520         int norm16;
 521         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 522             // c does not decompose
 523             return null;
 524         }
 525         int decomp = -1;
 526         if(isDecompNoAlgorithmic(norm16)) {
 527             // Maps to an isCompYesAndZeroCC.
 528             decomp=c=mapAlgorithmic(c, norm16);
 529             // The mapping might decompose further.
 530             norm16 = getRawNorm16(c);
 531         }
 532         if (norm16 < minYesNo) {
 533             if(decomp<0) {
 534                 return null;
 535             } else {
 536                 return UTF16.valueOf(decomp);
 537             }
 538         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 539             // Hangul syllable: decompose algorithmically
 540             StringBuilder buffer=new StringBuilder();
 541             Hangul.decompose(c, buffer);
 542             return buffer.toString();
 543         }
 544         // c decomposes, get everything from the variable-length extra data
 545         int mapping=norm16>>OFFSET_SHIFT;
 546         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
 547         return extraData.substring(mapping, mapping+length);
 548     }
 549 
 550     // Fixed norm16 values.
 551     public static final int MIN_YES_YES_WITH_CC=0xfe02;
 552     public static final int JAMO_VT=0xfe00;
 553     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
 554     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
 555     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
 556 
 557     // norm16 bit 0 is comp-boundary-after.
 558     public static final int HAS_COMP_BOUNDARY_AFTER=1;
 559     public static final int OFFSET_SHIFT=1;
 560 
 561     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
 562     // tccc (0, 1, >1) for quick FCC boundary-after tests.
 563     public static final int DELTA_TCCC_0=0;
 564     public static final int DELTA_TCCC_1=2;
 565     public static final int DELTA_TCCC_GT_1=4;
 566     public static final int DELTA_TCCC_MASK=6;
 567     public static final int DELTA_SHIFT=3;
 568 
 569     public static final int MAX_DELTA=0x40;
 570 
 571     // Byte offsets from the start of the data, after the generic header.
 572     public static final int IX_NORM_TRIE_OFFSET=0;
 573     public static final int IX_EXTRA_DATA_OFFSET=1;
 574     public static final int IX_SMALL_FCD_OFFSET=2;
 575     public static final int IX_RESERVED3_OFFSET=3;
 576     public static final int IX_TOTAL_SIZE=7;
 577     public static final int MIN_CCC_LCCC_CP=0x300;
 578     // Code point thresholds for quick check codes.
 579     public static final int IX_MIN_DECOMP_NO_CP=8;
 580     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
 581 
 582     // Norm16 value thresholds for quick check combinations and types of extra data.
 583 
 584     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
 585     public static final int IX_MIN_YES_NO=10;
 586     /** Mappings are comp-normalized. */
 587     public static final int IX_MIN_NO_NO=11;
 588     public static final int IX_LIMIT_NO_NO=12;
 589     public static final int IX_MIN_MAYBE_YES=13;
 590 
 591     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
 592     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
 593     /** Mappings are not comp-normalized but have a comp boundary before. */
 594     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
 595     /** Mappings do not have a comp boundary before. */
 596     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
 597     /** Mappings to the empty string. */
 598     public static final int IX_MIN_NO_NO_EMPTY=17;
 599 
 600     public static final int IX_MIN_LCCC_CP=18;
 601     public static final int IX_COUNT=20;
 602 
 603     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
 604     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
 605     // unused bit 0x20;
 606     public static final int MAPPING_LENGTH_MASK=0x1f;
 607 
 608     public static final int COMP_1_LAST_TUPLE=0x8000;
 609     public static final int COMP_1_TRIPLE=1;
 610     public static final int COMP_1_TRAIL_LIMIT=0x3400;
 611     public static final int COMP_1_TRAIL_MASK=0x7ffe;
 612     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
 613     public static final int COMP_2_TRAIL_SHIFT=6;
 614     public static final int COMP_2_TRAIL_MASK=0xffc0;
 615 
 616     // higher-level functionality ------------------------------------------ ***
 617 
 618     /**
 619      * Decomposes s[src, limit[ and writes the result to dest.
 620      * limit can be NULL if src is NUL-terminated.
 621      * destLengthEstimate is the initial dest buffer capacity and can be -1.
 622      */
 623     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
 624                    int destLengthEstimate) {
 625         if(destLengthEstimate<0) {
 626             destLengthEstimate=limit-src;
 627         }
 628         dest.setLength(0);
 629         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
 630         decompose(s, src, limit, buffer);
 631     }
 632 
 633     // Dual functionality:
 634     // buffer!=NULL: normalize
 635     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
 636     public int decompose(CharSequence s, int src, int limit,
 637                          ReorderingBuffer buffer) {
 638         int minNoCP=minDecompNoCP;
 639 
 640         int prevSrc;
 641         int c=0;
 642         int norm16=0;
 643 
 644         // only for quick check
 645         int prevBoundary=src;
 646         int prevCC=0;
 647 
 648         for(;;) {
 649             // count code units below the minimum or with irrelevant data for the quick check
 650             for(prevSrc=src; src!=limit;) {
 651                 if( (c=s.charAt(src))<minNoCP ||
 652                     isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 653                 ) {
 654                     ++src;
 655                 } else if(!UTF16Plus.isLeadSurrogate(c)) {
 656                     break;
 657                 } else {
 658                     char c2;
 659                     if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
 660                         c = Character.toCodePoint((char)c, c2);
 661                         norm16 = normTrie.suppGet(c);
 662                         if (isMostDecompYesAndZeroCC(norm16)) {
 663                             src += 2;
 664                         } else {
 665                             break;
 666                         }
 667                     } else {
 668                         ++src;  // unpaired lead surrogate: inert
 669                     }
 670                 }
 671             }
 672             // copy these code units all at once
 673             if(src!=prevSrc) {
 674                 if(buffer!=null) {
 675                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
 676                 } else {
 677                     prevCC=0;
 678                     prevBoundary=src;
 679                 }
 680             }
 681             if(src==limit) {
 682                 break;
 683             }
 684 
 685             // Check one above-minimum, relevant code point.
 686             src+=Character.charCount(c);
 687             if(buffer!=null) {
 688                 decompose(c, norm16, buffer);
 689             } else {
 690                 if(isDecompYes(norm16)) {
 691                     int cc=getCCFromYesOrMaybe(norm16);
 692                     if(prevCC<=cc || cc==0) {
 693                         prevCC=cc;
 694                         if(cc<=1) {
 695                             prevBoundary=src;
 696                         }
 697                         continue;
 698                     }
 699                 }
 700                 return prevBoundary;  // "no" or cc out of order
 701             }
 702         }
 703         return src;
 704     }
 705     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
 706         int limit=s.length();
 707         if(limit==0) {
 708             return;
 709         }
 710         if(doDecompose) {
 711             decompose(s, 0, limit, buffer);
 712             return;
 713         }
 714         // Just merge the strings at the boundary.
 715         int c=Character.codePointAt(s, 0);
 716         int src=0;
 717         int firstCC, prevCC, cc;
 718         firstCC=prevCC=cc=getCC(getNorm16(c));
 719         while(cc!=0) {
 720             prevCC=cc;
 721             src+=Character.charCount(c);
 722             if(src>=limit) {
 723                 break;
 724             }
 725             c=Character.codePointAt(s, src);
 726             cc=getCC(getNorm16(c));
 727         };
 728         buffer.append(s, 0, src, false, firstCC, prevCC);
 729         buffer.append(s, src, limit);
 730     }
 731 
 732     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 733     // doCompose: normalize
 734     // !doCompose: isNormalized (buffer must be empty and initialized)
 735     public boolean compose(CharSequence s, int src, int limit,
 736                            boolean onlyContiguous,
 737                            boolean doCompose,
 738                            ReorderingBuffer buffer) {
 739         int prevBoundary=src;
 740         int minNoMaybeCP=minCompNoMaybeCP;
 741 
 742         for (;;) {
 743             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 744             // or with (compYes && ccc==0) properties.
 745             int prevSrc;
 746             int c = 0;
 747             int norm16 = 0;
 748             for (;;) {
 749                 if (src == limit) {
 750                     if (prevBoundary != limit && doCompose) {
 751                         buffer.append(s, prevBoundary, limit);
 752                     }
 753                     return true;
 754                 }
 755                 if( (c=s.charAt(src))<minNoMaybeCP ||
 756                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 757                 ) {
 758                     ++src;
 759                 } else {
 760                     prevSrc = src++;
 761                     if (!UTF16Plus.isLeadSurrogate(c)) {
 762                         break;
 763                     } else {
 764                         char c2;
 765                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
 766                             ++src;
 767                             c = Character.toCodePoint((char)c, c2);
 768                             norm16 = normTrie.suppGet(c);
 769                             if (!isCompYesAndZeroCC(norm16)) {
 770                                 break;
 771                             }
 772                         }
 773                     }
 774                 }
 775             }
 776             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 777             // The current character is either a "noNo" (has a mapping)
 778             // or a "maybeYes" (combines backward)
 779             // or a "yesYes" with ccc!=0.
 780             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
 781 
 782             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
 783             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
 784                 if (!doCompose) {
 785                     return false;
 786                 }
 787                 // Fast path for mapping a character that is immediately surrounded by boundaries.
 788                 // In this case, we need not decompose around the current character.
 789                 if (isDecompNoAlgorithmic(norm16)) {
 790                     // Maps to a single isCompYesAndZeroCC character
 791                     // which also implies hasCompBoundaryBefore.
 792                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 793                             hasCompBoundaryBefore(s, src, limit)) {
 794                         if (prevBoundary != prevSrc) {
 795                             buffer.append(s, prevBoundary, prevSrc);
 796                         }
 797                         buffer.append(mapAlgorithmic(c, norm16), 0);
 798                         prevBoundary = src;
 799                         continue;
 800                     }
 801                 } else if (norm16 < minNoNoCompBoundaryBefore) {
 802                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
 803                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 804                             hasCompBoundaryBefore(s, src, limit)) {
 805                         if (prevBoundary != prevSrc) {
 806                             buffer.append(s, prevBoundary, prevSrc);
 807                         }
 808                         int mapping = norm16 >> OFFSET_SHIFT;
 809                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
 810                         buffer.append(extraData, mapping, mapping + length);
 811                         prevBoundary = src;
 812                         continue;
 813                     }
 814                 } else if (norm16 >= minNoNoEmpty) {
 815                     // The current character maps to nothing.
 816                     // Simply omit it from the output if there is a boundary before _or_ after it.
 817                     // The character itself implies no boundaries.
 818                     if (hasCompBoundaryBefore(s, src, limit) ||
 819                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
 820                         if (prevBoundary != prevSrc) {
 821                             buffer.append(s, prevBoundary, prevSrc);
 822                         }
 823                         prevBoundary = src;
 824                         continue;
 825                     }
 826                 }
 827                 // Other "noNo" type, or need to examine more text around this character:
 828                 // Fall through to the slow path.
 829             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
 830                 char prev=s.charAt(prevSrc-1);
 831                 if(c<Hangul.JAMO_T_BASE) {
 832                     // The current character is a Jamo Vowel,
 833                     // compose with previous Jamo L and following Jamo T.
 834                     char l = (char)(prev-Hangul.JAMO_L_BASE);
 835                     if(l<Hangul.JAMO_L_COUNT) {
 836                         if (!doCompose) {
 837                             return false;
 838                         }
 839                         int t;
 840                         if (src != limit &&
 841                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
 842                                 t < Hangul.JAMO_T_COUNT) {
 843                             // The next character is a Jamo T.
 844                             ++src;
 845                         } else if (hasCompBoundaryBefore(s, src, limit)) {
 846                             // No Jamo T follows, not even via decomposition.
 847                             t = 0;
 848                         } else {
 849                             t = -1;
 850                         }
 851                         if (t >= 0) {
 852                             int syllable = Hangul.HANGUL_BASE +
 853                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
 854                                 Hangul.JAMO_T_COUNT + t;
 855                             --prevSrc;  // Replace the Jamo L as well.
 856                             if (prevBoundary != prevSrc) {
 857                                 buffer.append(s, prevBoundary, prevSrc);
 858                             }
 859                             buffer.append((char)syllable);
 860                             prevBoundary = src;
 861                             continue;
 862                         }
 863                         // If we see L+V+x where x!=T then we drop to the slow path,
 864                         // decompose and recompose.
 865                         // This is to deal with NFKC finding normal L and V but a
 866                         // compatibility variant of a T.
 867                         // We need to either fully compose that combination here
 868                         // (which would complicate the code and may not work with strange custom data)
 869                         // or use the slow path.
 870                     }
 871                 } else if (Hangul.isHangulLV(prev)) {
 872                     // The current character is a Jamo Trailing consonant,
 873                     // compose with previous Hangul LV that does not contain a Jamo T.
 874                     if (!doCompose) {
 875                         return false;
 876                     }
 877                     int syllable = prev + c - Hangul.JAMO_T_BASE;
 878                     --prevSrc;  // Replace the Hangul LV as well.
 879                     if (prevBoundary != prevSrc) {
 880                         buffer.append(s, prevBoundary, prevSrc);
 881                     }
 882                     buffer.append((char)syllable);
 883                     prevBoundary = src;
 884                     continue;
 885                 }
 886                 // No matching context, or may need to decompose surrounding text first:
 887                 // Fall through to the slow path.
 888             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
 889                 // One or more combining marks that do not combine-back:
 890                 // Check for canonical order, copy unchanged if ok and
 891                 // if followed by a character with a boundary-before.
 892                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
 893                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
 894                     // Fails FCD test, need to decompose and contiguously recompose.
 895                     if (!doCompose) {
 896                         return false;
 897                     }
 898                 } else {
 899                     // If !onlyContiguous (not FCC), then we ignore the tccc of
 900                     // the previous character which passed the quick check "yes && ccc==0" test.
 901                     int n16;
 902                     for (;;) {
 903                         if (src == limit) {
 904                             if (doCompose) {
 905                                 buffer.append(s, prevBoundary, limit);
 906                             }
 907                             return true;
 908                         }
 909                         int prevCC = cc;
 910                         c = Character.codePointAt(s, src);
 911                         n16 = normTrie.get(c);
 912                         if (n16 >= MIN_YES_YES_WITH_CC) {
 913                             cc = getCCFromNormalYesOrMaybe(n16);
 914                             if (prevCC > cc) {
 915                                 if (!doCompose) {
 916                                     return false;
 917                                 }
 918                                 break;
 919                             }
 920                         } else {
 921                             break;
 922                         }
 923                         src += Character.charCount(c);
 924                     }
 925                     // p is after the last in-order combining mark.
 926                     // If there is a boundary here, then we continue with no change.
 927                     if (norm16HasCompBoundaryBefore(n16)) {
 928                         if (isCompYesAndZeroCC(n16)) {
 929                             src += Character.charCount(c);
 930                         }
 931                         continue;
 932                     }
 933                     // Use the slow path. There is no boundary in [prevSrc, src[.
 934                 }
 935             }
 936 
 937             // Slow path: Find the nearest boundaries around the current character,
 938             // decompose and recompose.
 939             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
 940                 c = Character.codePointBefore(s, prevSrc);
 941                 norm16 = normTrie.get(c);
 942                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 943                     prevSrc -= Character.charCount(c);
 944                 }
 945             }
 946             if (doCompose && prevBoundary != prevSrc) {
 947                 buffer.append(s, prevBoundary, prevSrc);
 948             }
 949             int recomposeStartIndex=buffer.length();
 950             // We know there is not a boundary here.
 951             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
 952                            buffer);
 953             // Decompose until the next boundary.
 954             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
 955                                  buffer);
 956             recompose(buffer, recomposeStartIndex, onlyContiguous);
 957             if(!doCompose) {
 958                 if(!buffer.equals(s, prevSrc, src)) {
 959                     return false;
 960                 }
 961                 buffer.remove();
 962             }
 963             prevBoundary=src;
 964         }
 965     }
 966 
 967     /**
 968      * Very similar to compose(): Make the same changes in both places if relevant.
 969      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
 970      * !doSpan: quickCheck
 971      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
 972      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 973      *         then the quick check result is "no"
 974      */
 975     public int composeQuickCheck(CharSequence s, int src, int limit,
 976                                  boolean onlyContiguous, boolean doSpan) {
 977         int qcResult=0;
 978         int prevBoundary=src;
 979         int minNoMaybeCP=minCompNoMaybeCP;
 980 
 981         for(;;) {
 982             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 983             // or with (compYes && ccc==0) properties.
 984             int prevSrc;
 985             int c = 0;
 986             int norm16 = 0;
 987             for (;;) {
 988                 if(src==limit) {
 989                     return (src<<1)|qcResult;  // "yes" or "maybe"
 990                 }
 991                 if( (c=s.charAt(src))<minNoMaybeCP ||
 992                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 993                 ) {
 994                     ++src;
 995                 } else {
 996                     prevSrc = src++;
 997                     if (!UTF16Plus.isLeadSurrogate(c)) {
 998                         break;
 999                     } else {
1000                         char c2;
1001                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1002                             ++src;
1003                             c = Character.toCodePoint((char)c, c2);
1004                             norm16 = normTrie.suppGet(c);
1005                             if (!isCompYesAndZeroCC(norm16)) {
1006                                 break;
1007                             }
1008                         }
1009                     }
1010                 }
1011             }
1012             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1013             // The current character is either a "noNo" (has a mapping)
1014             // or a "maybeYes" (combines backward)
1015             // or a "yesYes" with ccc!=0.
1016             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1017 
1018             int prevNorm16 = INERT;
1019             if (prevBoundary != prevSrc) {
1020                 prevBoundary = prevSrc;
1021                 if (!norm16HasCompBoundaryBefore(norm16)) {
1022                     c = Character.codePointBefore(s, prevSrc);
1023                     int n16 = getNorm16(c);
1024                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1025                         prevBoundary -= Character.charCount(c);
1026                         prevNorm16 = n16;
1027                     }
1028                 }
1029             }
1030 
1031             if(isMaybeOrNonZeroCC(norm16)) {
1032                 int cc=getCCFromYesOrMaybe(norm16);
1033                 if (onlyContiguous /* FCC */ && cc != 0 &&
1034                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1035                     // The [prevBoundary..prevSrc[ character
1036                     // passed the quick check "yes && ccc==0" test
1037                     // but is out of canonical order with the current combining mark.
1038                 } else {
1039                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1040                     // the previous character which passed the quick check "yes && ccc==0" test.
1041                     for (;;) {
1042                         if (norm16 < MIN_YES_YES_WITH_CC) {
1043                             if (!doSpan) {
1044                                 qcResult = 1;
1045                             } else {
1046                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1047                             }
1048                         }
1049                         if (src == limit) {
1050                             return (src<<1) | qcResult;  // "yes" or "maybe"
1051                         }
1052                         int prevCC = cc;
1053                         c = Character.codePointAt(s, src);
1054                         norm16 = getNorm16(c);
1055                         if (isMaybeOrNonZeroCC(norm16)) {
1056                             cc = getCCFromYesOrMaybe(norm16);
1057                             if (!(prevCC <= cc || cc == 0)) {
1058                                 break;
1059                             }
1060                         } else {
1061                             break;
1062                         }
1063                         src += Character.charCount(c);
1064                     }
1065                     // src is after the last in-order combining mark.
1066                     if (isCompYesAndZeroCC(norm16)) {
1067                         prevBoundary = src;
1068                         src += Character.charCount(c);
1069                         continue;
1070                     }
1071                 }
1072             }
1073             return prevBoundary<<1;  // "no"
1074         }
1075     }
1076     public void composeAndAppend(CharSequence s,
1077                                  boolean doCompose,
1078                                  boolean onlyContiguous,
1079                                  ReorderingBuffer buffer) {
1080         int src=0, limit=s.length();
1081         if(!buffer.isEmpty()) {
1082             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1083             if(0!=firstStarterInSrc) {
1084                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1085                                                                buffer.length(), onlyContiguous);
1086                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1087                                                        firstStarterInSrc+16);
1088                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1089                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1090                 middle.append(s, 0, firstStarterInSrc);
1091                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1092                 src=firstStarterInSrc;
1093             }
1094         }
1095         if(doCompose) {
1096             compose(s, src, limit, onlyContiguous, true, buffer);
1097         } else {
1098             buffer.append(s, src, limit);
1099         }
1100     }
1101     // Dual functionality:
1102     // buffer!=NULL: normalize
1103     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1104     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1105         // Note: In this function we use buffer->appendZeroCC() because we track
1106         // the lead and trail combining classes here, rather than leaving it to
1107         // the ReorderingBuffer.
1108         // The exception is the call to decomposeShort() which uses the buffer
1109         // in the normal way.
1110 
1111         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1112         // Similar to the prevBoundary in the compose() implementation.
1113         int prevBoundary=src;
1114         int prevSrc;
1115         int c=0;
1116         int prevFCD16=0;
1117         int fcd16=0;
1118 
1119         for(;;) {
1120             // count code units with lccc==0
1121             for(prevSrc=src; src!=limit;) {
1122                 if((c=s.charAt(src))<minLcccCP) {
1123                     prevFCD16=~c;
1124                     ++src;
1125                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1126                     prevFCD16=0;
1127                     ++src;
1128                 } else {
1129                     if (UTF16Plus.isLeadSurrogate(c)) {
1130                         char c2;
1131                         if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1132                             c = Character.toCodePoint((char)c, c2);
1133                         }
1134                     }
1135                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1136                         prevFCD16=fcd16;
1137                         src+=Character.charCount(c);
1138                     } else {
1139                         break;
1140                     }
1141                 }
1142             }
1143             // copy these code units all at once
1144             if(src!=prevSrc) {
1145                 if(src==limit) {
1146                     if(buffer!=null) {
1147                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1148                     }
1149                     break;
1150                 }
1151                 prevBoundary=src;
1152                 // We know that the previous character's lccc==0.
1153                 if(prevFCD16<0) {
1154                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1155                     int prev=~prevFCD16;
1156                     if(prev<minDecompNoCP) {
1157                         prevFCD16=0;
1158                     } else {
1159                         prevFCD16=getFCD16FromNormData(prev);
1160                         if(prevFCD16>1) {
1161                             --prevBoundary;
1162                         }
1163                     }
1164                 } else {
1165                     int p=src-1;
1166                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1167                         Character.isHighSurrogate(s.charAt(p-1))
1168                     ) {
1169                         --p;
1170                         // Need to fetch the previous character's FCD value because
1171                         // prevFCD16 was just for the trail surrogate code point.
1172                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1173                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1174                     }
1175                     if(prevFCD16>1) {
1176                         prevBoundary=p;
1177                     }
1178                 }
1179                 if(buffer!=null) {
1180                     // The last lccc==0 character is excluded from the
1181                     // flush-and-append call in case it needs to be modified.
1182                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1183                     buffer.append(s, prevBoundary, src);
1184                 }
1185                 // The start of the current character (c).
1186                 prevSrc=src;
1187             } else if(src==limit) {
1188                 break;
1189             }
1190 
1191             src+=Character.charCount(c);
1192             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1193             // Check for proper order, and decompose locally if necessary.
1194             if((prevFCD16&0xff)<=(fcd16>>8)) {
1195                 // proper order: prev tccc <= current lccc
1196                 if((fcd16&0xff)<=1) {
1197                     prevBoundary=src;
1198                 }
1199                 if(buffer!=null) {
1200                     buffer.appendZeroCC(c);
1201                 }
1202                 prevFCD16=fcd16;
1203                 continue;
1204             } else if(buffer==null) {
1205                 return prevBoundary;  // quick check "no"
1206             } else {
1207                 /*
1208                  * Back out the part of the source that we copied or appended
1209                  * already but is now going to be decomposed.
1210                  * prevSrc is set to after what was copied/appended.
1211                  */
1212                 buffer.removeSuffix(prevSrc-prevBoundary);
1213                 /*
1214                  * Find the part of the source that needs to be decomposed,
1215                  * up to the next safe boundary.
1216                  */
1217                 src=findNextFCDBoundary(s, src, limit);
1218                 /*
1219                  * The source text does not fulfill the conditions for FCD.
1220                  * Decompose and reorder a limited piece of the text.
1221                  */
1222                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1223                 prevBoundary=src;
1224                 prevFCD16=0;
1225             }
1226         }
1227         return src;
1228     }
1229 
1230     public boolean hasDecompBoundaryBefore(int c) {
1231         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1232             norm16HasDecompBoundaryBefore(getNorm16(c));
1233     }
1234     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1235         if (norm16 < minNoNoCompNoMaybeCC) {
1236             return true;
1237         }
1238         if (norm16 >= limitNoNo) {
1239             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1240         }
1241         // c decomposes, get everything from the variable-length extra data
1242         int mapping=norm16>>OFFSET_SHIFT;
1243         int firstUnit=extraData.charAt(mapping);
1244         // true if leadCC==0 (hasFCDBoundaryBefore())
1245         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1246     }
1247     public boolean hasDecompBoundaryAfter(int c) {
1248         if (c < minDecompNoCP) {
1249             return true;
1250         }
1251         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1252             return true;
1253         }
1254         return norm16HasDecompBoundaryAfter(getNorm16(c));
1255     }
1256     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1257         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1258             return true;
1259         }
1260         if (norm16 >= limitNoNo) {
1261             if (isMaybeOrNonZeroCC(norm16)) {
1262                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1263             }
1264             // Maps to an isCompYesAndZeroCC.
1265             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1266         }
1267         // c decomposes, get everything from the variable-length extra data
1268         int mapping=norm16>>OFFSET_SHIFT;
1269         int firstUnit=extraData.charAt(mapping);
1270         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1271         // fcd16<=1 || trailCC==0
1272         if(firstUnit>0x1ff) {
1273             return false;  // trailCC>1
1274         }
1275         if(firstUnit<=0xff) {
1276             return true;  // trailCC==0
1277         }
1278         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1279         // true if leadCC==0 (hasFCDBoundaryBefore())
1280         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1281     }
1282     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1283 
1284     public boolean hasCompBoundaryBefore(int c) {
1285         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1286     }
1287     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1288         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1289     }
1290 
1291     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1292     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1293     private static boolean isInert(int norm16) { return norm16==INERT; }
1294     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1295     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
1296     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
1297     private boolean isHangulLVT(int norm16) {
1298         return norm16==hangulLVT();
1299     }
1300     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1301     // UBool isCompYes(uint16_t norm16) const {
1302     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1303     // }
1304     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1305     //     return norm16<minNoNo || minMaybeYes<=norm16;
1306     // }
1307     // private boolean hasZeroCCFromDecompYes(int norm16) {
1308     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1309     // }
1310     private boolean isDecompYesAndZeroCC(int norm16) {
1311         return norm16<minYesNo ||
1312                norm16==JAMO_VT ||
1313                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1314     }
1315     /**
1316      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1317      * the MaybeYes which combine-forward and have ccc=0.
1318      * (Standard Unicode 10 normalization does not have such characters.)
1319      */
1320     private boolean isMostDecompYesAndZeroCC(int norm16) {
1321         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1322     }
1323     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1324 
1325     // For use with isCompYes().
1326     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1327     // static uint8_t getCCFromYes(uint16_t norm16) {
1328     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1329     // }
1330     private int getCCFromNoNo(int norm16) {
1331         int mapping=norm16>>OFFSET_SHIFT;
1332         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1333             return extraData.charAt(mapping-1)&0xff;
1334         } else {
1335             return 0;
1336         }
1337     }
1338     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1339         if(norm16<=minYesNo) {
1340             return 0;  // yesYes and Hangul LV have ccc=tccc=0
1341         } else {
1342             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1343             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1344         }
1345     }
1346 
1347     // Requires algorithmic-NoNo.
1348     private int mapAlgorithmic(int c, int norm16) {
1349         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1350     }
1351 
1352     // Requires minYesNo<norm16<limitNoNo.
1353     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1354 
1355     /**
1356      * @return index into maybeYesCompositions, or -1
1357      */
1358     private int getCompositionsListForDecompYes(int norm16) {
1359         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1360             return -1;
1361         } else {
1362             if((norm16-=minMaybeYes)<0) {
1363                 // norm16<minMaybeYes: index into extraData which is a substring at
1364                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1365                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1366                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1367             }
1368             return norm16>>OFFSET_SHIFT;
1369         }
1370     }
1371     /**
1372      * @return index into maybeYesCompositions
1373      */
1374     private int getCompositionsListForComposite(int norm16) {
1375         // A composite has both mapping & compositions list.
1376         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1377         int firstUnit=maybeYesCompositions.charAt(list);
1378         return list+  // mapping in maybeYesCompositions
1379             1+  // +1 to skip the first unit with the mapping length
1380             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1381     }
1382 
1383     // Decompose a short piece of text which is likely to contain characters that
1384     // fail the quick check loop and/or where the quick check loop's overhead
1385     // is unlikely to be amortized.
1386     // Called by the compose() and makeFCD() implementations.
1387     // Public in Java for collation implementation code.
1388     private int decomposeShort(
1389             CharSequence s, int src, int limit,
1390             boolean stopAtCompBoundary, boolean onlyContiguous,
1391             ReorderingBuffer buffer) {
1392         while(src<limit) {
1393             int c=Character.codePointAt(s, src);
1394             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1395                 return src;
1396             }
1397             int norm16 = getNorm16(c);
1398             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1399                 return src;
1400             }
1401             src+=Character.charCount(c);
1402             decompose(c, norm16, buffer);
1403             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1404                 return src;
1405             }
1406         }
1407         return src;
1408     }
1409     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1410         // get the decomposition and the lead and trail cc's
1411         if (norm16 >= limitNoNo) {
1412             if (isMaybeOrNonZeroCC(norm16)) {
1413                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1414                 return;
1415             }
1416             // Maps to an isCompYesAndZeroCC.
1417             c=mapAlgorithmic(c, norm16);
1418             norm16=getRawNorm16(c);
1419         }
1420         if (norm16 < minYesNo) {
1421             // c does not decompose
1422             buffer.append(c, 0);
1423         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1424             // Hangul syllable: decompose algorithmically
1425             Hangul.decompose(c, buffer);
1426         } else {
1427             // c decomposes, get everything from the variable-length extra data
1428             int mapping=norm16>>OFFSET_SHIFT;
1429             int firstUnit=extraData.charAt(mapping);
1430             int length=firstUnit&MAPPING_LENGTH_MASK;
1431             int leadCC, trailCC;
1432             trailCC=firstUnit>>8;
1433             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1434                 leadCC=extraData.charAt(mapping-1)>>8;
1435             } else {
1436                 leadCC=0;
1437             }
1438             ++mapping;  // skip over the firstUnit
1439             buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
1440         }
1441     }
1442 
1443     /**
1444      * Finds the recomposition result for
1445      * a forward-combining "lead" character,
1446      * specified with a pointer to its compositions list,
1447      * and a backward-combining "trail" character.
1448      *
1449      * <p>If the lead and trail characters combine, then this function returns
1450      * the following "compositeAndFwd" value:
1451      * <pre>
1452      * Bits 21..1  composite character
1453      * Bit      0  set if the composite is a forward-combining starter
1454      * </pre>
1455      * otherwise it returns -1.
1456      *
1457      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1458      * encoded as either pairs or triples of 16-bit units.
1459      * The last entry has the high bit of its first unit set.
1460      *
1461      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1462      * A linear search is used.
1463      *
1464      * <p>See normalizer2impl.h for a more detailed description
1465      * of the compositions list format.
1466      */
1467     private static int combine(String compositions, int list, int trail) {
1468         int key1, firstUnit;
1469         if(trail<COMP_1_TRAIL_LIMIT) {
1470             // trail character is 0..33FF
1471             // result entry may have 2 or 3 units
1472             key1=(trail<<1);
1473             while(key1>(firstUnit=compositions.charAt(list))) {
1474                 list+=2+(firstUnit&COMP_1_TRIPLE);
1475             }
1476             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1477                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1478                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1479                 } else {
1480                     return compositions.charAt(list+1);
1481                 }
1482             }
1483         } else {
1484             // trail character is 3400..10FFFF
1485             // result entry has 3 units
1486             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1487             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1488             int secondUnit;
1489             for(;;) {
1490                 if(key1>(firstUnit=compositions.charAt(list))) {
1491                     list+=2+(firstUnit&COMP_1_TRIPLE);
1492                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1493                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1494                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1495                             break;
1496                         } else {
1497                             list+=3;
1498                         }
1499                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1500                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1501                     } else {
1502                         break;
1503                     }
1504                 } else {
1505                     break;
1506                 }
1507             }
1508         }
1509         return -1;
1510     }
1511 
1512     /*
1513      * Recomposes the buffer text starting at recomposeStartIndex
1514      * (which is in NFD - decomposed and canonically ordered),
1515      * and truncates the buffer contents.
1516      *
1517      * Note that recomposition never lengthens the text:
1518      * Any character consists of either one or two code units;
1519      * a composition may contain at most one more code unit than the original starter,
1520      * while the combining mark that is removed has at least one code unit.
1521      */
1522     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1523                            boolean onlyContiguous) {
1524         StringBuilder sb=buffer.getStringBuilder();
1525         int p=recomposeStartIndex;
1526         if(p==sb.length()) {
1527             return;
1528         }
1529 
1530         int starter, pRemove;
1531         int compositionsList;
1532         int c, compositeAndFwd;
1533         int norm16;
1534         int cc, prevCC;
1535         boolean starterIsSupplementary;
1536 
1537         // Some of the following variables are not used until we have a forward-combining starter
1538         // and are only initialized now to avoid compiler warnings.
1539         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1540         starter=-1;
1541         starterIsSupplementary=false;
1542         prevCC=0;
1543 
1544         for(;;) {
1545             c=sb.codePointAt(p);
1546             p+=Character.charCount(c);
1547             norm16=getNorm16(c);
1548             cc=getCCFromYesOrMaybe(norm16);
1549             if( // this character combines backward and
1550                 isMaybe(norm16) &&
1551                 // we have seen a starter that combines forward and
1552                 compositionsList>=0 &&
1553                 // the backward-combining character is not blocked
1554                 (prevCC<cc || prevCC==0)
1555             ) {
1556                 if(isJamoVT(norm16)) {
1557                     // c is a Jamo V/T, see if we can compose it with the previous character.
1558                     if(c<Hangul.JAMO_T_BASE) {
1559                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1560                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1561                         if(prev<Hangul.JAMO_L_COUNT) {
1562                             pRemove=p-1;
1563                             char syllable=(char)
1564                                 (Hangul.HANGUL_BASE+
1565                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1566                                  Hangul.JAMO_T_COUNT);
1567                             char t;
1568                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1569                                 ++p;
1570                                 syllable+=t;  // The next character was a Jamo T.
1571                             }
1572                             sb.setCharAt(starter, syllable);
1573                             // remove the Jamo V/T
1574                             sb.delete(pRemove, p);
1575                             p=pRemove;
1576                         }
1577                     }
1578                     /*
1579                      * No "else" for Jamo T:
1580                      * Since the input is in NFD, there are no Hangul LV syllables that
1581                      * a Jamo T could combine with.
1582                      * All Jamo Ts are combined above when handling Jamo Vs.
1583                      */
1584                     if(p==sb.length()) {
1585                         break;
1586                     }
1587                     compositionsList=-1;
1588                     continue;
1589                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1590                     // The starter and the combining mark (c) do combine.
1591                     int composite=compositeAndFwd>>1;
1592 
1593                     // Remove the combining mark.
1594                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1595                     sb.delete(pRemove, p);
1596                     p=pRemove;
1597                     // Replace the starter with the composite.
1598                     if(starterIsSupplementary) {
1599                         if(composite>0xffff) {
1600                             // both are supplementary
1601                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1602                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1603                         } else {
1604                             sb.setCharAt(starter, (char)c);
1605                             sb.deleteCharAt(starter+1);
1606                             // The composite is shorter than the starter,
1607                             // move the intermediate characters forward one.
1608                             starterIsSupplementary=false;
1609                             --p;
1610                         }
1611                     } else if(composite>0xffff) {
1612                         // The composite is longer than the starter,
1613                         // move the intermediate characters back one.
1614                         starterIsSupplementary=true;
1615                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1616                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1617                         ++p;
1618                     } else {
1619                         // both are on the BMP
1620                         sb.setCharAt(starter, (char)composite);
1621                     }
1622 
1623                     // Keep prevCC because we removed the combining mark.
1624 
1625                     if(p==sb.length()) {
1626                         break;
1627                     }
1628                     // Is the composite a starter that combines forward?
1629                     if((compositeAndFwd&1)!=0) {
1630                         compositionsList=
1631                             getCompositionsListForComposite(getRawNorm16(composite));
1632                     } else {
1633                         compositionsList=-1;
1634                     }
1635 
1636                     // We combined; continue with looking for compositions.
1637                     continue;
1638                 }
1639             }
1640 
1641             // no combination this time
1642             prevCC=cc;
1643             if(p==sb.length()) {
1644                 break;
1645             }
1646 
1647             // If c did not combine, then check if it is a starter.
1648             if(cc==0) {
1649                 // Found a new starter.
1650                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1651                     // It may combine with something, prepare for it.
1652                     if(c<=0xffff) {
1653                         starterIsSupplementary=false;
1654                         starter=p-1;
1655                     } else {
1656                         starterIsSupplementary=true;
1657                         starter=p-2;
1658                     }
1659                 }
1660             } else if(onlyContiguous) {
1661                 // FCC: no discontiguous compositions; any intervening character blocks.
1662                 compositionsList=-1;
1663             }
1664         }
1665         buffer.flush();
1666     }
1667 
1668     /**
1669      * Does c have a composition boundary before it?
1670      * True if its decomposition begins with a character that has
1671      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1672      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1673      * (isCompYesAndZeroCC()) so we need not decompose.
1674      */
1675     private boolean hasCompBoundaryBefore(int c, int norm16) {
1676         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
1677     }
1678     private boolean norm16HasCompBoundaryBefore(int norm16) {
1679         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
1680     }
1681     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
1682         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
1683     }
1684     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
1685         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1686             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
1687     }
1688     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
1689         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
1690     }
1691     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
1692     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
1693         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
1694             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
1695     }
1696 
1697     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
1698         while(p>0) {
1699             int c=Character.codePointBefore(s, p);
1700             int norm16 = getNorm16(c);
1701             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1702                 break;
1703             }
1704             p-=Character.charCount(c);
1705             if(hasCompBoundaryBefore(c, norm16)) {
1706                 break;
1707             }
1708         }
1709         return p;
1710     }
1711     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
1712         while(p<limit) {
1713             int c=Character.codePointAt(s, p);
1714             int norm16=normTrie.get(c);
1715             if(hasCompBoundaryBefore(c, norm16)) {
1716                 break;
1717             }
1718             p+=Character.charCount(c);
1719             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1720                 break;
1721             }
1722         }
1723         return p;
1724     }
1725 
1726 
1727     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1728         while(p<limit) {
1729             int c=Character.codePointAt(s, p);
1730             int norm16;
1731             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
1732                 break;
1733             }
1734             p+=Character.charCount(c);
1735             if (norm16HasDecompBoundaryAfter(norm16)) {
1736                 break;
1737             }
1738         }
1739         return p;
1740     }
1741 
1742     /**
1743      * Get the canonical decomposition
1744      * sherman  for ComposedCharIter
1745      */
1746     public static int getDecompose(int chars[], String decomps[]) {
1747         Normalizer2 impl = Normalizer2.getNFDInstance();
1748 
1749         int length=0;
1750         int norm16 = 0;
1751         int ch = -1;
1752         int i = 0;
1753 
1754         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1755             //TBD !!!! the hack code heres save us about 50ms for startup
1756             //need a better solution/lookup
1757             if (ch == 0x30ff)
1758                 ch = 0xf900;
1759             else if (ch == 0x115bc)
1760                 ch = 0x1d15e;
1761             else if (ch == 0x1d1c1)
1762                 ch = 0x2f800;
1763 
1764             String s = impl.getDecomposition(ch);
1765 
1766             if(s != null && i < chars.length) {
1767                 chars[i] = ch;
1768                 decomps[i++] = s;
1769             }
1770         }
1771         return i;
1772     }
1773 
1774     //------------------------------------------------------
1775     // special method for Collation (RBTableBuilder.build())
1776     //------------------------------------------------------
1777     private static boolean needSingleQuotation(char c) {
1778         return (c >= 0x0009 && c <= 0x000D) ||
1779                (c >= 0x0020 && c <= 0x002F) ||
1780                (c >= 0x003A && c <= 0x0040) ||
1781                (c >= 0x005B && c <= 0x0060) ||
1782                (c >= 0x007B && c <= 0x007E);
1783     }
1784 
1785     public static String canonicalDecomposeWithSingleQuotation(String string) {
1786        Normalizer2 impl = Normalizer2.getNFDInstance();
1787        char[] src = string.toCharArray();
1788        int    srcIndex = 0;
1789        int    srcLimit = src.length;
1790        char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
1791        int    destIndex = 0;
1792        int    destLimit = dest.length;
1793 
1794         int prevSrc;
1795         String norm;
1796         int reorderStartIndex, length;
1797         char c1, c2;
1798         int cp;
1799         int minNoMaybe = 0x00c0;
1800         int cc, prevCC, trailCC;
1801         char[] p;
1802         int pStart;
1803 
1804         // initialize
1805         reorderStartIndex = 0;
1806         prevCC = 0;
1807         norm = null;
1808         cp = 0;
1809         pStart = 0;
1810 
1811         cc = trailCC = -1; // initialize to bogus value
1812         c1 = 0;
1813         for (;;) {
1814             prevSrc=srcIndex;
1815             //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
1816             while (srcIndex != srcLimit &&
1817                    ((c1 = src[srcIndex]) < minNoMaybe ||
1818                     (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
1819                     (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
1820                 prevCC = 0;
1821                 srcIndex += (cp < 0x10000) ? 1 : 2;
1822             }
1823 
1824             // copy these code units all at once
1825             if (srcIndex != prevSrc) {
1826                 length = srcIndex - prevSrc;
1827                 if ((destIndex + length) <= destLimit) {
1828                     System.arraycopy(src,prevSrc,dest,destIndex,length);
1829                 }
1830 
1831                 destIndex += length;
1832                 reorderStartIndex = destIndex;
1833             }
1834 
1835             // end of source reached?
1836             if (srcIndex == srcLimit) {
1837                 break;
1838             }
1839 
1840             // cp already contains *src and norm32 is set for it, increment src
1841             srcIndex += (cp < 0x10000) ? 1 : 2;
1842 
1843             if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1844                 c2 = 0;
1845                 length = 1;
1846 
1847                 if (Character.isHighSurrogate(c1)
1848                     || Character.isLowSurrogate(c1)) {
1849                     norm = null;
1850                 }
1851             } else {
1852                 length = 2;
1853                 c2 = src[srcIndex-1];
1854             }
1855 
1856           // get the decomposition and the lead and trail cc's
1857           if (norm == null) {
1858               // cp does not decompose
1859               cc = trailCC = UCharacter.getCombiningClass(cp);
1860               p = null;
1861               pStart = -1;
1862           } else {
1863 
1864                 pStart = 0;
1865                 p = norm.toCharArray();
1866                 length = p.length;
1867                 int cpNum = norm.codePointCount(0, length);
1868                 cc= UCharacter.getCombiningClass(norm.codePointAt(0));
1869                 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
1870                 if (length == 1) {
1871                     // fastpath a single code unit from decomposition
1872                     c1 = p[pStart];
1873                     c2 = 0;
1874                     p = null;
1875                     pStart = -1;
1876                 }
1877             }
1878 
1879             if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
1880                 // buffer overflow
1881                 char[] tmpBuf = new char[destLimit * 2];
1882                 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
1883                 dest = tmpBuf;
1884                 destLimit = dest.length;
1885             }
1886 
1887             // append the decomposition to the destination buffer, assume length>0
1888             {
1889                 int reorderSplit = destIndex;
1890                 if (p == null) {
1891                     // fastpath: single code point
1892                     if (needSingleQuotation(c1)) {
1893                         //if we need single quotation, no need to consider "prevCC"
1894                         //and it must NOT be a supplementary pair
1895                         dest[destIndex++] = '\'';
1896                         dest[destIndex++] = c1;
1897                         dest[destIndex++] = '\'';
1898                         trailCC = 0;
1899                     } else if(cc != 0 && cc < prevCC) {
1900                         // (c1, c2) is out of order with respect to the preceding
1901                         //  text
1902                         destIndex += length;
1903                         trailCC = insertOrdered(dest, reorderStartIndex,
1904                                                 reorderSplit, destIndex, c1, c2, cc);
1905                     } else {
1906                         // just append (c1, c2)
1907                         dest[destIndex++] = c1;
1908                         if(c2 != 0) {
1909                             dest[destIndex++] = c2;
1910                         }
1911                     }
1912                 } else {
1913                     // general: multiple code points (ordered by themselves)
1914                     // from decomposition
1915                     if (needSingleQuotation(p[pStart])) {
1916                         dest[destIndex++] = '\'';
1917                         dest[destIndex++] = p[pStart++];
1918                         dest[destIndex++] = '\'';
1919                         length--;
1920                         do {
1921                             dest[destIndex++] = p[pStart++];
1922                         } while(--length > 0);
1923                     } else if (cc != 0 && cc < prevCC) {
1924                         destIndex += length;
1925                         trailCC = mergeOrdered(dest, reorderStartIndex,
1926                                                reorderSplit, p, pStart,
1927                                                pStart+length);
1928                     } else {
1929                         // just append the decomposition
1930                         do {
1931                             dest[destIndex++] = p[pStart++];
1932                         } while (--length > 0);
1933                     }
1934                 }
1935             }
1936             prevCC = trailCC;
1937             if(prevCC == 0) {
1938                 reorderStartIndex = destIndex;
1939             }
1940         }
1941 
1942         return new String(dest, 0, destIndex);
1943     }
1944 
1945     /**
1946      * simpler, single-character version of mergeOrdered() -
1947      * bubble-insert one single code point into the preceding string
1948      * which is already canonically ordered
1949      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
1950      *
1951      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1952      *
1953      * before: src[start]..src[current] is already ordered, and
1954      *         src[current]..src[p]     may or may not hold (c, c2) but
1955      *                          must be exactly the same length as (c, c2)
1956      * after: src[start]..src[p] is ordered
1957      *
1958      * @return the trailing combining class
1959      */
1960     private static int/*unsigned byte*/ insertOrdered(char[] source,
1961                                                       int start,
1962                                                       int current, int p,
1963                                                       char c1, char c2,
1964                                                       int/*unsigned byte*/ cc) {
1965         int back, preBack;
1966         int r;
1967         int prevCC, trailCC=cc;
1968 
1969         if (start<current && cc!=0) {
1970             // search for the insertion point where cc>=prevCC
1971             preBack=back=current;
1972 
1973             PrevArgs prevArgs = new PrevArgs();
1974             prevArgs.current  = current;
1975             prevArgs.start    = start;
1976             prevArgs.src      = source;
1977             prevArgs.c1       = c1;
1978             prevArgs.c2       = c2;
1979 
1980             // get the prevCC
1981             prevCC=getPrevCC(prevArgs);
1982             preBack = prevArgs.current;
1983 
1984             if(cc<prevCC) {
1985                 // this will be the last code point, so keep its cc
1986                 trailCC=prevCC;
1987                 back=preBack;
1988                 while(start<preBack) {
1989                     prevCC=getPrevCC(prevArgs);
1990                     preBack=prevArgs.current;
1991                     if(cc>=prevCC) {
1992                         break;
1993                     }
1994                     back=preBack;
1995                 }
1996 
1997                 // this is where we are right now with all these indicies:
1998                 // [start]..[pPreBack] 0..? code points that we can ignore
1999                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
2000                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
2001                 // [current]..[p]         1 code point (c, c2) with cc
2002 
2003                 // move the code units in between up
2004                 r=p;
2005                 do {
2006                     source[--r]=source[--current];
2007                 } while (back!=current);
2008             }
2009         }
2010 
2011         // insert (c1, c2)
2012         source[current] = c1;
2013         if (c2!=0) {
2014             source[(current+1)] = c2;
2015         }
2016 
2017         // we know the cc of the last code point
2018         return trailCC;
2019     }
2020     /**
2021      * merge two UTF-16 string parts together
2022      * to canonically order (order by combining classes) their concatenation
2023      *
2024      * the two strings may already be adjacent, so that the merging is done
2025      * in-place if the two strings are not adjacent, then the buffer holding the
2026      * first one must be large enough
2027      * the second string may or may not be ordered in itself
2028      *
2029      * before: [start]..[current] is already ordered, and
2030      *         [next]..[limit]    may be ordered in itself, but
2031      *                          is not in relation to [start..current[
2032      * after: [start..current+(limit-next)[ is ordered
2033      *
2034      * the algorithm is a simple bubble-sort that takes the characters from
2035      * src[next++] and inserts them in correct combining class order into the
2036      * preceding part of the string
2037      *
2038      * since this function is called much less often than the single-code point
2039      * insertOrdered(), it just uses that for easier maintenance
2040      *
2041      * @return the trailing combining class
2042      */
2043     private static int /*unsigned byte*/ mergeOrdered(char[] source,
2044                                                       int start,
2045                                                       int current,
2046                                                       char[] data,
2047                                                         int next,
2048                                                         int limit) {
2049             int r;
2050             int /*unsigned byte*/ cc, trailCC=0;
2051             boolean adjacent;
2052 
2053             adjacent= current==next;
2054             NextCCArgs ncArgs = new NextCCArgs();
2055             ncArgs.source = data;
2056             ncArgs.next   = next;
2057             ncArgs.limit  = limit;
2058 
2059             if(start!=current) {
2060 
2061                 while(ncArgs.next<ncArgs.limit) {
2062                     cc=getNextCC(ncArgs);
2063                     if(cc==0) {
2064                         // does not bubble back
2065                         trailCC=0;
2066                         if(adjacent) {
2067                             current=ncArgs.next;
2068                         } else {
2069                             data[current++]=ncArgs.c1;
2070                             if(ncArgs.c2!=0) {
2071                                 data[current++]=ncArgs.c2;
2072                             }
2073                         }
2074                         break;
2075                     } else {
2076                         r=current+(ncArgs.c2==0 ? 1 : 2);
2077                         trailCC=insertOrdered(source,start, current, r,
2078                                               ncArgs.c1, ncArgs.c2, cc);
2079                         current=r;
2080                     }
2081                 }
2082             }
2083 
2084             if(ncArgs.next==ncArgs.limit) {
2085                 // we know the cc of the last code point
2086                 return trailCC;
2087             } else {
2088                 if(!adjacent) {
2089                     // copy the second string part
2090                     do {
2091                         source[current++]=data[ncArgs.next++];
2092                     } while(ncArgs.next!=ncArgs.limit);
2093                     ncArgs.limit=current;
2094                 }
2095                 PrevArgs prevArgs = new PrevArgs();
2096                 prevArgs.src   = data;
2097                 prevArgs.start = start;
2098                 prevArgs.current =  ncArgs.limit;
2099                 return getPrevCC(prevArgs);
2100             }
2101 
2102     }
2103     private static final class PrevArgs{
2104         char[] src;
2105         int start;
2106         int current;
2107         char c1;
2108         char c2;
2109     }
2110 
2111     private static final class NextCCArgs{
2112         char[] source;
2113         int next;
2114         int limit;
2115         char c1;
2116         char c2;
2117     }
2118     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2119         args.c1=args.source[args.next++];
2120         args.c2=0;
2121 
2122         if (UTF16.isTrailSurrogate(args.c1)) {
2123             /* unpaired second surrogate */
2124             return 0;
2125         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2126             return UCharacter.getCombiningClass(args.c1);
2127         } else if (args.next!=args.limit &&
2128                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2129             ++args.next;
2130             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2131         } else {
2132             /* unpaired first surrogate */
2133             args.c2=0;
2134             return 0;
2135         }
2136     }
2137     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2138         args.c1=args.src[--args.current];
2139         args.c2=0;
2140 
2141         if (args.c1 < MIN_CCC_LCCC_CP) {
2142             return 0;
2143         } else if (UTF16.isLeadSurrogate(args.c1)) {
2144             /* unpaired first surrogate */
2145             return 0;
2146         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2147             return UCharacter.getCombiningClass(args.c1);
2148         } else if (args.current!=args.start &&
2149                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2150             --args.current;
2151             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2152         } else {
2153             /* unpaired second surrogate */
2154             args.c2=0;
2155             return 0;
2156         }
2157     }
2158 
2159     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2160         if (start == p) {
2161             return 0;
2162         }
2163         return getFCD16(Character.codePointBefore(s, p));
2164     }
2165 
2166     private VersionInfo dataVersion;
2167 
2168     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2169     private int minDecompNoCP;
2170     private int minCompNoMaybeCP;
2171     private int minLcccCP;
2172 
2173     // Norm16 value thresholds for quick check combinations and types of extra data.
2174     private int minYesNo;
2175     private int minYesNoMappingsOnly;
2176     private int minNoNo;
2177     private int minNoNoCompBoundaryBefore;
2178     private int minNoNoCompNoMaybeCC;
2179     private int minNoNoEmpty;
2180     private int limitNoNo;
2181     private int centerNoNoDelta;
2182     private int minMaybeYes;
2183 
2184     private CodePointTrie.Fast16 normTrie;
2185     private String maybeYesCompositions;
2186     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2187     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2188 }