New src/java.base/share/classes/jdk/internal/icu/impl/NormalizerImpl.java

   1 /*
   2  * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 package jdk.internal.icu.impl;
  33 
  34 import java.io.IOException;
  35 import java.nio.ByteBuffer;
  36 
  37 import jdk.internal.icu.lang.UCharacter;
  38 import jdk.internal.icu.text.Normalizer2;
  39 import jdk.internal.icu.text.UTF16;
  40 import jdk.internal.icu.util.CodePointTrie;
  41 import jdk.internal.icu.util.VersionInfo;
  42 
  43 // Original filename in ICU4J: Normalizer2Impl.java
  44 public final class NormalizerImpl {
  45     public static final class Hangul {
  46         /* Korean Hangul and Jamo constants */
  47         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
  48         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
  49         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
  50 
  51         public static final int HANGUL_BASE=0xac00;
  52         public static final int HANGUL_END=0xd7a3;
  53 
  54         public static final int JAMO_L_COUNT=19;
  55         public static final int JAMO_V_COUNT=21;
  56         public static final int JAMO_T_COUNT=28;
  57 
  58         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
  59         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
  60 
  61         public static boolean isHangul(int c) {
  62             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
  63         }
  64         public static boolean isHangulLV(int c) {
  65             c-=HANGUL_BASE;
  66             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
  67         }
  68 
  69         /**
  70          * Decomposes c, which must be a Hangul syllable, into buffer
  71          * and returns the length of the decomposition (2 or 3).
  72          */
  73         public static int decompose(int c, Appendable buffer) {
  74             try {
  75                 c-=HANGUL_BASE;
  76                 int c2=c%JAMO_T_COUNT;
  77                 c/=JAMO_T_COUNT;
  78                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
  79                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
  80                 if(c2==0) {
  81                     return 2;
  82                 } else {
  83                     buffer.append((char)(JAMO_T_BASE+c2));
  84                     return 3;
  85                 }
  86             } catch(IOException e) {
  87                 throw new InternalError(e);
  88             }
  89         }
  90     }
  91 
  92     /**
  93      * Writable buffer that takes care of canonical ordering.
  94      * Its Appendable methods behave like the C++ implementation's
  95      * appendZeroCC() methods.
  96      * <p>
  97      * If dest is a StringBuilder, then the buffer writes directly to it.
  98      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
  99      * until no further changes are necessary and whole segments are appended.
 100      * append() methods that take combining-class values always write to the StringBuilder.
 101      * Other append() methods flush and append to the Appendable.
 102      */
 103     public static final class ReorderingBuffer implements Appendable {
 104         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
 105             impl=ni;
 106             app=dest;
 107             if (app instanceof StringBuilder) {
 108                 appIsStringBuilder=true;
 109                 str=(StringBuilder)dest;
 110                 // In Java, the constructor subsumes public void init(int destCapacity)
 111                 str.ensureCapacity(destCapacity);
 112                 reorderStart=0;
 113                 if(str.length()==0) {
 114                     lastCC=0;
 115                 } else {
 116                     setIterator();
 117                     lastCC=previousCC();
 118                     // Set reorderStart after the last code point with cc<=1 if there is one.
 119                     if(lastCC>1) {
 120                         while(previousCC()>1) {}
 121                     }
 122                     reorderStart=codePointLimit;
 123                 }
 124             } else {
 125                 appIsStringBuilder=false;
 126                 str=new StringBuilder();
 127                 reorderStart=0;
 128                 lastCC=0;
 129             }
 130         }
 131 
 132         public boolean isEmpty() { return str.length()==0; }
 133         public int length() { return str.length(); }
 134         public int getLastCC() { return lastCC; }
 135 
 136         public StringBuilder getStringBuilder() { return str; }
 137 
 138         public boolean equals(CharSequence s, int start, int limit) {
 139             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 140         }
 141 
 142         public void append(int c, int cc) {
 143             if(lastCC<=cc || cc==0) {
 144                 str.appendCodePoint(c);
 145                 lastCC=cc;
 146                 if(cc<=1) {
 147                     reorderStart=str.length();
 148                 }
 149             } else {
 150                 insert(c, cc);
 151             }
 152         }
 153         public void append(CharSequence s, int start, int limit, boolean isNFD,
 154                            int leadCC, int trailCC) {
 155             if(start==limit) {
 156                 return;
 157             }
 158             if(lastCC<=leadCC || leadCC==0) {
 159                 if(trailCC<=1) {
 160                     reorderStart=str.length()+(limit-start);
 161                 } else if(leadCC<=1) {
 162                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 163                 }
 164                 str.append(s, start, limit);
 165                 lastCC=trailCC;
 166             } else {
 167                 int c=Character.codePointAt(s, start);
 168                 start+=Character.charCount(c);
 169                 insert(c, leadCC);  // insert first code point
 170                 while(start<limit) {
 171                     c=Character.codePointAt(s, start);
 172                     start+=Character.charCount(c);
 173                     if(start<limit) {
 174                         if (isNFD) {
 175                             leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
 176                         } else {
 177                             leadCC = impl.getCC(impl.getNorm16(c));
 178                         }
 179                     } else {
 180                         leadCC=trailCC;
 181                     }
 182                     append(c, leadCC);
 183                 }
 184             }
 185         }
 186         // The following append() methods work like C++ appendZeroCC().
 187         // They assume that the cc or trailCC of their input is 0.
 188         // Most of them implement Appendable interface methods.
 189         @Override
 190         public ReorderingBuffer append(char c) {
 191             str.append(c);
 192             lastCC=0;
 193             reorderStart=str.length();
 194             return this;
 195         }
 196         public void appendZeroCC(int c) {
 197             str.appendCodePoint(c);
 198             lastCC=0;
 199             reorderStart=str.length();
 200         }
 201         @Override
 202         public ReorderingBuffer append(CharSequence s) {
 203             if(s.length()!=0) {
 204                 str.append(s);
 205                 lastCC=0;
 206                 reorderStart=str.length();
 207             }
 208             return this;
 209         }
 210         @Override
 211         public ReorderingBuffer append(CharSequence s, int start, int limit) {
 212             if(start!=limit) {
 213                 str.append(s, start, limit);
 214                 lastCC=0;
 215                 reorderStart=str.length();
 216             }
 217             return this;
 218         }
 219         /**
 220          * Flushes from the intermediate StringBuilder to the Appendable,
 221          * if they are different objects.
 222          * Used after recomposition.
 223          * Must be called at the end when writing to a non-StringBuilder Appendable.
 224          */
 225         public void flush() {
 226             if(appIsStringBuilder) {
 227                 reorderStart=str.length();
 228             } else {
 229                 try {
 230                     app.append(str);
 231                     str.setLength(0);
 232                     reorderStart=0;
 233                 } catch(IOException e) {
 234                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 235                 }
 236             }
 237             lastCC=0;
 238         }
 239         /**
 240          * Flushes from the intermediate StringBuilder to the Appendable,
 241          * if they are different objects.
 242          * Then appends the new text to the Appendable or StringBuilder.
 243          * Normally used after quick check loops find a non-empty sequence.
 244          */
 245         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
 246             if(appIsStringBuilder) {
 247                 str.append(s, start, limit);
 248                 reorderStart=str.length();
 249             } else {
 250                 try {
 251                     app.append(str).append(s, start, limit);
 252                     str.setLength(0);
 253                     reorderStart=0;
 254                 } catch(IOException e) {
 255                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 256                 }
 257             }
 258             lastCC=0;
 259             return this;
 260         }
 261         public void remove() {
 262             str.setLength(0);
 263             lastCC=0;
 264             reorderStart=0;
 265         }
 266         public void removeSuffix(int suffixLength) {
 267             int oldLength=str.length();
 268             str.delete(oldLength-suffixLength, oldLength);
 269             lastCC=0;
 270             reorderStart=str.length();
 271         }
 272 
 273         // Inserts c somewhere before the last character.
 274         // Requires 0<cc<lastCC which implies reorderStart<limit.
 275         private void insert(int c, int cc) {
 276             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 277             // insert c at codePointLimit, after the character with prevCC<=cc
 278             if(c<=0xffff) {
 279                 str.insert(codePointLimit, (char)c);
 280                 if(cc<=1) {
 281                     reorderStart=codePointLimit+1;
 282                 }
 283             } else {
 284                 str.insert(codePointLimit, Character.toChars(c));
 285                 if(cc<=1) {
 286                     reorderStart=codePointLimit+2;
 287                 }
 288             }
 289         }
 290 
 291         private final NormalizerImpl impl;
 292         private final Appendable app;
 293         private final StringBuilder str;
 294         private final boolean appIsStringBuilder;
 295         private int reorderStart;
 296         private int lastCC;
 297 
 298         // private backward iterator
 299         private void setIterator() { codePointStart=str.length(); }
 300         private void skipPrevious() {  // Requires 0<codePointStart.
 301             codePointLimit=codePointStart;
 302             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 303         }
 304         private int previousCC() {  // Returns 0 if there is no previous character.
 305             codePointLimit=codePointStart;
 306             if(reorderStart>=codePointStart) {
 307                 return 0;
 308             }
 309             int c=str.codePointBefore(codePointStart);
 310             codePointStart-=Character.charCount(c);
 311             return impl.getCCFromYesOrMaybeCP(c);
 312         }
 313         private int codePointStart, codePointLimit;
 314     }
 315 
 316     // TODO: Propose as public API on the UTF16 class.
 317     // TODO: Propose widening UTF16 methods that take char to take int.
 318     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 319     public static final class UTF16Plus {
 320         /**
 321          * Is this code point a lead surrogate (U+d800..U+dbff)?
 322          * @param c code unit or code point
 323          * @return true or false
 324          */
 325         public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
 326         /**
 327          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 328          * is it a lead surrogate?
 329          * @param c code unit or code point
 330          * @return true or false
 331          */
 332         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 333 
 334         /**
 335          * Compares two CharSequence subsequences for binary equality.
 336          * @param s1 first sequence
 337          * @param start1 start offset in first sequence
 338          * @param limit1 limit offset in first sequence
 339          * @param s2 second sequence
 340          * @param start2 start offset in second sequence
 341          * @param limit2 limit offset in second sequence
 342          * @return true if s1.subSequence(start1, limit1) contains the same text
 343          *              as s2.subSequence(start2, limit2)
 344          */
 345         public static boolean equal(CharSequence s1, int start1, int limit1,
 346                                     CharSequence s2, int start2, int limit2) {
 347             if((limit1-start1)!=(limit2-start2)) {
 348                 return false;
 349             }
 350             if(s1==s2 && start1==start2) {
 351                 return true;
 352             }
 353             while(start1<limit1) {
 354                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 355                     return false;
 356                 }
 357             }
 358             return true;
 359         }
 360     }
 361 
 362     public NormalizerImpl() {}
 363 
 364     private static final class IsAcceptable implements ICUBinary.Authenticate {
 365         public boolean isDataVersionAcceptable(byte version[]) {
 366             return version[0]==4;
 367         }
 368     }
 369     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 370     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 371 
 372     public NormalizerImpl load(ByteBuffer bytes) {
 373         try {
 374             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 375             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 376             if(indexesLength<=IX_MIN_LCCC_CP) {
 377                 throw new InternalError("Normalizer2 data: not enough indexes");
 378             }
 379             int[] inIndexes=new int[indexesLength];
 380             inIndexes[0]=indexesLength*4;
 381             for(int i=1; i<indexesLength; ++i) {
 382                 inIndexes[i]=bytes.getInt();
 383             }
 384 
 385             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 386             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 387             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 388 
 389             minYesNo=inIndexes[IX_MIN_YES_NO];
 390             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 391             minNoNo=inIndexes[IX_MIN_NO_NO];
 392             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 393             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 394             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 395             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 396             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 397             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 398             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 399 
 400             // Read the normTrie.
 401             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 402             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 403             int triePosition = bytes.position();
 404             normTrie = CodePointTrie.Fast16.fromBinary(bytes);
 405             int trieLength = bytes.position() - triePosition;
 406             if(trieLength>(nextOffset-offset)) {
 407                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
 408             }
 409             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 410 
 411             // Read the composition and mapping data.
 412             offset=nextOffset;
 413             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 414             int numChars=(nextOffset-offset)/2;
 415             if(numChars!=0) {
 416                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
 417                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 418             }
 419 
 420             // smallFCD: new in formatVersion 2
 421             offset=nextOffset;
 422             smallFCD=new byte[0x100];
 423             bytes.get(smallFCD);
 424 
 425             return this;
 426         } catch(IOException e) {
 427             throw new InternalError(e);
 428         }
 429     }
 430     public NormalizerImpl load(String name) {
 431         return load(ICUBinary.getRequiredData(name));
 432     }
 433 
 434     // The trie stores values for lead surrogate code *units*.
 435     // Surrogate code *points* are inert.
 436     public int getNorm16(int c) {
 437         return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
 438     }
 439     public int getRawNorm16(int c) { return normTrie.get(c); }
 440     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
 441     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
 442     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 443 
 444     public int getCC(int norm16) {
 445         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 446             return getCCFromNormalYesOrMaybe(norm16);
 447         }
 448         if(norm16<minNoNo || limitNoNo<=norm16) {
 449             return 0;
 450         }
 451         return getCCFromNoNo(norm16);
 452     }
 453     public static int getCCFromNormalYesOrMaybe(int norm16) {
 454         return (norm16 >> OFFSET_SHIFT) & 0xff;
 455     }
 456     public static int getCCFromYesOrMaybe(int norm16) {
 457         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
 458     }
 459     public int getCCFromYesOrMaybeCP(int c) {
 460         if (c < minCompNoMaybeCP) { return 0; }
 461         return getCCFromYesOrMaybe(getNorm16(c));
 462     }
 463 
 464     /**
 465      * Returns the FCD data for code point c.
 466      * @param c A Unicode code point.
 467      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
 468      */
 469     public int getFCD16(int c) {
 470         if(c<minDecompNoCP) {
 471             return 0;
 472         } else if(c<=0xffff) {
 473             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
 474         }
 475         return getFCD16FromNormData(c);
 476     }
 477     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
 478     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
 479         // 0<=lead<=0xffff
 480         byte bits=smallFCD[lead>>8];
 481         if(bits==0) { return false; }
 482         return ((bits>>((lead>>5)&7))&1)!=0;
 483     }
 484 
 485     /** Gets the FCD value from the regular normalization data. */
 486     public int getFCD16FromNormData(int c) {
 487         int norm16=getNorm16(c);
 488         if (norm16 >= limitNoNo) {
 489             if(norm16>=MIN_NORMAL_MAYBE_YES) {
 490                 // combining mark
 491                 norm16=getCCFromNormalYesOrMaybe(norm16);
 492                 return norm16|(norm16<<8);
 493             } else if(norm16>=minMaybeYes) {
 494                 return 0;
 495             } else {  // isDecompNoAlgorithmic(norm16)
 496                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
 497                 if (deltaTrailCC <= DELTA_TCCC_1) {
 498                     return deltaTrailCC >> OFFSET_SHIFT;
 499                 }
 500                 // Maps to an isCompYesAndZeroCC.
 501                 c=mapAlgorithmic(c, norm16);
 502                 norm16=getRawNorm16(c);
 503             }
 504         }
 505         if(norm16<=minYesNo || isHangulLVT(norm16)) {
 506             // no decomposition or Hangul syllable, all zeros
 507             return 0;
 508         }
 509         // c decomposes, get everything from the variable-length extra data
 510         int mapping=norm16>>OFFSET_SHIFT;
 511         int firstUnit=extraData.charAt(mapping);
 512         int fcd16=firstUnit>>8;  // tccc
 513         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 514             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
 515         }
 516         return fcd16;
 517     }
 518 
 519     /**
 520      * Gets the decomposition for one code point.
 521      * @param c code point
 522      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 523      */
 524     public String getDecomposition(int c) {
 525         int norm16;
 526         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 527             // c does not decompose
 528             return null;
 529         }
 530         int decomp = -1;
 531         if(isDecompNoAlgorithmic(norm16)) {
 532             // Maps to an isCompYesAndZeroCC.
 533             decomp=c=mapAlgorithmic(c, norm16);
 534             // The mapping might decompose further.
 535             norm16 = getRawNorm16(c);
 536         }
 537         if (norm16 < minYesNo) {
 538             if(decomp<0) {
 539                 return null;
 540             } else {
 541                 return UTF16.valueOf(decomp);
 542             }
 543         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 544             // Hangul syllable: decompose algorithmically
 545             StringBuilder buffer=new StringBuilder();
 546             Hangul.decompose(c, buffer);
 547             return buffer.toString();
 548         }
 549         // c decomposes, get everything from the variable-length extra data
 550         int mapping=norm16>>OFFSET_SHIFT;
 551         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
 552         return extraData.substring(mapping, mapping+length);
 553     }
 554 
 555     // Fixed norm16 values.
 556     public static final int MIN_YES_YES_WITH_CC=0xfe02;
 557     public static final int JAMO_VT=0xfe00;
 558     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
 559     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
 560     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
 561 
 562     // norm16 bit 0 is comp-boundary-after.
 563     public static final int HAS_COMP_BOUNDARY_AFTER=1;
 564     public static final int OFFSET_SHIFT=1;
 565 
 566     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
 567     // tccc (0, 1, >1) for quick FCC boundary-after tests.
 568     public static final int DELTA_TCCC_0=0;
 569     public static final int DELTA_TCCC_1=2;
 570     public static final int DELTA_TCCC_GT_1=4;
 571     public static final int DELTA_TCCC_MASK=6;
 572     public static final int DELTA_SHIFT=3;
 573 
 574     public static final int MAX_DELTA=0x40;
 575 
 576     // Byte offsets from the start of the data, after the generic header.
 577     public static final int IX_NORM_TRIE_OFFSET=0;
 578     public static final int IX_EXTRA_DATA_OFFSET=1;
 579     public static final int IX_SMALL_FCD_OFFSET=2;
 580     public static final int IX_RESERVED3_OFFSET=3;
 581     public static final int IX_TOTAL_SIZE=7;
 582     public static final int MIN_CCC_LCCC_CP=0x300;
 583     // Code point thresholds for quick check codes.
 584     public static final int IX_MIN_DECOMP_NO_CP=8;
 585     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
 586 
 587     // Norm16 value thresholds for quick check combinations and types of extra data.
 588 
 589     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
 590     public static final int IX_MIN_YES_NO=10;
 591     /** Mappings are comp-normalized. */
 592     public static final int IX_MIN_NO_NO=11;
 593     public static final int IX_LIMIT_NO_NO=12;
 594     public static final int IX_MIN_MAYBE_YES=13;
 595 
 596     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
 597     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
 598     /** Mappings are not comp-normalized but have a comp boundary before. */
 599     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
 600     /** Mappings do not have a comp boundary before. */
 601     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
 602     /** Mappings to the empty string. */
 603     public static final int IX_MIN_NO_NO_EMPTY=17;
 604 
 605     public static final int IX_MIN_LCCC_CP=18;
 606     public static final int IX_COUNT=20;
 607 
 608     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
 609     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
 610     // unused bit 0x20;
 611     public static final int MAPPING_LENGTH_MASK=0x1f;
 612 
 613     public static final int COMP_1_LAST_TUPLE=0x8000;
 614     public static final int COMP_1_TRIPLE=1;
 615     public static final int COMP_1_TRAIL_LIMIT=0x3400;
 616     public static final int COMP_1_TRAIL_MASK=0x7ffe;
 617     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
 618     public static final int COMP_2_TRAIL_SHIFT=6;
 619     public static final int COMP_2_TRAIL_MASK=0xffc0;
 620 
 621     // higher-level functionality ------------------------------------------ ***
 622 
 623     /**
 624      * Decomposes s[src, limit[ and writes the result to dest.
 625      * limit can be NULL if src is NUL-terminated.
 626      * destLengthEstimate is the initial dest buffer capacity and can be -1.
 627      */
 628     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
 629                    int destLengthEstimate) {
 630         if(destLengthEstimate<0) {
 631             destLengthEstimate=limit-src;
 632         }
 633         dest.setLength(0);
 634         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
 635         decompose(s, src, limit, buffer);
 636     }
 637 
 638     // Dual functionality:
 639     // buffer!=NULL: normalize
 640     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
 641     public int decompose(CharSequence s, int src, int limit,
 642                          ReorderingBuffer buffer) {
 643         int minNoCP=minDecompNoCP;
 644 
 645         int prevSrc;
 646         int c=0;
 647         int norm16=0;
 648 
 649         // only for quick check
 650         int prevBoundary=src;
 651         int prevCC=0;
 652 
 653         for(;;) {
 654             // count code units below the minimum or with irrelevant data for the quick check
 655             for(prevSrc=src; src!=limit;) {
 656                 if( (c=s.charAt(src))<minNoCP ||
 657                     isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 658                 ) {
 659                     ++src;
 660                 } else if(!UTF16Plus.isLeadSurrogate(c)) {
 661                     break;
 662                 } else {
 663                     char c2;
 664                     if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
 665                         c = Character.toCodePoint((char)c, c2);
 666                         norm16 = normTrie.suppGet(c);
 667                         if (isMostDecompYesAndZeroCC(norm16)) {
 668                             src += 2;
 669                         } else {
 670                             break;
 671                         }
 672                     } else {
 673                         ++src;  // unpaired lead surrogate: inert
 674                     }
 675                 }
 676             }
 677             // copy these code units all at once
 678             if(src!=prevSrc) {
 679                 if(buffer!=null) {
 680                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
 681                 } else {
 682                     prevCC=0;
 683                     prevBoundary=src;
 684                 }
 685             }
 686             if(src==limit) {
 687                 break;
 688             }
 689 
 690             // Check one above-minimum, relevant code point.
 691             src+=Character.charCount(c);
 692             if(buffer!=null) {
 693                 decompose(c, norm16, buffer);
 694             } else {
 695                 if(isDecompYes(norm16)) {
 696                     int cc=getCCFromYesOrMaybe(norm16);
 697                     if(prevCC<=cc || cc==0) {
 698                         prevCC=cc;
 699                         if(cc<=1) {
 700                             prevBoundary=src;
 701                         }
 702                         continue;
 703                     }
 704                 }
 705                 return prevBoundary;  // "no" or cc out of order
 706             }
 707         }
 708         return src;
 709     }
 710     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
 711         int limit=s.length();
 712         if(limit==0) {
 713             return;
 714         }
 715         if(doDecompose) {
 716             decompose(s, 0, limit, buffer);
 717             return;
 718         }
 719         // Just merge the strings at the boundary.
 720         int c=Character.codePointAt(s, 0);
 721         int src=0;
 722         int firstCC, prevCC, cc;
 723         firstCC=prevCC=cc=getCC(getNorm16(c));
 724         while(cc!=0) {
 725             prevCC=cc;
 726             src+=Character.charCount(c);
 727             if(src>=limit) {
 728                 break;
 729             }
 730             c=Character.codePointAt(s, src);
 731             cc=getCC(getNorm16(c));
 732         };
 733         buffer.append(s, 0, src, false, firstCC, prevCC);
 734         buffer.append(s, src, limit);
 735     }
 736 
 737     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 738     // doCompose: normalize
 739     // !doCompose: isNormalized (buffer must be empty and initialized)
 740     public boolean compose(CharSequence s, int src, int limit,
 741                            boolean onlyContiguous,
 742                            boolean doCompose,
 743                            ReorderingBuffer buffer) {
 744         int prevBoundary=src;
 745         int minNoMaybeCP=minCompNoMaybeCP;
 746 
 747         for (;;) {
 748             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 749             // or with (compYes && ccc==0) properties.
 750             int prevSrc;
 751             int c = 0;
 752             int norm16 = 0;
 753             for (;;) {
 754                 if (src == limit) {
 755                     if (prevBoundary != limit && doCompose) {
 756                         buffer.append(s, prevBoundary, limit);
 757                     }
 758                     return true;
 759                 }
 760                 if( (c=s.charAt(src))<minNoMaybeCP ||
 761                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 762                 ) {
 763                     ++src;
 764                 } else {
 765                     prevSrc = src++;
 766                     if (!UTF16Plus.isLeadSurrogate(c)) {
 767                         break;
 768                     } else {
 769                         char c2;
 770                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
 771                             ++src;
 772                             c = Character.toCodePoint((char)c, c2);
 773                             norm16 = normTrie.suppGet(c);
 774                             if (!isCompYesAndZeroCC(norm16)) {
 775                                 break;
 776                             }
 777                         }
 778                     }
 779                 }
 780             }
 781             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 782             // The current character is either a "noNo" (has a mapping)
 783             // or a "maybeYes" (combines backward)
 784             // or a "yesYes" with ccc!=0.
 785             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
 786 
 787             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
 788             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
 789                 if (!doCompose) {
 790                     return false;
 791                 }
 792                 // Fast path for mapping a character that is immediately surrounded by boundaries.
 793                 // In this case, we need not decompose around the current character.
 794                 if (isDecompNoAlgorithmic(norm16)) {
 795                     // Maps to a single isCompYesAndZeroCC character
 796                     // which also implies hasCompBoundaryBefore.
 797                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 798                             hasCompBoundaryBefore(s, src, limit)) {
 799                         if (prevBoundary != prevSrc) {
 800                             buffer.append(s, prevBoundary, prevSrc);
 801                         }
 802                         buffer.append(mapAlgorithmic(c, norm16), 0);
 803                         prevBoundary = src;
 804                         continue;
 805                     }
 806                 } else if (norm16 < minNoNoCompBoundaryBefore) {
 807                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
 808                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 809                             hasCompBoundaryBefore(s, src, limit)) {
 810                         if (prevBoundary != prevSrc) {
 811                             buffer.append(s, prevBoundary, prevSrc);
 812                         }
 813                         int mapping = norm16 >> OFFSET_SHIFT;
 814                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
 815                         buffer.append(extraData, mapping, mapping + length);
 816                         prevBoundary = src;
 817                         continue;
 818                     }
 819                 } else if (norm16 >= minNoNoEmpty) {
 820                     // The current character maps to nothing.
 821                     // Simply omit it from the output if there is a boundary before _or_ after it.
 822                     // The character itself implies no boundaries.
 823                     if (hasCompBoundaryBefore(s, src, limit) ||
 824                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
 825                         if (prevBoundary != prevSrc) {
 826                             buffer.append(s, prevBoundary, prevSrc);
 827                         }
 828                         prevBoundary = src;
 829                         continue;
 830                     }
 831                 }
 832                 // Other "noNo" type, or need to examine more text around this character:
 833                 // Fall through to the slow path.
 834             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
 835                 char prev=s.charAt(prevSrc-1);
 836                 if(c<Hangul.JAMO_T_BASE) {
 837                     // The current character is a Jamo Vowel,
 838                     // compose with previous Jamo L and following Jamo T.
 839                     char l = (char)(prev-Hangul.JAMO_L_BASE);
 840                     if(l<Hangul.JAMO_L_COUNT) {
 841                         if (!doCompose) {
 842                             return false;
 843                         }
 844                         int t;
 845                         if (src != limit &&
 846                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
 847                                 t < Hangul.JAMO_T_COUNT) {
 848                             // The next character is a Jamo T.
 849                             ++src;
 850                         } else if (hasCompBoundaryBefore(s, src, limit)) {
 851                             // No Jamo T follows, not even via decomposition.
 852                             t = 0;
 853                         } else {
 854                             t = -1;
 855                         }
 856                         if (t >= 0) {
 857                             int syllable = Hangul.HANGUL_BASE +
 858                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
 859                                 Hangul.JAMO_T_COUNT + t;
 860                             --prevSrc;  // Replace the Jamo L as well.
 861                             if (prevBoundary != prevSrc) {
 862                                 buffer.append(s, prevBoundary, prevSrc);
 863                             }
 864                             buffer.append((char)syllable);
 865                             prevBoundary = src;
 866                             continue;
 867                         }
 868                         // If we see L+V+x where x!=T then we drop to the slow path,
 869                         // decompose and recompose.
 870                         // This is to deal with NFKC finding normal L and V but a
 871                         // compatibility variant of a T.
 872                         // We need to either fully compose that combination here
 873                         // (which would complicate the code and may not work with strange custom data)
 874                         // or use the slow path.
 875                     }
 876                 } else if (Hangul.isHangulLV(prev)) {
 877                     // The current character is a Jamo Trailing consonant,
 878                     // compose with previous Hangul LV that does not contain a Jamo T.
 879                     if (!doCompose) {
 880                         return false;
 881                     }
 882                     int syllable = prev + c - Hangul.JAMO_T_BASE;
 883                     --prevSrc;  // Replace the Hangul LV as well.
 884                     if (prevBoundary != prevSrc) {
 885                         buffer.append(s, prevBoundary, prevSrc);
 886                     }
 887                     buffer.append((char)syllable);
 888                     prevBoundary = src;
 889                     continue;
 890                 }
 891                 // No matching context, or may need to decompose surrounding text first:
 892                 // Fall through to the slow path.
 893             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
 894                 // One or more combining marks that do not combine-back:
 895                 // Check for canonical order, copy unchanged if ok and
 896                 // if followed by a character with a boundary-before.
 897                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
 898                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
 899                     // Fails FCD test, need to decompose and contiguously recompose.
 900                     if (!doCompose) {
 901                         return false;
 902                     }
 903                 } else {
 904                     // If !onlyContiguous (not FCC), then we ignore the tccc of
 905                     // the previous character which passed the quick check "yes && ccc==0" test.
 906                     int n16;
 907                     for (;;) {
 908                         if (src == limit) {
 909                             if (doCompose) {
 910                                 buffer.append(s, prevBoundary, limit);
 911                             }
 912                             return true;
 913                         }
 914                         int prevCC = cc;
 915                         c = Character.codePointAt(s, src);
 916                         n16 = normTrie.get(c);
 917                         if (n16 >= MIN_YES_YES_WITH_CC) {
 918                             cc = getCCFromNormalYesOrMaybe(n16);
 919                             if (prevCC > cc) {
 920                                 if (!doCompose) {
 921                                     return false;
 922                                 }
 923                                 break;
 924                             }
 925                         } else {
 926                             break;
 927                         }
 928                         src += Character.charCount(c);
 929                     }
 930                     // p is after the last in-order combining mark.
 931                     // If there is a boundary here, then we continue with no change.
 932                     if (norm16HasCompBoundaryBefore(n16)) {
 933                         if (isCompYesAndZeroCC(n16)) {
 934                             src += Character.charCount(c);
 935                         }
 936                         continue;
 937                     }
 938                     // Use the slow path. There is no boundary in [prevSrc, src[.
 939                 }
 940             }
 941 
 942             // Slow path: Find the nearest boundaries around the current character,
 943             // decompose and recompose.
 944             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
 945                 c = Character.codePointBefore(s, prevSrc);
 946                 norm16 = normTrie.get(c);
 947                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 948                     prevSrc -= Character.charCount(c);
 949                 }
 950             }
 951             if (doCompose && prevBoundary != prevSrc) {
 952                 buffer.append(s, prevBoundary, prevSrc);
 953             }
 954             int recomposeStartIndex=buffer.length();
 955             // We know there is not a boundary here.
 956             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
 957                            buffer);
 958             // Decompose until the next boundary.
 959             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
 960                                  buffer);
 961             recompose(buffer, recomposeStartIndex, onlyContiguous);
 962             if(!doCompose) {
 963                 if(!buffer.equals(s, prevSrc, src)) {
 964                     return false;
 965                 }
 966                 buffer.remove();
 967             }
 968             prevBoundary=src;
 969         }
 970     }
 971 
 972     /**
 973      * Very similar to compose(): Make the same changes in both places if relevant.
 974      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
 975      * !doSpan: quickCheck
 976      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
 977      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 978      *         then the quick check result is "no"
 979      */
 980     public int composeQuickCheck(CharSequence s, int src, int limit,
 981                                  boolean onlyContiguous, boolean doSpan) {
 982         int qcResult=0;
 983         int prevBoundary=src;
 984         int minNoMaybeCP=minCompNoMaybeCP;
 985 
 986         for(;;) {
 987             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 988             // or with (compYes && ccc==0) properties.
 989             int prevSrc;
 990             int c = 0;
 991             int norm16 = 0;
 992             for (;;) {
 993                 if(src==limit) {
 994                     return (src<<1)|qcResult;  // "yes" or "maybe"
 995                 }
 996                 if( (c=s.charAt(src))<minNoMaybeCP ||
 997                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 998                 ) {
 999                     ++src;
1000                 } else {
1001                     prevSrc = src++;
1002                     if (!UTF16Plus.isLeadSurrogate(c)) {
1003                         break;
1004                     } else {
1005                         char c2;
1006                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1007                             ++src;
1008                             c = Character.toCodePoint((char)c, c2);
1009                             norm16 = normTrie.suppGet(c);
1010                             if (!isCompYesAndZeroCC(norm16)) {
1011                                 break;
1012                             }
1013                         }
1014                     }
1015                 }
1016             }
1017             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1018             // The current character is either a "noNo" (has a mapping)
1019             // or a "maybeYes" (combines backward)
1020             // or a "yesYes" with ccc!=0.
1021             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1022 
1023             int prevNorm16 = INERT;
1024             if (prevBoundary != prevSrc) {
1025                 prevBoundary = prevSrc;
1026                 if (!norm16HasCompBoundaryBefore(norm16)) {
1027                     c = Character.codePointBefore(s, prevSrc);
1028                     int n16 = getNorm16(c);
1029                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1030                         prevBoundary -= Character.charCount(c);
1031                         prevNorm16 = n16;
1032                     }
1033                 }
1034             }
1035 
1036             if(isMaybeOrNonZeroCC(norm16)) {
1037                 int cc=getCCFromYesOrMaybe(norm16);
1038                 if (onlyContiguous /* FCC */ && cc != 0 &&
1039                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1040                     // The [prevBoundary..prevSrc[ character
1041                     // passed the quick check "yes && ccc==0" test
1042                     // but is out of canonical order with the current combining mark.
1043                 } else {
1044                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1045                     // the previous character which passed the quick check "yes && ccc==0" test.
1046                     for (;;) {
1047                         if (norm16 < MIN_YES_YES_WITH_CC) {
1048                             if (!doSpan) {
1049                                 qcResult = 1;
1050                             } else {
1051                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1052                             }
1053                         }
1054                         if (src == limit) {
1055                             return (src<<1) | qcResult;  // "yes" or "maybe"
1056                         }
1057                         int prevCC = cc;
1058                         c = Character.codePointAt(s, src);
1059                         norm16 = getNorm16(c);
1060                         if (isMaybeOrNonZeroCC(norm16)) {
1061                             cc = getCCFromYesOrMaybe(norm16);
1062                             if (!(prevCC <= cc || cc == 0)) {
1063                                 break;
1064                             }
1065                         } else {
1066                             break;
1067                         }
1068                         src += Character.charCount(c);
1069                     }
1070                     // src is after the last in-order combining mark.
1071                     if (isCompYesAndZeroCC(norm16)) {
1072                         prevBoundary = src;
1073                         src += Character.charCount(c);
1074                         continue;
1075                     }
1076                 }
1077             }
1078             return prevBoundary<<1;  // "no"
1079         }
1080     }
1081     public void composeAndAppend(CharSequence s,
1082                                  boolean doCompose,
1083                                  boolean onlyContiguous,
1084                                  ReorderingBuffer buffer) {
1085         int src=0, limit=s.length();
1086         if(!buffer.isEmpty()) {
1087             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1088             if(0!=firstStarterInSrc) {
1089                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1090                                                                buffer.length(), onlyContiguous);
1091                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1092                                                        firstStarterInSrc+16);
1093                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1094                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1095                 middle.append(s, 0, firstStarterInSrc);
1096                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1097                 src=firstStarterInSrc;
1098             }
1099         }
1100         if(doCompose) {
1101             compose(s, src, limit, onlyContiguous, true, buffer);
1102         } else {
1103             buffer.append(s, src, limit);
1104         }
1105     }
1106     // Dual functionality:
1107     // buffer!=NULL: normalize
1108     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1109     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1110         // Note: In this function we use buffer->appendZeroCC() because we track
1111         // the lead and trail combining classes here, rather than leaving it to
1112         // the ReorderingBuffer.
1113         // The exception is the call to decomposeShort() which uses the buffer
1114         // in the normal way.
1115 
1116         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1117         // Similar to the prevBoundary in the compose() implementation.
1118         int prevBoundary=src;
1119         int prevSrc;
1120         int c=0;
1121         int prevFCD16=0;
1122         int fcd16=0;
1123 
1124         for(;;) {
1125             // count code units with lccc==0
1126             for(prevSrc=src; src!=limit;) {
1127                 if((c=s.charAt(src))<minLcccCP) {
1128                     prevFCD16=~c;
1129                     ++src;
1130                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1131                     prevFCD16=0;
1132                     ++src;
1133                 } else {
1134                     if (UTF16Plus.isLeadSurrogate(c)) {
1135                         char c2;
1136                         if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1137                             c = Character.toCodePoint((char)c, c2);
1138                         }
1139                     }
1140                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1141                         prevFCD16=fcd16;
1142                         src+=Character.charCount(c);
1143                     } else {
1144                         break;
1145                     }
1146                 }
1147             }
1148             // copy these code units all at once
1149             if(src!=prevSrc) {
1150                 if(src==limit) {
1151                     if(buffer!=null) {
1152                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1153                     }
1154                     break;
1155                 }
1156                 prevBoundary=src;
1157                 // We know that the previous character's lccc==0.
1158                 if(prevFCD16<0) {
1159                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1160                     int prev=~prevFCD16;
1161                     if(prev<minDecompNoCP) {
1162                         prevFCD16=0;
1163                     } else {
1164                         prevFCD16=getFCD16FromNormData(prev);
1165                         if(prevFCD16>1) {
1166                             --prevBoundary;
1167                         }
1168                     }
1169                 } else {
1170                     int p=src-1;
1171                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1172                         Character.isHighSurrogate(s.charAt(p-1))
1173                     ) {
1174                         --p;
1175                         // Need to fetch the previous character's FCD value because
1176                         // prevFCD16 was just for the trail surrogate code point.
1177                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1178                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1179                     }
1180                     if(prevFCD16>1) {
1181                         prevBoundary=p;
1182                     }
1183                 }
1184                 if(buffer!=null) {
1185                     // The last lccc==0 character is excluded from the
1186                     // flush-and-append call in case it needs to be modified.
1187                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1188                     buffer.append(s, prevBoundary, src);
1189                 }
1190                 // The start of the current character (c).
1191                 prevSrc=src;
1192             } else if(src==limit) {
1193                 break;
1194             }
1195 
1196             src+=Character.charCount(c);
1197             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1198             // Check for proper order, and decompose locally if necessary.
1199             if((prevFCD16&0xff)<=(fcd16>>8)) {
1200                 // proper order: prev tccc <= current lccc
1201                 if((fcd16&0xff)<=1) {
1202                     prevBoundary=src;
1203                 }
1204                 if(buffer!=null) {
1205                     buffer.appendZeroCC(c);
1206                 }
1207                 prevFCD16=fcd16;
1208                 continue;
1209             } else if(buffer==null) {
1210                 return prevBoundary;  // quick check "no"
1211             } else {
1212                 /*
1213                  * Back out the part of the source that we copied or appended
1214                  * already but is now going to be decomposed.
1215                  * prevSrc is set to after what was copied/appended.
1216                  */
1217                 buffer.removeSuffix(prevSrc-prevBoundary);
1218                 /*
1219                  * Find the part of the source that needs to be decomposed,
1220                  * up to the next safe boundary.
1221                  */
1222                 src=findNextFCDBoundary(s, src, limit);
1223                 /*
1224                  * The source text does not fulfill the conditions for FCD.
1225                  * Decompose and reorder a limited piece of the text.
1226                  */
1227                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1228                 prevBoundary=src;
1229                 prevFCD16=0;
1230             }
1231         }
1232         return src;
1233     }
1234 
1235     public boolean hasDecompBoundaryBefore(int c) {
1236         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1237             norm16HasDecompBoundaryBefore(getNorm16(c));
1238     }
1239     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1240         if (norm16 < minNoNoCompNoMaybeCC) {
1241             return true;
1242         }
1243         if (norm16 >= limitNoNo) {
1244             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1245         }
1246         // c decomposes, get everything from the variable-length extra data
1247         int mapping=norm16>>OFFSET_SHIFT;
1248         int firstUnit=extraData.charAt(mapping);
1249         // true if leadCC==0 (hasFCDBoundaryBefore())
1250         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1251     }
1252     public boolean hasDecompBoundaryAfter(int c) {
1253         if (c < minDecompNoCP) {
1254             return true;
1255         }
1256         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1257             return true;
1258         }
1259         return norm16HasDecompBoundaryAfter(getNorm16(c));
1260     }
1261     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1262         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1263             return true;
1264         }
1265         if (norm16 >= limitNoNo) {
1266             if (isMaybeOrNonZeroCC(norm16)) {
1267                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1268             }
1269             // Maps to an isCompYesAndZeroCC.
1270             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1271         }
1272         // c decomposes, get everything from the variable-length extra data
1273         int mapping=norm16>>OFFSET_SHIFT;
1274         int firstUnit=extraData.charAt(mapping);
1275         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1276         // fcd16<=1 || trailCC==0
1277         if(firstUnit>0x1ff) {
1278             return false;  // trailCC>1
1279         }
1280         if(firstUnit<=0xff) {
1281             return true;  // trailCC==0
1282         }
1283         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1284         // true if leadCC==0 (hasFCDBoundaryBefore())
1285         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1286     }
1287     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1288 
1289     public boolean hasCompBoundaryBefore(int c) {
1290         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1291     }
1292     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1293         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1294     }
1295 
1296     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1297     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1298     private static boolean isInert(int norm16) { return norm16==INERT; }
1299     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1300     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
1301     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
1302     private boolean isHangulLVT(int norm16) {
1303         return norm16==hangulLVT();
1304     }
1305     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1306     // UBool isCompYes(uint16_t norm16) const {
1307     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1308     // }
1309     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1310     //     return norm16<minNoNo || minMaybeYes<=norm16;
1311     // }
1312     // private boolean hasZeroCCFromDecompYes(int norm16) {
1313     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1314     // }
1315     private boolean isDecompYesAndZeroCC(int norm16) {
1316         return norm16<minYesNo ||
1317                norm16==JAMO_VT ||
1318                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1319     }
1320     /**
1321      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1322      * the MaybeYes which combine-forward and have ccc=0.
1323      * (Standard Unicode 10 normalization does not have such characters.)
1324      */
1325     private boolean isMostDecompYesAndZeroCC(int norm16) {
1326         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1327     }
1328     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1329 
1330     // For use with isCompYes().
1331     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1332     // static uint8_t getCCFromYes(uint16_t norm16) {
1333     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1334     // }
1335     private int getCCFromNoNo(int norm16) {
1336         int mapping=norm16>>OFFSET_SHIFT;
1337         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1338             return extraData.charAt(mapping-1)&0xff;
1339         } else {
1340             return 0;
1341         }
1342     }
1343     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1344         if(norm16<=minYesNo) {
1345             return 0;  // yesYes and Hangul LV have ccc=tccc=0
1346         } else {
1347             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1348             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1349         }
1350     }
1351 
1352     // Requires algorithmic-NoNo.
1353     private int mapAlgorithmic(int c, int norm16) {
1354         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1355     }
1356 
1357     // Requires minYesNo<norm16<limitNoNo.
1358     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1359 
1360     /**
1361      * @return index into maybeYesCompositions, or -1
1362      */
1363     private int getCompositionsListForDecompYes(int norm16) {
1364         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1365             return -1;
1366         } else {
1367             if((norm16-=minMaybeYes)<0) {
1368                 // norm16<minMaybeYes: index into extraData which is a substring at
1369                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1370                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1371                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1372             }
1373             return norm16>>OFFSET_SHIFT;
1374         }
1375     }
1376     /**
1377      * @return index into maybeYesCompositions
1378      */
1379     private int getCompositionsListForComposite(int norm16) {
1380         // A composite has both mapping & compositions list.
1381         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1382         int firstUnit=maybeYesCompositions.charAt(list);
1383         return list+  // mapping in maybeYesCompositions
1384             1+  // +1 to skip the first unit with the mapping length
1385             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1386     }
1387 
1388     // Decompose a short piece of text which is likely to contain characters that
1389     // fail the quick check loop and/or where the quick check loop's overhead
1390     // is unlikely to be amortized.
1391     // Called by the compose() and makeFCD() implementations.
1392     // Public in Java for collation implementation code.
1393     private int decomposeShort(
1394             CharSequence s, int src, int limit,
1395             boolean stopAtCompBoundary, boolean onlyContiguous,
1396             ReorderingBuffer buffer) {
1397         while(src<limit) {
1398             int c=Character.codePointAt(s, src);
1399             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1400                 return src;
1401             }
1402             int norm16 = getNorm16(c);
1403             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1404                 return src;
1405             }
1406             src+=Character.charCount(c);
1407             decompose(c, norm16, buffer);
1408             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1409                 return src;
1410             }
1411         }
1412         return src;
1413     }
1414     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1415         // get the decomposition and the lead and trail cc's
1416         if (norm16 >= limitNoNo) {
1417             if (isMaybeOrNonZeroCC(norm16)) {
1418                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1419                 return;
1420             }
1421             // Maps to an isCompYesAndZeroCC.
1422             c=mapAlgorithmic(c, norm16);
1423             norm16=getRawNorm16(c);
1424         }
1425         if (norm16 < minYesNo) {
1426             // c does not decompose
1427             buffer.append(c, 0);
1428         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1429             // Hangul syllable: decompose algorithmically
1430             Hangul.decompose(c, buffer);
1431         } else {
1432             // c decomposes, get everything from the variable-length extra data
1433             int mapping=norm16>>OFFSET_SHIFT;
1434             int firstUnit=extraData.charAt(mapping);
1435             int length=firstUnit&MAPPING_LENGTH_MASK;
1436             int leadCC, trailCC;
1437             trailCC=firstUnit>>8;
1438             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1439                 leadCC=extraData.charAt(mapping-1)>>8;
1440             } else {
1441                 leadCC=0;
1442             }
1443             ++mapping;  // skip over the firstUnit
1444             buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
1445         }
1446     }
1447 
1448     /**
1449      * Finds the recomposition result for
1450      * a forward-combining "lead" character,
1451      * specified with a pointer to its compositions list,
1452      * and a backward-combining "trail" character.
1453      *
1454      * <p>If the lead and trail characters combine, then this function returns
1455      * the following "compositeAndFwd" value:
1456      * <pre>
1457      * Bits 21..1  composite character
1458      * Bit      0  set if the composite is a forward-combining starter
1459      * </pre>
1460      * otherwise it returns -1.
1461      *
1462      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1463      * encoded as either pairs or triples of 16-bit units.
1464      * The last entry has the high bit of its first unit set.
1465      *
1466      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1467      * A linear search is used.
1468      *
1469      * <p>See normalizer2impl.h for a more detailed description
1470      * of the compositions list format.
1471      */
1472     private static int combine(String compositions, int list, int trail) {
1473         int key1, firstUnit;
1474         if(trail<COMP_1_TRAIL_LIMIT) {
1475             // trail character is 0..33FF
1476             // result entry may have 2 or 3 units
1477             key1=(trail<<1);
1478             while(key1>(firstUnit=compositions.charAt(list))) {
1479                 list+=2+(firstUnit&COMP_1_TRIPLE);
1480             }
1481             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1482                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1483                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1484                 } else {
1485                     return compositions.charAt(list+1);
1486                 }
1487             }
1488         } else {
1489             // trail character is 3400..10FFFF
1490             // result entry has 3 units
1491             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1492             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1493             int secondUnit;
1494             for(;;) {
1495                 if(key1>(firstUnit=compositions.charAt(list))) {
1496                     list+=2+(firstUnit&COMP_1_TRIPLE);
1497                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1498                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1499                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1500                             break;
1501                         } else {
1502                             list+=3;
1503                         }
1504                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1505                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1506                     } else {
1507                         break;
1508                     }
1509                 } else {
1510                     break;
1511                 }
1512             }
1513         }
1514         return -1;
1515     }
1516 
1517     /*
1518      * Recomposes the buffer text starting at recomposeStartIndex
1519      * (which is in NFD - decomposed and canonically ordered),
1520      * and truncates the buffer contents.
1521      *
1522      * Note that recomposition never lengthens the text:
1523      * Any character consists of either one or two code units;
1524      * a composition may contain at most one more code unit than the original starter,
1525      * while the combining mark that is removed has at least one code unit.
1526      */
1527     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1528                            boolean onlyContiguous) {
1529         StringBuilder sb=buffer.getStringBuilder();
1530         int p=recomposeStartIndex;
1531         if(p==sb.length()) {
1532             return;
1533         }
1534 
1535         int starter, pRemove;
1536         int compositionsList;
1537         int c, compositeAndFwd;
1538         int norm16;
1539         int cc, prevCC;
1540         boolean starterIsSupplementary;
1541 
1542         // Some of the following variables are not used until we have a forward-combining starter
1543         // and are only initialized now to avoid compiler warnings.
1544         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1545         starter=-1;
1546         starterIsSupplementary=false;
1547         prevCC=0;
1548 
1549         for(;;) {
1550             c=sb.codePointAt(p);
1551             p+=Character.charCount(c);
1552             norm16=getNorm16(c);
1553             cc=getCCFromYesOrMaybe(norm16);
1554             if( // this character combines backward and
1555                 isMaybe(norm16) &&
1556                 // we have seen a starter that combines forward and
1557                 compositionsList>=0 &&
1558                 // the backward-combining character is not blocked
1559                 (prevCC<cc || prevCC==0)
1560             ) {
1561                 if(isJamoVT(norm16)) {
1562                     // c is a Jamo V/T, see if we can compose it with the previous character.
1563                     if(c<Hangul.JAMO_T_BASE) {
1564                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1565                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1566                         if(prev<Hangul.JAMO_L_COUNT) {
1567                             pRemove=p-1;
1568                             char syllable=(char)
1569                                 (Hangul.HANGUL_BASE+
1570                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1571                                  Hangul.JAMO_T_COUNT);
1572                             char t;
1573                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1574                                 ++p;
1575                                 syllable+=t;  // The next character was a Jamo T.
1576                             }
1577                             sb.setCharAt(starter, syllable);
1578                             // remove the Jamo V/T
1579                             sb.delete(pRemove, p);
1580                             p=pRemove;
1581                         }
1582                     }
1583                     /*
1584                      * No "else" for Jamo T:
1585                      * Since the input is in NFD, there are no Hangul LV syllables that
1586                      * a Jamo T could combine with.
1587                      * All Jamo Ts are combined above when handling Jamo Vs.
1588                      */
1589                     if(p==sb.length()) {
1590                         break;
1591                     }
1592                     compositionsList=-1;
1593                     continue;
1594                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1595                     // The starter and the combining mark (c) do combine.
1596                     int composite=compositeAndFwd>>1;
1597 
1598                     // Remove the combining mark.
1599                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1600                     sb.delete(pRemove, p);
1601                     p=pRemove;
1602                     // Replace the starter with the composite.
1603                     if(starterIsSupplementary) {
1604                         if(composite>0xffff) {
1605                             // both are supplementary
1606                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1607                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1608                         } else {
1609                             sb.setCharAt(starter, (char)c);
1610                             sb.deleteCharAt(starter+1);
1611                             // The composite is shorter than the starter,
1612                             // move the intermediate characters forward one.
1613                             starterIsSupplementary=false;
1614                             --p;
1615                         }
1616                     } else if(composite>0xffff) {
1617                         // The composite is longer than the starter,
1618                         // move the intermediate characters back one.
1619                         starterIsSupplementary=true;
1620                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1621                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1622                         ++p;
1623                     } else {
1624                         // both are on the BMP
1625                         sb.setCharAt(starter, (char)composite);
1626                     }
1627 
1628                     // Keep prevCC because we removed the combining mark.
1629 
1630                     if(p==sb.length()) {
1631                         break;
1632                     }
1633                     // Is the composite a starter that combines forward?
1634                     if((compositeAndFwd&1)!=0) {
1635                         compositionsList=
1636                             getCompositionsListForComposite(getRawNorm16(composite));
1637                     } else {
1638                         compositionsList=-1;
1639                     }
1640 
1641                     // We combined; continue with looking for compositions.
1642                     continue;
1643                 }
1644             }
1645 
1646             // no combination this time
1647             prevCC=cc;
1648             if(p==sb.length()) {
1649                 break;
1650             }
1651 
1652             // If c did not combine, then check if it is a starter.
1653             if(cc==0) {
1654                 // Found a new starter.
1655                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1656                     // It may combine with something, prepare for it.
1657                     if(c<=0xffff) {
1658                         starterIsSupplementary=false;
1659                         starter=p-1;
1660                     } else {
1661                         starterIsSupplementary=true;
1662                         starter=p-2;
1663                     }
1664                 }
1665             } else if(onlyContiguous) {
1666                 // FCC: no discontiguous compositions; any intervening character blocks.
1667                 compositionsList=-1;
1668             }
1669         }
1670         buffer.flush();
1671     }
1672 
1673     /**
1674      * Does c have a composition boundary before it?
1675      * True if its decomposition begins with a character that has
1676      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1677      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1678      * (isCompYesAndZeroCC()) so we need not decompose.
1679      */
1680     private boolean hasCompBoundaryBefore(int c, int norm16) {
1681         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
1682     }
1683     private boolean norm16HasCompBoundaryBefore(int norm16) {
1684         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
1685     }
1686     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
1687         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
1688     }
1689     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
1690         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1691             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
1692     }
1693     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
1694         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
1695     }
1696     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
1697     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
1698         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
1699             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
1700     }
1701 
1702     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
1703         while(p>0) {
1704             int c=Character.codePointBefore(s, p);
1705             int norm16 = getNorm16(c);
1706             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1707                 break;
1708             }
1709             p-=Character.charCount(c);
1710             if(hasCompBoundaryBefore(c, norm16)) {
1711                 break;
1712             }
1713         }
1714         return p;
1715     }
1716     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
1717         while(p<limit) {
1718             int c=Character.codePointAt(s, p);
1719             int norm16=normTrie.get(c);
1720             if(hasCompBoundaryBefore(c, norm16)) {
1721                 break;
1722             }
1723             p+=Character.charCount(c);
1724             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1725                 break;
1726             }
1727         }
1728         return p;
1729     }
1730 
1731 
1732     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1733         while(p<limit) {
1734             int c=Character.codePointAt(s, p);
1735             int norm16;
1736             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
1737                 break;
1738             }
1739             p+=Character.charCount(c);
1740             if (norm16HasDecompBoundaryAfter(norm16)) {
1741                 break;
1742             }
1743         }
1744         return p;
1745     }
1746 
1747     /**
1748      * Get the canonical decomposition
1749      * sherman  for ComposedCharIter
1750      */
1751     public static int getDecompose(int chars[], String decomps[]) {
1752         Normalizer2 impl = Normalizer2.getNFDInstance();
1753 
1754         int length=0;
1755         int norm16 = 0;
1756         int ch = -1;
1757         int i = 0;
1758 
1759         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1760             //TBD !!!! the hack code heres save us about 50ms for startup
1761             //need a better solution/lookup
1762             if (ch == 0x30ff)
1763                 ch = 0xf900;
1764             else if (ch == 0x115bc)
1765                 ch = 0x1d15e;
1766             else if (ch == 0x1d1c1)
1767                 ch = 0x2f800;
1768 
1769             String s = impl.getDecomposition(ch);
1770 
1771             if(s != null && i < chars.length) {
1772                 chars[i] = ch;
1773                 decomps[i++] = s;
1774             }
1775         }
1776         return i;
1777     }
1778 
1779     //------------------------------------------------------
1780     // special method for Collation (RBTableBuilder.build())
1781     //------------------------------------------------------
1782     private static boolean needSingleQuotation(char c) {
1783         return (c >= 0x0009 && c <= 0x000D) ||
1784                (c >= 0x0020 && c <= 0x002F) ||
1785                (c >= 0x003A && c <= 0x0040) ||
1786                (c >= 0x005B && c <= 0x0060) ||
1787                (c >= 0x007B && c <= 0x007E);
1788     }
1789 
1790     public static String canonicalDecomposeWithSingleQuotation(String string) {
1791        Normalizer2 impl = Normalizer2.getNFDInstance();
1792        char[] src = string.toCharArray();
1793        int    srcIndex = 0;
1794        int    srcLimit = src.length;
1795        char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
1796        int    destIndex = 0;
1797        int    destLimit = dest.length;
1798 
1799         int prevSrc;
1800         String norm;
1801         int reorderStartIndex, length;
1802         char c1, c2;
1803         int cp;
1804         int minNoMaybe = 0x00c0;
1805         int cc, prevCC, trailCC;
1806         char[] p;
1807         int pStart;
1808 
1809         // initialize
1810         reorderStartIndex = 0;
1811         prevCC = 0;
1812         norm = null;
1813         cp = 0;
1814         pStart = 0;
1815 
1816         cc = trailCC = -1; // initialize to bogus value
1817         c1 = 0;
1818         for (;;) {
1819             prevSrc=srcIndex;
1820             //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
1821             while (srcIndex != srcLimit &&
1822                    ((c1 = src[srcIndex]) < minNoMaybe ||
1823                     (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
1824                     (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
1825                 prevCC = 0;
1826                 srcIndex += (cp < 0x10000) ? 1 : 2;
1827             }
1828 
1829             // copy these code units all at once
1830             if (srcIndex != prevSrc) {
1831                 length = srcIndex - prevSrc;
1832                 if ((destIndex + length) <= destLimit) {
1833                     System.arraycopy(src,prevSrc,dest,destIndex,length);
1834                 }
1835 
1836                 destIndex += length;
1837                 reorderStartIndex = destIndex;
1838             }
1839 
1840             // end of source reached?
1841             if (srcIndex == srcLimit) {
1842                 break;
1843             }
1844 
1845             // cp already contains *src and norm32 is set for it, increment src
1846             srcIndex += (cp < 0x10000) ? 1 : 2;
1847 
1848             if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1849                 c2 = 0;
1850                 length = 1;
1851 
1852                 if (Character.isHighSurrogate(c1)
1853                     || Character.isLowSurrogate(c1)) {
1854                     norm = null;
1855                 }
1856             } else {
1857                 length = 2;
1858                 c2 = src[srcIndex-1];
1859             }
1860 
1861           // get the decomposition and the lead and trail cc's
1862           if (norm == null) {
1863               // cp does not decompose
1864               cc = trailCC = UCharacter.getCombiningClass(cp);
1865               p = null;
1866               pStart = -1;
1867           } else {
1868 
1869                 pStart = 0;
1870                 p = norm.toCharArray();
1871                 length = p.length;
1872                 int cpNum = norm.codePointCount(0, length);
1873                 cc= UCharacter.getCombiningClass(norm.codePointAt(0));
1874                 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
1875                 if (length == 1) {
1876                     // fastpath a single code unit from decomposition
1877                     c1 = p[pStart];
1878                     c2 = 0;
1879                     p = null;
1880                     pStart = -1;
1881                 }
1882             }
1883 
1884             if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
1885                 // buffer overflow
1886                 char[] tmpBuf = new char[destLimit * 2];
1887                 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
1888                 dest = tmpBuf;
1889                 destLimit = dest.length;
1890             }
1891 
1892             // append the decomposition to the destination buffer, assume length>0
1893             {
1894                 int reorderSplit = destIndex;
1895                 if (p == null) {
1896                     // fastpath: single code point
1897                     if (needSingleQuotation(c1)) {
1898                         //if we need single quotation, no need to consider "prevCC"
1899                         //and it must NOT be a supplementary pair
1900                         dest[destIndex++] = '\'';
1901                         dest[destIndex++] = c1;
1902                         dest[destIndex++] = '\'';
1903                         trailCC = 0;
1904                     } else if(cc != 0 && cc < prevCC) {
1905                         // (c1, c2) is out of order with respect to the preceding
1906                         //  text
1907                         destIndex += length;
1908                         trailCC = insertOrdered(dest, reorderStartIndex,
1909                                                 reorderSplit, destIndex, c1, c2, cc);
1910                     } else {
1911                         // just append (c1, c2)
1912                         dest[destIndex++] = c1;
1913                         if(c2 != 0) {
1914                             dest[destIndex++] = c2;
1915                         }
1916                     }
1917                 } else {
1918                     // general: multiple code points (ordered by themselves)
1919                     // from decomposition
1920                     if (needSingleQuotation(p[pStart])) {
1921                         dest[destIndex++] = '\'';
1922                         dest[destIndex++] = p[pStart++];
1923                         dest[destIndex++] = '\'';
1924                         length--;
1925                         do {
1926                             dest[destIndex++] = p[pStart++];
1927                         } while(--length > 0);
1928                     } else if (cc != 0 && cc < prevCC) {
1929                         destIndex += length;
1930                         trailCC = mergeOrdered(dest, reorderStartIndex,
1931                                                reorderSplit, p, pStart,
1932                                                pStart+length);
1933                     } else {
1934                         // just append the decomposition
1935                         do {
1936                             dest[destIndex++] = p[pStart++];
1937                         } while (--length > 0);
1938                     }
1939                 }
1940             }
1941             prevCC = trailCC;
1942             if(prevCC == 0) {
1943                 reorderStartIndex = destIndex;
1944             }
1945         }
1946 
1947         return new String(dest, 0, destIndex);
1948     }
1949 
1950     /**
1951      * simpler, single-character version of mergeOrdered() -
1952      * bubble-insert one single code point into the preceding string
1953      * which is already canonically ordered
1954      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
1955      *
1956      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1957      *
1958      * before: src[start]..src[current] is already ordered, and
1959      *         src[current]..src[p]     may or may not hold (c, c2) but
1960      *                          must be exactly the same length as (c, c2)
1961      * after: src[start]..src[p] is ordered
1962      *
1963      * @return the trailing combining class
1964      */
1965     private static int/*unsigned byte*/ insertOrdered(char[] source,
1966                                                       int start,
1967                                                       int current, int p,
1968                                                       char c1, char c2,
1969                                                       int/*unsigned byte*/ cc) {
1970         int back, preBack;
1971         int r;
1972         int prevCC, trailCC=cc;
1973 
1974         if (start<current && cc!=0) {
1975             // search for the insertion point where cc>=prevCC
1976             preBack=back=current;
1977 
1978             PrevArgs prevArgs = new PrevArgs();
1979             prevArgs.current  = current;
1980             prevArgs.start    = start;
1981             prevArgs.src      = source;
1982             prevArgs.c1       = c1;
1983             prevArgs.c2       = c2;
1984 
1985             // get the prevCC
1986             prevCC=getPrevCC(prevArgs);
1987             preBack = prevArgs.current;
1988 
1989             if(cc<prevCC) {
1990                 // this will be the last code point, so keep its cc
1991                 trailCC=prevCC;
1992                 back=preBack;
1993                 while(start<preBack) {
1994                     prevCC=getPrevCC(prevArgs);
1995                     preBack=prevArgs.current;
1996                     if(cc>=prevCC) {
1997                         break;
1998                     }
1999                     back=preBack;
2000                 }
2001 
2002                 // this is where we are right now with all these indicies:
2003                 // [start]..[pPreBack] 0..? code points that we can ignore
2004                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
2005                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
2006                 // [current]..[p]         1 code point (c, c2) with cc
2007 
2008                 // move the code units in between up
2009                 r=p;
2010                 do {
2011                     source[--r]=source[--current];
2012                 } while (back!=current);
2013             }
2014         }
2015 
2016         // insert (c1, c2)
2017         source[current] = c1;
2018         if (c2!=0) {
2019             source[(current+1)] = c2;
2020         }
2021 
2022         // we know the cc of the last code point
2023         return trailCC;
2024     }
2025     /**
2026      * merge two UTF-16 string parts together
2027      * to canonically order (order by combining classes) their concatenation
2028      *
2029      * the two strings may already be adjacent, so that the merging is done
2030      * in-place if the two strings are not adjacent, then the buffer holding the
2031      * first one must be large enough
2032      * the second string may or may not be ordered in itself
2033      *
2034      * before: [start]..[current] is already ordered, and
2035      *         [next]..[limit]    may be ordered in itself, but
2036      *                          is not in relation to [start..current[
2037      * after: [start..current+(limit-next)[ is ordered
2038      *
2039      * the algorithm is a simple bubble-sort that takes the characters from
2040      * src[next++] and inserts them in correct combining class order into the
2041      * preceding part of the string
2042      *
2043      * since this function is called much less often than the single-code point
2044      * insertOrdered(), it just uses that for easier maintenance
2045      *
2046      * @return the trailing combining class
2047      */
2048     private static int /*unsigned byte*/ mergeOrdered(char[] source,
2049                                                       int start,
2050                                                       int current,
2051                                                       char[] data,
2052                                                         int next,
2053                                                         int limit) {
2054             int r;
2055             int /*unsigned byte*/ cc, trailCC=0;
2056             boolean adjacent;
2057 
2058             adjacent= current==next;
2059             NextCCArgs ncArgs = new NextCCArgs();
2060             ncArgs.source = data;
2061             ncArgs.next   = next;
2062             ncArgs.limit  = limit;
2063 
2064             if(start!=current) {
2065 
2066                 while(ncArgs.next<ncArgs.limit) {
2067                     cc=getNextCC(ncArgs);
2068                     if(cc==0) {
2069                         // does not bubble back
2070                         trailCC=0;
2071                         if(adjacent) {
2072                             current=ncArgs.next;
2073                         } else {
2074                             data[current++]=ncArgs.c1;
2075                             if(ncArgs.c2!=0) {
2076                                 data[current++]=ncArgs.c2;
2077                             }
2078                         }
2079                         break;
2080                     } else {
2081                         r=current+(ncArgs.c2==0 ? 1 : 2);
2082                         trailCC=insertOrdered(source,start, current, r,
2083                                               ncArgs.c1, ncArgs.c2, cc);
2084                         current=r;
2085                     }
2086                 }
2087             }
2088 
2089             if(ncArgs.next==ncArgs.limit) {
2090                 // we know the cc of the last code point
2091                 return trailCC;
2092             } else {
2093                 if(!adjacent) {
2094                     // copy the second string part
2095                     do {
2096                         source[current++]=data[ncArgs.next++];
2097                     } while(ncArgs.next!=ncArgs.limit);
2098                     ncArgs.limit=current;
2099                 }
2100                 PrevArgs prevArgs = new PrevArgs();
2101                 prevArgs.src   = data;
2102                 prevArgs.start = start;
2103                 prevArgs.current =  ncArgs.limit;
2104                 return getPrevCC(prevArgs);
2105             }
2106 
2107     }
2108     private static final class PrevArgs{
2109         char[] src;
2110         int start;
2111         int current;
2112         char c1;
2113         char c2;
2114     }
2115 
2116     private static final class NextCCArgs{
2117         char[] source;
2118         int next;
2119         int limit;
2120         char c1;
2121         char c2;
2122     }
2123     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2124         args.c1=args.source[args.next++];
2125         args.c2=0;
2126 
2127         if (UTF16.isTrailSurrogate(args.c1)) {
2128             /* unpaired second surrogate */
2129             return 0;
2130         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2131             return UCharacter.getCombiningClass(args.c1);
2132         } else if (args.next!=args.limit &&
2133                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2134             ++args.next;
2135             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2136         } else {
2137             /* unpaired first surrogate */
2138             args.c2=0;
2139             return 0;
2140         }
2141     }
2142     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2143         args.c1=args.src[--args.current];
2144         args.c2=0;
2145 
2146         if (args.c1 < MIN_CCC_LCCC_CP) {
2147             return 0;
2148         } else if (UTF16.isLeadSurrogate(args.c1)) {
2149             /* unpaired first surrogate */
2150             return 0;
2151         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2152             return UCharacter.getCombiningClass(args.c1);
2153         } else if (args.current!=args.start &&
2154                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2155             --args.current;
2156             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2157         } else {
2158             /* unpaired second surrogate */
2159             args.c2=0;
2160             return 0;
2161         }
2162     }
2163 
2164     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2165         if (start == p) {
2166             return 0;
2167         }
2168         return getFCD16(Character.codePointBefore(s, p));
2169     }
2170 
2171     private VersionInfo dataVersion;
2172 
2173     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2174     private int minDecompNoCP;
2175     private int minCompNoMaybeCP;
2176     private int minLcccCP;
2177 
2178     // Norm16 value thresholds for quick check combinations and types of extra data.
2179     private int minYesNo;
2180     private int minYesNoMappingsOnly;
2181     private int minNoNo;
2182     private int minNoNoCompBoundaryBefore;
2183     private int minNoNoCompNoMaybeCC;
2184     private int minNoNoEmpty;
2185     private int limitNoNo;
2186     private int centerNoNoDelta;
2187     private int minMaybeYes;
2188 
2189     private CodePointTrie.Fast16 normTrie;
2190     private String maybeYesCompositions;
2191     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2192     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2193 }