< prev index next >

src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

Print this page


   1 /*
   2  * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 
  33 package sun.text.normalizer;
  34 
  35 import java.io.IOException;
  36 import java.nio.ByteBuffer;
  37 import java.text.Normalizer;
  38 
  39 // Original filename in ICU4J: Normalizer2Impl.java
  40 public final class NormalizerImpl {
  41 
  42     public static final class Hangul {
  43         /* Korean Hangul and Jamo constants */
  44         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
  45         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
  46         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
  47 
  48         public static final int HANGUL_BASE=0xac00;
  49         public static final int HANGUL_END=0xd7a3;
  50 
  51         public static final int JAMO_L_COUNT=19;
  52         public static final int JAMO_V_COUNT=21;
  53         public static final int JAMO_T_COUNT=28;
  54 
  55         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
  56         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
  57 
  58         public static boolean isHangul(int c) {
  59             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
  60         }
  61 
  62         public static boolean isHangulWithoutJamoT(char c) {
  63             c-=HANGUL_BASE;
  64             return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
  65         }
  66 
  67         /**
  68          * Decomposes c, which must be a Hangul syllable, into buffer
  69          * and returns the length of the decomposition (2 or 3).
  70          */
  71         public static int decompose(int c, Appendable buffer) {
  72             try {
  73                 c-=HANGUL_BASE;
  74                 int c2=c%JAMO_T_COUNT;
  75                 c/=JAMO_T_COUNT;
  76                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
  77                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
  78                 if(c2==0) {
  79                     return 2;
  80                 } else {
  81                     buffer.append((char)(JAMO_T_BASE+c2));
  82                     return 3;
  83                 }
  84             } catch(IOException e) {
  85                 throw new InternalError(e);
  86             }
  87         }
  88     }
  89 
  90     /**
  91      * Writable buffer that takes care of canonical ordering.
  92      * Its Appendable methods behave like the C++ implementation's
  93      * appendZeroCC() methods.
  94      * <p>
  95      * If dest is a StringBuilder, then the buffer writes directly to it.
  96      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
  97      * until no further changes are necessary and whole segments are appended.
  98      * append() methods that take combining-class values always write to the StringBuilder.
  99      * Other append() methods flush and append to the Appendable.
 100      */
 101     public static final class ReorderingBuffer implements Appendable {
 102         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
 103             impl=ni;
 104             app=dest;
 105             if (app instanceof StringBuilder) {
 106                 appIsStringBuilder=true;
 107                 str=(StringBuilder)dest;
 108                 // In Java, the constructor subsumes public void init(int destCapacity)
 109                 str.ensureCapacity(destCapacity);
 110                 reorderStart=0;
 111                 if(str.length()==0) {
 112                     lastCC=0;
 113                 } else {
 114                     setIterator();
 115                     lastCC=previousCC();
 116                     // Set reorderStart after the last code point with cc<=1 if there is one.
 117                     if(lastCC>1) {
 118                         while(previousCC()>1) {}
 119                     }
 120                     reorderStart=codePointLimit;
 121                 }
 122             } else {
 123                 appIsStringBuilder=false;
 124                 str=new StringBuilder();
 125                 reorderStart=0;
 126                 lastCC=0;
 127             }
 128         }
 129 
 130         public boolean isEmpty() { return str.length()==0; }
 131         public int length() { return str.length(); }
 132         public int getLastCC() { return lastCC; }
 133 
 134         public StringBuilder getStringBuilder() { return str; }
 135 
 136         public boolean equals(CharSequence s, int start, int limit) {
 137             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 138         }
 139 
 140         // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
 141         public void setLastChar(char c) {
 142             str.setCharAt(str.length()-1, c);
 143         }
 144 
 145         public void append(int c, int cc) {
 146             if(lastCC<=cc || cc==0) {
 147                 str.appendCodePoint(c);
 148                 lastCC=cc;
 149                 if(cc<=1) {
 150                     reorderStart=str.length();
 151                 }
 152             } else {
 153                 insert(c, cc);
 154             }
 155         }
 156 
 157         // s must be in NFD, otherwise change the implementation.
 158         public void append(CharSequence s, int start, int limit,
 159                            int leadCC, int trailCC) {
 160             if(start==limit) {
 161                 return;
 162             }
 163             if(lastCC<=leadCC || leadCC==0) {
 164                 if(trailCC<=1) {
 165                     reorderStart=str.length()+(limit-start);
 166                 } else if(leadCC<=1) {
 167                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 168                 }
 169                 str.append(s, start, limit);
 170                 lastCC=trailCC;
 171             } else {
 172                 int c=Character.codePointAt(s, start);
 173                 start+=Character.charCount(c);
 174                 insert(c, leadCC);  // insert first code point
 175                 while(start<limit) {
 176                     c=Character.codePointAt(s, start);
 177                     start+=Character.charCount(c);
 178                     if(start<limit) {
 179                         // s must be in NFD, otherwise we need to use getCC().
 180                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
 181                     } else {
 182                         leadCC=trailCC;
 183                     }
 184                     append(c, leadCC);
 185                 }
 186             }
 187         }
 188 
 189         // The following append() methods work like C++ appendZeroCC().
 190         // They assume that the cc or trailCC of their input is 0.
 191         // Most of them implement Appendable interface methods.
 192         // @Override when we switch to Java 6
 193         public ReorderingBuffer append(char c) {
 194             str.append(c);
 195             lastCC=0;
 196             reorderStart=str.length();
 197             return this;
 198         }
 199 
 200         public void appendZeroCC(int c) {
 201             str.appendCodePoint(c);
 202             lastCC=0;
 203             reorderStart=str.length();
 204         }
 205 
 206         // @Override when we switch to Java 6
 207         public ReorderingBuffer append(CharSequence s) {
 208             if(s.length()!=0) {
 209                 str.append(s);
 210                 lastCC=0;
 211                 reorderStart=str.length();
 212             }
 213             return this;
 214         }
 215 
 216         // @Override when we switch to Java 6
 217         public ReorderingBuffer append(CharSequence s, int start, int limit) {
 218             if(start!=limit) {
 219                 str.append(s, start, limit);
 220                 lastCC=0;
 221                 reorderStart=str.length();
 222             }
 223             return this;
 224         }
 225 
 226         /**
 227          * Flushes from the intermediate StringBuilder to the Appendable,
 228          * if they are different objects.
 229          * Used after recomposition.
 230          * Must be called at the end when writing to a non-StringBuilder Appendable.
 231          */
 232         public void flush() {
 233             if(appIsStringBuilder) {
 234                 reorderStart=str.length();
 235             } else {
 236                 try {
 237                     app.append(str);
 238                     str.setLength(0);
 239                     reorderStart=0;
 240                 } catch(IOException e) {
 241                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 242                 }
 243             }
 244             lastCC=0;
 245         }
 246 
 247         /**
 248          * Flushes from the intermediate StringBuilder to the Appendable,
 249          * if they are different objects.
 250          * Then appends the new text to the Appendable or StringBuilder.
 251          * Normally used after quick check loops find a non-empty sequence.
 252          */
 253         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
 254             if(appIsStringBuilder) {
 255                 str.append(s, start, limit);
 256                 reorderStart=str.length();
 257             } else {
 258                 try {
 259                     app.append(str).append(s, start, limit);
 260                     str.setLength(0);
 261                     reorderStart=0;
 262                 } catch(IOException e) {
 263                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 264                 }
 265             }
 266             lastCC=0;
 267             return this;
 268         }
 269 
 270         public void remove() {
 271             str.setLength(0);
 272             lastCC=0;
 273             reorderStart=0;
 274         }
 275 
 276         public void removeSuffix(int suffixLength) {
 277             int oldLength=str.length();
 278             str.delete(oldLength-suffixLength, oldLength);
 279             lastCC=0;
 280             reorderStart=str.length();
 281         }
 282 
 283         // Inserts c somewhere before the last character.
 284         // Requires 0<cc<lastCC which implies reorderStart<limit.
 285         private void insert(int c, int cc) {
 286             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 287             // insert c at codePointLimit, after the character with prevCC<=cc
 288             if(c<=0xffff) {
 289                 str.insert(codePointLimit, (char)c);
 290                 if(cc<=1) {
 291                     reorderStart=codePointLimit+1;
 292                 }
 293             } else {
 294                 str.insert(codePointLimit, Character.toChars(c));
 295                 if(cc<=1) {


 301         private final NormalizerImpl impl;
 302         private final Appendable app;
 303         private final StringBuilder str;
 304         private final boolean appIsStringBuilder;
 305         private int reorderStart;
 306         private int lastCC;
 307 
 308         // private backward iterator
 309         private void setIterator() { codePointStart=str.length(); }
 310         private void skipPrevious() {  // Requires 0<codePointStart.
 311             codePointLimit=codePointStart;
 312             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 313         }
 314         private int previousCC() {  // Returns 0 if there is no previous character.
 315             codePointLimit=codePointStart;
 316             if(reorderStart>=codePointStart) {
 317                 return 0;
 318             }
 319             int c=str.codePointBefore(codePointStart);
 320             codePointStart-=Character.charCount(c);
 321             if(c<MIN_CCC_LCCC_CP) {
 322                 return 0;
 323             }
 324             return getCCFromYesOrMaybe(impl.getNorm16(c));
 325         }
 326 
 327         private int codePointStart, codePointLimit;
 328     }
 329 
 330     // TODO: Propose as public API on the UTF16 class.
 331     // TODO: Propose widening UTF16 methods that take char to take int.
 332     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 333     public static final class UTF16Plus {
 334         /**
 335          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 336          * is it a lead surrogate?
 337          * @param c code unit or code point
 338          * @return true or false
 339          */
 340         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 341 
 342         /**
 343          * Compares two CharSequence subsequences for binary equality.
 344          * @param s1 first sequence
 345          * @param start1 start offset in first sequence
 346          * @param limit1 limit offset in first sequence


 353         public static boolean equal(CharSequence s1, int start1, int limit1,
 354                                     CharSequence s2, int start2, int limit2) {
 355             if((limit1-start1)!=(limit2-start2)) {
 356                 return false;
 357             }
 358             if(s1==s2 && start1==start2) {
 359                 return true;
 360             }
 361             while(start1<limit1) {
 362                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 363                     return false;
 364                 }
 365             }
 366             return true;
 367         }
 368     }
 369 
 370     public NormalizerImpl() {}
 371 
 372     private static final class IsAcceptable implements ICUBinary.Authenticate {
 373         // @Override when we switch to Java 6
 374         public boolean isDataVersionAcceptable(byte version[]) {
 375             return version[0]==2;
 376         }
 377     }
 378 
 379     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 380     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 381 
 382     public NormalizerImpl load(ByteBuffer bytes) {
 383         try {
 384             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 385             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 386             if(indexesLength<=IX_MIN_MAYBE_YES) {
 387                 throw new IOException("Normalizer2 data: not enough indexes");
 388             }
 389             int[] inIndexes=new int[indexesLength];
 390             inIndexes[0]=indexesLength*4;
 391             for(int i=1; i<indexesLength; ++i) {
 392                 inIndexes[i]=bytes.getInt();
 393             }
 394 
 395             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 396             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];

 397 
 398             minYesNo=inIndexes[IX_MIN_YES_NO];
 399             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 400             minNoNo=inIndexes[IX_MIN_NO_NO];



 401             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 402             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];


 403 
 404             // Read the normTrie.
 405             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 406             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 407             normTrie=Trie2_16.createFromSerialized(bytes);
 408             int trieLength=normTrie.getSerializedLength();
 409             if(trieLength>(nextOffset-offset)) {
 410                 throw new IOException("Normalizer2 data: not enough bytes for normTrie");
 411             }
 412             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 413 
 414             // Read the composition and mapping data.
 415             offset=nextOffset;
 416             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 417             int numChars=(nextOffset-offset)/2;
 418             char[] chars;
 419             if(numChars!=0) {
 420                 chars=new char[numChars];
 421                 for(int i=0; i<numChars; ++i) {
 422                     chars[i]=bytes.getChar();
 423                 }
 424                 maybeYesCompositions=new String(chars);
 425                 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
 426             }
 427 
 428             // smallFCD: new in formatVersion 2
 429             offset=nextOffset;
 430             smallFCD=new byte[0x100];
 431             for(int i=0; i<0x100; ++i) {
 432                 smallFCD[i]=bytes.get();
 433             }
 434 
 435             // Build tccc180[].
 436             // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
 437             tccc180=new int[0x180];
 438             int bits=0;
 439             for(int c=0; c<0x180; bits>>=1) {
 440                 if((c&0xff)==0) {
 441                     bits=smallFCD[c>>8];  // one byte per 0x100 code points
 442                 }
 443                 if((bits&1)!=0) {
 444                     for(int i=0; i<0x20; ++i, ++c) {
 445                         tccc180[c]=getFCD16FromNormData(c)&0xff;
 446                     }
 447                 } else {
 448                     c+=0x20;
 449                 }
 450             }
 451 
 452             return this;
 453         } catch(IOException e) {
 454             throw new InternalError(e);
 455         }
 456     }
 457 
 458     public NormalizerImpl load(String name) {
 459         return load(ICUBinary.getRequiredData(name));
 460     }
 461 
 462     public int getNorm16(int c) {
 463         return normTrie.get(c);
 464     }
 465 



 466     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 467 
 468     public int getCC(int norm16) {
 469         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 470             return norm16&0xff;
 471         }
 472         if(norm16<minNoNo || limitNoNo<=norm16) {
 473             return 0;
 474         }
 475         return getCCFromNoNo(norm16);
 476     }
 477 


 478     public static int getCCFromYesOrMaybe(int norm16) {
 479         return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;




 480     }
 481 
 482     /**
 483      * Returns the FCD data for code point c.
 484      * @param c A Unicode code point.
 485      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
 486      */
 487     public int getFCD16(int c) {
 488         if(c<0) {
 489             return 0;
 490         } else if(c<0x180) {
 491             return tccc180[c];
 492         } else if(c<=0xffff) {
 493             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
 494         }
 495         return getFCD16FromNormData(c);
 496     }
 497 
 498     /** Returns the FCD data for U+0000<=c<U+0180. */
 499     public int getFCD16FromBelow180(int c) { return tccc180[c]; }
 500     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
 501     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
 502         // 0<=lead<=0xffff
 503         byte bits=smallFCD[lead>>8];
 504         if(bits==0) { return false; }
 505         return ((bits>>((lead>>5)&7))&1)!=0;
 506     }
 507 
 508     /** Gets the FCD value from the regular normalization data. */
 509     public int getFCD16FromNormData(int c) {
 510         // Only loops for 1:1 algorithmic mappings.
 511         for(;;) {
 512             int norm16=getNorm16(c);
 513             if(norm16<=minYesNo) {
 514                 // no decomposition or Hangul syllable, all zeros
 515                 return 0;
 516             } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
 517                 // combining mark
 518                 norm16&=0xff;
 519                 return norm16|(norm16<<8);
 520             } else if(norm16>=minMaybeYes) {
 521                 return 0;
 522             } else if(isDecompNoAlgorithmic(norm16)) {





 523                 c=mapAlgorithmic(c, norm16);
 524             } else {






 525                 // c decomposes, get everything from the variable-length extra data
 526                 int firstUnit=extraData.charAt(norm16);
 527                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
 528                     // A character that is deleted (maps to an empty string) must
 529                     // get the worst-case lccc and tccc values because arbitrary
 530                     // characters on both sides will become adjacent.
 531                     return 0x1ff;
 532                 } else {
 533                     int fcd16=firstUnit>>8;  // tccc
 534                     if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 535                         fcd16|=extraData.charAt(norm16-1)&0xff00;  // lccc
 536                     }
 537                     return fcd16;
 538                 }
 539             }
 540         }
 541     }
 542 
 543     /**
 544      * Gets the decomposition for one code point.
 545      * @param c code point
 546      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 547      */
 548     public String getDecomposition(int c) {
 549         int decomp=-1;
 550         int norm16;
 551         for(;;) {
 552             if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
 553                 // c does not decompose
 554             } else if(isHangul(norm16)) {
 555                 // Hangul syllable: decompose algorithmically
 556                 StringBuilder buffer=new StringBuilder();
 557                 Hangul.decompose(c, buffer);
 558                 return buffer.toString();
 559             } else if(isDecompNoAlgorithmic(norm16)) {
 560                 decomp=c=mapAlgorithmic(c, norm16);
 561                 continue;
 562             } else {
 563                 // c decomposes, get everything from the variable-length extra data
 564                 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
 565                 return extraData.substring(norm16, norm16+length);
 566             }

 567             if(decomp<0) {
 568                 return null;
 569             } else {
 570                 return UTF16.valueOf(decomp);
 571             }





 572         }
 573     }
 574 
 575     public static final int MIN_CCC_LCCC_CP=0x300;





















 576 
 577     public static final int MIN_YES_YES_WITH_CC=0xff01;
 578     public static final int JAMO_VT=0xff00;
 579     public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
 580     public static final int MAX_DELTA=0x40;
 581 
 582     // Byte offsets from the start of the data, after the generic header.
 583     public static final int IX_NORM_TRIE_OFFSET=0;
 584     public static final int IX_EXTRA_DATA_OFFSET=1;
 585     public static final int IX_SMALL_FCD_OFFSET=2;
 586 


 587     // Code point thresholds for quick check codes.
 588     public static final int IX_MIN_DECOMP_NO_CP=8;
 589     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
 590 
 591     // Norm16 value thresholds for quick check combinations and types of extra data.
 592     // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.

 593     public static final int IX_MIN_YES_NO=10;

 594     public static final int IX_MIN_NO_NO=11;
 595     public static final int IX_LIMIT_NO_NO=12;
 596     public static final int IX_MIN_MAYBE_YES=13;
 597 
 598     // Mappings only in [minYesNoMappingsOnly..minNoNo[.
 599     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;









 600 
 601     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;


 602     public static final int MAPPING_LENGTH_MASK=0x1f;
 603 
 604     public static final int COMP_1_LAST_TUPLE=0x8000;
 605     public static final int COMP_1_TRIPLE=1;
 606     public static final int COMP_1_TRAIL_LIMIT=0x3400;
 607     public static final int COMP_1_TRAIL_MASK=0x7ffe;
 608     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
 609     public static final int COMP_2_TRAIL_SHIFT=6;
 610     public static final int COMP_2_TRAIL_MASK=0xffc0;
 611 
 612     // higher-level functionality ------------------------------------------ ***
 613 
 614     /**
 615      * Decomposes s[src, limit[ and writes the result to dest.
 616      * limit can be NULL if src is NUL-terminated.
 617      * destLengthEstimate is the initial dest buffer capacity and can be -1.
 618      */
 619     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
 620                    int destLengthEstimate) {
 621         if(destLengthEstimate<0) {


 685             // Check one above-minimum, relevant code point.
 686             src+=Character.charCount(c);
 687             if(buffer!=null) {
 688                 decompose(c, norm16, buffer);
 689             } else {
 690                 if(isDecompYes(norm16)) {
 691                     int cc=getCCFromYesOrMaybe(norm16);
 692                     if(prevCC<=cc || cc==0) {
 693                         prevCC=cc;
 694                         if(cc<=1) {
 695                             prevBoundary=src;
 696                         }
 697                         continue;
 698                     }
 699                 }
 700                 return prevBoundary;  // "no" or cc out of order
 701             }
 702         }
 703         return src;
 704     }
 705 
 706     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
 707         int limit=s.length();
 708         if(limit==0) {
 709             return;
 710         }
 711         if(doDecompose) {
 712             decompose(s, 0, limit, buffer);
 713             return;
 714         }
 715         // Just merge the strings at the boundary.
 716         int c=Character.codePointAt(s, 0);
 717         int src=0;
 718         int firstCC, prevCC, cc;
 719         firstCC=prevCC=cc=getCC(getNorm16(c));
 720         while(cc!=0) {
 721             prevCC=cc;
 722             src+=Character.charCount(c);
 723             if(src>=limit) {
 724                 break;
 725             }
 726             c=Character.codePointAt(s, src);
 727             cc=getCC(getNorm16(c));
 728         };
 729         buffer.append(s, 0, src, firstCC, prevCC);
 730         buffer.append(s, src, limit);
 731     }
 732 
 733     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 734     // doCompose: normalize
 735     // !doCompose: isNormalized (buffer must be empty and initialized)
 736     public boolean compose(CharSequence s, int src, int limit,
 737                            boolean onlyContiguous,
 738                            boolean doCompose,
 739                            ReorderingBuffer buffer) {

 740         int minNoMaybeCP=minCompNoMaybeCP;
 741 
 742         /*
 743          * prevBoundary points to the last character before the current one
 744          * that has a composition boundary before it with ccc==0 and quick check "yes".
 745          * Keeping track of prevBoundary saves us looking for a composition boundary
 746          * when we find a "no" or "maybe".
 747          *
 748          * When we back out from prevSrc back to prevBoundary,
 749          * then we also remove those same characters (which had been simply copied
 750          * or canonically-order-inserted) from the ReorderingBuffer.
 751          * Therefore, at all times, the [prevBoundary..prevSrc[ source units
 752          * must correspond 1:1 to destination units at the end of the destination buffer.
 753          */
 754         int prevBoundary=src;
 755         int prevSrc;
 756         int c=0;
 757         int norm16=0;
 758 
 759         // only for isNormalized
 760         int prevCC=0;
 761 
 762         for(;;) {
 763             // count code units below the minimum or with irrelevant data for the quick check
 764             for(prevSrc=src; src!=limit;) {
 765                 if( (c=s.charAt(src))<minNoMaybeCP ||
 766                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 767                 ) {
 768                     ++src;
 769                 } else if(!UTF16.isSurrogate((char)c)) {


 770                     break;
 771                 } else {
 772                     char c2;
 773                     if(UTF16Plus.isSurrogateLead(c)) {
 774                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {

 775                             c=Character.toCodePoint((char)c, c2);
 776                         }
 777                     } else /* trail surrogate */ {
 778                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
 779                             --src;
 780                             c=Character.toCodePoint(c2, (char)c);
 781                         }
 782                     }
 783                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
 784                         src+=Character.charCount(c);
 785                     } else {
 786                         break;
 787                     }
 788                 }
 789             }
 790             // copy these code units all at once
 791             if(src!=prevSrc) {
 792                 if(src==limit) {
 793                     if(doCompose) {
 794                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
 795                     }
 796                     break;









 797                 }
 798                 // Set prevBoundary to the last character in the quick check loop.
 799                 prevBoundary=src-1;
 800                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
 801                     Character.isHighSurrogate(s.charAt(prevBoundary-1))
 802                 ) {
 803                     --prevBoundary;



 804                 }
 805                 if(doCompose) {
 806                     // The last "quick check yes" character is excluded from the
 807                     // flush-and-append call in case it needs to be modified.
 808                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
 809                     buffer.append(s, prevBoundary, src);
 810                 } else {
 811                     prevCC=0;
 812                 }
 813                 // The start of the current character (c).
 814                 prevSrc=src;
 815             } else if(src==limit) {
 816                 break;








 817             }
 818 
 819             src+=Character.charCount(c);
 820             /*
 821              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 822              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
 823              * or has ccc!=0.
 824              * Check for Jamo V/T, then for regular characters.
 825              * c is not a Hangul syllable or Jamo L because those have "yes" properties.
 826              */
 827             if(isJamoVT(norm16) && prevBoundary!=prevSrc) {






 828                 char prev=s.charAt(prevSrc-1);
 829                 boolean needToDecompose=false;
 830                 if(c<Hangul.JAMO_T_BASE) {
 831                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
 832                     prev-=Hangul.JAMO_L_BASE;
 833                     if(prev<Hangul.JAMO_L_COUNT) {
 834                         if(!doCompose) {

 835                             return false;
 836                         }
 837                         char syllable=(char)
 838                             (Hangul.HANGUL_BASE+
 839                              (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
 840                              Hangul.JAMO_T_COUNT);
 841                         char t;
 842                         if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
 843                             ++src;
 844                             syllable+=t;  // The next character was a Jamo T.
 845                             prevBoundary=src;
 846                             buffer.setLastChar(syllable);













 847                             continue;
 848                         }
 849                         // If we see L+V+x where x!=T then we drop to the slow path,
 850                         // decompose and recompose.
 851                         // This is to deal with NFKC finding normal L and V but a
 852                         // compatibility variant of a T. We need to either fully compose that
 853                         // combination here (which would complicate the code and may not work
 854                         // with strange custom data) or use the slow path -- or else our replacing
 855                         // two input characters (L+V) with one output character (LV syllable)
 856                         // would violate the invariant that [prevBoundary..prevSrc[ has the same
 857                         // length as what we appended to the buffer since prevBoundary.
 858                         needToDecompose=true;
 859                     }
 860                 } else if(Hangul.isHangulWithoutJamoT(prev)) {
 861                     // c is a Jamo Trailing consonant,
 862                     // compose with previous Hangul LV that does not contain a Jamo T.
 863                     if(!doCompose) {
 864                         return false;
 865                     }
 866                     buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
 867                     prevBoundary=src;
 868                     continue;
 869                 }
 870                 if(!needToDecompose) {
 871                     // The Jamo V/T did not compose into a Hangul syllable.
 872                     if(doCompose) {
 873                         buffer.append((char)c);
 874                     } else {
 875                         prevCC=0;
 876                     }


 877                     continue;
 878                 }
 879             }
 880             /*
 881              * Source buffer pointers:
 882              *
 883              *  all done      quick check   current char  not yet
 884              *                "yes" but     (c)           processed
 885              *                may combine
 886              *                forward
 887              * [-------------[-------------[-------------[-------------[
 888              * |             |             |             |             |
 889              * orig. src     prevBoundary  prevSrc       src           limit
 890              *
 891              *
 892              * Destination buffer pointers inside the ReorderingBuffer:
 893              *
 894              *  all done      might take    not filled yet
 895              *                characters for
 896              *                reordering
 897              * [-------------[-------------[-------------[
 898              * |             |             |             |
 899              * start         reorderStart  limit         |
 900              *                             +remainingCap.+
 901              */
 902             if(norm16>=MIN_YES_YES_WITH_CC) {
 903                 int cc=norm16&0xff;  // cc!=0
 904                 if( onlyContiguous &&  // FCC
 905                     (doCompose ? buffer.getLastCC() : prevCC)==0 &&
 906                     prevBoundary<prevSrc &&
 907                     // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
 908                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
 909                     // passed the quick check "yes && ccc==0" test.
 910                     // Check whether the last character was a "yesYes" or a "yesNo".
 911                     // If a "yesNo", then we get its trailing ccc from its
 912                     // mapping and check for canonical order.
 913                     // All other cases are ok.
 914                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
 915                 ) {
 916                     // Fails FCD test, need to decompose and contiguously recompose.
 917                     if(!doCompose) {
 918                         return false;
 919                     }
 920                 } else if(doCompose) {
 921                     buffer.append(c, cc);
 922                     continue;
 923                 } else if(prevCC<=cc) {
 924                     prevCC=cc;
 925                     continue;
 926                 } else {
 927                     return false;






 928                 }
 929             } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {








 930                 return false;
 931             }
 932 
 933             /*
 934              * Find appropriate boundaries around this character,
 935              * decompose the source text from between the boundaries,
 936              * and recompose it.
 937              *
 938              * We may need to remove the last few characters from the ReorderingBuffer
 939              * to account for source text that was copied or appended
 940              * but needs to take part in the recomposition.
 941              */
 942 
 943             /*
 944              * Find the last composition boundary in [prevBoundary..src[.
 945              * It is either the decomposition of the current character (at prevSrc),
 946              * or prevBoundary.
 947              */
 948             if(hasCompBoundaryBefore(c, norm16)) {
 949                 prevBoundary=prevSrc;
 950             } else if(doCompose) {
 951                 buffer.removeSuffix(prevSrc-prevBoundary);
 952             }
 953 
 954             // Find the next composition boundary in [src..limit[ -
 955             // modifies src to point to the next starter.
 956             src=findNextCompBoundary(s, src, limit);
 957 
 958             // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.







 959             int recomposeStartIndex=buffer.length();
 960             decomposeShort(s, prevBoundary, src, buffer);





 961             recompose(buffer, recomposeStartIndex, onlyContiguous);
 962             if(!doCompose) {
 963                 if(!buffer.equals(s, prevBoundary, src)) {
 964                     return false;
 965                 }
 966                 buffer.remove();
 967                 prevCC=0;
 968             }
 969 
 970             // Move to the next starter. We never need to look back before this point again.
 971             prevBoundary=src;
 972         }
 973         return true;
 974     }
 975 
 976     /**
 977      * Very similar to compose(): Make the same changes in both places if relevant.
 978      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
 979      * !doSpan: quickCheck
 980      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
 981      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 982      *         then the quick check result is "no"
 983      */
 984     public int composeQuickCheck(CharSequence s, int src, int limit,
 985                                  boolean onlyContiguous, boolean doSpan) {
 986         int qcResult=0;
 987         int minNoMaybeCP=minCompNoMaybeCP;
 988 
 989         /*
 990          * prevBoundary points to the last character before the current one
 991          * that has a composition boundary before it with ccc==0 and quick check "yes".
 992          */
 993         int prevBoundary=src;
 994         int prevSrc;
 995         int c=0;
 996         int norm16=0;
 997         int prevCC=0;
 998 
 999         for(;;) {
1000             // count code units below the minimum or with irrelevant data for the quick check
1001             for(prevSrc=src;;) {




1002                 if(src==limit) {
1003                     return (src<<1)|qcResult;  // "yes" or "maybe"
1004                 }
1005                 if( (c=s.charAt(src))<minNoMaybeCP ||
1006                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1007                 ) {
1008                     ++src;
1009                 } else if(!UTF16.isSurrogate((char)c)) {


1010                     break;
1011                 } else {
1012                     char c2;
1013                     if(UTF16Plus.isSurrogateLead(c)) {
1014                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {

1015                             c=Character.toCodePoint((char)c, c2);
1016                         }
1017                     } else /* trail surrogate */ {
1018                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1019                             --src;
1020                             c=Character.toCodePoint(c2, (char)c);
1021                         }
1022                     }
1023                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1024                         src+=Character.charCount(c);
1025                     } else {
1026                         break;
1027                     }
1028                 }
1029             }
1030             if(src!=prevSrc) {
1031                 // Set prevBoundary to the last character in the quick check loop.
1032                 prevBoundary=src-1;
1033                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1034                         Character.isHighSurrogate(s.charAt(prevBoundary-1))
1035                 ) {
1036                     --prevBoundary;
1037                 }
1038                 prevCC=0;
1039                 // The start of the current character (c).
1040                 prevSrc=src;














1041             }
1042 
1043             src+=Character.charCount(c);
1044             /*
1045              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1046              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1047              * or has ccc!=0.
1048              */
1049             if(isMaybeOrNonZeroCC(norm16)) {
1050                 int cc=getCCFromYesOrMaybe(norm16);
1051                 if( onlyContiguous &&  // FCC
1052                     cc!=0 &&
1053                     prevCC==0 &&
1054                     prevBoundary<prevSrc &&
1055                     // prevCC==0 && prevBoundary<prevSrc tell us that
1056                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1057                     // passed the quick check "yes && ccc==0" test.
1058                     // Check whether the last character was a "yesYes" or a "yesNo".
1059                     // If a "yesNo", then we get its trailing ccc from its
1060                     // mapping and check for canonical order.
1061                     // All other cases are ok.
1062                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1063                 ) {
1064                     // Fails FCD test.
1065                 } else if(prevCC<=cc || cc==0) {
1066                     prevCC=cc;
1067                     if(norm16<MIN_YES_YES_WITH_CC) {
1068                         if(!doSpan) {
1069                             qcResult=1;








1070                         } else {
1071                             return prevBoundary<<1;  // spanYes does not care to know it's "maybe"
1072                         }

1073                     }




1074                     continue;
1075                 }
1076             }

1077             return prevBoundary<<1;  // "no"
1078         }
1079     }
1080 
1081     public void composeAndAppend(CharSequence s,
1082                                  boolean doCompose,
1083                                  boolean onlyContiguous,
1084                                  ReorderingBuffer buffer) {
1085         int src=0, limit=s.length();
1086         if(!buffer.isEmpty()) {
1087             int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
1088             if(0!=firstStarterInSrc) {
1089                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1090                                                                buffer.length());
1091                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1092                                                        firstStarterInSrc+16);
1093                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1094                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1095                 middle.append(s, 0, firstStarterInSrc);
1096                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1097                 src=firstStarterInSrc;
1098             }
1099         }
1100         if(doCompose) {
1101             compose(s, src, limit, onlyContiguous, true, buffer);
1102         } else {
1103             buffer.append(s, src, limit);
1104         }
1105     }
1106 
1107     // Dual functionality:
1108     // buffer!=NULL: normalize
1109     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1110     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1111         // Note: In this function we use buffer->appendZeroCC() because we track
1112         // the lead and trail combining classes here, rather than leaving it to
1113         // the ReorderingBuffer.
1114         // The exception is the call to decomposeShort() which uses the buffer
1115         // in the normal way.
1116 
1117         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1118         // Similar to the prevBoundary in the compose() implementation.
1119         int prevBoundary=src;
1120         int prevSrc;
1121         int c=0;
1122         int prevFCD16=0;
1123         int fcd16=0;
1124 
1125         for(;;) {
1126             // count code units with lccc==0
1127             for(prevSrc=src; src!=limit;) {
1128                 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
1129                     prevFCD16=~c;
1130                     ++src;
1131                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1132                     prevFCD16=0;
1133                     ++src;
1134                 } else {
1135                     if(UTF16.isSurrogate((char)c)) {
1136                         char c2;
1137                         if(UTF16Plus.isSurrogateLead(c)) {
1138                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1139                                 c=Character.toCodePoint((char)c, c2);
1140                             }
1141                         } else /* trail surrogate */ {
1142                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1143                                 --src;
1144                                 c=Character.toCodePoint(c2, (char)c);
1145                             }
1146                         }
1147                     }
1148                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1149                         prevFCD16=fcd16;
1150                         src+=Character.charCount(c);
1151                     } else {
1152                         break;
1153                     }
1154               }
1155             }
1156             // copy these code units all at once
1157             if(src!=prevSrc) {
1158                 if(src==limit) {
1159                     if(buffer!=null) {
1160                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1161                     }
1162                     break;
1163                 }
1164                 prevBoundary=src;
1165                 // We know that the previous character's lccc==0.
1166                 if(prevFCD16<0) {
1167                     // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1168                     int prev=~prevFCD16;
1169                     prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);



1170                     if(prevFCD16>1) {
1171                         --prevBoundary;
1172                     }

1173                 } else {
1174                     int p=src-1;
1175                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1176                         Character.isHighSurrogate(s.charAt(p-1))
1177                     ) {
1178                         --p;
1179                         // Need to fetch the previous character's FCD value because
1180                         // prevFCD16 was just for the trail surrogate code point.
1181                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1182                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1183                     }
1184                     if(prevFCD16>1) {
1185                         prevBoundary=p;
1186                     }
1187                 }
1188                 if(buffer!=null) {
1189                     // The last lccc==0 character is excluded from the
1190                     // flush-and-append call in case it needs to be modified.
1191                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1192                     buffer.append(s, prevBoundary, src);


1211                 prevFCD16=fcd16;
1212                 continue;
1213             } else if(buffer==null) {
1214                 return prevBoundary;  // quick check "no"
1215             } else {
1216                 /*
1217                  * Back out the part of the source that we copied or appended
1218                  * already but is now going to be decomposed.
1219                  * prevSrc is set to after what was copied/appended.
1220                  */
1221                 buffer.removeSuffix(prevSrc-prevBoundary);
1222                 /*
1223                  * Find the part of the source that needs to be decomposed,
1224                  * up to the next safe boundary.
1225                  */
1226                 src=findNextFCDBoundary(s, src, limit);
1227                 /*
1228                  * The source text does not fulfill the conditions for FCD.
1229                  * Decompose and reorder a limited piece of the text.
1230                  */
1231                 decomposeShort(s, prevBoundary, src, buffer);
1232                 prevBoundary=src;
1233                 prevFCD16=0;
1234             }
1235         }
1236         return src;
1237     }
1238 
1239     // Note: hasDecompBoundary() could be implemented as aliases to
1240     // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
1241     // at the cost of building the FCD trie for a decomposition normalizer.
1242     public boolean hasDecompBoundary(int c, boolean before) {
1243         for(;;) {
1244             if(c<minDecompNoCP) {
1245                 return true;
1246             }
1247             int norm16=getNorm16(c);
1248             if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
1249                 return true;
1250             } else if(norm16>MIN_NORMAL_MAYBE_YES) {
1251                 return false;  // ccc!=0
1252             } else if(isDecompNoAlgorithmic(norm16)) {
1253                 c=mapAlgorithmic(c, norm16);
1254             } else {
1255                 // c decomposes, get everything from the variable-length extra data
1256                 int firstUnit=extraData.charAt(norm16);
1257                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1258                     return false;








1259                 }
1260                 if(!before) {















1261                     // decomp after-boundary: same as hasFCDBoundaryAfter(),
1262                     // fcd16<=1 || trailCC==0
1263                     if(firstUnit>0x1ff) {
1264                         return false;  // trailCC>1
1265                     }
1266                     if(firstUnit<=0xff) {
1267                         return true;  // trailCC==0
1268                     }
1269                     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1270                 }
1271                 // true if leadCC==0 (hasFCDBoundaryBefore())
1272                 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
1273             }
1274         }
1275     }

1276 
1277     public boolean hasCompBoundaryBefore(int c) {
1278         return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));



1279     }
1280 
1281     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1282     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }

1283     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1284     private boolean isHangul(int norm16) { return norm16==minYesNo; }




1285     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1286 
1287     // UBool isCompYes(uint16_t norm16) const {
1288     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1289     // }
1290     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1291     //     return norm16<minNoNo || minMaybeYes<=norm16;
1292     // }
1293     // private boolean hasZeroCCFromDecompYes(int norm16) {
1294     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1295     // }
1296     private boolean isDecompYesAndZeroCC(int norm16) {
1297         return norm16<minYesNo ||
1298                norm16==JAMO_VT ||
1299                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1300     }
1301 
1302     /**
1303      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1304      * the MaybeYes which combine-forward and have ccc=0.
1305      * (Standard Unicode 5.2 normalization does not have such characters.)
1306      */
1307     private boolean isMostDecompYesAndZeroCC(int norm16) {
1308         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1309     }
1310 
1311     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1312 
1313     // For use with isCompYes().
1314     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1315     // static uint8_t getCCFromYes(uint16_t norm16) {
1316     //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
1317     // }
1318     private int getCCFromNoNo(int norm16) {
1319         if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1320             return extraData.charAt(norm16-1)&0xff;

1321         } else {
1322             return 0;
1323         }
1324     }
1325 
1326     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
1327     int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
1328         int c;
1329         if(cpStart==(cpLimit-1)) {
1330             c=s.charAt(cpStart);
1331         } else {
1332             c=Character.codePointAt(s, cpStart);
1333         }
1334         int prevNorm16=getNorm16(c);
1335         if(prevNorm16<=minYesNo) {
1336             return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
1337         } else {
1338             return extraData.charAt(prevNorm16)>>8;  // tccc from yesNo

1339         }
1340     }
1341 
1342     // Requires algorithmic-NoNo.
1343     private int mapAlgorithmic(int c, int norm16) {
1344         return c+norm16-(minMaybeYes-MAX_DELTA-1);
1345     }
1346 
1347     // Requires minYesNo<norm16<limitNoNo.
1348     // private int getMapping(int norm16) { return /*extraData+*/norm16; }
1349 
1350     /**
1351      * @return index into maybeYesCompositions, or -1
1352      */
1353     private int getCompositionsListForDecompYes(int norm16) {
1354         if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
1355             return -1;
1356         } else {
1357             if((norm16-=minMaybeYes)<0) {
1358                 // norm16<minMaybeYes: index into extraData which is a substring at
1359                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1360                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1361                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1362             }
1363             return norm16;
1364         }
1365     }
1366 
1367     /**
1368      * @return index into maybeYesCompositions
1369      */
1370     private int getCompositionsListForComposite(int norm16) {
1371         // composite has both mapping & compositions list
1372         int firstUnit=extraData.charAt(norm16);
1373         return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+  // mapping in maybeYesCompositions
1374             1+  // +1 to skip the first unit with the mapping lenth

1375             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1376     }
1377 
1378     // Decompose a short piece of text which is likely to contain characters that
1379     // fail the quick check loop and/or where the quick check loop's overhead
1380     // is unlikely to be amortized.
1381     // Called by the compose() and makeFCD() implementations.
1382     // Public in Java for collation implementation code.
1383     public void decomposeShort(CharSequence s, int src, int limit,


1384                                ReorderingBuffer buffer) {
1385         while(src<limit) {
1386             int c=Character.codePointAt(s, src);







1387             src+=Character.charCount(c);
1388             decompose(c, getNorm16(c), buffer);


1389         }
1390     }
1391 
1392     private void decompose(int c, int norm16,
1393                            ReorderingBuffer buffer) {
1394         // Only loops for 1:1 algorithmic mappings.
1395         for(;;) {
1396             // get the decomposition and the lead and trail cc's
1397             if(isDecompYes(norm16)) {
1398                 // c does not decompose
1399                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1400             } else if(isHangul(norm16)) {
1401                 // Hangul syllable: decompose algorithmically
1402                 Hangul.decompose(c, buffer);
1403             } else if(isDecompNoAlgorithmic(norm16)) {
1404                 c=mapAlgorithmic(c, norm16);
1405                 norm16=getNorm16(c);
1406                 continue;






1407             } else {
1408                 // c decomposes, get everything from the variable-length extra data
1409                 int firstUnit=extraData.charAt(norm16);

1410                 int length=firstUnit&MAPPING_LENGTH_MASK;
1411                 int leadCC, trailCC;
1412                 trailCC=firstUnit>>8;
1413                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1414                     leadCC=extraData.charAt(norm16-1)>>8;
1415                 } else {
1416                     leadCC=0;
1417                 }
1418                 ++norm16;  // skip over the firstUnit
1419                 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
1420             }
1421             return;
1422         }
1423     }
1424 
1425     /**
1426      * Finds the recomposition result for
1427      * a forward-combining "lead" character,
1428      * specified with a pointer to its compositions list,
1429      * and a backward-combining "trail" character.
1430      *
1431      * <p>If the lead and trail characters combine, then this function returns
1432      * the following "compositeAndFwd" value:
1433      * <pre>
1434      * Bits 21..1  composite character
1435      * Bit      0  set if the composite is a forward-combining starter
1436      * </pre>
1437      * otherwise it returns -1.
1438      *
1439      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1440      * encoded as either pairs or triples of 16-bit units.
1441      * The last entry has the high bit of its first unit set.
1442      *
1443      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1444      * A linear search is used.
1445      *
1446      * <p>See normalizer2impl.h for a more detailed description
1447      * of the compositions list format.
1448      */
1449     private static int combine(String compositions, int list, int trail) {
1450         int key1, firstUnit;
1451         if(trail<COMP_1_TRAIL_LIMIT) {
1452             // trail character is 0..33FF
1453             // result entry may have 2 or 3 units
1454             key1=(trail<<1);
1455             while(key1>(firstUnit=compositions.charAt(list))) {
1456                 list+=2+(firstUnit&COMP_1_TRIPLE);
1457             }
1458             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1459                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1460                     return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1461                 } else {
1462                     return compositions.charAt(list+1);
1463                 }
1464             }
1465         } else {
1466             // trail character is 3400..10FFFF
1467             // result entry has 3 units
1468             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1469             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1470             int secondUnit;
1471             for(;;) {
1472                 if(key1>(firstUnit=compositions.charAt(list))) {
1473                     list+=2+(firstUnit&COMP_1_TRIPLE);
1474                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1475                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1476                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1477                             break;
1478                         } else {
1479                             list+=3;
1480                         }


1516         int cc, prevCC;
1517         boolean starterIsSupplementary;
1518 
1519         // Some of the following variables are not used until we have a forward-combining starter
1520         // and are only initialized now to avoid compiler warnings.
1521         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1522         starter=-1;
1523         starterIsSupplementary=false;
1524         prevCC=0;
1525 
1526         for(;;) {
1527             c=sb.codePointAt(p);
1528             p+=Character.charCount(c);
1529             norm16=getNorm16(c);
1530             cc=getCCFromYesOrMaybe(norm16);
1531             if( // this character combines backward and
1532                 isMaybe(norm16) &&
1533                 // we have seen a starter that combines forward and
1534                 compositionsList>=0 &&
1535                 // the backward-combining character is not blocked
1536                 (prevCC<cc || prevCC==0)) {

1537                 if(isJamoVT(norm16)) {
1538                     // c is a Jamo V/T, see if we can compose it with the previous character.
1539                     if(c<Hangul.JAMO_T_BASE) {
1540                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1541                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1542                         if(prev<Hangul.JAMO_L_COUNT) {
1543                             pRemove=p-1;
1544                             char syllable=(char)
1545                                 (Hangul.HANGUL_BASE+
1546                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1547                                  Hangul.JAMO_T_COUNT);
1548                             char t;
1549                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1550                                 ++p;
1551                                 syllable+=t;  // The next character was a Jamo T.
1552                             }
1553                             sb.setCharAt(starter, syllable);
1554                             // remove the Jamo V/T
1555                             sb.delete(pRemove, p);
1556                             p=pRemove;


1637                         starterIsSupplementary=true;
1638                         starter=p-2;
1639                     }
1640                 }
1641             } else if(onlyContiguous) {
1642                 // FCC: no discontiguous compositions; any intervening character blocks.
1643                 compositionsList=-1;
1644             }
1645         }
1646         buffer.flush();
1647     }
1648 
1649     /**
1650      * Does c have a composition boundary before it?
1651      * True if its decomposition begins with a character that has
1652      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1653      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1654      * (isCompYesAndZeroCC()) so we need not decompose.
1655      */
1656     private boolean hasCompBoundaryBefore(int c, int norm16) {
1657         for(;;) {
1658             if(isCompYesAndZeroCC(norm16)) {
1659                 return true;
1660             } else if(isMaybeOrNonZeroCC(norm16)) {
1661                 return false;
1662             } else if(isDecompNoAlgorithmic(norm16)) {
1663                 c=mapAlgorithmic(c, norm16);
1664                 norm16=getNorm16(c);
1665             } else {
1666                 // c decomposes, get everything from the variable-length extra data
1667                 int firstUnit=extraData.charAt(norm16);
1668                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1669                     return false;
1670                 }
1671                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
1672                     return false;  // non-zero leadCC
1673                 }
1674                 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));

1675             }



1676         }







1677     }
1678 
1679     private int findPreviousCompBoundary(CharSequence s, int p) {
1680         while(p>0) {
1681             int c=Character.codePointBefore(s, p);




1682             p-=Character.charCount(c);
1683             if(hasCompBoundaryBefore(c)) {
1684                 break;
1685             }
1686             // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1687             // but that's probably not worth the extra cost.
1688         }
1689         return p;
1690     }
1691 
1692     private int findNextCompBoundary(CharSequence s, int p, int limit) {
1693         while(p<limit) {
1694             int c=Character.codePointAt(s, p);
1695             int norm16=normTrie.get(c);
1696             if(hasCompBoundaryBefore(c, norm16)) {
1697                 break;
1698             }
1699             p+=Character.charCount(c);



1700         }
1701         return p;
1702     }
1703 

1704     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1705         while(p<limit) {
1706             int c=Character.codePointAt(s, p);
1707             if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {

1708                 break;
1709             }
1710             p+=Character.charCount(c);



1711         }
1712         return p;
1713     }
1714 
1715     /**
1716      * Get the canonical decomposition
1717      * sherman  for ComposedCharIter
1718      */
1719     public static int getDecompose(int chars[], String decomps[]) {
1720         Normalizer2 impl = Normalizer2.getNFDInstance();
1721 
1722         int length=0;
1723         int norm16 = 0;
1724         int ch = -1;
1725         int i = 0;
1726 
1727         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1728             //TBD !!!! the hack code heres save us about 50ms for startup
1729             //need a better solution/lookup
1730             if (ch == 0x30ff)


1973                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
1974                 // [current]..[p]         1 code point (c, c2) with cc
1975 
1976                 // move the code units in between up
1977                 r=p;
1978                 do {
1979                     source[--r]=source[--current];
1980                 } while (back!=current);
1981             }
1982         }
1983 
1984         // insert (c1, c2)
1985         source[current] = c1;
1986         if (c2!=0) {
1987             source[(current+1)] = c2;
1988         }
1989 
1990         // we know the cc of the last code point
1991         return trailCC;
1992     }
1993 
1994     /**
1995      * merge two UTF-16 string parts together
1996      * to canonically order (order by combining classes) their concatenation
1997      *
1998      * the two strings may already be adjacent, so that the merging is done
1999      * in-place if the two strings are not adjacent, then the buffer holding the
2000      * first one must be large enough
2001      * the second string may or may not be ordered in itself
2002      *
2003      * before: [start]..[current] is already ordered, and
2004      *         [next]..[limit]    may be ordered in itself, but
2005      *                          is not in relation to [start..current[
2006      * after: [start..current+(limit-next)[ is ordered
2007      *
2008      * the algorithm is a simple bubble-sort that takes the characters from
2009      * src[next++] and inserts them in correct combining class order into the
2010      * preceding part of the string
2011      *
2012      * since this function is called much less often than the single-code point
2013      * insertOrdered(), it just uses that for easier maintenance


2057 
2058             if(ncArgs.next==ncArgs.limit) {
2059                 // we know the cc of the last code point
2060                 return trailCC;
2061             } else {
2062                 if(!adjacent) {
2063                     // copy the second string part
2064                     do {
2065                         source[current++]=data[ncArgs.next++];
2066                     } while(ncArgs.next!=ncArgs.limit);
2067                     ncArgs.limit=current;
2068                 }
2069                 PrevArgs prevArgs = new PrevArgs();
2070                 prevArgs.src   = data;
2071                 prevArgs.start = start;
2072                 prevArgs.current =  ncArgs.limit;
2073                 return getPrevCC(prevArgs);
2074             }
2075 
2076     }
2077 
2078     private static final class PrevArgs{
2079         char[] src;
2080         int start;
2081         int current;
2082         char c1;
2083         char c2;
2084     }
2085 
2086     private static final class NextCCArgs{
2087         char[] source;
2088         int next;
2089         int limit;
2090         char c1;
2091         char c2;
2092     }



2093 















2094     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2095         args.c1=args.src[--args.current];
2096         args.c2=0;
2097 
2098         if (args.c1 < MIN_CCC_LCCC_CP) {
2099             return 0;
2100         } else if (UTF16.isLeadSurrogate(args.c1)) {
2101             /* unpaired first surrogate */
2102             return 0;
2103         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2104             return UCharacter.getCombiningClass(args.c1);
2105         } else if (args.current!=args.start &&
2106                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2107             --args.current;
2108             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2109         } else {
2110             /* unpaired second surrogate */
2111             args.c2=0;
2112             return 0;
2113         }
2114     }
2115 
2116     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2117         args.c1=args.source[args.next++];
2118         args.c2=0;
2119 
2120         if (UTF16.isTrailSurrogate(args.c1)) {
2121             /* unpaired second surrogate */
2122             return 0;
2123         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2124             return UCharacter.getCombiningClass(args.c1);
2125         } else if (args.next!=args.limit &&
2126                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2127             ++args.next;
2128             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2129         } else {
2130             /* unpaired first surrogate */
2131             args.c2=0;
2132             return 0;
2133         }

2134     }
2135 
2136     private VersionInfo dataVersion;
2137 
2138     // Code point thresholds for quick check codes.
2139     private int minDecompNoCP;
2140     private int minCompNoMaybeCP;

2141 
2142     // Norm16 value thresholds for quick check combinations and types of extra data.
2143     private int minYesNo;
2144     private int minYesNoMappingsOnly;
2145     private int minNoNo;



2146     private int limitNoNo;

2147     private int minMaybeYes;
2148 
2149     private Trie2_16 normTrie;
2150     private String maybeYesCompositions;
2151     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2152     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2153     private int[] tccc180;  // [0x180] tccc values for U+0000..U+017F
2154 
2155 }
   1 /*
   2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */

  32 package sun.text.normalizer;
  33 
  34 import java.io.IOException;
  35 import java.nio.ByteBuffer;
  36 import java.text.Normalizer;
  37 
  38 // Original filename in ICU4J: Normalizer2Impl.java
  39 public final class NormalizerImpl {

  40     public static final class Hangul {
  41         /* Korean Hangul and Jamo constants */
  42         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
  43         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
  44         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
  45 
  46         public static final int HANGUL_BASE=0xac00;
  47         public static final int HANGUL_END=0xd7a3;
  48 
  49         public static final int JAMO_L_COUNT=19;
  50         public static final int JAMO_V_COUNT=21;
  51         public static final int JAMO_T_COUNT=28;
  52 
  53         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
  54         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
  55 
  56         public static boolean isHangul(int c) {
  57             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
  58         }
  59         public static boolean isHangulLV(int c) {

  60             c-=HANGUL_BASE;
  61             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
  62         }
  63         
  64                 /**
  65          * Decomposes c, which must be a Hangul syllable, into buffer
  66          * and returns the length of the decomposition (2 or 3).
  67          */
  68         public static int decompose(int c, Appendable buffer) {
  69             try {
  70                 c-=HANGUL_BASE;
  71                 int c2=c%JAMO_T_COUNT;
  72                 c/=JAMO_T_COUNT;
  73                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
  74                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
  75                 if(c2==0) {
  76                     return 2;
  77                 } else {
  78                     buffer.append((char)(JAMO_T_BASE+c2));
  79                     return 3;
  80                 }
  81             } catch(IOException e) {
  82                 throw new InternalError(e);
  83             }
  84         }
  85     }
  86 
  87     /**
  88      * Writable buffer that takes care of canonical ordering.
  89      * Its Appendable methods behave like the C++ implementation's
  90      * appendZeroCC() methods.
  91      * <p>
  92      * If dest is a StringBuilder, then the buffer writes directly to it.
  93      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
  94      * until no further changes are necessary and whole segments are appended.
  95      * append() methods that take combining-class values always write to the StringBuilder.
  96      * Other append() methods flush and append to the Appendable.
  97      */
  98     public static final class ReorderingBuffer implements Appendable {
  99         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
 100             impl=ni;
 101             app=dest;
 102             if(app instanceof StringBuilder) {
 103                 appIsStringBuilder=true;
 104                 str=(StringBuilder)dest;
 105                 // In Java, the constructor subsumes public void init(int destCapacity) {
 106                 str.ensureCapacity(destCapacity);
 107                 reorderStart=0;
 108                 if(str.length()==0) {
 109                     lastCC=0;
 110                 } else {
 111                     setIterator();
 112                     lastCC=previousCC();
 113                     // Set reorderStart after the last code point with cc<=1 if there is one.
 114                     if(lastCC>1) {
 115                         while(previousCC()>1) {}
 116                     }
 117                     reorderStart=codePointLimit;
 118                 }
 119             } else {
 120                 appIsStringBuilder=false;
 121                 str=new StringBuilder();
 122                 reorderStart=0;
 123                 lastCC=0;
 124             }
 125         }
 126 
 127         public boolean isEmpty() { return str.length()==0; }
 128         public int length() { return str.length(); }
 129         public int getLastCC() { return lastCC; }
 130 
 131         public StringBuilder getStringBuilder() { return str; }
 132 
 133         public boolean equals(CharSequence s, int start, int limit) {
 134             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 135         }
 136 





 137         public void append(int c, int cc) {
 138             if(lastCC<=cc || cc==0) {
 139                 str.appendCodePoint(c);
 140                 lastCC=cc;
 141                 if(cc<=1) {
 142                     reorderStart=str.length();
 143                 }
 144             } else {
 145                 insert(c, cc);
 146             }
 147         }

 148         // s must be in NFD, otherwise change the implementation.
 149         public void append(CharSequence s, int start, int limit,
 150                            int leadCC, int trailCC) {
 151             if(start==limit) {
 152                 return;
 153             }
 154             if(lastCC<=leadCC || leadCC==0) {
 155                 if(trailCC<=1) {
 156                     reorderStart=str.length()+(limit-start);
 157                 } else if(leadCC<=1) {
 158                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 159                 }
 160                 str.append(s, start, limit);
 161                 lastCC=trailCC;
 162             } else {
 163                 int c=Character.codePointAt(s, start);
 164                 start+=Character.charCount(c);
 165                 insert(c, leadCC);  // insert first code point
 166                 while(start<limit) {
 167                     c=Character.codePointAt(s, start);
 168                     start+=Character.charCount(c);
 169                     if(start<limit) {
 170                         // s must be in NFD, otherwise we need to use getCC().
 171                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
 172                     } else {
 173                         leadCC=trailCC;
 174                     }
 175                     append(c, leadCC);
 176                 }
 177             }
 178         }

 179         // The following append() methods work like C++ appendZeroCC().
 180         // They assume that the cc or trailCC of their input is 0.
 181         // Most of them implement Appendable interface methods.
 182         @Override
 183         public ReorderingBuffer append(char c) {
 184             str.append(c);
 185             lastCC=0;
 186             reorderStart=str.length();
 187             return this;
 188         }

 189         public void appendZeroCC(int c) {
 190             str.appendCodePoint(c);
 191             lastCC=0;
 192             reorderStart=str.length();
 193         }
 194         @Override

 195         public ReorderingBuffer append(CharSequence s) {
 196             if(s.length()!=0) {
 197                 str.append(s);
 198                 lastCC=0;
 199                 reorderStart=str.length();
 200             }
 201             return this;
 202         }
 203         @Override

 204         public ReorderingBuffer append(CharSequence s, int start, int limit) {
 205             if(start!=limit) {
 206                 str.append(s, start, limit);
 207                 lastCC=0;
 208                 reorderStart=str.length();
 209             }
 210             return this;
 211         }

 212         /**
 213          * Flushes from the intermediate StringBuilder to the Appendable,
 214          * if they are different objects.
 215          * Used after recomposition.
 216          * Must be called at the end when writing to a non-StringBuilder Appendable.
 217          */
 218         public void flush() {
 219             if(appIsStringBuilder) {
 220                 reorderStart=str.length();
 221             } else {
 222                 try {
 223                     app.append(str);
 224                     str.setLength(0);
 225                     reorderStart=0;
 226                 } catch(IOException e) {
 227                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 228                 }
 229             }
 230             lastCC=0;
 231         }

 232         /**
 233          * Flushes from the intermediate StringBuilder to the Appendable,
 234          * if they are different objects.
 235          * Then appends the new text to the Appendable or StringBuilder.
 236          * Normally used after quick check loops find a non-empty sequence.
 237          */
 238         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
 239             if(appIsStringBuilder) {
 240                 str.append(s, start, limit);
 241                 reorderStart=str.length();
 242             } else {
 243                 try {
 244                     app.append(str).append(s, start, limit);
 245                     str.setLength(0);
 246                     reorderStart=0;
 247                 } catch(IOException e) {
 248                     throw new InternalError(e);  // Avoid declaring "throws IOException".
 249                 }
 250             }
 251             lastCC=0;
 252             return this;
 253         }

 254         public void remove() {
 255             str.setLength(0);
 256             lastCC=0;
 257             reorderStart=0;
 258         }

 259         public void removeSuffix(int suffixLength) {
 260             int oldLength=str.length();
 261             str.delete(oldLength-suffixLength, oldLength);
 262             lastCC=0;
 263             reorderStart=str.length();
 264         }
 265 
 266         // Inserts c somewhere before the last character.
 267         // Requires 0<cc<lastCC which implies reorderStart<limit.
 268         private void insert(int c, int cc) {
 269             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
 270             // insert c at codePointLimit, after the character with prevCC<=cc
 271             if(c<=0xffff) {
 272                 str.insert(codePointLimit, (char)c);
 273                 if(cc<=1) {
 274                     reorderStart=codePointLimit+1;
 275                 }
 276             } else {
 277                 str.insert(codePointLimit, Character.toChars(c));
 278                 if(cc<=1) {


 284         private final NormalizerImpl impl;
 285         private final Appendable app;
 286         private final StringBuilder str;
 287         private final boolean appIsStringBuilder;
 288         private int reorderStart;
 289         private int lastCC;
 290 
 291         // private backward iterator
 292         private void setIterator() { codePointStart=str.length(); }
 293         private void skipPrevious() {  // Requires 0<codePointStart.
 294             codePointLimit=codePointStart;
 295             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 296         }
 297         private int previousCC() {  // Returns 0 if there is no previous character.
 298             codePointLimit=codePointStart;
 299             if(reorderStart>=codePointStart) {
 300                 return 0;
 301             }
 302             int c=str.codePointBefore(codePointStart);
 303             codePointStart-=Character.charCount(c);
 304             return impl.getCCFromYesOrMaybeCP(c);



 305         }

 306         private int codePointStart, codePointLimit;
 307     }
 308 
 309     // TODO: Propose as public API on the UTF16 class.
 310     // TODO: Propose widening UTF16 methods that take char to take int.
 311     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 312     public static final class UTF16Plus {
 313         /**
 314          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 315          * is it a lead surrogate?
 316          * @param c code unit or code point
 317          * @return true or false
 318          */
 319         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 320         
 321         /**
 322          * Compares two CharSequence subsequences for binary equality.
 323          * @param s1 first sequence
 324          * @param start1 start offset in first sequence
 325          * @param limit1 limit offset in first sequence


 332         public static boolean equal(CharSequence s1, int start1, int limit1,
 333                                     CharSequence s2, int start2, int limit2) {
 334             if((limit1-start1)!=(limit2-start2)) {
 335                 return false;
 336             }
 337             if(s1==s2 && start1==start2) {
 338                 return true;
 339             }
 340             while(start1<limit1) {
 341                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 342                     return false;
 343                 }
 344             }
 345             return true;
 346         }
 347     }
 348 
 349     public NormalizerImpl() {}
 350 
 351     private static final class IsAcceptable implements ICUBinary.Authenticate {

 352         public boolean isDataVersionAcceptable(byte version[]) {
 353             return version[0]==3;
 354         }
 355     }

 356     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 357     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 358 
 359     public NormalizerImpl load(ByteBuffer bytes) {
 360         try {
 361             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 362             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 363             if(indexesLength<=IX_MIN_LCCC_CP) {
 364                 throw new InternalError("Normalizer2 data: not enough indexes");
 365             }
 366             int[] inIndexes=new int[indexesLength];
 367             inIndexes[0]=indexesLength*4;
 368             for(int i=1; i<indexesLength; ++i) {
 369                 inIndexes[i]=bytes.getInt();
 370             }
 371 
 372             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 373             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 374             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 375 
 376             minYesNo=inIndexes[IX_MIN_YES_NO];
 377             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 378             minNoNo=inIndexes[IX_MIN_NO_NO];
 379             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 380             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 381             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 382             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 383             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 384             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 385             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 386 
 387             // Read the normTrie.
 388             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 389             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 390             normTrie=Trie2_16.createFromSerialized(bytes);
 391             int trieLength=normTrie.getSerializedLength();
 392             if(trieLength>(nextOffset-offset)) {
 393                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
 394             }
 395             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 396 
 397             // Read the composition and mapping data.
 398             offset=nextOffset;
 399             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 400             int numChars=(nextOffset-offset)/2;
 401             char[] chars;
 402             if(numChars!=0) {
 403                 chars=new char[numChars];
 404                 for(int i=0; i<numChars; ++i) {
 405                     chars[i]=bytes.getChar();
 406                 }
 407                 maybeYesCompositions=new String(chars);
 408                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 409             }
 410 
 411             // smallFCD: new in formatVersion 2
 412             offset=nextOffset;
 413             smallFCD=new byte[0x100];
 414             bytes.get(smallFCD);



















 415 
 416             return this;
 417         } catch(IOException e) {
 418             throw new InternalError(e);
 419         }
 420     }

 421     public NormalizerImpl load(String name) {
 422         return load(ICUBinary.getRequiredData(name));
 423     }
 424 



 425        
 426     public int getNorm16(int c) { return normTrie.get(c); }
 427     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
 428     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
 429     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 430 
 431     public int getCC(int norm16) {
 432         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 433             return getCCFromNormalYesOrMaybe(norm16);
 434         }
 435         if(norm16<minNoNo || limitNoNo<=norm16) {
 436             return 0;
 437         }
 438         return getCCFromNoNo(norm16);
 439     }
 440     public static int getCCFromNormalYesOrMaybe(int norm16) {
 441         return (norm16 >> OFFSET_SHIFT) & 0xff;
 442     }
 443     public static int getCCFromYesOrMaybe(int norm16) {
 444         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
 445     }
 446     public int getCCFromYesOrMaybeCP(int c) {
 447         if (c < minCompNoMaybeCP) { return 0; }
 448         return getCCFromYesOrMaybe(getNorm16(c));
 449     }
 450 
 451     /**
 452      * Returns the FCD data for code point c.
 453      * @param c A Unicode code point.
 454      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
 455      */
 456     public int getFCD16(int c) {
 457         if(c<minDecompNoCP) {
 458             return 0;


 459         } else if(c<=0xffff) {
 460             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
 461         }
 462         return getFCD16FromNormData(c);
 463     }



 464     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
 465     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
 466         // 0<=lead<=0xffff
 467         byte bits=smallFCD[lead>>8];
 468         if(bits==0) { return false; }
 469         return ((bits>>((lead>>5)&7))&1)!=0;
 470     }
 471 
 472     /** Gets the FCD value from the regular normalization data. */
 473     public int getFCD16FromNormData(int c) {


 474         int norm16=getNorm16(c);
 475         if (norm16 >= limitNoNo) {
 476             if(norm16>=MIN_NORMAL_MAYBE_YES) {


 477                 // combining mark
 478                 norm16=getCCFromNormalYesOrMaybe(norm16);
 479                 return norm16|(norm16<<8);
 480             } else if(norm16>=minMaybeYes) {
 481                 return 0;
 482             } else {  // isDecompNoAlgorithmic(norm16)
 483                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
 484                 if (deltaTrailCC <= DELTA_TCCC_1) {
 485                     return deltaTrailCC >> OFFSET_SHIFT;
 486                 }
 487                 // Maps to an isCompYesAndZeroCC.
 488                 c=mapAlgorithmic(c, norm16);
 489                 norm16=getNorm16(c);
 490             }
 491         }
 492         if(norm16<=minYesNo || isHangulLVT(norm16)) {
 493             // no decomposition or Hangul syllable, all zeros
 494             return 0;
 495         }
 496         // c decomposes, get everything from the variable-length extra data
 497         int mapping=norm16>>OFFSET_SHIFT;
 498         int firstUnit=extraData.charAt(mapping);





 499         int fcd16=firstUnit>>8;  // tccc
 500         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 501             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
 502         }
 503         return fcd16;
 504     }



 505 
 506     /**
 507      * Gets the decomposition for one code point.
 508      * @param c code point
 509      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 510      */
 511     public String getDecomposition(int c) {

 512         int norm16;
 513         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {

 514             // c does not decompose
 515             return null;
 516         }
 517         int decomp = -1;
 518         if(isDecompNoAlgorithmic(norm16)) {
 519             // Maps to an isCompYesAndZeroCC.

 520             decomp=c=mapAlgorithmic(c, norm16);
 521             // The mapping might decompose further.
 522             norm16 = getNorm16(c);



 523         }
 524         if (norm16 < minYesNo) {
 525             if(decomp<0) {
 526                 return null;
 527             } else {
 528                 return UTF16.valueOf(decomp);
 529             }
 530         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 531             // Hangul syllable: decompose algorithmically
 532             StringBuilder buffer=new StringBuilder();
 533             Hangul.decompose(c, buffer);
 534             return buffer.toString();
 535         }
 536         // c decomposes, get everything from the variable-length extra data
 537         int mapping=norm16>>OFFSET_SHIFT;
 538         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
 539         return extraData.substring(mapping, mapping+length);
 540     }
 541         
 542     // Fixed norm16 values.
 543     public static final int MIN_YES_YES_WITH_CC=0xfe02;
 544     public static final int JAMO_VT=0xfe00;
 545     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
 546     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
 547     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
 548 
 549     // norm16 bit 0 is comp-boundary-after.
 550     public static final int HAS_COMP_BOUNDARY_AFTER=1;
 551     public static final int OFFSET_SHIFT=1;
 552 
 553     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
 554     // tccc (0, 1, >1) for quick FCC boundary-after tests.
 555     public static final int DELTA_TCCC_0=0;
 556     public static final int DELTA_TCCC_1=2;
 557     public static final int DELTA_TCCC_GT_1=4;
 558     public static final int DELTA_TCCC_MASK=6;
 559     public static final int DELTA_SHIFT=3;
 560 



 561     public static final int MAX_DELTA=0x40;
 562 
 563     // Byte offsets from the start of the data, after the generic header.
 564     public static final int IX_NORM_TRIE_OFFSET=0;
 565     public static final int IX_EXTRA_DATA_OFFSET=1;
 566     public static final int IX_SMALL_FCD_OFFSET=2;
 567     public static final int IX_RESERVED3_OFFSET=3;
 568     public static final int IX_TOTAL_SIZE=7;
 569     public static final int MIN_CCC_LCCC_CP=0x300;
 570     // Code point thresholds for quick check codes.
 571     public static final int IX_MIN_DECOMP_NO_CP=8;
 572     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
 573 
 574     // Norm16 value thresholds for quick check combinations and types of extra data.
 575 
 576     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
 577     public static final int IX_MIN_YES_NO=10;
 578     /** Mappings are comp-normalized. */
 579     public static final int IX_MIN_NO_NO=11;
 580     public static final int IX_LIMIT_NO_NO=12;
 581     public static final int IX_MIN_MAYBE_YES=13;
 582 
 583     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
 584     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
 585     /** Mappings are not comp-normalized but have a comp boundary before. */
 586     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
 587     /** Mappings do not have a comp boundary before. */
 588     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
 589     /** Mappings to the empty string. */
 590     public static final int IX_MIN_NO_NO_EMPTY=17;
 591 
 592     public static final int IX_MIN_LCCC_CP=18;
 593     public static final int IX_COUNT=20;
 594 
 595     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
 596     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
 597     // unused bit 0x20;
 598     public static final int MAPPING_LENGTH_MASK=0x1f;
 599 
 600     public static final int COMP_1_LAST_TUPLE=0x8000;
 601     public static final int COMP_1_TRIPLE=1;
 602     public static final int COMP_1_TRAIL_LIMIT=0x3400;
 603     public static final int COMP_1_TRAIL_MASK=0x7ffe;
 604     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
 605     public static final int COMP_2_TRAIL_SHIFT=6;
 606     public static final int COMP_2_TRAIL_MASK=0xffc0;
 607 
 608     // higher-level functionality ------------------------------------------ ***
 609 
 610     /**
 611      * Decomposes s[src, limit[ and writes the result to dest.
 612      * limit can be NULL if src is NUL-terminated.
 613      * destLengthEstimate is the initial dest buffer capacity and can be -1.
 614      */
 615     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
 616                    int destLengthEstimate) {
 617         if(destLengthEstimate<0) {


 681             // Check one above-minimum, relevant code point.
 682             src+=Character.charCount(c);
 683             if(buffer!=null) {
 684                 decompose(c, norm16, buffer);
 685             } else {
 686                 if(isDecompYes(norm16)) {
 687                     int cc=getCCFromYesOrMaybe(norm16);
 688                     if(prevCC<=cc || cc==0) {
 689                         prevCC=cc;
 690                         if(cc<=1) {
 691                             prevBoundary=src;
 692                         }
 693                         continue;
 694                     }
 695                 }
 696                 return prevBoundary;  // "no" or cc out of order
 697             }
 698         }
 699         return src;
 700     }

 701     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
 702         int limit=s.length();
 703         if(limit==0) {
 704             return;
 705         }
 706         if(doDecompose) {
 707             decompose(s, 0, limit, buffer);
 708             return;
 709         }
 710         // Just merge the strings at the boundary.
 711         int c=Character.codePointAt(s, 0);
 712         int src=0;
 713         int firstCC, prevCC, cc;
 714         firstCC=prevCC=cc=getCC(getNorm16(c));
 715         while(cc!=0) {
 716             prevCC=cc;
 717             src+=Character.charCount(c);
 718             if(src>=limit) {
 719                 break;
 720             }
 721             c=Character.codePointAt(s, src);
 722             cc=getCC(getNorm16(c));
 723         };
 724         buffer.append(s, 0, src, firstCC, prevCC);
 725         buffer.append(s, src, limit);
 726     }
 727 
 728     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 729     // doCompose: normalize
 730     // !doCompose: isNormalized (buffer must be empty and initialized)
 731     public boolean compose(CharSequence s, int src, int limit,
 732                            boolean onlyContiguous,
 733                            boolean doCompose,
 734                            ReorderingBuffer buffer) {
 735         int prevBoundary=src;
 736         int minNoMaybeCP=minCompNoMaybeCP;
 737 
 738         for (;;) {
 739             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 740             // or with (compYes && ccc==0) properties.










 741             int prevSrc;
 742             int c = 0;
 743             int norm16 = 0;
 744             for (;;) {
 745                 if (src == limit) {
 746                     if (prevBoundary != limit && doCompose) {
 747                         buffer.append(s, prevBoundary, limit);
 748                     }
 749                     return true;
 750                 }
 751                 if( (c=s.charAt(src))<minNoMaybeCP ||
 752                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 753                 ) {
 754                     ++src;
 755                 } else {
 756                     prevSrc = src++;
 757                     if(!UTF16.isSurrogate((char)c)) {
 758                         break;
 759                     } else {
 760                         char c2;
 761                         if(UTF16Plus.isSurrogateLead(c)) {
 762                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
 763                                 ++src;
 764                                 c=Character.toCodePoint((char)c, c2);
 765                             }
 766                         } else /* trail surrogate */ {
 767                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
 768                                 --prevSrc;
 769                                 c=Character.toCodePoint(c2, (char)c);
 770                             }
 771                         }
 772                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {


 773                             break;
 774                         }
 775                     }
 776                 }





 777             }
 778             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 779             // The current character is either a "noNo" (has a mapping)
 780             // or a "maybeYes" (combines backward)
 781             // or a "yesYes" with ccc!=0.
 782             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
 783 
 784             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
 785             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
 786                 if (!doCompose) {
 787                     return false;
 788                 }
 789                 // Fast path for mapping a character that is immediately surrounded by boundaries.
 790                 // In this case, we need not decompose around the current character.
 791                 if (isDecompNoAlgorithmic(norm16)) {
 792                     // Maps to a single isCompYesAndZeroCC character
 793                     // which also implies hasCompBoundaryBefore.
 794                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 795                             hasCompBoundaryBefore(s, src, limit)) {
 796                         if (prevBoundary != prevSrc) {
 797                             buffer.append(s, prevBoundary, prevSrc);
 798                         }
 799                         buffer.append(mapAlgorithmic(c, norm16), 0);
 800                         prevBoundary = src;
 801                         continue;




 802                     }
 803                 } else if (norm16 < minNoNoCompBoundaryBefore) {
 804                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
 805                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 806                             hasCompBoundaryBefore(s, src, limit)) {
 807                         if (prevBoundary != prevSrc) {
 808                             buffer.append(s, prevBoundary, prevSrc);
 809                         }
 810                         int mapping = norm16 >> OFFSET_SHIFT;
 811                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
 812                         buffer.append(extraData, mapping, mapping + length);
 813                         prevBoundary = src;
 814                         continue;
 815                     }
 816                 } else if (norm16 >= minNoNoEmpty) {
 817                     // The current character maps to nothing.
 818                     // Simply omit it from the output if there is a boundary before _or_ after it.
 819                     // The character itself implies no boundaries.
 820                     if (hasCompBoundaryBefore(s, src, limit) ||
 821                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
 822                         if (prevBoundary != prevSrc) {
 823                             buffer.append(s, prevBoundary, prevSrc);
 824                         }
 825                         prevBoundary = src;
 826                         continue;
 827                     }
 828                 }
 829                 // Other "noNo" type, or need to examine more text around this character:
 830                 // Fall through to the slow path.
 831             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
 832                 char prev=s.charAt(prevSrc-1);

 833                 if(c<Hangul.JAMO_T_BASE) {
 834                     // The current character is a Jamo Vowel,
 835                     // compose with previous Jamo L and following Jamo T.
 836                     char l = (char)(prev-Hangul.JAMO_L_BASE);
 837                     if(l<Hangul.JAMO_L_COUNT) {
 838                         if (!doCompose) {
 839                             return false;
 840                         }
 841                         int t;
 842                         if (src != limit &&
 843                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
 844                                 t < Hangul.JAMO_T_COUNT) {
 845                             // The next character is a Jamo T.

 846                             ++src;
 847                         } else if (hasCompBoundaryBefore(s, src, limit)) {
 848                             // No Jamo T follows, not even via decomposition.
 849                             t = 0;
 850                         } else {
 851                             t = -1;
 852                         }
 853                         if (t >= 0) {
 854                             int syllable = Hangul.HANGUL_BASE +
 855                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
 856                                 Hangul.JAMO_T_COUNT + t;
 857                             --prevSrc;  // Replace the Jamo L as well.
 858                             if (prevBoundary != prevSrc) {
 859                                 buffer.append(s, prevBoundary, prevSrc);
 860                             }
 861                             buffer.append((char)syllable);
 862                             prevBoundary = src;
 863                             continue;
 864                         }
 865                         // If we see L+V+x where x!=T then we drop to the slow path,
 866                         // decompose and recompose.
 867                         // This is to deal with NFKC finding normal L and V but a
 868                         // compatibility variant of a T.
 869                         // We need to either fully compose that combination here
 870                         // (which would complicate the code and may not work with strange custom data)
 871                         // or use the slow path.



 872                     }
 873                 } else if (Hangul.isHangulLV(prev)) {
 874                     // The current character is a Jamo Trailing consonant,
 875                     // compose with previous Hangul LV that does not contain a Jamo T.
 876                     if (!doCompose) {
 877                         return false;
 878                     }
 879                     int syllable = prev + c - Hangul.JAMO_T_BASE;
 880                     --prevSrc;  // Replace the Hangul LV as well.
 881                     if (prevBoundary != prevSrc) {
 882                         buffer.append(s, prevBoundary, prevSrc);






 883                     }
 884                     buffer.append((char)syllable);
 885                     prevBoundary = src;
 886                     continue;
 887                 }
 888                 // No matching context, or may need to decompose surrounding text first:
 889                 // Fall through to the slow path.
 890             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
 891                 // One or more combining marks that do not combine-back:
 892                 // Check for canonical order, copy unchanged if ok and
 893                 // if followed by a character with a boundary-before.
 894                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
 895                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {





























 896                     // Fails FCD test, need to decompose and contiguously recompose.
 897                     if (!doCompose) {
 898                         return false;
 899                     }






 900                 } else {
 901                     // If !onlyContiguous (not FCC), then we ignore the tccc of
 902                     // the previous character which passed the quick check "yes && ccc==0" test.
 903                     int n16;
 904                     for (;;) {
 905                         if (src == limit) {
 906                             if (doCompose) {
 907                                 buffer.append(s, prevBoundary, limit);
 908                             }
 909                             return true;
 910                         }
 911                         int prevCC = cc;
 912                         c = Character.codePointAt(s, src);
 913                         n16 = normTrie.get(c);
 914                         if (n16 >= MIN_YES_YES_WITH_CC) {
 915                             cc = getCCFromNormalYesOrMaybe(n16);
 916                             if (prevCC > cc) {
 917                                 if (!doCompose) {
 918                                     return false;
 919                                 }
 920                                 break;
 921                             }
 922                         } else {
 923                             break;
 924                         }
 925                         src += Character.charCount(c);
 926                     }
 927                     // p is after the last in-order combining mark.
 928                     // If there is a boundary here, then we continue with no change.
 929                     if (norm16HasCompBoundaryBefore(n16)) {
 930                         if (isCompYesAndZeroCC(n16)) {
 931                             src += Character.charCount(c);
 932                         }
 933                         continue;
 934                     }
 935                     // Use the slow path. There is no boundary in [prevSrc, src[.
 936                 }



 937             }
 938 
 939             // Slow path: Find the nearest boundaries around the current character,
 940             // decompose and recompose.
 941             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
 942                 c = Character.codePointBefore(s, prevSrc);
 943                 norm16 = normTrie.get(c);
 944                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
 945                     prevSrc -= Character.charCount(c);
 946                 }
 947             }
 948             if (doCompose && prevBoundary != prevSrc) {
 949                 buffer.append(s, prevBoundary, prevSrc);
 950             }
 951             int recomposeStartIndex=buffer.length();
 952             // We know there is not a boundary here.
 953             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
 954                            buffer);
 955             // Decompose until the next boundary.
 956             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
 957                                  buffer);
 958             recompose(buffer, recomposeStartIndex, onlyContiguous);
 959             if(!doCompose) {
 960                 if(!buffer.equals(s, prevSrc, src)) {
 961                     return false;
 962                 }
 963                 buffer.remove();

 964             }


 965             prevBoundary=src;
 966         }

 967     }
 968 
 969     /**
 970      * Very similar to compose(): Make the same changes in both places if relevant.
 971      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
 972      * !doSpan: quickCheck
 973      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
 974      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 975      *         then the quick check result is "no"
 976      */
 977     public int composeQuickCheck(CharSequence s, int src, int limit,
 978                                  boolean onlyContiguous, boolean doSpan) {
 979         int qcResult=0;






 980         int prevBoundary=src;
 981         int minNoMaybeCP=minCompNoMaybeCP;



 982 
 983         for(;;) {
 984             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 985             // or with (compYes && ccc==0) properties.
 986             int prevSrc;
 987             int c = 0;
 988             int norm16 = 0;
 989             for (;;) {
 990                 if(src==limit) {
 991                     return (src<<1)|qcResult;  // "yes" or "maybe"
 992                 }
 993                 if( (c=s.charAt(src))<minNoMaybeCP ||
 994                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 995                 ) {
 996                     ++src;
 997                 } else {
 998                     prevSrc = src++;
 999                     if(!UTF16.isSurrogate((char)c)) {
1000                         break;
1001                     } else {
1002                         char c2;
1003                         if(UTF16Plus.isSurrogateLead(c)) {
1004                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1005                                 ++src;
1006                                 c=Character.toCodePoint((char)c, c2);
1007                             }
1008                         } else /* trail surrogate */ {
1009                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1010                                 --prevSrc;
1011                                 c=Character.toCodePoint(c2, (char)c);
1012                             }
1013                         }
1014                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {


1015                             break;
1016                         }
1017                     }
1018                 }







1019             }
1020             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1021             // The current character is either a "noNo" (has a mapping)
1022             // or a "maybeYes" (combines backward)
1023             // or a "yesYes" with ccc!=0.
1024             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1025 
1026             int prevNorm16 = INERT;
1027             if (prevBoundary != prevSrc) {
1028                 prevBoundary = prevSrc;
1029                 if (!norm16HasCompBoundaryBefore(norm16)) {
1030                     c = Character.codePointBefore(s, prevSrc);
1031                     int n16 = getNorm16(c);
1032                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1033                         prevBoundary -= Character.charCount(c);
1034                         prevNorm16 = n16;
1035                     }
1036                 }
1037             }
1038 






1039             if(isMaybeOrNonZeroCC(norm16)) {
1040                 int cc=getCCFromYesOrMaybe(norm16);
1041                 if (onlyContiguous /* FCC */ && cc != 0 &&
1042                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1043                     // The [prevBoundary..prevSrc[ character
1044                     // passed the quick check "yes && ccc==0" test
1045                     // but is out of canonical order with the current combining mark.
1046                 } else {
1047                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1048                     // the previous character which passed the quick check "yes && ccc==0" test.
1049                     for (;;) {
1050                         if (norm16 < MIN_YES_YES_WITH_CC) {
1051                             if (!doSpan) {
1052                                 qcResult = 1;
1053                             } else {
1054                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1055                             }
1056                         }
1057                         if (src == limit) {
1058                             return (src<<1) | qcResult;  // "yes" or "maybe"
1059                         }
1060                         int prevCC = cc;
1061                         c = Character.codePointAt(s, src);
1062                         norm16 = getNorm16(c);
1063                         if (isMaybeOrNonZeroCC(norm16)) {
1064                             cc = getCCFromYesOrMaybe(norm16);
1065                             if (!(prevCC <= cc || cc == 0)) {
1066                                 break;
1067                             }
1068                         } else {
1069                             break;
1070                         }
1071                         src += Character.charCount(c);
1072                     }
1073                     // src is after the last in-order combining mark.
1074                     if (isCompYesAndZeroCC(norm16)) {
1075                         prevBoundary = src;
1076                         src += Character.charCount(c);
1077                         continue;
1078                     }
1079                 }
1080             }
1081             return prevBoundary<<1;  // "no"
1082         }
1083     }

1084     public void composeAndAppend(CharSequence s,
1085                                  boolean doCompose,
1086                                  boolean onlyContiguous,
1087                                  ReorderingBuffer buffer) {
1088         int src=0, limit=s.length();
1089         if(!buffer.isEmpty()) {
1090             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1091             if(0!=firstStarterInSrc) {
1092                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1093                                                                buffer.length(), onlyContiguous);
1094                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1095                                                        firstStarterInSrc+16);
1096                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1097                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1098                 middle.append(s, 0, firstStarterInSrc);
1099                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1100                 src=firstStarterInSrc;
1101             }
1102         }
1103         if(doCompose) {
1104             compose(s, src, limit, onlyContiguous, true, buffer);
1105         } else {
1106             buffer.append(s, src, limit);
1107         }
1108     }

1109     // Dual functionality:
1110     // buffer!=NULL: normalize
1111     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1112     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1113         // Note: In this function we use buffer->appendZeroCC() because we track
1114         // the lead and trail combining classes here, rather than leaving it to
1115         // the ReorderingBuffer.
1116         // The exception is the call to decomposeShort() which uses the buffer
1117         // in the normal way.
1118 
1119         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1120         // Similar to the prevBoundary in the compose() implementation.
1121         int prevBoundary=src;
1122         int prevSrc;
1123         int c=0;
1124         int prevFCD16=0;
1125         int fcd16=0;
1126 
1127         for(;;) {
1128             // count code units with lccc==0
1129             for(prevSrc=src; src!=limit;) {
1130                 if((c=s.charAt(src))<minLcccCP) {
1131                     prevFCD16=~c;
1132                     ++src;
1133                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1134                     prevFCD16=0;
1135                     ++src;
1136                 } else {
1137                     if(UTF16.isSurrogate((char)c)) {
1138                         char c2;
1139                         if(UTF16Plus.isSurrogateLead(c)) {
1140                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1141                                 c=Character.toCodePoint((char)c, c2);
1142                             }
1143                         } else /* trail surrogate */ {
1144                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1145                                 --src;
1146                                 c=Character.toCodePoint(c2, (char)c);
1147                             }
1148                         }
1149                     }
1150                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1151                         prevFCD16=fcd16;
1152                         src+=Character.charCount(c);
1153                     } else {
1154                         break;
1155                     }
1156                 }
1157             }
1158             // copy these code units all at once
1159             if(src!=prevSrc) {
1160                 if(src==limit) {
1161                     if(buffer!=null) {
1162                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1163                     }
1164                     break;
1165                 }
1166                 prevBoundary=src;
1167                 // We know that the previous character's lccc==0.
1168                 if(prevFCD16<0) {
1169                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1170                     int prev=~prevFCD16;
1171                     if(prev<minDecompNoCP) {
1172                         prevFCD16=0;
1173                     } else {
1174                         prevFCD16=getFCD16FromNormData(prev);
1175                         if(prevFCD16>1) {
1176                             --prevBoundary;
1177                         }
1178                     }
1179                 } else {
1180                     int p=src-1;
1181                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1182                         Character.isHighSurrogate(s.charAt(p-1))
1183                     ) {
1184                         --p;
1185                         // Need to fetch the previous character's FCD value because
1186                         // prevFCD16 was just for the trail surrogate code point.
1187                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1188                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1189                     }
1190                     if(prevFCD16>1) {
1191                         prevBoundary=p;
1192                     }
1193                 }
1194                 if(buffer!=null) {
1195                     // The last lccc==0 character is excluded from the
1196                     // flush-and-append call in case it needs to be modified.
1197                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1198                     buffer.append(s, prevBoundary, src);


1217                 prevFCD16=fcd16;
1218                 continue;
1219             } else if(buffer==null) {
1220                 return prevBoundary;  // quick check "no"
1221             } else {
1222                 /*
1223                  * Back out the part of the source that we copied or appended
1224                  * already but is now going to be decomposed.
1225                  * prevSrc is set to after what was copied/appended.
1226                  */
1227                 buffer.removeSuffix(prevSrc-prevBoundary);
1228                 /*
1229                  * Find the part of the source that needs to be decomposed,
1230                  * up to the next safe boundary.
1231                  */
1232                 src=findNextFCDBoundary(s, src, limit);
1233                 /*
1234                  * The source text does not fulfill the conditions for FCD.
1235                  * Decompose and reorder a limited piece of the text.
1236                  */
1237                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1238                 prevBoundary=src;
1239                 prevFCD16=0;
1240             }
1241         }
1242         return src;
1243     }
1244     
1245     public boolean hasDecompBoundaryBefore(int c) {
1246         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1247             norm16HasDecompBoundaryBefore(getNorm16(c));




1248     }
1249     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1250         if (norm16 < minNoNoCompNoMaybeCC) {
1251             return true;
1252         }
1253         if (norm16 >= limitNoNo) {
1254             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1255         }

1256         // c decomposes, get everything from the variable-length extra data
1257         int mapping=norm16>>OFFSET_SHIFT;
1258         int firstUnit=extraData.charAt(mapping);
1259         // true if leadCC==0 (hasFCDBoundaryBefore())
1260         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1261     }
1262     public boolean hasDecompBoundaryAfter(int c) {
1263         if (c < minDecompNoCP) {
1264             return true;
1265         }
1266         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1267             return true;
1268         }
1269         return norm16HasDecompBoundaryAfter(getNorm16(c));
1270     }
1271     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1272         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1273             return true;
1274         }
1275         if (norm16 >= limitNoNo) {
1276             if (isMaybeOrNonZeroCC(norm16)) {
1277                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1278             }
1279             // Maps to an isCompYesAndZeroCC.
1280             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1281         }
1282         // c decomposes, get everything from the variable-length extra data
1283         int mapping=norm16>>OFFSET_SHIFT;
1284         int firstUnit=extraData.charAt(mapping);
1285         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1286         // fcd16<=1 || trailCC==0
1287         if(firstUnit>0x1ff) {
1288             return false;  // trailCC>1
1289         }
1290         if(firstUnit<=0xff) {
1291             return true;  // trailCC==0
1292         }
1293         // if(trailCC==1) test leadCC==0, same as checking for before-boundary

1294         // true if leadCC==0 (hasFCDBoundaryBefore())
1295         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;


1296     }
1297     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1298 
1299     public boolean hasCompBoundaryBefore(int c) {
1300         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1301     }
1302     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1303         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1304     }
1305     
1306     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1307     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1308     private static boolean isInert(int norm16) { return norm16==INERT; }
1309     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1310     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
1311     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
1312     private boolean isHangulLVT(int norm16) {
1313         return norm16==hangulLVT();
1314     }
1315     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }

1316     // UBool isCompYes(uint16_t norm16) const {
1317     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1318     // }
1319     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1320     //     return norm16<minNoNo || minMaybeYes<=norm16;
1321     // }
1322     // private boolean hasZeroCCFromDecompYes(int norm16) {
1323     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1324     // }
1325     private boolean isDecompYesAndZeroCC(int norm16) {
1326         return norm16<minYesNo ||
1327                norm16==JAMO_VT ||
1328                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1329     }

1330     /**
1331      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1332      * the MaybeYes which combine-forward and have ccc=0.
1333      * (Standard Unicode 10 normalization does not have such characters.)
1334      */
1335     private boolean isMostDecompYesAndZeroCC(int norm16) {
1336         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1337     }

1338     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1339 
1340     // For use with isCompYes().
1341     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1342     // static uint8_t getCCFromYes(uint16_t norm16) {
1343     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1344     // }
1345     private int getCCFromNoNo(int norm16) {
1346         int mapping=norm16>>OFFSET_SHIFT;
1347         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1348             return extraData.charAt(mapping-1)&0xff;
1349         } else {
1350             return 0;
1351         }
1352     }
1353     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1354         if(norm16<=minYesNo) {
1355             return 0;  // yesYes and Hangul LV have ccc=tccc=0









1356         } else {
1357             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1358             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1359         }
1360     }
1361 
1362     // Requires algorithmic-NoNo.
1363     private int mapAlgorithmic(int c, int norm16) {
1364         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1365     }
1366 
1367     // Requires minYesNo<norm16<limitNoNo.
1368     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1369 
1370     /**
1371      * @return index into maybeYesCompositions, or -1
1372      */
1373     private int getCompositionsListForDecompYes(int norm16) {
1374         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1375             return -1;
1376         } else {
1377             if((norm16-=minMaybeYes)<0) {
1378                 // norm16<minMaybeYes: index into extraData which is a substring at
1379                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1380                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1381                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1382             }
1383             return norm16>>OFFSET_SHIFT;
1384         }
1385     }

1386     /**
1387      * @return index into maybeYesCompositions
1388      */
1389     private int getCompositionsListForComposite(int norm16) {
1390         // A composite has both mapping & compositions list.
1391         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1392         int firstUnit=maybeYesCompositions.charAt(list);
1393         return list+  // mapping in maybeYesCompositions
1394             1+  // +1 to skip the first unit with the mapping length
1395             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1396     }
1397     
1398     // Decompose a short piece of text which is likely to contain characters that
1399     // fail the quick check loop and/or where the quick check loop's overhead
1400     // is unlikely to be amortized.
1401     // Called by the compose() and makeFCD() implementations.
1402     // Public in Java for collation implementation code.
1403     private int decomposeShort(
1404             CharSequence s, int src, int limit,
1405             boolean stopAtCompBoundary, boolean onlyContiguous,
1406             ReorderingBuffer buffer) {
1407         while(src<limit) {
1408             int c=Character.codePointAt(s, src);
1409             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1410                 return src;
1411             }
1412             int norm16 = getNorm16(c);
1413             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1414                 return src;
1415             }
1416             src+=Character.charCount(c);
1417             decompose(c, norm16, buffer);
1418             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1419                 return src;
1420             }
1421         }
1422         return src;
1423     }
1424     private void decompose(int c, int norm16, ReorderingBuffer buffer) {


1425         // get the decomposition and the lead and trail cc's
1426         if (norm16 >= limitNoNo) {
1427             if (isMaybeOrNonZeroCC(norm16)) {
1428                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1429                 return;
1430             }
1431             // Maps to an isCompYesAndZeroCC.

1432             c=mapAlgorithmic(c, norm16);
1433             norm16=getNorm16(c);
1434         }
1435         if (norm16 < minYesNo) {
1436             // c does not decompose
1437             buffer.append(c, 0);
1438         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1439             // Hangul syllable: decompose algorithmically
1440             Hangul.decompose(c, buffer);
1441         } else {
1442             // c decomposes, get everything from the variable-length extra data
1443             int mapping=norm16>>OFFSET_SHIFT;
1444             int firstUnit=extraData.charAt(mapping);
1445             int length=firstUnit&MAPPING_LENGTH_MASK;
1446             int leadCC, trailCC;
1447             trailCC=firstUnit>>8;
1448             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1449                 leadCC=extraData.charAt(mapping-1)>>8;
1450             } else {
1451                 leadCC=0;
1452             }
1453             ++mapping;  // skip over the firstUnit
1454             buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);


1455         }
1456     }
1457 
1458     /**
1459      * Finds the recomposition result for
1460      * a forward-combining "lead" character,
1461      * specified with a pointer to its compositions list,
1462      * and a backward-combining "trail" character.
1463      *
1464      * <p>If the lead and trail characters combine, then this function returns
1465      * the following "compositeAndFwd" value:
1466      * <pre>
1467      * Bits 21..1  composite character
1468      * Bit      0  set if the composite is a forward-combining starter
1469      * </pre>
1470      * otherwise it returns -1.
1471      *
1472      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1473      * encoded as either pairs or triples of 16-bit units.
1474      * The last entry has the high bit of its first unit set.
1475      *
1476      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1477      * A linear search is used.
1478      *
1479      * <p>See normalizer2impl.h for a more detailed description
1480      * of the compositions list format.
1481      */
1482     private static int combine(String compositions, int list, int trail) {
1483         int key1, firstUnit;
1484         if(trail<COMP_1_TRAIL_LIMIT) {
1485             // trail character is 0..33FF
1486             // result entry may have 2 or 3 units
1487             key1=(trail<<1);
1488             while(key1>(firstUnit=compositions.charAt(list))) {
1489                 list+=2+(firstUnit&COMP_1_TRIPLE);
1490             }
1491             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1492                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1493                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1494                 } else {
1495                     return compositions.charAt(list+1);
1496                 }
1497             }
1498         } else {
1499             // trail character is 3400..10FFFF
1500             // result entry has 3 units
1501             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1502             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1503             int secondUnit;
1504             for(;;) {
1505                 if(key1>(firstUnit=compositions.charAt(list))) {
1506                     list+=2+(firstUnit&COMP_1_TRIPLE);
1507                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1508                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1509                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1510                             break;
1511                         } else {
1512                             list+=3;
1513                         }


1549         int cc, prevCC;
1550         boolean starterIsSupplementary;
1551 
1552         // Some of the following variables are not used until we have a forward-combining starter
1553         // and are only initialized now to avoid compiler warnings.
1554         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1555         starter=-1;
1556         starterIsSupplementary=false;
1557         prevCC=0;
1558 
1559         for(;;) {
1560             c=sb.codePointAt(p);
1561             p+=Character.charCount(c);
1562             norm16=getNorm16(c);
1563             cc=getCCFromYesOrMaybe(norm16);
1564             if( // this character combines backward and
1565                 isMaybe(norm16) &&
1566                 // we have seen a starter that combines forward and
1567                 compositionsList>=0 &&
1568                 // the backward-combining character is not blocked
1569                 (prevCC<cc || prevCC==0)
1570             ) {
1571                 if(isJamoVT(norm16)) {
1572                     // c is a Jamo V/T, see if we can compose it with the previous character.
1573                     if(c<Hangul.JAMO_T_BASE) {
1574                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1575                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1576                         if(prev<Hangul.JAMO_L_COUNT) {
1577                             pRemove=p-1;
1578                             char syllable=(char)
1579                                 (Hangul.HANGUL_BASE+
1580                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1581                                  Hangul.JAMO_T_COUNT);
1582                             char t;
1583                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1584                                 ++p;
1585                                 syllable+=t;  // The next character was a Jamo T.
1586                             }
1587                             sb.setCharAt(starter, syllable);
1588                             // remove the Jamo V/T
1589                             sb.delete(pRemove, p);
1590                             p=pRemove;


1671                         starterIsSupplementary=true;
1672                         starter=p-2;
1673                     }
1674                 }
1675             } else if(onlyContiguous) {
1676                 // FCC: no discontiguous compositions; any intervening character blocks.
1677                 compositionsList=-1;
1678             }
1679         }
1680         buffer.flush();
1681     }
1682 
1683     /**
1684      * Does c have a composition boundary before it?
1685      * True if its decomposition begins with a character that has
1686      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1687      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1688      * (isCompYesAndZeroCC()) so we need not decompose.
1689      */
1690     private boolean hasCompBoundaryBefore(int c, int norm16) {
1691         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);












1692     }
1693     private boolean norm16HasCompBoundaryBefore(int norm16) {
1694         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
1695     }
1696     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
1697         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
1698     }
1699     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
1700         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1701             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
1702     }
1703     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
1704         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
1705     }
1706     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
1707     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
1708         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
1709             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
1710     }
1711 
1712     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
1713         while(p>0) {
1714             int c=Character.codePointBefore(s, p);
1715             int norm16 = getNorm16(c);
1716             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1717                 break;
1718             }
1719             p-=Character.charCount(c);
1720             if(hasCompBoundaryBefore(c, norm16)) {
1721                 break;
1722             }


1723         }
1724         return p;
1725     }
1726     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {

1727         while(p<limit) {
1728             int c=Character.codePointAt(s, p);
1729             int norm16=normTrie.get(c);
1730             if(hasCompBoundaryBefore(c, norm16)) {
1731                 break;
1732             }
1733             p+=Character.charCount(c);
1734             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1735                 break;
1736             }
1737         }
1738         return p;
1739     }
1740 
1741     
1742     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1743         while(p<limit) {
1744             int c=Character.codePointAt(s, p);
1745             int norm16;
1746             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
1747                 break;
1748             }
1749             p+=Character.charCount(c);
1750             if (norm16HasDecompBoundaryAfter(norm16)) {
1751                 break;
1752             }
1753         }
1754         return p;
1755     }
1756     
1757     /**
1758      * Get the canonical decomposition
1759      * sherman  for ComposedCharIter
1760      */
1761     public static int getDecompose(int chars[], String decomps[]) {
1762         Normalizer2 impl = Normalizer2.getNFDInstance();
1763 
1764         int length=0;
1765         int norm16 = 0;
1766         int ch = -1;
1767         int i = 0;
1768 
1769         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1770             //TBD !!!! the hack code heres save us about 50ms for startup
1771             //need a better solution/lookup
1772             if (ch == 0x30ff)


2015                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
2016                 // [current]..[p]         1 code point (c, c2) with cc
2017 
2018                 // move the code units in between up
2019                 r=p;
2020                 do {
2021                     source[--r]=source[--current];
2022                 } while (back!=current);
2023             }
2024         }
2025 
2026         // insert (c1, c2)
2027         source[current] = c1;
2028         if (c2!=0) {
2029             source[(current+1)] = c2;
2030         }
2031 
2032         // we know the cc of the last code point
2033         return trailCC;
2034     }

2035     /**
2036      * merge two UTF-16 string parts together
2037      * to canonically order (order by combining classes) their concatenation
2038      *
2039      * the two strings may already be adjacent, so that the merging is done
2040      * in-place if the two strings are not adjacent, then the buffer holding the
2041      * first one must be large enough
2042      * the second string may or may not be ordered in itself
2043      *
2044      * before: [start]..[current] is already ordered, and
2045      *         [next]..[limit]    may be ordered in itself, but
2046      *                          is not in relation to [start..current[
2047      * after: [start..current+(limit-next)[ is ordered
2048      *
2049      * the algorithm is a simple bubble-sort that takes the characters from
2050      * src[next++] and inserts them in correct combining class order into the
2051      * preceding part of the string
2052      *
2053      * since this function is called much less often than the single-code point
2054      * insertOrdered(), it just uses that for easier maintenance


2098 
2099             if(ncArgs.next==ncArgs.limit) {
2100                 // we know the cc of the last code point
2101                 return trailCC;
2102             } else {
2103                 if(!adjacent) {
2104                     // copy the second string part
2105                     do {
2106                         source[current++]=data[ncArgs.next++];
2107                     } while(ncArgs.next!=ncArgs.limit);
2108                     ncArgs.limit=current;
2109                 }
2110                 PrevArgs prevArgs = new PrevArgs();
2111                 prevArgs.src   = data;
2112                 prevArgs.start = start;
2113                 prevArgs.current =  ncArgs.limit;
2114                 return getPrevCC(prevArgs);
2115             }
2116 
2117     }

2118     private static final class PrevArgs{
2119         char[] src;
2120         int start;
2121         int current;
2122         char c1;
2123         char c2;
2124     }
2125 
2126     private static final class NextCCArgs{
2127         char[] source;
2128         int next;
2129         int limit;
2130         char c1;
2131         char c2;
2132     }
2133     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2134         args.c1=args.source[args.next++];
2135         args.c2=0;
2136 
2137         if (UTF16.isTrailSurrogate(args.c1)) {
2138             /* unpaired second surrogate */
2139             return 0;
2140         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2141             return UCharacter.getCombiningClass(args.c1);
2142         } else if (args.next!=args.limit &&
2143                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2144             ++args.next;
2145             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2146         } else {
2147             /* unpaired first surrogate */
2148             args.c2=0;
2149             return 0;
2150         }
2151     }
2152     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2153         args.c1=args.src[--args.current];
2154         args.c2=0;
2155 
2156         if (args.c1 < MIN_CCC_LCCC_CP) {
2157             return 0;
2158         } else if (UTF16.isLeadSurrogate(args.c1)) {
2159             /* unpaired first surrogate */
2160             return 0;
2161         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2162             return UCharacter.getCombiningClass(args.c1);
2163         } else if (args.current!=args.start &&
2164                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2165             --args.current;
2166             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2167         } else {
2168             /* unpaired second surrogate */
2169             args.c2=0;
2170             return 0;
2171         }
2172     }
2173 
2174     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2175         if (start == p) {














2176             return 0;
2177         }
2178         return getFCD16(Character.codePointBefore(s, p));
2179     }
2180 
2181     private VersionInfo dataVersion;
2182 
2183     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2184     private int minDecompNoCP;
2185     private int minCompNoMaybeCP;
2186     private int minLcccCP;
2187 
2188     // Norm16 value thresholds for quick check combinations and types of extra data.
2189     private int minYesNo;
2190     private int minYesNoMappingsOnly;
2191     private int minNoNo;
2192     private int minNoNoCompBoundaryBefore;
2193     private int minNoNoCompNoMaybeCC;
2194     private int minNoNoEmpty;
2195     private int limitNoNo;
2196     private int centerNoNoDelta;
2197     private int minMaybeYes;
2198 
2199     private Trie2_16 normTrie;
2200     private String maybeYesCompositions;
2201     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2202     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0

2203 
2204    }
< prev index next >