< prev index next >

src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

Print this page
rev 54996 : 8221431: Support for Unicode 12.1
Reviewed-by:
   1 /*
   2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


 128         public int length() { return str.length(); }
 129         public int getLastCC() { return lastCC; }
 130 
 131         public StringBuilder getStringBuilder() { return str; }
 132 
 133         public boolean equals(CharSequence s, int start, int limit) {
 134             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 135         }
 136 
 137         public void append(int c, int cc) {
 138             if(lastCC<=cc || cc==0) {
 139                 str.appendCodePoint(c);
 140                 lastCC=cc;
 141                 if(cc<=1) {
 142                     reorderStart=str.length();
 143                 }
 144             } else {
 145                 insert(c, cc);
 146             }
 147         }
 148         // s must be in NFD, otherwise change the implementation.
 149         public void append(CharSequence s, int start, int limit,
 150                            int leadCC, int trailCC) {
 151             if(start==limit) {
 152                 return;
 153             }
 154             if(lastCC<=leadCC || leadCC==0) {
 155                 if(trailCC<=1) {
 156                     reorderStart=str.length()+(limit-start);
 157                 } else if(leadCC<=1) {
 158                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 159                 }
 160                 str.append(s, start, limit);
 161                 lastCC=trailCC;
 162             } else {
 163                 int c=Character.codePointAt(s, start);
 164                 start+=Character.charCount(c);
 165                 insert(c, leadCC);  // insert first code point
 166                 while(start<limit) {
 167                     c=Character.codePointAt(s, start);
 168                     start+=Character.charCount(c);
 169                     if(start<limit) {
 170                         // s must be in NFD, otherwise we need to use getCC().
 171                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));



 172                     } else {
 173                         leadCC=trailCC;
 174                     }
 175                     append(c, leadCC);
 176                 }
 177             }
 178         }
 179         // The following append() methods work like C++ appendZeroCC().
 180         // They assume that the cc or trailCC of their input is 0.
 181         // Most of them implement Appendable interface methods.
 182         @Override
 183         public ReorderingBuffer append(char c) {
 184             str.append(c);
 185             lastCC=0;
 186             reorderStart=str.length();
 187             return this;
 188         }
 189         public void appendZeroCC(int c) {
 190             str.appendCodePoint(c);
 191             lastCC=0;


 294             codePointLimit=codePointStart;
 295             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 296         }
 297         private int previousCC() {  // Returns 0 if there is no previous character.
 298             codePointLimit=codePointStart;
 299             if(reorderStart>=codePointStart) {
 300                 return 0;
 301             }
 302             int c=str.codePointBefore(codePointStart);
 303             codePointStart-=Character.charCount(c);
 304             return impl.getCCFromYesOrMaybeCP(c);
 305         }
 306         private int codePointStart, codePointLimit;
 307     }
 308 
 309     // TODO: Propose as public API on the UTF16 class.
 310     // TODO: Propose widening UTF16 methods that take char to take int.
 311     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 312     public static final class UTF16Plus {
 313         /**






 314          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 315          * is it a lead surrogate?
 316          * @param c code unit or code point
 317          * @return true or false
 318          */
 319         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 320 
 321         /**
 322          * Compares two CharSequence subsequences for binary equality.
 323          * @param s1 first sequence
 324          * @param start1 start offset in first sequence
 325          * @param limit1 limit offset in first sequence
 326          * @param s2 second sequence
 327          * @param start2 start offset in second sequence
 328          * @param limit2 limit offset in second sequence
 329          * @return true if s1.subSequence(start1, limit1) contains the same text
 330          *              as s2.subSequence(start2, limit2)
 331          */
 332         public static boolean equal(CharSequence s1, int start1, int limit1,
 333                                     CharSequence s2, int start2, int limit2) {
 334             if((limit1-start1)!=(limit2-start2)) {
 335                 return false;
 336             }
 337             if(s1==s2 && start1==start2) {
 338                 return true;
 339             }
 340             while(start1<limit1) {
 341                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 342                     return false;
 343                 }
 344             }
 345             return true;
 346         }
 347     }
 348 
 349     public NormalizerImpl() {}
 350 
 351     private static final class IsAcceptable implements ICUBinary.Authenticate {
 352         public boolean isDataVersionAcceptable(byte version[]) {
 353             return version[0]==3;
 354         }
 355     }
 356     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 357     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 358 
 359     public NormalizerImpl load(ByteBuffer bytes) {
 360         try {
 361             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 362             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 363             if(indexesLength<=IX_MIN_LCCC_CP) {
 364                 throw new InternalError("Normalizer2 data: not enough indexes");
 365             }
 366             int[] inIndexes=new int[indexesLength];
 367             inIndexes[0]=indexesLength*4;
 368             for(int i=1; i<indexesLength; ++i) {
 369                 inIndexes[i]=bytes.getInt();
 370             }
 371 
 372             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 373             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 374             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 375 
 376             minYesNo=inIndexes[IX_MIN_YES_NO];
 377             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 378             minNoNo=inIndexes[IX_MIN_NO_NO];
 379             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 380             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 381             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 382             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 383             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 384             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 385             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 386 
 387             // Read the normTrie.
 388             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 389             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 390             normTrie=Trie2_16.createFromSerialized(bytes);
 391             int trieLength=normTrie.getSerializedLength();

 392             if(trieLength>(nextOffset-offset)) {
 393                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
 394             }
 395             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 396 
 397             // Read the composition and mapping data.
 398             offset=nextOffset;
 399             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 400             int numChars=(nextOffset-offset)/2;
 401             char[] chars;
 402             if(numChars!=0) {
 403                 chars=new char[numChars];
 404                 for(int i=0; i<numChars; ++i) {
 405                     chars[i]=bytes.getChar();
 406                 }
 407                 maybeYesCompositions=new String(chars);
 408                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 409             }
 410 
 411             // smallFCD: new in formatVersion 2
 412             offset=nextOffset;
 413             smallFCD=new byte[0x100];
 414             bytes.get(smallFCD);
 415 
 416             return this;
 417         } catch(IOException e) {
 418             throw new InternalError(e);
 419         }
 420     }
 421     public NormalizerImpl load(String name) {
 422         return load(ICUBinary.getRequiredData(name));
 423     }
 424 
 425 
 426     public int getNorm16(int c) { return normTrie.get(c); }




 427     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
 428     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
 429     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 430 
 431     public int getCC(int norm16) {
 432         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 433             return getCCFromNormalYesOrMaybe(norm16);
 434         }
 435         if(norm16<minNoNo || limitNoNo<=norm16) {
 436             return 0;
 437         }
 438         return getCCFromNoNo(norm16);
 439     }
 440     public static int getCCFromNormalYesOrMaybe(int norm16) {
 441         return (norm16 >> OFFSET_SHIFT) & 0xff;
 442     }
 443     public static int getCCFromYesOrMaybe(int norm16) {
 444         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
 445     }
 446     public int getCCFromYesOrMaybeCP(int c) {


 469         return ((bits>>((lead>>5)&7))&1)!=0;
 470     }
 471 
 472     /** Gets the FCD value from the regular normalization data. */
 473     public int getFCD16FromNormData(int c) {
 474         int norm16=getNorm16(c);
 475         if (norm16 >= limitNoNo) {
 476             if(norm16>=MIN_NORMAL_MAYBE_YES) {
 477                 // combining mark
 478                 norm16=getCCFromNormalYesOrMaybe(norm16);
 479                 return norm16|(norm16<<8);
 480             } else if(norm16>=minMaybeYes) {
 481                 return 0;
 482             } else {  // isDecompNoAlgorithmic(norm16)
 483                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
 484                 if (deltaTrailCC <= DELTA_TCCC_1) {
 485                     return deltaTrailCC >> OFFSET_SHIFT;
 486                 }
 487                 // Maps to an isCompYesAndZeroCC.
 488                 c=mapAlgorithmic(c, norm16);
 489                 norm16=getNorm16(c);
 490             }
 491         }
 492         if(norm16<=minYesNo || isHangulLVT(norm16)) {
 493             // no decomposition or Hangul syllable, all zeros
 494             return 0;
 495         }
 496         // c decomposes, get everything from the variable-length extra data
 497         int mapping=norm16>>OFFSET_SHIFT;
 498         int firstUnit=extraData.charAt(mapping);
 499         int fcd16=firstUnit>>8;  // tccc
 500         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 501             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
 502         }
 503         return fcd16;
 504     }
 505 
 506     /**
 507      * Gets the decomposition for one code point.
 508      * @param c code point
 509      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 510      */
 511     public String getDecomposition(int c) {
 512         int norm16;
 513         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 514             // c does not decompose
 515             return null;
 516         }
 517         int decomp = -1;
 518         if(isDecompNoAlgorithmic(norm16)) {
 519             // Maps to an isCompYesAndZeroCC.
 520             decomp=c=mapAlgorithmic(c, norm16);
 521             // The mapping might decompose further.
 522             norm16 = getNorm16(c);
 523         }
 524         if (norm16 < minYesNo) {
 525             if(decomp<0) {
 526                 return null;
 527             } else {
 528                 return UTF16.valueOf(decomp);
 529             }
 530         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 531             // Hangul syllable: decompose algorithmically
 532             StringBuilder buffer=new StringBuilder();
 533             Hangul.decompose(c, buffer);
 534             return buffer.toString();
 535         }
 536         // c decomposes, get everything from the variable-length extra data
 537         int mapping=norm16>>OFFSET_SHIFT;
 538         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
 539         return extraData.substring(mapping, mapping+length);
 540     }
 541 
 542     // Fixed norm16 values.


 624 
 625     // Dual functionality:
 626     // buffer!=NULL: normalize
 627     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
 628     public int decompose(CharSequence s, int src, int limit,
 629                          ReorderingBuffer buffer) {
 630         int minNoCP=minDecompNoCP;
 631 
 632         int prevSrc;
 633         int c=0;
 634         int norm16=0;
 635 
 636         // only for quick check
 637         int prevBoundary=src;
 638         int prevCC=0;
 639 
 640         for(;;) {
 641             // count code units below the minimum or with irrelevant data for the quick check
 642             for(prevSrc=src; src!=limit;) {
 643                 if( (c=s.charAt(src))<minNoCP ||
 644                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 645                 ) {
 646                     ++src;
 647                 } else if(!UTF16.isSurrogate((char)c)) {
 648                     break;
 649                 } else {
 650                     char c2;
 651                     if(UTF16Plus.isSurrogateLead(c)) {
 652                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
 653                             c=Character.toCodePoint((char)c, c2);
 654                         }
 655                     } else /* trail surrogate */ {
 656                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
 657                             --src;
 658                             c=Character.toCodePoint(c2, (char)c);
 659                         }
 660                     }
 661                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
 662                         src+=Character.charCount(c);
 663                     } else {
 664                         break;
 665                     }



 666                 }
 667             }
 668             // copy these code units all at once
 669             if(src!=prevSrc) {
 670                 if(buffer!=null) {
 671                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
 672                 } else {
 673                     prevCC=0;
 674                     prevBoundary=src;
 675                 }
 676             }
 677             if(src==limit) {
 678                 break;
 679             }
 680 
 681             // Check one above-minimum, relevant code point.
 682             src+=Character.charCount(c);
 683             if(buffer!=null) {
 684                 decompose(c, norm16, buffer);
 685             } else {


 704             return;
 705         }
 706         if(doDecompose) {
 707             decompose(s, 0, limit, buffer);
 708             return;
 709         }
 710         // Just merge the strings at the boundary.
 711         int c=Character.codePointAt(s, 0);
 712         int src=0;
 713         int firstCC, prevCC, cc;
 714         firstCC=prevCC=cc=getCC(getNorm16(c));
 715         while(cc!=0) {
 716             prevCC=cc;
 717             src+=Character.charCount(c);
 718             if(src>=limit) {
 719                 break;
 720             }
 721             c=Character.codePointAt(s, src);
 722             cc=getCC(getNorm16(c));
 723         };
 724         buffer.append(s, 0, src, firstCC, prevCC);
 725         buffer.append(s, src, limit);
 726     }
 727 
 728     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 729     // doCompose: normalize
 730     // !doCompose: isNormalized (buffer must be empty and initialized)
 731     public boolean compose(CharSequence s, int src, int limit,
 732                            boolean onlyContiguous,
 733                            boolean doCompose,
 734                            ReorderingBuffer buffer) {
 735         int prevBoundary=src;
 736         int minNoMaybeCP=minCompNoMaybeCP;
 737 
 738         for (;;) {
 739             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 740             // or with (compYes && ccc==0) properties.
 741             int prevSrc;
 742             int c = 0;
 743             int norm16 = 0;
 744             for (;;) {
 745                 if (src == limit) {
 746                     if (prevBoundary != limit && doCompose) {
 747                         buffer.append(s, prevBoundary, limit);
 748                     }
 749                     return true;
 750                 }
 751                 if( (c=s.charAt(src))<minNoMaybeCP ||
 752                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 753                 ) {
 754                     ++src;
 755                 } else {
 756                     prevSrc = src++;
 757                     if(!UTF16.isSurrogate((char)c)) {
 758                         break;
 759                     } else {
 760                         char c2;
 761                         if(UTF16Plus.isSurrogateLead(c)) {
 762                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
 763                                 ++src;
 764                                 c=Character.toCodePoint((char)c, c2);
 765                             }
 766                         } else /* trail surrogate */ {
 767                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
 768                                 --prevSrc;
 769                                 c=Character.toCodePoint(c2, (char)c);
 770                             }
 771                         }
 772                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
 773                             break;
 774                         }
 775                     }
 776                 }
 777             }

 778             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 779             // The current character is either a "noNo" (has a mapping)
 780             // or a "maybeYes" (combines backward)
 781             // or a "yesYes" with ccc!=0.
 782             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
 783 
 784             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
 785             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
 786                 if (!doCompose) {
 787                     return false;
 788                 }
 789                 // Fast path for mapping a character that is immediately surrounded by boundaries.
 790                 // In this case, we need not decompose around the current character.
 791                 if (isDecompNoAlgorithmic(norm16)) {
 792                     // Maps to a single isCompYesAndZeroCC character
 793                     // which also implies hasCompBoundaryBefore.
 794                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 795                             hasCompBoundaryBefore(s, src, limit)) {
 796                         if (prevBoundary != prevSrc) {
 797                             buffer.append(s, prevBoundary, prevSrc);


 974      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 975      *         then the quick check result is "no"
 976      */
 977     public int composeQuickCheck(CharSequence s, int src, int limit,
 978                                  boolean onlyContiguous, boolean doSpan) {
 979         int qcResult=0;
 980         int prevBoundary=src;
 981         int minNoMaybeCP=minCompNoMaybeCP;
 982 
 983         for(;;) {
 984             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 985             // or with (compYes && ccc==0) properties.
 986             int prevSrc;
 987             int c = 0;
 988             int norm16 = 0;
 989             for (;;) {
 990                 if(src==limit) {
 991                     return (src<<1)|qcResult;  // "yes" or "maybe"
 992                 }
 993                 if( (c=s.charAt(src))<minNoMaybeCP ||
 994                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
 995                 ) {
 996                     ++src;
 997                 } else {
 998                     prevSrc = src++;
 999                     if(!UTF16.isSurrogate((char)c)) {
1000                         break;
1001                     } else {
1002                         char c2;
1003                         if(UTF16Plus.isSurrogateLead(c)) {
1004                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1005                                 ++src;
1006                                 c=Character.toCodePoint((char)c, c2);
1007                             }
1008                         } else /* trail surrogate */ {
1009                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1010                                 --prevSrc;
1011                                 c=Character.toCodePoint(c2, (char)c);
1012                             }
1013                         }
1014                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1015                             break;
1016                         }
1017                     }
1018                 }
1019             }

1020             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1021             // The current character is either a "noNo" (has a mapping)
1022             // or a "maybeYes" (combines backward)
1023             // or a "yesYes" with ccc!=0.
1024             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1025 
1026             int prevNorm16 = INERT;
1027             if (prevBoundary != prevSrc) {
1028                 prevBoundary = prevSrc;
1029                 if (!norm16HasCompBoundaryBefore(norm16)) {
1030                     c = Character.codePointBefore(s, prevSrc);
1031                     int n16 = getNorm16(c);
1032                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1033                         prevBoundary -= Character.charCount(c);
1034                         prevNorm16 = n16;
1035                     }
1036                 }
1037             }
1038 
1039             if(isMaybeOrNonZeroCC(norm16)) {


1117         // in the normal way.
1118 
1119         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1120         // Similar to the prevBoundary in the compose() implementation.
1121         int prevBoundary=src;
1122         int prevSrc;
1123         int c=0;
1124         int prevFCD16=0;
1125         int fcd16=0;
1126 
1127         for(;;) {
1128             // count code units with lccc==0
1129             for(prevSrc=src; src!=limit;) {
1130                 if((c=s.charAt(src))<minLcccCP) {
1131                     prevFCD16=~c;
1132                     ++src;
1133                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1134                     prevFCD16=0;
1135                     ++src;
1136                 } else {
1137                     if(UTF16.isSurrogate((char)c)) {
1138                         char c2;
1139                         if(UTF16Plus.isSurrogateLead(c)) {
1140                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1141                                 c=Character.toCodePoint((char)c, c2);
1142                             }
1143                         } else /* trail surrogate */ {
1144                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1145                                 --src;
1146                                 c=Character.toCodePoint(c2, (char)c);
1147                             }
1148                         }
1149                     }
1150                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1151                         prevFCD16=fcd16;
1152                         src+=Character.charCount(c);
1153                     } else {
1154                         break;
1155                     }
1156                 }
1157             }
1158             // copy these code units all at once
1159             if(src!=prevSrc) {
1160                 if(src==limit) {
1161                     if(buffer!=null) {
1162                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1163                     }
1164                     break;
1165                 }
1166                 prevBoundary=src;
1167                 // We know that the previous character's lccc==0.


1413             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1414                 return src;
1415             }
1416             src+=Character.charCount(c);
1417             decompose(c, norm16, buffer);
1418             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1419                 return src;
1420             }
1421         }
1422         return src;
1423     }
1424     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1425         // get the decomposition and the lead and trail cc's
1426         if (norm16 >= limitNoNo) {
1427             if (isMaybeOrNonZeroCC(norm16)) {
1428                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1429                 return;
1430             }
1431             // Maps to an isCompYesAndZeroCC.
1432             c=mapAlgorithmic(c, norm16);
1433             norm16=getNorm16(c);
1434         }
1435         if (norm16 < minYesNo) {
1436             // c does not decompose
1437             buffer.append(c, 0);
1438         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1439             // Hangul syllable: decompose algorithmically
1440             Hangul.decompose(c, buffer);
1441         } else {
1442             // c decomposes, get everything from the variable-length extra data
1443             int mapping=norm16>>OFFSET_SHIFT;
1444             int firstUnit=extraData.charAt(mapping);
1445             int length=firstUnit&MAPPING_LENGTH_MASK;
1446             int leadCC, trailCC;
1447             trailCC=firstUnit>>8;
1448             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1449                 leadCC=extraData.charAt(mapping-1)>>8;
1450             } else {
1451                 leadCC=0;
1452             }
1453             ++mapping;  // skip over the firstUnit
1454             buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
1455         }
1456     }
1457 
1458     /**
1459      * Finds the recomposition result for
1460      * a forward-combining "lead" character,
1461      * specified with a pointer to its compositions list,
1462      * and a backward-combining "trail" character.
1463      *
1464      * <p>If the lead and trail characters combine, then this function returns
1465      * the following "compositeAndFwd" value:
1466      * <pre>
1467      * Bits 21..1  composite character
1468      * Bit      0  set if the composite is a forward-combining starter
1469      * </pre>
1470      * otherwise it returns -1.
1471      *
1472      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1473      * encoded as either pairs or triples of 16-bit units.
1474      * The last entry has the high bit of its first unit set.


1626                     } else if(composite>0xffff) {
1627                         // The composite is longer than the starter,
1628                         // move the intermediate characters back one.
1629                         starterIsSupplementary=true;
1630                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1631                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1632                         ++p;
1633                     } else {
1634                         // both are on the BMP
1635                         sb.setCharAt(starter, (char)composite);
1636                     }
1637 
1638                     // Keep prevCC because we removed the combining mark.
1639 
1640                     if(p==sb.length()) {
1641                         break;
1642                     }
1643                     // Is the composite a starter that combines forward?
1644                     if((compositeAndFwd&1)!=0) {
1645                         compositionsList=
1646                             getCompositionsListForComposite(getNorm16(composite));
1647                     } else {
1648                         compositionsList=-1;
1649                     }
1650 
1651                     // We combined; continue with looking for compositions.
1652                     continue;
1653                 }
1654             }
1655 
1656             // no combination this time
1657             prevCC=cc;
1658             if(p==sb.length()) {
1659                 break;
1660             }
1661 
1662             // If c did not combine, then check if it is a starter.
1663             if(cc==0) {
1664                 // Found a new starter.
1665                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1666                     // It may combine with something, prepare for it.


2179     }
2180 
2181     private VersionInfo dataVersion;
2182 
2183     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2184     private int minDecompNoCP;
2185     private int minCompNoMaybeCP;
2186     private int minLcccCP;
2187 
2188     // Norm16 value thresholds for quick check combinations and types of extra data.
2189     private int minYesNo;
2190     private int minYesNoMappingsOnly;
2191     private int minNoNo;
2192     private int minNoNoCompBoundaryBefore;
2193     private int minNoNoCompNoMaybeCC;
2194     private int minNoNoEmpty;
2195     private int limitNoNo;
2196     private int centerNoNoDelta;
2197     private int minMaybeYes;
2198 
2199     private Trie2_16 normTrie;
2200     private String maybeYesCompositions;
2201     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2202     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2203 
2204    }
   1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


 128         public int length() { return str.length(); }
 129         public int getLastCC() { return lastCC; }
 130 
 131         public StringBuilder getStringBuilder() { return str; }
 132 
 133         public boolean equals(CharSequence s, int start, int limit) {
 134             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
 135         }
 136 
 137         public void append(int c, int cc) {
 138             if(lastCC<=cc || cc==0) {
 139                 str.appendCodePoint(c);
 140                 lastCC=cc;
 141                 if(cc<=1) {
 142                     reorderStart=str.length();
 143                 }
 144             } else {
 145                 insert(c, cc);
 146             }
 147         }
 148         public void append(CharSequence s, int start, int limit, boolean isNFD,

 149                            int leadCC, int trailCC) {
 150             if(start==limit) {
 151                 return;
 152             }
 153             if(lastCC<=leadCC || leadCC==0) {
 154                 if(trailCC<=1) {
 155                     reorderStart=str.length()+(limit-start);
 156                 } else if(leadCC<=1) {
 157                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
 158                 }
 159                 str.append(s, start, limit);
 160                 lastCC=trailCC;
 161             } else {
 162                 int c=Character.codePointAt(s, start);
 163                 start+=Character.charCount(c);
 164                 insert(c, leadCC);  // insert first code point
 165                 while(start<limit) {
 166                     c=Character.codePointAt(s, start);
 167                     start+=Character.charCount(c);
 168                     if(start<limit) {
 169                         if (isNFD) {
 170                             leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
 171                         } else {
 172                             leadCC = impl.getCC(impl.getNorm16(c));
 173                         }
 174                     } else {
 175                         leadCC=trailCC;
 176                     }
 177                     append(c, leadCC);
 178                 }
 179             }
 180         }
 181         // The following append() methods work like C++ appendZeroCC().
 182         // They assume that the cc or trailCC of their input is 0.
 183         // Most of them implement Appendable interface methods.
 184         @Override
 185         public ReorderingBuffer append(char c) {
 186             str.append(c);
 187             lastCC=0;
 188             reorderStart=str.length();
 189             return this;
 190         }
 191         public void appendZeroCC(int c) {
 192             str.appendCodePoint(c);
 193             lastCC=0;


 296             codePointLimit=codePointStart;
 297             codePointStart=str.offsetByCodePoints(codePointStart, -1);
 298         }
 299         private int previousCC() {  // Returns 0 if there is no previous character.
 300             codePointLimit=codePointStart;
 301             if(reorderStart>=codePointStart) {
 302                 return 0;
 303             }
 304             int c=str.codePointBefore(codePointStart);
 305             codePointStart-=Character.charCount(c);
 306             return impl.getCCFromYesOrMaybeCP(c);
 307         }
 308         private int codePointStart, codePointLimit;
 309     }
 310 
 311     // TODO: Propose as public API on the UTF16 class.
 312     // TODO: Propose widening UTF16 methods that take char to take int.
 313     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
 314     public static final class UTF16Plus {
 315         /**
 316          * Is this code point a lead surrogate (U+d800..U+dbff)?
 317          * @param c code unit or code point
 318          * @return true or false
 319          */
 320         public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
 321         /**
 322          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
 323          * is it a lead surrogate?
 324          * @param c code unit or code point
 325          * @return true or false
 326          */
 327         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
 328 
 329         /**
 330          * Compares two CharSequence subsequences for binary equality.
 331          * @param s1 first sequence
 332          * @param start1 start offset in first sequence
 333          * @param limit1 limit offset in first sequence
 334          * @param s2 second sequence
 335          * @param start2 start offset in second sequence
 336          * @param limit2 limit offset in second sequence
 337          * @return true if s1.subSequence(start1, limit1) contains the same text
 338          *              as s2.subSequence(start2, limit2)
 339          */
 340         public static boolean equal(CharSequence s1, int start1, int limit1,
 341                                     CharSequence s2, int start2, int limit2) {
 342             if((limit1-start1)!=(limit2-start2)) {
 343                 return false;
 344             }
 345             if(s1==s2 && start1==start2) {
 346                 return true;
 347             }
 348             while(start1<limit1) {
 349                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
 350                     return false;
 351                 }
 352             }
 353             return true;
 354         }
 355     }
 356 
 357     public NormalizerImpl() {}
 358 
 359     private static final class IsAcceptable implements ICUBinary.Authenticate {
 360         public boolean isDataVersionAcceptable(byte version[]) {
 361             return version[0]==4;
 362         }
 363     }
 364     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
 365     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
 366 
 367     public NormalizerImpl load(ByteBuffer bytes) {
 368         try {
 369             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
 370             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
 371             if(indexesLength<=IX_MIN_LCCC_CP) {
 372                 throw new InternalError("Normalizer2 data: not enough indexes");
 373             }
 374             int[] inIndexes=new int[indexesLength];
 375             inIndexes[0]=indexesLength*4;
 376             for(int i=1; i<indexesLength; ++i) {
 377                 inIndexes[i]=bytes.getInt();
 378             }
 379 
 380             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
 381             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
 382             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
 383 
 384             minYesNo=inIndexes[IX_MIN_YES_NO];
 385             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
 386             minNoNo=inIndexes[IX_MIN_NO_NO];
 387             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
 388             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
 389             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
 390             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
 391             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
 392             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
 393             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
 394 
 395             // Read the normTrie.
 396             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
 397             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
 398             int triePosition = bytes.position();
 399             normTrie = CodePointTrie.Fast16.fromBinary(bytes);
 400             int trieLength = bytes.position() - triePosition;
 401             if(trieLength>(nextOffset-offset)) {
 402                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
 403             }
 404             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
 405 
 406             // Read the composition and mapping data.
 407             offset=nextOffset;
 408             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
 409             int numChars=(nextOffset-offset)/2;

 410             if(numChars!=0) {
 411                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);




 412                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
 413             }
 414 
 415             // smallFCD: new in formatVersion 2
 416             offset=nextOffset;
 417             smallFCD=new byte[0x100];
 418             bytes.get(smallFCD);
 419 
 420             return this;
 421         } catch(IOException e) {
 422             throw new InternalError(e);
 423         }
 424     }
 425     public NormalizerImpl load(String name) {
 426         return load(ICUBinary.getRequiredData(name));
 427     }
 428 
 429     // The trie stores values for lead surrogate code *units*.
 430     // Surrogate code *points* are inert.
 431     public int getNorm16(int c) {
 432         return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
 433     }
 434     public int getRawNorm16(int c) { return normTrie.get(c); }
 435     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
 436     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
 437     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
 438 
 439     public int getCC(int norm16) {
 440         if(norm16>=MIN_NORMAL_MAYBE_YES) {
 441             return getCCFromNormalYesOrMaybe(norm16);
 442         }
 443         if(norm16<minNoNo || limitNoNo<=norm16) {
 444             return 0;
 445         }
 446         return getCCFromNoNo(norm16);
 447     }
 448     public static int getCCFromNormalYesOrMaybe(int norm16) {
 449         return (norm16 >> OFFSET_SHIFT) & 0xff;
 450     }
 451     public static int getCCFromYesOrMaybe(int norm16) {
 452         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
 453     }
 454     public int getCCFromYesOrMaybeCP(int c) {


 477         return ((bits>>((lead>>5)&7))&1)!=0;
 478     }
 479 
 480     /** Gets the FCD value from the regular normalization data. */
 481     public int getFCD16FromNormData(int c) {
 482         int norm16=getNorm16(c);
 483         if (norm16 >= limitNoNo) {
 484             if(norm16>=MIN_NORMAL_MAYBE_YES) {
 485                 // combining mark
 486                 norm16=getCCFromNormalYesOrMaybe(norm16);
 487                 return norm16|(norm16<<8);
 488             } else if(norm16>=minMaybeYes) {
 489                 return 0;
 490             } else {  // isDecompNoAlgorithmic(norm16)
 491                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
 492                 if (deltaTrailCC <= DELTA_TCCC_1) {
 493                     return deltaTrailCC >> OFFSET_SHIFT;
 494                 }
 495                 // Maps to an isCompYesAndZeroCC.
 496                 c=mapAlgorithmic(c, norm16);
 497                 norm16=getRawNorm16(c);
 498             }
 499         }
 500         if(norm16<=minYesNo || isHangulLVT(norm16)) {
 501             // no decomposition or Hangul syllable, all zeros
 502             return 0;
 503         }
 504         // c decomposes, get everything from the variable-length extra data
 505         int mapping=norm16>>OFFSET_SHIFT;
 506         int firstUnit=extraData.charAt(mapping);
 507         int fcd16=firstUnit>>8;  // tccc
 508         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
 509             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
 510         }
 511         return fcd16;
 512     }
 513 
 514     /**
 515      * Gets the decomposition for one code point.
 516      * @param c code point
 517      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
 518      */
 519     public String getDecomposition(int c) {
 520         int norm16;
 521         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
 522             // c does not decompose
 523             return null;
 524         }
 525         int decomp = -1;
 526         if(isDecompNoAlgorithmic(norm16)) {
 527             // Maps to an isCompYesAndZeroCC.
 528             decomp=c=mapAlgorithmic(c, norm16);
 529             // The mapping might decompose further.
 530             norm16 = getRawNorm16(c);
 531         }
 532         if (norm16 < minYesNo) {
 533             if(decomp<0) {
 534                 return null;
 535             } else {
 536                 return UTF16.valueOf(decomp);
 537             }
 538         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
 539             // Hangul syllable: decompose algorithmically
 540             StringBuilder buffer=new StringBuilder();
 541             Hangul.decompose(c, buffer);
 542             return buffer.toString();
 543         }
 544         // c decomposes, get everything from the variable-length extra data
 545         int mapping=norm16>>OFFSET_SHIFT;
 546         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
 547         return extraData.substring(mapping, mapping+length);
 548     }
 549 
 550     // Fixed norm16 values.


 632 
 633     // Dual functionality:
 634     // buffer!=NULL: normalize
 635     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
 636     public int decompose(CharSequence s, int src, int limit,
 637                          ReorderingBuffer buffer) {
 638         int minNoCP=minDecompNoCP;
 639 
 640         int prevSrc;
 641         int c=0;
 642         int norm16=0;
 643 
 644         // only for quick check
 645         int prevBoundary=src;
 646         int prevCC=0;
 647 
 648         for(;;) {
 649             // count code units below the minimum or with irrelevant data for the quick check
 650             for(prevSrc=src; src!=limit;) {
 651                 if( (c=s.charAt(src))<minNoCP ||
 652                     isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 653                 ) {
 654                     ++src;
 655                 } else if(!UTF16Plus.isLeadSurrogate(c)) {
 656                     break;
 657                 } else {
 658                     char c2;
 659                     if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
 660                         c = Character.toCodePoint((char)c, c2);
 661                         norm16 = normTrie.suppGet(c);
 662                         if (isMostDecompYesAndZeroCC(norm16)) {
 663                             src += 2;







 664                         } else {
 665                             break;
 666                         }
 667                     } else {
 668                         ++src;  // unpaired lead surrogate: inert
 669                     }
 670                 }
 671             }
 672             // copy these code units all at once
 673             if(src!=prevSrc) {
 674                 if(buffer!=null) {
 675                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
 676                 } else {
 677                     prevCC=0;
 678                     prevBoundary=src;
 679                 }
 680             }
 681             if(src==limit) {
 682                 break;
 683             }
 684 
 685             // Check one above-minimum, relevant code point.
 686             src+=Character.charCount(c);
 687             if(buffer!=null) {
 688                 decompose(c, norm16, buffer);
 689             } else {


 708             return;
 709         }
 710         if(doDecompose) {
 711             decompose(s, 0, limit, buffer);
 712             return;
 713         }
 714         // Just merge the strings at the boundary.
 715         int c=Character.codePointAt(s, 0);
 716         int src=0;
 717         int firstCC, prevCC, cc;
 718         firstCC=prevCC=cc=getCC(getNorm16(c));
 719         while(cc!=0) {
 720             prevCC=cc;
 721             src+=Character.charCount(c);
 722             if(src>=limit) {
 723                 break;
 724             }
 725             c=Character.codePointAt(s, src);
 726             cc=getCC(getNorm16(c));
 727         };
 728         buffer.append(s, 0, src, false, firstCC, prevCC);
 729         buffer.append(s, src, limit);
 730     }
 731 
 732     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
 733     // doCompose: normalize
 734     // !doCompose: isNormalized (buffer must be empty and initialized)
 735     public boolean compose(CharSequence s, int src, int limit,
 736                            boolean onlyContiguous,
 737                            boolean doCompose,
 738                            ReorderingBuffer buffer) {
 739         int prevBoundary=src;
 740         int minNoMaybeCP=minCompNoMaybeCP;
 741 
 742         for (;;) {
 743             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 744             // or with (compYes && ccc==0) properties.
 745             int prevSrc;
 746             int c = 0;
 747             int norm16 = 0;
 748             for (;;) {
 749                 if (src == limit) {
 750                     if (prevBoundary != limit && doCompose) {
 751                         buffer.append(s, prevBoundary, limit);
 752                     }
 753                     return true;
 754                 }
 755                 if( (c=s.charAt(src))<minNoMaybeCP ||
 756                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 757                 ) {
 758                     ++src;
 759                 } else {
 760                     prevSrc = src++;
 761                     if (!UTF16Plus.isLeadSurrogate(c)) {
 762                         break;
 763                     } else {
 764                         char c2;
 765                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {

 766                             ++src;
 767                             c = Character.toCodePoint((char)c, c2);
 768                             norm16 = normTrie.suppGet(c);
 769                             if (!isCompYesAndZeroCC(norm16)) {






 770                                 break;
 771                             }
 772                         }
 773                     }
 774                 }
 775             }
 776             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
 777             // The current character is either a "noNo" (has a mapping)
 778             // or a "maybeYes" (combines backward)
 779             // or a "yesYes" with ccc!=0.
 780             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
 781 
 782             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
 783             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
 784                 if (!doCompose) {
 785                     return false;
 786                 }
 787                 // Fast path for mapping a character that is immediately surrounded by boundaries.
 788                 // In this case, we need not decompose around the current character.
 789                 if (isDecompNoAlgorithmic(norm16)) {
 790                     // Maps to a single isCompYesAndZeroCC character
 791                     // which also implies hasCompBoundaryBefore.
 792                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
 793                             hasCompBoundaryBefore(s, src, limit)) {
 794                         if (prevBoundary != prevSrc) {
 795                             buffer.append(s, prevBoundary, prevSrc);


 972      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
 973      *         then the quick check result is "no"
 974      */
 975     public int composeQuickCheck(CharSequence s, int src, int limit,
 976                                  boolean onlyContiguous, boolean doSpan) {
 977         int qcResult=0;
 978         int prevBoundary=src;
 979         int minNoMaybeCP=minCompNoMaybeCP;
 980 
 981         for(;;) {
 982             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
 983             // or with (compYes && ccc==0) properties.
 984             int prevSrc;
 985             int c = 0;
 986             int norm16 = 0;
 987             for (;;) {
 988                 if(src==limit) {
 989                     return (src<<1)|qcResult;  // "yes" or "maybe"
 990                 }
 991                 if( (c=s.charAt(src))<minNoMaybeCP ||
 992                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
 993                 ) {
 994                     ++src;
 995                 } else {
 996                     prevSrc = src++;
 997                     if (!UTF16Plus.isLeadSurrogate(c)) {
 998                         break;
 999                     } else {
1000                         char c2;
1001                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {

1002                             ++src;
1003                             c = Character.toCodePoint((char)c, c2);
1004                             norm16 = normTrie.suppGet(c);
1005                             if (!isCompYesAndZeroCC(norm16)) {






1006                                 break;
1007                             }
1008                         }
1009                     }
1010                 }
1011             }
1012             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1013             // The current character is either a "noNo" (has a mapping)
1014             // or a "maybeYes" (combines backward)
1015             // or a "yesYes" with ccc!=0.
1016             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1017 
1018             int prevNorm16 = INERT;
1019             if (prevBoundary != prevSrc) {
1020                 prevBoundary = prevSrc;
1021                 if (!norm16HasCompBoundaryBefore(norm16)) {
1022                     c = Character.codePointBefore(s, prevSrc);
1023                     int n16 = getNorm16(c);
1024                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1025                         prevBoundary -= Character.charCount(c);
1026                         prevNorm16 = n16;
1027                     }
1028                 }
1029             }
1030 
1031             if(isMaybeOrNonZeroCC(norm16)) {


1109         // in the normal way.
1110 
1111         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1112         // Similar to the prevBoundary in the compose() implementation.
1113         int prevBoundary=src;
1114         int prevSrc;
1115         int c=0;
1116         int prevFCD16=0;
1117         int fcd16=0;
1118 
1119         for(;;) {
1120             // count code units with lccc==0
1121             for(prevSrc=src; src!=limit;) {
1122                 if((c=s.charAt(src))<minLcccCP) {
1123                     prevFCD16=~c;
1124                     ++src;
1125                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1126                     prevFCD16=0;
1127                     ++src;
1128                 } else {
1129                     if (UTF16Plus.isLeadSurrogate(c)) {
1130                         char c2;
1131                         if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1132                             c = Character.toCodePoint((char)c, c2);







1133                         }
1134                     }
1135                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1136                         prevFCD16=fcd16;
1137                         src+=Character.charCount(c);
1138                     } else {
1139                         break;
1140                     }
1141                 }
1142             }
1143             // copy these code units all at once
1144             if(src!=prevSrc) {
1145                 if(src==limit) {
1146                     if(buffer!=null) {
1147                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1148                     }
1149                     break;
1150                 }
1151                 prevBoundary=src;
1152                 // We know that the previous character's lccc==0.


1398             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1399                 return src;
1400             }
1401             src+=Character.charCount(c);
1402             decompose(c, norm16, buffer);
1403             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1404                 return src;
1405             }
1406         }
1407         return src;
1408     }
1409     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1410         // get the decomposition and the lead and trail cc's
1411         if (norm16 >= limitNoNo) {
1412             if (isMaybeOrNonZeroCC(norm16)) {
1413                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1414                 return;
1415             }
1416             // Maps to an isCompYesAndZeroCC.
1417             c=mapAlgorithmic(c, norm16);
1418             norm16=getRawNorm16(c);
1419         }
1420         if (norm16 < minYesNo) {
1421             // c does not decompose
1422             buffer.append(c, 0);
1423         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1424             // Hangul syllable: decompose algorithmically
1425             Hangul.decompose(c, buffer);
1426         } else {
1427             // c decomposes, get everything from the variable-length extra data
1428             int mapping=norm16>>OFFSET_SHIFT;
1429             int firstUnit=extraData.charAt(mapping);
1430             int length=firstUnit&MAPPING_LENGTH_MASK;
1431             int leadCC, trailCC;
1432             trailCC=firstUnit>>8;
1433             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1434                 leadCC=extraData.charAt(mapping-1)>>8;
1435             } else {
1436                 leadCC=0;
1437             }
1438             ++mapping;  // skip over the firstUnit
1439             buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
1440         }
1441     }
1442 
1443     /**
1444      * Finds the recomposition result for
1445      * a forward-combining "lead" character,
1446      * specified with a pointer to its compositions list,
1447      * and a backward-combining "trail" character.
1448      *
1449      * <p>If the lead and trail characters combine, then this function returns
1450      * the following "compositeAndFwd" value:
1451      * <pre>
1452      * Bits 21..1  composite character
1453      * Bit      0  set if the composite is a forward-combining starter
1454      * </pre>
1455      * otherwise it returns -1.
1456      *
1457      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1458      * encoded as either pairs or triples of 16-bit units.
1459      * The last entry has the high bit of its first unit set.


1611                     } else if(composite>0xffff) {
1612                         // The composite is longer than the starter,
1613                         // move the intermediate characters back one.
1614                         starterIsSupplementary=true;
1615                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1616                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1617                         ++p;
1618                     } else {
1619                         // both are on the BMP
1620                         sb.setCharAt(starter, (char)composite);
1621                     }
1622 
1623                     // Keep prevCC because we removed the combining mark.
1624 
1625                     if(p==sb.length()) {
1626                         break;
1627                     }
1628                     // Is the composite a starter that combines forward?
1629                     if((compositeAndFwd&1)!=0) {
1630                         compositionsList=
1631                             getCompositionsListForComposite(getRawNorm16(composite));
1632                     } else {
1633                         compositionsList=-1;
1634                     }
1635 
1636                     // We combined; continue with looking for compositions.
1637                     continue;
1638                 }
1639             }
1640 
1641             // no combination this time
1642             prevCC=cc;
1643             if(p==sb.length()) {
1644                 break;
1645             }
1646 
1647             // If c did not combine, then check if it is a starter.
1648             if(cc==0) {
1649                 // Found a new starter.
1650                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1651                     // It may combine with something, prepare for it.


2164     }
2165 
2166     private VersionInfo dataVersion;
2167 
2168     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2169     private int minDecompNoCP;
2170     private int minCompNoMaybeCP;
2171     private int minLcccCP;
2172 
2173     // Norm16 value thresholds for quick check combinations and types of extra data.
2174     private int minYesNo;
2175     private int minYesNoMappingsOnly;
2176     private int minNoNo;
2177     private int minNoNoCompBoundaryBefore;
2178     private int minNoNoCompNoMaybeCC;
2179     private int minNoNoEmpty;
2180     private int limitNoNo;
2181     private int centerNoNoDelta;
2182     private int minMaybeYes;
2183 
2184     private CodePointTrie.Fast16 normTrie;
2185     private String maybeYesCompositions;
2186     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2187     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2188 }

< prev index next >