1 /* 2 * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 package sun.text.normalizer; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.text.Normalizer; 37 38 // Original filename in ICU4J: Normalizer2Impl.java 39 public final class NormalizerImpl { 40 public static final class Hangul { 41 /* Korean Hangul and Jamo constants */ 42 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 43 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 44 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 45 46 public static final int HANGUL_BASE=0xac00; 47 public static final int HANGUL_END=0xd7a3; 48 49 public static final int JAMO_L_COUNT=19; 50 public static final int JAMO_V_COUNT=21; 51 public static final int JAMO_T_COUNT=28; 52 53 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 54 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 55 56 public static boolean isHangul(int c) { 57 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 58 } 59 public static boolean isHangulLV(int c) { 60 c-=HANGUL_BASE; 61 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 62 } 63 64 /** 65 * Decomposes c, which must be a Hangul syllable, into buffer 66 * and returns the length of the decomposition (2 or 3). 67 */ 68 public static int decompose(int c, Appendable buffer) { 69 try { 70 c-=HANGUL_BASE; 71 int c2=c%JAMO_T_COUNT; 72 c/=JAMO_T_COUNT; 73 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 74 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 75 if(c2==0) { 76 return 2; 77 } else { 78 buffer.append((char)(JAMO_T_BASE+c2)); 79 return 3; 80 } 81 } catch(IOException e) { 82 throw new InternalError(e); 83 } 84 } 85 } 86 87 /** 88 * Writable buffer that takes care of canonical ordering. 89 * Its Appendable methods behave like the C++ implementation's 90 * appendZeroCC() methods. 91 * <p> 92 * If dest is a StringBuilder, then the buffer writes directly to it. 93 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 94 * until no further changes are necessary and whole segments are appended. 95 * append() methods that take combining-class values always write to the StringBuilder. 96 * Other append() methods flush and append to the Appendable. 97 */ 98 public static final class ReorderingBuffer implements Appendable { 99 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { 100 impl=ni; 101 app=dest; 102 if (app instanceof StringBuilder) { 103 appIsStringBuilder=true; 104 str=(StringBuilder)dest; 105 // In Java, the constructor subsumes public void init(int destCapacity) 106 str.ensureCapacity(destCapacity); 107 reorderStart=0; 108 if(str.length()==0) { 109 lastCC=0; 110 } else { 111 setIterator(); 112 lastCC=previousCC(); 113 // Set reorderStart after the last code point with cc<=1 if there is one. 114 if(lastCC>1) { 115 while(previousCC()>1) {} 116 } 117 reorderStart=codePointLimit; 118 } 119 } else { 120 appIsStringBuilder=false; 121 str=new StringBuilder(); 122 reorderStart=0; 123 lastCC=0; 124 } 125 } 126 127 public boolean isEmpty() { return str.length()==0; } 128 public int length() { return str.length(); } 129 public int getLastCC() { return lastCC; } 130 131 public StringBuilder getStringBuilder() { return str; } 132 133 public boolean equals(CharSequence s, int start, int limit) { 134 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 135 } 136 137 public void append(int c, int cc) { 138 if(lastCC<=cc || cc==0) { 139 str.appendCodePoint(c); 140 lastCC=cc; 141 if(cc<=1) { 142 reorderStart=str.length(); 143 } 144 } else { 145 insert(c, cc); 146 } 147 } 148 // s must be in NFD, otherwise change the implementation. 149 public void append(CharSequence s, int start, int limit, 150 int leadCC, int trailCC) { 151 if(start==limit) { 152 return; 153 } 154 if(lastCC<=leadCC || leadCC==0) { 155 if(trailCC<=1) { 156 reorderStart=str.length()+(limit-start); 157 } else if(leadCC<=1) { 158 reorderStart=str.length()+1; // Ok if not a code point boundary. 159 } 160 str.append(s, start, limit); 161 lastCC=trailCC; 162 } else { 163 int c=Character.codePointAt(s, start); 164 start+=Character.charCount(c); 165 insert(c, leadCC); // insert first code point 166 while(start<limit) { 167 c=Character.codePointAt(s, start); 168 start+=Character.charCount(c); 169 if(start<limit) { 170 // s must be in NFD, otherwise we need to use getCC(). 171 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 172 } else { 173 leadCC=trailCC; 174 } 175 append(c, leadCC); 176 } 177 } 178 } 179 // The following append() methods work like C++ appendZeroCC(). 180 // They assume that the cc or trailCC of their input is 0. 181 // Most of them implement Appendable interface methods. 182 @Override 183 public ReorderingBuffer append(char c) { 184 str.append(c); 185 lastCC=0; 186 reorderStart=str.length(); 187 return this; 188 } 189 public void appendZeroCC(int c) { 190 str.appendCodePoint(c); 191 lastCC=0; 192 reorderStart=str.length(); 193 } 194 @Override 195 public ReorderingBuffer append(CharSequence s) { 196 if(s.length()!=0) { 197 str.append(s); 198 lastCC=0; 199 reorderStart=str.length(); 200 } 201 return this; 202 } 203 @Override 204 public ReorderingBuffer append(CharSequence s, int start, int limit) { 205 if(start!=limit) { 206 str.append(s, start, limit); 207 lastCC=0; 208 reorderStart=str.length(); 209 } 210 return this; 211 } 212 /** 213 * Flushes from the intermediate StringBuilder to the Appendable, 214 * if they are different objects. 215 * Used after recomposition. 216 * Must be called at the end when writing to a non-StringBuilder Appendable. 217 */ 218 public void flush() { 219 if(appIsStringBuilder) { 220 reorderStart=str.length(); 221 } else { 222 try { 223 app.append(str); 224 str.setLength(0); 225 reorderStart=0; 226 } catch(IOException e) { 227 throw new InternalError(e); // Avoid declaring "throws IOException". 228 } 229 } 230 lastCC=0; 231 } 232 /** 233 * Flushes from the intermediate StringBuilder to the Appendable, 234 * if they are different objects. 235 * Then appends the new text to the Appendable or StringBuilder. 236 * Normally used after quick check loops find a non-empty sequence. 237 */ 238 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 239 if(appIsStringBuilder) { 240 str.append(s, start, limit); 241 reorderStart=str.length(); 242 } else { 243 try { 244 app.append(str).append(s, start, limit); 245 str.setLength(0); 246 reorderStart=0; 247 } catch(IOException e) { 248 throw new InternalError(e); // Avoid declaring "throws IOException". 249 } 250 } 251 lastCC=0; 252 return this; 253 } 254 public void remove() { 255 str.setLength(0); 256 lastCC=0; 257 reorderStart=0; 258 } 259 public void removeSuffix(int suffixLength) { 260 int oldLength=str.length(); 261 str.delete(oldLength-suffixLength, oldLength); 262 lastCC=0; 263 reorderStart=str.length(); 264 } 265 266 // Inserts c somewhere before the last character. 267 // Requires 0<cc<lastCC which implies reorderStart<limit. 268 private void insert(int c, int cc) { 269 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 270 // insert c at codePointLimit, after the character with prevCC<=cc 271 if(c<=0xffff) { 272 str.insert(codePointLimit, (char)c); 273 if(cc<=1) { 274 reorderStart=codePointLimit+1; 275 } 276 } else { 277 str.insert(codePointLimit, Character.toChars(c)); 278 if(cc<=1) { 279 reorderStart=codePointLimit+2; 280 } 281 } 282 } 283 284 private final NormalizerImpl impl; 285 private final Appendable app; 286 private final StringBuilder str; 287 private final boolean appIsStringBuilder; 288 private int reorderStart; 289 private int lastCC; 290 291 // private backward iterator 292 private void setIterator() { codePointStart=str.length(); } 293 private void skipPrevious() { // Requires 0<codePointStart. 294 codePointLimit=codePointStart; 295 codePointStart=str.offsetByCodePoints(codePointStart, -1); 296 } 297 private int previousCC() { // Returns 0 if there is no previous character. 298 codePointLimit=codePointStart; 299 if(reorderStart>=codePointStart) { 300 return 0; 301 } 302 int c=str.codePointBefore(codePointStart); 303 codePointStart-=Character.charCount(c); 304 return impl.getCCFromYesOrMaybeCP(c); 305 } 306 private int codePointStart, codePointLimit; 307 } 308 309 // TODO: Propose as public API on the UTF16 class. 310 // TODO: Propose widening UTF16 methods that take char to take int. 311 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 312 public static final class UTF16Plus { 313 /** 314 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 315 * is it a lead surrogate? 316 * @param c code unit or code point 317 * @return true or false 318 */ 319 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 320 321 /** 322 * Compares two CharSequence subsequences for binary equality. 323 * @param s1 first sequence 324 * @param start1 start offset in first sequence 325 * @param limit1 limit offset in first sequence 326 * @param s2 second sequence 327 * @param start2 start offset in second sequence 328 * @param limit2 limit offset in second sequence 329 * @return true if s1.subSequence(start1, limit1) contains the same text 330 * as s2.subSequence(start2, limit2) 331 */ 332 public static boolean equal(CharSequence s1, int start1, int limit1, 333 CharSequence s2, int start2, int limit2) { 334 if((limit1-start1)!=(limit2-start2)) { 335 return false; 336 } 337 if(s1==s2 && start1==start2) { 338 return true; 339 } 340 while(start1<limit1) { 341 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 342 return false; 343 } 344 } 345 return true; 346 } 347 } 348 349 public NormalizerImpl() {} 350 351 private static final class IsAcceptable implements ICUBinary.Authenticate { 352 public boolean isDataVersionAcceptable(byte[] version) { 353 return version[0]==3; 354 } 355 } 356 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 357 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 358 359 public NormalizerImpl load(ByteBuffer bytes) { 360 try { 361 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 362 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 363 if(indexesLength<=IX_MIN_LCCC_CP) { 364 throw new InternalError("Normalizer2 data: not enough indexes"); 365 } 366 int[] inIndexes=new int[indexesLength]; 367 inIndexes[0]=indexesLength*4; 368 for(int i=1; i<indexesLength; ++i) { 369 inIndexes[i]=bytes.getInt(); 370 } 371 372 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 373 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 374 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 375 376 minYesNo=inIndexes[IX_MIN_YES_NO]; 377 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 378 minNoNo=inIndexes[IX_MIN_NO_NO]; 379 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 380 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 381 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 382 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 383 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 384 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 385 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 386 387 // Read the normTrie. 388 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 389 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 390 normTrie=Trie2_16.createFromSerialized(bytes); 391 int trieLength=normTrie.getSerializedLength(); 392 if(trieLength>(nextOffset-offset)) { 393 throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); 394 } 395 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 396 397 // Read the composition and mapping data. 398 offset=nextOffset; 399 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 400 int numChars=(nextOffset-offset)/2; 401 char[] chars; 402 if(numChars!=0) { 403 chars=new char[numChars]; 404 for(int i=0; i<numChars; ++i) { 405 chars[i]=bytes.getChar(); 406 } 407 maybeYesCompositions=new String(chars); 408 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 409 } 410 411 // smallFCD: new in formatVersion 2 412 offset=nextOffset; 413 smallFCD=new byte[0x100]; 414 bytes.get(smallFCD); 415 416 return this; 417 } catch(IOException e) { 418 throw new InternalError(e); 419 } 420 } 421 public NormalizerImpl load(String name) { 422 return load(ICUBinary.getRequiredData(name)); 423 } 424 425 426 public int getNorm16(int c) { return normTrie.get(c); } 427 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 428 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 429 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 430 431 public int getCC(int norm16) { 432 if(norm16>=MIN_NORMAL_MAYBE_YES) { 433 return getCCFromNormalYesOrMaybe(norm16); 434 } 435 if(norm16<minNoNo || limitNoNo<=norm16) { 436 return 0; 437 } 438 return getCCFromNoNo(norm16); 439 } 440 public static int getCCFromNormalYesOrMaybe(int norm16) { 441 return (norm16 >> OFFSET_SHIFT) & 0xff; 442 } 443 public static int getCCFromYesOrMaybe(int norm16) { 444 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 445 } 446 public int getCCFromYesOrMaybeCP(int c) { 447 if (c < minCompNoMaybeCP) { return 0; } 448 return getCCFromYesOrMaybe(getNorm16(c)); 449 } 450 451 /** 452 * Returns the FCD data for code point c. 453 * @param c A Unicode code point. 454 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 455 */ 456 public int getFCD16(int c) { 457 if(c<minDecompNoCP) { 458 return 0; 459 } else if(c<=0xffff) { 460 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 461 } 462 return getFCD16FromNormData(c); 463 } 464 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 465 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 466 // 0<=lead<=0xffff 467 byte bits=smallFCD[lead>>8]; 468 if(bits==0) { return false; } 469 return ((bits>>((lead>>5)&7))&1)!=0; 470 } 471 472 /** Gets the FCD value from the regular normalization data. */ 473 public int getFCD16FromNormData(int c) { 474 int norm16=getNorm16(c); 475 if (norm16 >= limitNoNo) { 476 if(norm16>=MIN_NORMAL_MAYBE_YES) { 477 // combining mark 478 norm16=getCCFromNormalYesOrMaybe(norm16); 479 return norm16|(norm16<<8); 480 } else if(norm16>=minMaybeYes) { 481 return 0; 482 } else { // isDecompNoAlgorithmic(norm16) 483 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 484 if (deltaTrailCC <= DELTA_TCCC_1) { 485 return deltaTrailCC >> OFFSET_SHIFT; 486 } 487 // Maps to an isCompYesAndZeroCC. 488 c=mapAlgorithmic(c, norm16); 489 norm16=getNorm16(c); 490 } 491 } 492 if(norm16<=minYesNo || isHangulLVT(norm16)) { 493 // no decomposition or Hangul syllable, all zeros 494 return 0; 495 } 496 // c decomposes, get everything from the variable-length extra data 497 int mapping=norm16>>OFFSET_SHIFT; 498 int firstUnit=extraData.charAt(mapping); 499 int fcd16=firstUnit>>8; // tccc 500 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 501 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 502 } 503 return fcd16; 504 } 505 506 /** 507 * Gets the decomposition for one code point. 508 * @param c code point 509 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 510 */ 511 public String getDecomposition(int c) { 512 int norm16; 513 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 514 // c does not decompose 515 return null; 516 } 517 int decomp = -1; 518 if(isDecompNoAlgorithmic(norm16)) { 519 // Maps to an isCompYesAndZeroCC. 520 decomp=c=mapAlgorithmic(c, norm16); 521 // The mapping might decompose further. 522 norm16 = getNorm16(c); 523 } 524 if (norm16 < minYesNo) { 525 if(decomp<0) { 526 return null; 527 } else { 528 return UTF16.valueOf(decomp); 529 } 530 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 531 // Hangul syllable: decompose algorithmically 532 StringBuilder buffer=new StringBuilder(); 533 Hangul.decompose(c, buffer); 534 return buffer.toString(); 535 } 536 // c decomposes, get everything from the variable-length extra data 537 int mapping=norm16>>OFFSET_SHIFT; 538 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 539 return extraData.substring(mapping, mapping+length); 540 } 541 542 // Fixed norm16 values. 543 public static final int MIN_YES_YES_WITH_CC=0xfe02; 544 public static final int JAMO_VT=0xfe00; 545 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 546 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 547 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 548 549 // norm16 bit 0 is comp-boundary-after. 550 public static final int HAS_COMP_BOUNDARY_AFTER=1; 551 public static final int OFFSET_SHIFT=1; 552 553 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 554 // tccc (0, 1, >1) for quick FCC boundary-after tests. 555 public static final int DELTA_TCCC_0=0; 556 public static final int DELTA_TCCC_1=2; 557 public static final int DELTA_TCCC_GT_1=4; 558 public static final int DELTA_TCCC_MASK=6; 559 public static final int DELTA_SHIFT=3; 560 561 public static final int MAX_DELTA=0x40; 562 563 // Byte offsets from the start of the data, after the generic header. 564 public static final int IX_NORM_TRIE_OFFSET=0; 565 public static final int IX_EXTRA_DATA_OFFSET=1; 566 public static final int IX_SMALL_FCD_OFFSET=2; 567 public static final int IX_RESERVED3_OFFSET=3; 568 public static final int IX_TOTAL_SIZE=7; 569 public static final int MIN_CCC_LCCC_CP=0x300; 570 // Code point thresholds for quick check codes. 571 public static final int IX_MIN_DECOMP_NO_CP=8; 572 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 573 574 // Norm16 value thresholds for quick check combinations and types of extra data. 575 576 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 577 public static final int IX_MIN_YES_NO=10; 578 /** Mappings are comp-normalized. */ 579 public static final int IX_MIN_NO_NO=11; 580 public static final int IX_LIMIT_NO_NO=12; 581 public static final int IX_MIN_MAYBE_YES=13; 582 583 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 584 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 585 /** Mappings are not comp-normalized but have a comp boundary before. */ 586 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 587 /** Mappings do not have a comp boundary before. */ 588 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 589 /** Mappings to the empty string. */ 590 public static final int IX_MIN_NO_NO_EMPTY=17; 591 592 public static final int IX_MIN_LCCC_CP=18; 593 public static final int IX_COUNT=20; 594 595 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 596 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 597 // unused bit 0x20; 598 public static final int MAPPING_LENGTH_MASK=0x1f; 599 600 public static final int COMP_1_LAST_TUPLE=0x8000; 601 public static final int COMP_1_TRIPLE=1; 602 public static final int COMP_1_TRAIL_LIMIT=0x3400; 603 public static final int COMP_1_TRAIL_MASK=0x7ffe; 604 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 605 public static final int COMP_2_TRAIL_SHIFT=6; 606 public static final int COMP_2_TRAIL_MASK=0xffc0; 607 608 // higher-level functionality ------------------------------------------ *** 609 610 /** 611 * Decomposes s[src, limit[ and writes the result to dest. 612 * limit can be NULL if src is NUL-terminated. 613 * destLengthEstimate is the initial dest buffer capacity and can be -1. 614 */ 615 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 616 int destLengthEstimate) { 617 if(destLengthEstimate<0) { 618 destLengthEstimate=limit-src; 619 } 620 dest.setLength(0); 621 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 622 decompose(s, src, limit, buffer); 623 } 624 625 // Dual functionality: 626 // buffer!=NULL: normalize 627 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 628 public int decompose(CharSequence s, int src, int limit, 629 ReorderingBuffer buffer) { 630 int minNoCP=minDecompNoCP; 631 632 int prevSrc; 633 int c=0; 634 int norm16=0; 635 636 // only for quick check 637 int prevBoundary=src; 638 int prevCC=0; 639 640 for(;;) { 641 // count code units below the minimum or with irrelevant data for the quick check 642 for(prevSrc=src; src!=limit;) { 643 if( (c=s.charAt(src))<minNoCP || 644 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 645 ) { 646 ++src; 647 } else if(!UTF16.isSurrogate((char)c)) { 648 break; 649 } else { 650 char c2; 651 if(UTF16Plus.isSurrogateLead(c)) { 652 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 653 c=Character.toCodePoint((char)c, c2); 654 } 655 } else /* trail surrogate */ { 656 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 657 --src; 658 c=Character.toCodePoint(c2, (char)c); 659 } 660 } 661 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 662 src+=Character.charCount(c); 663 } else { 664 break; 665 } 666 } 667 } 668 // copy these code units all at once 669 if(src!=prevSrc) { 670 if(buffer!=null) { 671 buffer.flushAndAppendZeroCC(s, prevSrc, src); 672 } else { 673 prevCC=0; 674 prevBoundary=src; 675 } 676 } 677 if(src==limit) { 678 break; 679 } 680 681 // Check one above-minimum, relevant code point. 682 src+=Character.charCount(c); 683 if(buffer!=null) { 684 decompose(c, norm16, buffer); 685 } else { 686 if(isDecompYes(norm16)) { 687 int cc=getCCFromYesOrMaybe(norm16); 688 if(prevCC<=cc || cc==0) { 689 prevCC=cc; 690 if(cc<=1) { 691 prevBoundary=src; 692 } 693 continue; 694 } 695 } 696 return prevBoundary; // "no" or cc out of order 697 } 698 } 699 return src; 700 } 701 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 702 int limit=s.length(); 703 if(limit==0) { 704 return; 705 } 706 if(doDecompose) { 707 decompose(s, 0, limit, buffer); 708 return; 709 } 710 // Just merge the strings at the boundary. 711 int c=Character.codePointAt(s, 0); 712 int src=0; 713 int firstCC, prevCC, cc; 714 firstCC=prevCC=cc=getCC(getNorm16(c)); 715 while(cc!=0) { 716 prevCC=cc; 717 src+=Character.charCount(c); 718 if(src>=limit) { 719 break; 720 } 721 c=Character.codePointAt(s, src); 722 cc=getCC(getNorm16(c)); 723 }; 724 buffer.append(s, 0, src, firstCC, prevCC); 725 buffer.append(s, src, limit); 726 } 727 728 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 729 // doCompose: normalize 730 // !doCompose: isNormalized (buffer must be empty and initialized) 731 public boolean compose(CharSequence s, int src, int limit, 732 boolean onlyContiguous, 733 boolean doCompose, 734 ReorderingBuffer buffer) { 735 int prevBoundary=src; 736 int minNoMaybeCP=minCompNoMaybeCP; 737 738 for (;;) { 739 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 740 // or with (compYes && ccc==0) properties. 741 int prevSrc; 742 int c = 0; 743 int norm16 = 0; 744 for (;;) { 745 if (src == limit) { 746 if (prevBoundary != limit && doCompose) { 747 buffer.append(s, prevBoundary, limit); 748 } 749 return true; 750 } 751 if( (c=s.charAt(src))<minNoMaybeCP || 752 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 753 ) { 754 ++src; 755 } else { 756 prevSrc = src++; 757 if(!UTF16.isSurrogate((char)c)) { 758 break; 759 } else { 760 char c2; 761 if(UTF16Plus.isSurrogateLead(c)) { 762 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 763 ++src; 764 c=Character.toCodePoint((char)c, c2); 765 } 766 } else /* trail surrogate */ { 767 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 768 --prevSrc; 769 c=Character.toCodePoint(c2, (char)c); 770 } 771 } 772 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 773 break; 774 } 775 } 776 } 777 } 778 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 779 // The current character is either a "noNo" (has a mapping) 780 // or a "maybeYes" (combines backward) 781 // or a "yesYes" with ccc!=0. 782 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 783 784 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 785 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 786 if (!doCompose) { 787 return false; 788 } 789 // Fast path for mapping a character that is immediately surrounded by boundaries. 790 // In this case, we need not decompose around the current character. 791 if (isDecompNoAlgorithmic(norm16)) { 792 // Maps to a single isCompYesAndZeroCC character 793 // which also implies hasCompBoundaryBefore. 794 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 795 hasCompBoundaryBefore(s, src, limit)) { 796 if (prevBoundary != prevSrc) { 797 buffer.append(s, prevBoundary, prevSrc); 798 } 799 buffer.append(mapAlgorithmic(c, norm16), 0); 800 prevBoundary = src; 801 continue; 802 } 803 } else if (norm16 < minNoNoCompBoundaryBefore) { 804 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 805 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 806 hasCompBoundaryBefore(s, src, limit)) { 807 if (prevBoundary != prevSrc) { 808 buffer.append(s, prevBoundary, prevSrc); 809 } 810 int mapping = norm16 >> OFFSET_SHIFT; 811 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 812 buffer.append(extraData, mapping, mapping + length); 813 prevBoundary = src; 814 continue; 815 } 816 } else if (norm16 >= minNoNoEmpty) { 817 // The current character maps to nothing. 818 // Simply omit it from the output if there is a boundary before _or_ after it. 819 // The character itself implies no boundaries. 820 if (hasCompBoundaryBefore(s, src, limit) || 821 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 822 if (prevBoundary != prevSrc) { 823 buffer.append(s, prevBoundary, prevSrc); 824 } 825 prevBoundary = src; 826 continue; 827 } 828 } 829 // Other "noNo" type, or need to examine more text around this character: 830 // Fall through to the slow path. 831 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 832 char prev=s.charAt(prevSrc-1); 833 if(c<Hangul.JAMO_T_BASE) { 834 // The current character is a Jamo Vowel, 835 // compose with previous Jamo L and following Jamo T. 836 char l = (char)(prev-Hangul.JAMO_L_BASE); 837 if(l<Hangul.JAMO_L_COUNT) { 838 if (!doCompose) { 839 return false; 840 } 841 int t; 842 if (src != limit && 843 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 844 t < Hangul.JAMO_T_COUNT) { 845 // The next character is a Jamo T. 846 ++src; 847 } else if (hasCompBoundaryBefore(s, src, limit)) { 848 // No Jamo T follows, not even via decomposition. 849 t = 0; 850 } else { 851 t = -1; 852 } 853 if (t >= 0) { 854 int syllable = Hangul.HANGUL_BASE + 855 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 856 Hangul.JAMO_T_COUNT + t; 857 --prevSrc; // Replace the Jamo L as well. 858 if (prevBoundary != prevSrc) { 859 buffer.append(s, prevBoundary, prevSrc); 860 } 861 buffer.append((char)syllable); 862 prevBoundary = src; 863 continue; 864 } 865 // If we see L+V+x where x!=T then we drop to the slow path, 866 // decompose and recompose. 867 // This is to deal with NFKC finding normal L and V but a 868 // compatibility variant of a T. 869 // We need to either fully compose that combination here 870 // (which would complicate the code and may not work with strange custom data) 871 // or use the slow path. 872 } 873 } else if (Hangul.isHangulLV(prev)) { 874 // The current character is a Jamo Trailing consonant, 875 // compose with previous Hangul LV that does not contain a Jamo T. 876 if (!doCompose) { 877 return false; 878 } 879 int syllable = prev + c - Hangul.JAMO_T_BASE; 880 --prevSrc; // Replace the Hangul LV as well. 881 if (prevBoundary != prevSrc) { 882 buffer.append(s, prevBoundary, prevSrc); 883 } 884 buffer.append((char)syllable); 885 prevBoundary = src; 886 continue; 887 } 888 // No matching context, or may need to decompose surrounding text first: 889 // Fall through to the slow path. 890 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 891 // One or more combining marks that do not combine-back: 892 // Check for canonical order, copy unchanged if ok and 893 // if followed by a character with a boundary-before. 894 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 895 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 896 // Fails FCD test, need to decompose and contiguously recompose. 897 if (!doCompose) { 898 return false; 899 } 900 } else { 901 // If !onlyContiguous (not FCC), then we ignore the tccc of 902 // the previous character which passed the quick check "yes && ccc==0" test. 903 int n16; 904 for (;;) { 905 if (src == limit) { 906 if (doCompose) { 907 buffer.append(s, prevBoundary, limit); 908 } 909 return true; 910 } 911 int prevCC = cc; 912 c = Character.codePointAt(s, src); 913 n16 = normTrie.get(c); 914 if (n16 >= MIN_YES_YES_WITH_CC) { 915 cc = getCCFromNormalYesOrMaybe(n16); 916 if (prevCC > cc) { 917 if (!doCompose) { 918 return false; 919 } 920 break; 921 } 922 } else { 923 break; 924 } 925 src += Character.charCount(c); 926 } 927 // p is after the last in-order combining mark. 928 // If there is a boundary here, then we continue with no change. 929 if (norm16HasCompBoundaryBefore(n16)) { 930 if (isCompYesAndZeroCC(n16)) { 931 src += Character.charCount(c); 932 } 933 continue; 934 } 935 // Use the slow path. There is no boundary in [prevSrc, src[. 936 } 937 } 938 939 // Slow path: Find the nearest boundaries around the current character, 940 // decompose and recompose. 941 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 942 c = Character.codePointBefore(s, prevSrc); 943 norm16 = normTrie.get(c); 944 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 945 prevSrc -= Character.charCount(c); 946 } 947 } 948 if (doCompose && prevBoundary != prevSrc) { 949 buffer.append(s, prevBoundary, prevSrc); 950 } 951 int recomposeStartIndex=buffer.length(); 952 // We know there is not a boundary here. 953 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 954 buffer); 955 // Decompose until the next boundary. 956 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 957 buffer); 958 recompose(buffer, recomposeStartIndex, onlyContiguous); 959 if(!doCompose) { 960 if(!buffer.equals(s, prevSrc, src)) { 961 return false; 962 } 963 buffer.remove(); 964 } 965 prevBoundary=src; 966 } 967 } 968 969 /** 970 * Very similar to compose(): Make the same changes in both places if relevant. 971 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 972 * !doSpan: quickCheck 973 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 974 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 975 * then the quick check result is "no" 976 */ 977 public int composeQuickCheck(CharSequence s, int src, int limit, 978 boolean onlyContiguous, boolean doSpan) { 979 int qcResult=0; 980 int prevBoundary=src; 981 int minNoMaybeCP=minCompNoMaybeCP; 982 983 for(;;) { 984 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 985 // or with (compYes && ccc==0) properties. 986 int prevSrc; 987 int c = 0; 988 int norm16 = 0; 989 for (;;) { 990 if(src==limit) { 991 return (src<<1)|qcResult; // "yes" or "maybe" 992 } 993 if( (c=s.charAt(src))<minNoMaybeCP || 994 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 995 ) { 996 ++src; 997 } else { 998 prevSrc = src++; 999 if(!UTF16.isSurrogate((char)c)) { 1000 break; 1001 } else { 1002 char c2; 1003 if(UTF16Plus.isSurrogateLead(c)) { 1004 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 1005 ++src; 1006 c=Character.toCodePoint((char)c, c2); 1007 } 1008 } else /* trail surrogate */ { 1009 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 1010 --prevSrc; 1011 c=Character.toCodePoint(c2, (char)c); 1012 } 1013 } 1014 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 1015 break; 1016 } 1017 } 1018 } 1019 } 1020 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1021 // The current character is either a "noNo" (has a mapping) 1022 // or a "maybeYes" (combines backward) 1023 // or a "yesYes" with ccc!=0. 1024 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1025 1026 int prevNorm16 = INERT; 1027 if (prevBoundary != prevSrc) { 1028 prevBoundary = prevSrc; 1029 if (!norm16HasCompBoundaryBefore(norm16)) { 1030 c = Character.codePointBefore(s, prevSrc); 1031 int n16 = getNorm16(c); 1032 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1033 prevBoundary -= Character.charCount(c); 1034 prevNorm16 = n16; 1035 } 1036 } 1037 } 1038 1039 if(isMaybeOrNonZeroCC(norm16)) { 1040 int cc=getCCFromYesOrMaybe(norm16); 1041 if (onlyContiguous /* FCC */ && cc != 0 && 1042 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1043 // The [prevBoundary..prevSrc[ character 1044 // passed the quick check "yes && ccc==0" test 1045 // but is out of canonical order with the current combining mark. 1046 } else { 1047 // If !onlyContiguous (not FCC), then we ignore the tccc of 1048 // the previous character which passed the quick check "yes && ccc==0" test. 1049 for (;;) { 1050 if (norm16 < MIN_YES_YES_WITH_CC) { 1051 if (!doSpan) { 1052 qcResult = 1; 1053 } else { 1054 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1055 } 1056 } 1057 if (src == limit) { 1058 return (src<<1) | qcResult; // "yes" or "maybe" 1059 } 1060 int prevCC = cc; 1061 c = Character.codePointAt(s, src); 1062 norm16 = getNorm16(c); 1063 if (isMaybeOrNonZeroCC(norm16)) { 1064 cc = getCCFromYesOrMaybe(norm16); 1065 if (!(prevCC <= cc || cc == 0)) { 1066 break; 1067 } 1068 } else { 1069 break; 1070 } 1071 src += Character.charCount(c); 1072 } 1073 // src is after the last in-order combining mark. 1074 if (isCompYesAndZeroCC(norm16)) { 1075 prevBoundary = src; 1076 src += Character.charCount(c); 1077 continue; 1078 } 1079 } 1080 } 1081 return prevBoundary<<1; // "no" 1082 } 1083 } 1084 public void composeAndAppend(CharSequence s, 1085 boolean doCompose, 1086 boolean onlyContiguous, 1087 ReorderingBuffer buffer) { 1088 int src=0, limit=s.length(); 1089 if(!buffer.isEmpty()) { 1090 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1091 if(0!=firstStarterInSrc) { 1092 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1093 buffer.length(), onlyContiguous); 1094 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1095 firstStarterInSrc+16); 1096 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1097 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1098 middle.append(s, 0, firstStarterInSrc); 1099 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1100 src=firstStarterInSrc; 1101 } 1102 } 1103 if(doCompose) { 1104 compose(s, src, limit, onlyContiguous, true, buffer); 1105 } else { 1106 buffer.append(s, src, limit); 1107 } 1108 } 1109 // Dual functionality: 1110 // buffer!=NULL: normalize 1111 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1112 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1113 // Note: In this function we use buffer->appendZeroCC() because we track 1114 // the lead and trail combining classes here, rather than leaving it to 1115 // the ReorderingBuffer. 1116 // The exception is the call to decomposeShort() which uses the buffer 1117 // in the normal way. 1118 1119 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1120 // Similar to the prevBoundary in the compose() implementation. 1121 int prevBoundary=src; 1122 int prevSrc; 1123 int c=0; 1124 int prevFCD16=0; 1125 int fcd16=0; 1126 1127 for(;;) { 1128 // count code units with lccc==0 1129 for(prevSrc=src; src!=limit;) { 1130 if((c=s.charAt(src))<minLcccCP) { 1131 prevFCD16=~c; 1132 ++src; 1133 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1134 prevFCD16=0; 1135 ++src; 1136 } else { 1137 if(UTF16.isSurrogate((char)c)) { 1138 char c2; 1139 if(UTF16Plus.isSurrogateLead(c)) { 1140 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1141 c=Character.toCodePoint((char)c, c2); 1142 } 1143 } else /* trail surrogate */ { 1144 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1145 --src; 1146 c=Character.toCodePoint(c2, (char)c); 1147 } 1148 } 1149 } 1150 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1151 prevFCD16=fcd16; 1152 src+=Character.charCount(c); 1153 } else { 1154 break; 1155 } 1156 } 1157 } 1158 // copy these code units all at once 1159 if(src!=prevSrc) { 1160 if(src==limit) { 1161 if(buffer!=null) { 1162 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1163 } 1164 break; 1165 } 1166 prevBoundary=src; 1167 // We know that the previous character's lccc==0. 1168 if(prevFCD16<0) { 1169 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1170 int prev=~prevFCD16; 1171 if(prev<minDecompNoCP) { 1172 prevFCD16=0; 1173 } else { 1174 prevFCD16=getFCD16FromNormData(prev); 1175 if(prevFCD16>1) { 1176 --prevBoundary; 1177 } 1178 } 1179 } else { 1180 int p=src-1; 1181 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1182 Character.isHighSurrogate(s.charAt(p-1)) 1183 ) { 1184 --p; 1185 // Need to fetch the previous character's FCD value because 1186 // prevFCD16 was just for the trail surrogate code point. 1187 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1188 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1189 } 1190 if(prevFCD16>1) { 1191 prevBoundary=p; 1192 } 1193 } 1194 if(buffer!=null) { 1195 // The last lccc==0 character is excluded from the 1196 // flush-and-append call in case it needs to be modified. 1197 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1198 buffer.append(s, prevBoundary, src); 1199 } 1200 // The start of the current character (c). 1201 prevSrc=src; 1202 } else if(src==limit) { 1203 break; 1204 } 1205 1206 src+=Character.charCount(c); 1207 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1208 // Check for proper order, and decompose locally if necessary. 1209 if((prevFCD16&0xff)<=(fcd16>>8)) { 1210 // proper order: prev tccc <= current lccc 1211 if((fcd16&0xff)<=1) { 1212 prevBoundary=src; 1213 } 1214 if(buffer!=null) { 1215 buffer.appendZeroCC(c); 1216 } 1217 prevFCD16=fcd16; 1218 continue; 1219 } else if(buffer==null) { 1220 return prevBoundary; // quick check "no" 1221 } else { 1222 /* 1223 * Back out the part of the source that we copied or appended 1224 * already but is now going to be decomposed. 1225 * prevSrc is set to after what was copied/appended. 1226 */ 1227 buffer.removeSuffix(prevSrc-prevBoundary); 1228 /* 1229 * Find the part of the source that needs to be decomposed, 1230 * up to the next safe boundary. 1231 */ 1232 src=findNextFCDBoundary(s, src, limit); 1233 /* 1234 * The source text does not fulfill the conditions for FCD. 1235 * Decompose and reorder a limited piece of the text. 1236 */ 1237 decomposeShort(s, prevBoundary, src, false, false, buffer); 1238 prevBoundary=src; 1239 prevFCD16=0; 1240 } 1241 } 1242 return src; 1243 } 1244 1245 public boolean hasDecompBoundaryBefore(int c) { 1246 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1247 norm16HasDecompBoundaryBefore(getNorm16(c)); 1248 } 1249 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1250 if (norm16 < minNoNoCompNoMaybeCC) { 1251 return true; 1252 } 1253 if (norm16 >= limitNoNo) { 1254 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1255 } 1256 // c decomposes, get everything from the variable-length extra data 1257 int mapping=norm16>>OFFSET_SHIFT; 1258 int firstUnit=extraData.charAt(mapping); 1259 // true if leadCC==0 (hasFCDBoundaryBefore()) 1260 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1261 } 1262 public boolean hasDecompBoundaryAfter(int c) { 1263 if (c < minDecompNoCP) { 1264 return true; 1265 } 1266 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1267 return true; 1268 } 1269 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1270 } 1271 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1272 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1273 return true; 1274 } 1275 if (norm16 >= limitNoNo) { 1276 if (isMaybeOrNonZeroCC(norm16)) { 1277 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1278 } 1279 // Maps to an isCompYesAndZeroCC. 1280 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1281 } 1282 // c decomposes, get everything from the variable-length extra data 1283 int mapping=norm16>>OFFSET_SHIFT; 1284 int firstUnit=extraData.charAt(mapping); 1285 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1286 // fcd16<=1 || trailCC==0 1287 if(firstUnit>0x1ff) { 1288 return false; // trailCC>1 1289 } 1290 if(firstUnit<=0xff) { 1291 return true; // trailCC==0 1292 } 1293 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1294 // true if leadCC==0 (hasFCDBoundaryBefore()) 1295 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1296 } 1297 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1298 1299 public boolean hasCompBoundaryBefore(int c) { 1300 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1301 } 1302 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1303 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1304 } 1305 1306 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1307 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1308 private static boolean isInert(int norm16) { return norm16==INERT; } 1309 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1310 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } 1311 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } 1312 private boolean isHangulLVT(int norm16) { 1313 return norm16==hangulLVT(); 1314 } 1315 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1316 // UBool isCompYes(uint16_t norm16) const { 1317 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1318 // } 1319 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1320 // return norm16<minNoNo || minMaybeYes<=norm16; 1321 // } 1322 // private boolean hasZeroCCFromDecompYes(int norm16) { 1323 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1324 // } 1325 private boolean isDecompYesAndZeroCC(int norm16) { 1326 return norm16<minYesNo || 1327 norm16==JAMO_VT || 1328 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1329 } 1330 /** 1331 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1332 * the MaybeYes which combine-forward and have ccc=0. 1333 * (Standard Unicode 10 normalization does not have such characters.) 1334 */ 1335 private boolean isMostDecompYesAndZeroCC(int norm16) { 1336 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1337 } 1338 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1339 1340 // For use with isCompYes(). 1341 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1342 // static uint8_t getCCFromYes(uint16_t norm16) { 1343 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1344 // } 1345 private int getCCFromNoNo(int norm16) { 1346 int mapping=norm16>>OFFSET_SHIFT; 1347 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1348 return extraData.charAt(mapping-1)&0xff; 1349 } else { 1350 return 0; 1351 } 1352 } 1353 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1354 if(norm16<=minYesNo) { 1355 return 0; // yesYes and Hangul LV have ccc=tccc=0 1356 } else { 1357 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1358 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1359 } 1360 } 1361 1362 // Requires algorithmic-NoNo. 1363 private int mapAlgorithmic(int c, int norm16) { 1364 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1365 } 1366 1367 // Requires minYesNo<norm16<limitNoNo. 1368 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1369 1370 /** 1371 * @return index into maybeYesCompositions, or -1 1372 */ 1373 private int getCompositionsListForDecompYes(int norm16) { 1374 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1375 return -1; 1376 } else { 1377 if((norm16-=minMaybeYes)<0) { 1378 // norm16<minMaybeYes: index into extraData which is a substring at 1379 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1380 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1381 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1382 } 1383 return norm16>>OFFSET_SHIFT; 1384 } 1385 } 1386 /** 1387 * @return index into maybeYesCompositions 1388 */ 1389 private int getCompositionsListForComposite(int norm16) { 1390 // A composite has both mapping & compositions list. 1391 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1392 int firstUnit=maybeYesCompositions.charAt(list); 1393 return list+ // mapping in maybeYesCompositions 1394 1+ // +1 to skip the first unit with the mapping length 1395 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1396 } 1397 1398 // Decompose a short piece of text which is likely to contain characters that 1399 // fail the quick check loop and/or where the quick check loop's overhead 1400 // is unlikely to be amortized. 1401 // Called by the compose() and makeFCD() implementations. 1402 // Public in Java for collation implementation code. 1403 private int decomposeShort( 1404 CharSequence s, int src, int limit, 1405 boolean stopAtCompBoundary, boolean onlyContiguous, 1406 ReorderingBuffer buffer) { 1407 while(src<limit) { 1408 int c=Character.codePointAt(s, src); 1409 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1410 return src; 1411 } 1412 int norm16 = getNorm16(c); 1413 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1414 return src; 1415 } 1416 src+=Character.charCount(c); 1417 decompose(c, norm16, buffer); 1418 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1419 return src; 1420 } 1421 } 1422 return src; 1423 } 1424 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1425 // get the decomposition and the lead and trail cc's 1426 if (norm16 >= limitNoNo) { 1427 if (isMaybeOrNonZeroCC(norm16)) { 1428 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1429 return; 1430 } 1431 // Maps to an isCompYesAndZeroCC. 1432 c=mapAlgorithmic(c, norm16); 1433 norm16=getNorm16(c); 1434 } 1435 if (norm16 < minYesNo) { 1436 // c does not decompose 1437 buffer.append(c, 0); 1438 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1439 // Hangul syllable: decompose algorithmically 1440 Hangul.decompose(c, buffer); 1441 } else { 1442 // c decomposes, get everything from the variable-length extra data 1443 int mapping=norm16>>OFFSET_SHIFT; 1444 int firstUnit=extraData.charAt(mapping); 1445 int length=firstUnit&MAPPING_LENGTH_MASK; 1446 int leadCC, trailCC; 1447 trailCC=firstUnit>>8; 1448 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1449 leadCC=extraData.charAt(mapping-1)>>8; 1450 } else { 1451 leadCC=0; 1452 } 1453 ++mapping; // skip over the firstUnit 1454 buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); 1455 } 1456 } 1457 1458 /** 1459 * Finds the recomposition result for 1460 * a forward-combining "lead" character, 1461 * specified with a pointer to its compositions list, 1462 * and a backward-combining "trail" character. 1463 * 1464 * <p>If the lead and trail characters combine, then this function returns 1465 * the following "compositeAndFwd" value: 1466 * <pre> 1467 * Bits 21..1 composite character 1468 * Bit 0 set if the composite is a forward-combining starter 1469 * </pre> 1470 * otherwise it returns -1. 1471 * 1472 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1473 * encoded as either pairs or triples of 16-bit units. 1474 * The last entry has the high bit of its first unit set. 1475 * 1476 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1477 * A linear search is used. 1478 * 1479 * <p>See normalizer2impl.h for a more detailed description 1480 * of the compositions list format. 1481 */ 1482 private static int combine(String compositions, int list, int trail) { 1483 int key1, firstUnit; 1484 if(trail<COMP_1_TRAIL_LIMIT) { 1485 // trail character is 0..33FF 1486 // result entry may have 2 or 3 units 1487 key1=(trail<<1); 1488 while(key1>(firstUnit=compositions.charAt(list))) { 1489 list+=2+(firstUnit&COMP_1_TRIPLE); 1490 } 1491 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1492 if((firstUnit&COMP_1_TRIPLE)!=0) { 1493 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1494 } else { 1495 return compositions.charAt(list+1); 1496 } 1497 } 1498 } else { 1499 // trail character is 3400..10FFFF 1500 // result entry has 3 units 1501 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1502 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1503 int secondUnit; 1504 for(;;) { 1505 if(key1>(firstUnit=compositions.charAt(list))) { 1506 list+=2+(firstUnit&COMP_1_TRIPLE); 1507 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1508 if(key2>(secondUnit=compositions.charAt(list+1))) { 1509 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1510 break; 1511 } else { 1512 list+=3; 1513 } 1514 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1515 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1516 } else { 1517 break; 1518 } 1519 } else { 1520 break; 1521 } 1522 } 1523 } 1524 return -1; 1525 } 1526 1527 /* 1528 * Recomposes the buffer text starting at recomposeStartIndex 1529 * (which is in NFD - decomposed and canonically ordered), 1530 * and truncates the buffer contents. 1531 * 1532 * Note that recomposition never lengthens the text: 1533 * Any character consists of either one or two code units; 1534 * a composition may contain at most one more code unit than the original starter, 1535 * while the combining mark that is removed has at least one code unit. 1536 */ 1537 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1538 boolean onlyContiguous) { 1539 StringBuilder sb=buffer.getStringBuilder(); 1540 int p=recomposeStartIndex; 1541 if(p==sb.length()) { 1542 return; 1543 } 1544 1545 int starter, pRemove; 1546 int compositionsList; 1547 int c, compositeAndFwd; 1548 int norm16; 1549 int cc, prevCC; 1550 boolean starterIsSupplementary; 1551 1552 // Some of the following variables are not used until we have a forward-combining starter 1553 // and are only initialized now to avoid compiler warnings. 1554 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1555 starter=-1; 1556 starterIsSupplementary=false; 1557 prevCC=0; 1558 1559 for(;;) { 1560 c=sb.codePointAt(p); 1561 p+=Character.charCount(c); 1562 norm16=getNorm16(c); 1563 cc=getCCFromYesOrMaybe(norm16); 1564 if( // this character combines backward and 1565 isMaybe(norm16) && 1566 // we have seen a starter that combines forward and 1567 compositionsList>=0 && 1568 // the backward-combining character is not blocked 1569 (prevCC<cc || prevCC==0) 1570 ) { 1571 if(isJamoVT(norm16)) { 1572 // c is a Jamo V/T, see if we can compose it with the previous character. 1573 if(c<Hangul.JAMO_T_BASE) { 1574 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1575 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1576 if(prev<Hangul.JAMO_L_COUNT) { 1577 pRemove=p-1; 1578 char syllable=(char) 1579 (Hangul.HANGUL_BASE+ 1580 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1581 Hangul.JAMO_T_COUNT); 1582 char t; 1583 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1584 ++p; 1585 syllable+=t; // The next character was a Jamo T. 1586 } 1587 sb.setCharAt(starter, syllable); 1588 // remove the Jamo V/T 1589 sb.delete(pRemove, p); 1590 p=pRemove; 1591 } 1592 } 1593 /* 1594 * No "else" for Jamo T: 1595 * Since the input is in NFD, there are no Hangul LV syllables that 1596 * a Jamo T could combine with. 1597 * All Jamo Ts are combined above when handling Jamo Vs. 1598 */ 1599 if(p==sb.length()) { 1600 break; 1601 } 1602 compositionsList=-1; 1603 continue; 1604 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1605 // The starter and the combining mark (c) do combine. 1606 int composite=compositeAndFwd>>1; 1607 1608 // Remove the combining mark. 1609 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1610 sb.delete(pRemove, p); 1611 p=pRemove; 1612 // Replace the starter with the composite. 1613 if(starterIsSupplementary) { 1614 if(composite>0xffff) { 1615 // both are supplementary 1616 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1617 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1618 } else { 1619 sb.setCharAt(starter, (char)c); 1620 sb.deleteCharAt(starter+1); 1621 // The composite is shorter than the starter, 1622 // move the intermediate characters forward one. 1623 starterIsSupplementary=false; 1624 --p; 1625 } 1626 } else if(composite>0xffff) { 1627 // The composite is longer than the starter, 1628 // move the intermediate characters back one. 1629 starterIsSupplementary=true; 1630 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1631 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1632 ++p; 1633 } else { 1634 // both are on the BMP 1635 sb.setCharAt(starter, (char)composite); 1636 } 1637 1638 // Keep prevCC because we removed the combining mark. 1639 1640 if(p==sb.length()) { 1641 break; 1642 } 1643 // Is the composite a starter that combines forward? 1644 if((compositeAndFwd&1)!=0) { 1645 compositionsList= 1646 getCompositionsListForComposite(getNorm16(composite)); 1647 } else { 1648 compositionsList=-1; 1649 } 1650 1651 // We combined; continue with looking for compositions. 1652 continue; 1653 } 1654 } 1655 1656 // no combination this time 1657 prevCC=cc; 1658 if(p==sb.length()) { 1659 break; 1660 } 1661 1662 // If c did not combine, then check if it is a starter. 1663 if(cc==0) { 1664 // Found a new starter. 1665 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 1666 // It may combine with something, prepare for it. 1667 if(c<=0xffff) { 1668 starterIsSupplementary=false; 1669 starter=p-1; 1670 } else { 1671 starterIsSupplementary=true; 1672 starter=p-2; 1673 } 1674 } 1675 } else if(onlyContiguous) { 1676 // FCC: no discontiguous compositions; any intervening character blocks. 1677 compositionsList=-1; 1678 } 1679 } 1680 buffer.flush(); 1681 } 1682 1683 /** 1684 * Does c have a composition boundary before it? 1685 * True if its decomposition begins with a character that has 1686 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1687 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1688 * (isCompYesAndZeroCC()) so we need not decompose. 1689 */ 1690 private boolean hasCompBoundaryBefore(int c, int norm16) { 1691 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 1692 } 1693 private boolean norm16HasCompBoundaryBefore(int norm16) { 1694 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 1695 } 1696 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 1697 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 1698 } 1699 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 1700 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1701 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 1702 } 1703 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 1704 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 1705 } 1706 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ 1707 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 1708 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 1709 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 1710 } 1711 1712 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 1713 while(p>0) { 1714 int c=Character.codePointBefore(s, p); 1715 int norm16 = getNorm16(c); 1716 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1717 break; 1718 } 1719 p-=Character.charCount(c); 1720 if(hasCompBoundaryBefore(c, norm16)) { 1721 break; 1722 } 1723 } 1724 return p; 1725 } 1726 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 1727 while(p<limit) { 1728 int c=Character.codePointAt(s, p); 1729 int norm16=normTrie.get(c); 1730 if(hasCompBoundaryBefore(c, norm16)) { 1731 break; 1732 } 1733 p+=Character.charCount(c); 1734 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1735 break; 1736 } 1737 } 1738 return p; 1739 } 1740 1741 1742 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 1743 while(p<limit) { 1744 int c=Character.codePointAt(s, p); 1745 int norm16; 1746 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 1747 break; 1748 } 1749 p+=Character.charCount(c); 1750 if (norm16HasDecompBoundaryAfter(norm16)) { 1751 break; 1752 } 1753 } 1754 return p; 1755 } 1756 1757 /** 1758 * Get the canonical decomposition 1759 * sherman for ComposedCharIter 1760 */ 1761 public static int getDecompose(int[] chars, String[] decomps) { 1762 Normalizer2 impl = Normalizer2.getNFDInstance(); 1763 1764 int length=0; 1765 int norm16 = 0; 1766 int ch = -1; 1767 int i = 0; 1768 1769 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff 1770 //TBD !!!! the hack code heres save us about 50ms for startup 1771 //need a better solution/lookup 1772 if (ch == 0x30ff) 1773 ch = 0xf900; 1774 else if (ch == 0x115bc) 1775 ch = 0x1d15e; 1776 else if (ch == 0x1d1c1) 1777 ch = 0x2f800; 1778 1779 String s = impl.getDecomposition(ch); 1780 1781 if(s != null && i < chars.length) { 1782 chars[i] = ch; 1783 decomps[i++] = s; 1784 } 1785 } 1786 return i; 1787 } 1788 1789 //------------------------------------------------------ 1790 // special method for Collation (RBTableBuilder.build()) 1791 //------------------------------------------------------ 1792 private static boolean needSingleQuotation(char c) { 1793 return (c >= 0x0009 && c <= 0x000D) || 1794 (c >= 0x0020 && c <= 0x002F) || 1795 (c >= 0x003A && c <= 0x0040) || 1796 (c >= 0x005B && c <= 0x0060) || 1797 (c >= 0x007B && c <= 0x007E); 1798 } 1799 1800 public static String canonicalDecomposeWithSingleQuotation(String string) { 1801 Normalizer2 impl = Normalizer2.getNFDInstance(); 1802 char[] src = string.toCharArray(); 1803 int srcIndex = 0; 1804 int srcLimit = src.length; 1805 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 1806 int destIndex = 0; 1807 int destLimit = dest.length; 1808 1809 int prevSrc; 1810 String norm; 1811 int reorderStartIndex, length; 1812 char c1, c2; 1813 int cp; 1814 int minNoMaybe = 0x00c0; 1815 int cc, prevCC, trailCC; 1816 char[] p; 1817 int pStart; 1818 1819 // initialize 1820 reorderStartIndex = 0; 1821 prevCC = 0; 1822 norm = null; 1823 cp = 0; 1824 pStart = 0; 1825 1826 cc = trailCC = -1; // initialize to bogus value 1827 c1 = 0; 1828 for (;;) { 1829 prevSrc=srcIndex; 1830 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual 1831 while (srcIndex != srcLimit && 1832 ((c1 = src[srcIndex]) < minNoMaybe || 1833 (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || 1834 (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables 1835 prevCC = 0; 1836 srcIndex += (cp < 0x10000) ? 1 : 2; 1837 } 1838 1839 // copy these code units all at once 1840 if (srcIndex != prevSrc) { 1841 length = srcIndex - prevSrc; 1842 if ((destIndex + length) <= destLimit) { 1843 System.arraycopy(src,prevSrc,dest,destIndex,length); 1844 } 1845 1846 destIndex += length; 1847 reorderStartIndex = destIndex; 1848 } 1849 1850 // end of source reached? 1851 if (srcIndex == srcLimit) { 1852 break; 1853 } 1854 1855 // cp already contains *src and norm32 is set for it, increment src 1856 srcIndex += (cp < 0x10000) ? 1 : 2; 1857 1858 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1859 c2 = 0; 1860 length = 1; 1861 1862 if (Character.isHighSurrogate(c1) 1863 || Character.isLowSurrogate(c1)) { 1864 norm = null; 1865 } 1866 } else { 1867 length = 2; 1868 c2 = src[srcIndex-1]; 1869 } 1870 1871 // get the decomposition and the lead and trail cc's 1872 if (norm == null) { 1873 // cp does not decompose 1874 cc = trailCC = UCharacter.getCombiningClass(cp); 1875 p = null; 1876 pStart = -1; 1877 } else { 1878 1879 pStart = 0; 1880 p = norm.toCharArray(); 1881 length = p.length; 1882 int cpNum = norm.codePointCount(0, length); 1883 cc= UCharacter.getCombiningClass(norm.codePointAt(0)); 1884 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); 1885 if (length == 1) { 1886 // fastpath a single code unit from decomposition 1887 c1 = p[pStart]; 1888 c2 = 0; 1889 p = null; 1890 pStart = -1; 1891 } 1892 } 1893 1894 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 1895 // buffer overflow 1896 char[] tmpBuf = new char[destLimit * 2]; 1897 System.arraycopy(dest, 0, tmpBuf, 0, destIndex); 1898 dest = tmpBuf; 1899 destLimit = dest.length; 1900 } 1901 1902 // append the decomposition to the destination buffer, assume length>0 1903 { 1904 int reorderSplit = destIndex; 1905 if (p == null) { 1906 // fastpath: single code point 1907 if (needSingleQuotation(c1)) { 1908 //if we need single quotation, no need to consider "prevCC" 1909 //and it must NOT be a supplementary pair 1910 dest[destIndex++] = '\''; 1911 dest[destIndex++] = c1; 1912 dest[destIndex++] = '\''; 1913 trailCC = 0; 1914 } else if(cc != 0 && cc < prevCC) { 1915 // (c1, c2) is out of order with respect to the preceding 1916 // text 1917 destIndex += length; 1918 trailCC = insertOrdered(dest, reorderStartIndex, 1919 reorderSplit, destIndex, c1, c2, cc); 1920 } else { 1921 // just append (c1, c2) 1922 dest[destIndex++] = c1; 1923 if(c2 != 0) { 1924 dest[destIndex++] = c2; 1925 } 1926 } 1927 } else { 1928 // general: multiple code points (ordered by themselves) 1929 // from decomposition 1930 if (needSingleQuotation(p[pStart])) { 1931 dest[destIndex++] = '\''; 1932 dest[destIndex++] = p[pStart++]; 1933 dest[destIndex++] = '\''; 1934 length--; 1935 do { 1936 dest[destIndex++] = p[pStart++]; 1937 } while(--length > 0); 1938 } else if (cc != 0 && cc < prevCC) { 1939 destIndex += length; 1940 trailCC = mergeOrdered(dest, reorderStartIndex, 1941 reorderSplit, p, pStart, 1942 pStart+length); 1943 } else { 1944 // just append the decomposition 1945 do { 1946 dest[destIndex++] = p[pStart++]; 1947 } while (--length > 0); 1948 } 1949 } 1950 } 1951 prevCC = trailCC; 1952 if(prevCC == 0) { 1953 reorderStartIndex = destIndex; 1954 } 1955 } 1956 1957 return new String(dest, 0, destIndex); 1958 } 1959 1960 /** 1961 * simpler, single-character version of mergeOrdered() - 1962 * bubble-insert one single code point into the preceding string 1963 * which is already canonically ordered 1964 * (c, c2) may or may not yet have been inserted at src[current]..src[p] 1965 * 1966 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) 1967 * 1968 * before: src[start]..src[current] is already ordered, and 1969 * src[current]..src[p] may or may not hold (c, c2) but 1970 * must be exactly the same length as (c, c2) 1971 * after: src[start]..src[p] is ordered 1972 * 1973 * @return the trailing combining class 1974 */ 1975 private static int/*unsigned byte*/ insertOrdered(char[] source, 1976 int start, 1977 int current, int p, 1978 char c1, char c2, 1979 int/*unsigned byte*/ cc) { 1980 int back, preBack; 1981 int r; 1982 int prevCC, trailCC=cc; 1983 1984 if (start<current && cc!=0) { 1985 // search for the insertion point where cc>=prevCC 1986 preBack=back=current; 1987 1988 PrevArgs prevArgs = new PrevArgs(); 1989 prevArgs.current = current; 1990 prevArgs.start = start; 1991 prevArgs.src = source; 1992 prevArgs.c1 = c1; 1993 prevArgs.c2 = c2; 1994 1995 // get the prevCC 1996 prevCC=getPrevCC(prevArgs); 1997 preBack = prevArgs.current; 1998 1999 if(cc<prevCC) { 2000 // this will be the last code point, so keep its cc 2001 trailCC=prevCC; 2002 back=preBack; 2003 while(start<preBack) { 2004 prevCC=getPrevCC(prevArgs); 2005 preBack=prevArgs.current; 2006 if(cc>=prevCC) { 2007 break; 2008 } 2009 back=preBack; 2010 } 2011 2012 // this is where we are right now with all these indicies: 2013 // [start]..[pPreBack] 0..? code points that we can ignore 2014 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc 2015 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) 2016 // [current]..[p] 1 code point (c, c2) with cc 2017 2018 // move the code units in between up 2019 r=p; 2020 do { 2021 source[--r]=source[--current]; 2022 } while (back!=current); 2023 } 2024 } 2025 2026 // insert (c1, c2) 2027 source[current] = c1; 2028 if (c2!=0) { 2029 source[(current+1)] = c2; 2030 } 2031 2032 // we know the cc of the last code point 2033 return trailCC; 2034 } 2035 /** 2036 * merge two UTF-16 string parts together 2037 * to canonically order (order by combining classes) their concatenation 2038 * 2039 * the two strings may already be adjacent, so that the merging is done 2040 * in-place if the two strings are not adjacent, then the buffer holding the 2041 * first one must be large enough 2042 * the second string may or may not be ordered in itself 2043 * 2044 * before: [start]..[current] is already ordered, and 2045 * [next]..[limit] may be ordered in itself, but 2046 * is not in relation to [start..current[ 2047 * after: [start..current+(limit-next)[ is ordered 2048 * 2049 * the algorithm is a simple bubble-sort that takes the characters from 2050 * src[next++] and inserts them in correct combining class order into the 2051 * preceding part of the string 2052 * 2053 * since this function is called much less often than the single-code point 2054 * insertOrdered(), it just uses that for easier maintenance 2055 * 2056 * @return the trailing combining class 2057 */ 2058 private static int /*unsigned byte*/ mergeOrdered(char[] source, 2059 int start, 2060 int current, 2061 char[] data, 2062 int next, 2063 int limit) { 2064 int r; 2065 int /*unsigned byte*/ cc, trailCC=0; 2066 boolean adjacent; 2067 2068 adjacent= current==next; 2069 NextCCArgs ncArgs = new NextCCArgs(); 2070 ncArgs.source = data; 2071 ncArgs.next = next; 2072 ncArgs.limit = limit; 2073 2074 if(start!=current) { 2075 2076 while(ncArgs.next<ncArgs.limit) { 2077 cc=getNextCC(ncArgs); 2078 if(cc==0) { 2079 // does not bubble back 2080 trailCC=0; 2081 if(adjacent) { 2082 current=ncArgs.next; 2083 } else { 2084 data[current++]=ncArgs.c1; 2085 if(ncArgs.c2!=0) { 2086 data[current++]=ncArgs.c2; 2087 } 2088 } 2089 break; 2090 } else { 2091 r=current+(ncArgs.c2==0 ? 1 : 2); 2092 trailCC=insertOrdered(source,start, current, r, 2093 ncArgs.c1, ncArgs.c2, cc); 2094 current=r; 2095 } 2096 } 2097 } 2098 2099 if(ncArgs.next==ncArgs.limit) { 2100 // we know the cc of the last code point 2101 return trailCC; 2102 } else { 2103 if(!adjacent) { 2104 // copy the second string part 2105 do { 2106 source[current++]=data[ncArgs.next++]; 2107 } while(ncArgs.next!=ncArgs.limit); 2108 ncArgs.limit=current; 2109 } 2110 PrevArgs prevArgs = new PrevArgs(); 2111 prevArgs.src = data; 2112 prevArgs.start = start; 2113 prevArgs.current = ncArgs.limit; 2114 return getPrevCC(prevArgs); 2115 } 2116 2117 } 2118 private static final class PrevArgs{ 2119 char[] src; 2120 int start; 2121 int current; 2122 char c1; 2123 char c2; 2124 } 2125 2126 private static final class NextCCArgs{ 2127 char[] source; 2128 int next; 2129 int limit; 2130 char c1; 2131 char c2; 2132 } 2133 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { 2134 args.c1=args.source[args.next++]; 2135 args.c2=0; 2136 2137 if (UTF16.isTrailSurrogate(args.c1)) { 2138 /* unpaired second surrogate */ 2139 return 0; 2140 } else if (!UTF16.isLeadSurrogate(args.c1)) { 2141 return UCharacter.getCombiningClass(args.c1); 2142 } else if (args.next!=args.limit && 2143 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ 2144 ++args.next; 2145 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); 2146 } else { 2147 /* unpaired first surrogate */ 2148 args.c2=0; 2149 return 0; 2150 } 2151 } 2152 private static int /*unsigned*/ getPrevCC(PrevArgs args) { 2153 args.c1=args.src[--args.current]; 2154 args.c2=0; 2155 2156 if (args.c1 < MIN_CCC_LCCC_CP) { 2157 return 0; 2158 } else if (UTF16.isLeadSurrogate(args.c1)) { 2159 /* unpaired first surrogate */ 2160 return 0; 2161 } else if (!UTF16.isTrailSurrogate(args.c1)) { 2162 return UCharacter.getCombiningClass(args.c1); 2163 } else if (args.current!=args.start && 2164 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { 2165 --args.current; 2166 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); 2167 } else { 2168 /* unpaired second surrogate */ 2169 args.c2=0; 2170 return 0; 2171 } 2172 } 2173 2174 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2175 if (start == p) { 2176 return 0; 2177 } 2178 return getFCD16(Character.codePointBefore(s, p)); 2179 } 2180 2181 private VersionInfo dataVersion; 2182 2183 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2184 private int minDecompNoCP; 2185 private int minCompNoMaybeCP; 2186 private int minLcccCP; 2187 2188 // Norm16 value thresholds for quick check combinations and types of extra data. 2189 private int minYesNo; 2190 private int minYesNoMappingsOnly; 2191 private int minNoNo; 2192 private int minNoNoCompBoundaryBefore; 2193 private int minNoNoCompNoMaybeCC; 2194 private int minNoNoEmpty; 2195 private int limitNoNo; 2196 private int centerNoNoDelta; 2197 private int minMaybeYes; 2198 2199 private Trie2_16 normTrie; 2200 private String maybeYesCompositions; 2201 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2202 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2203 2204 }