1 /* 2 * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 33 package sun.text.normalizer; 34 35 import java.io.IOException; 36 import java.nio.ByteBuffer; 37 import java.text.Normalizer; 38 39 // Original filename in ICU4J: Normalizer2Impl.java 40 public final class NormalizerImpl { 41 42 public static final class Hangul { 43 /* Korean Hangul and Jamo constants */ 44 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 45 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 46 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 47 48 public static final int HANGUL_BASE=0xac00; 49 public static final int HANGUL_END=0xd7a3; 50 51 public static final int JAMO_L_COUNT=19; 52 public static final int JAMO_V_COUNT=21; 53 public static final int JAMO_T_COUNT=28; 54 55 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 56 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 57 58 public static boolean isHangul(int c) { 59 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 60 } 61 62 public static boolean isHangulWithoutJamoT(char c) { 63 c-=HANGUL_BASE; 64 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 65 } 66 67 /** 68 * Decomposes c, which must be a Hangul syllable, into buffer 69 * and returns the length of the decomposition (2 or 3). 70 */ 71 public static int decompose(int c, Appendable buffer) { 72 try { 73 c-=HANGUL_BASE; 74 int c2=c%JAMO_T_COUNT; 75 c/=JAMO_T_COUNT; 76 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 77 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 78 if(c2==0) { 79 return 2; 80 } else { 81 buffer.append((char)(JAMO_T_BASE+c2)); 82 return 3; 83 } 84 } catch(IOException e) { 85 throw new InternalError(e); 86 } 87 } 88 } 89 90 /** 91 * Writable buffer that takes care of canonical ordering. 92 * Its Appendable methods behave like the C++ implementation's 93 * appendZeroCC() methods. 94 * <p> 95 * If dest is a StringBuilder, then the buffer writes directly to it. 96 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 97 * until no further changes are necessary and whole segments are appended. 98 * append() methods that take combining-class values always write to the StringBuilder. 99 * Other append() methods flush and append to the Appendable. 100 */ 101 public static final class ReorderingBuffer implements Appendable { 102 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { 103 impl=ni; 104 app=dest; 105 if (app instanceof StringBuilder) { 106 appIsStringBuilder=true; 107 str=(StringBuilder)dest; 108 // In Java, the constructor subsumes public void init(int destCapacity) 109 str.ensureCapacity(destCapacity); 110 reorderStart=0; 111 if(str.length()==0) { 112 lastCC=0; 113 } else { 114 setIterator(); 115 lastCC=previousCC(); 116 // Set reorderStart after the last code point with cc<=1 if there is one. 117 if(lastCC>1) { 118 while(previousCC()>1) {} 119 } 120 reorderStart=codePointLimit; 121 } 122 } else { 123 appIsStringBuilder=false; 124 str=new StringBuilder(); 125 reorderStart=0; 126 lastCC=0; 127 } 128 } 129 130 public boolean isEmpty() { return str.length()==0; } 131 public int length() { return str.length(); } 132 public int getLastCC() { return lastCC; } 133 134 public StringBuilder getStringBuilder() { return str; } 135 136 public boolean equals(CharSequence s, int start, int limit) { 137 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 138 } 139 140 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 141 public void setLastChar(char c) { 142 str.setCharAt(str.length()-1, c); 143 } 144 145 public void append(int c, int cc) { 146 if(lastCC<=cc || cc==0) { 147 str.appendCodePoint(c); 148 lastCC=cc; 149 if(cc<=1) { 150 reorderStart=str.length(); 151 } 152 } else { 153 insert(c, cc); 154 } 155 } 156 157 // s must be in NFD, otherwise change the implementation. 158 public void append(CharSequence s, int start, int limit, 159 int leadCC, int trailCC) { 160 if(start==limit) { 161 return; 162 } 163 if(lastCC<=leadCC || leadCC==0) { 164 if(trailCC<=1) { 165 reorderStart=str.length()+(limit-start); 166 } else if(leadCC<=1) { 167 reorderStart=str.length()+1; // Ok if not a code point boundary. 168 } 169 str.append(s, start, limit); 170 lastCC=trailCC; 171 } else { 172 int c=Character.codePointAt(s, start); 173 start+=Character.charCount(c); 174 insert(c, leadCC); // insert first code point 175 while(start<limit) { 176 c=Character.codePointAt(s, start); 177 start+=Character.charCount(c); 178 if(start<limit) { 179 // s must be in NFD, otherwise we need to use getCC(). 180 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 181 } else { 182 leadCC=trailCC; 183 } 184 append(c, leadCC); 185 } 186 } 187 } 188 189 // The following append() methods work like C++ appendZeroCC(). 190 // They assume that the cc or trailCC of their input is 0. 191 // Most of them implement Appendable interface methods. 192 // @Override when we switch to Java 6 193 public ReorderingBuffer append(char c) { 194 str.append(c); 195 lastCC=0; 196 reorderStart=str.length(); 197 return this; 198 } 199 200 public void appendZeroCC(int c) { 201 str.appendCodePoint(c); 202 lastCC=0; 203 reorderStart=str.length(); 204 } 205 206 // @Override when we switch to Java 6 207 public ReorderingBuffer append(CharSequence s) { 208 if(s.length()!=0) { 209 str.append(s); 210 lastCC=0; 211 reorderStart=str.length(); 212 } 213 return this; 214 } 215 216 // @Override when we switch to Java 6 217 public ReorderingBuffer append(CharSequence s, int start, int limit) { 218 if(start!=limit) { 219 str.append(s, start, limit); 220 lastCC=0; 221 reorderStart=str.length(); 222 } 223 return this; 224 } 225 226 /** 227 * Flushes from the intermediate StringBuilder to the Appendable, 228 * if they are different objects. 229 * Used after recomposition. 230 * Must be called at the end when writing to a non-StringBuilder Appendable. 231 */ 232 public void flush() { 233 if(appIsStringBuilder) { 234 reorderStart=str.length(); 235 } else { 236 try { 237 app.append(str); 238 str.setLength(0); 239 reorderStart=0; 240 } catch(IOException e) { 241 throw new InternalError(e); // Avoid declaring "throws IOException". 242 } 243 } 244 lastCC=0; 245 } 246 247 /** 248 * Flushes from the intermediate StringBuilder to the Appendable, 249 * if they are different objects. 250 * Then appends the new text to the Appendable or StringBuilder. 251 * Normally used after quick check loops find a non-empty sequence. 252 */ 253 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 254 if(appIsStringBuilder) { 255 str.append(s, start, limit); 256 reorderStart=str.length(); 257 } else { 258 try { 259 app.append(str).append(s, start, limit); 260 str.setLength(0); 261 reorderStart=0; 262 } catch(IOException e) { 263 throw new InternalError(e); // Avoid declaring "throws IOException". 264 } 265 } 266 lastCC=0; 267 return this; 268 } 269 270 public void remove() { 271 str.setLength(0); 272 lastCC=0; 273 reorderStart=0; 274 } 275 276 public void removeSuffix(int suffixLength) { 277 int oldLength=str.length(); 278 str.delete(oldLength-suffixLength, oldLength); 279 lastCC=0; 280 reorderStart=str.length(); 281 } 282 283 // Inserts c somewhere before the last character. 284 // Requires 0<cc<lastCC which implies reorderStart<limit. 285 private void insert(int c, int cc) { 286 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 287 // insert c at codePointLimit, after the character with prevCC<=cc 288 if(c<=0xffff) { 289 str.insert(codePointLimit, (char)c); 290 if(cc<=1) { 291 reorderStart=codePointLimit+1; 292 } 293 } else { 294 str.insert(codePointLimit, Character.toChars(c)); 295 if(cc<=1) { 296 reorderStart=codePointLimit+2; 297 } 298 } 299 } 300 301 private final NormalizerImpl impl; 302 private final Appendable app; 303 private final StringBuilder str; 304 private final boolean appIsStringBuilder; 305 private int reorderStart; 306 private int lastCC; 307 308 // private backward iterator 309 private void setIterator() { codePointStart=str.length(); } 310 private void skipPrevious() { // Requires 0<codePointStart. 311 codePointLimit=codePointStart; 312 codePointStart=str.offsetByCodePoints(codePointStart, -1); 313 } 314 private int previousCC() { // Returns 0 if there is no previous character. 315 codePointLimit=codePointStart; 316 if(reorderStart>=codePointStart) { 317 return 0; 318 } 319 int c=str.codePointBefore(codePointStart); 320 codePointStart-=Character.charCount(c); 321 if(c<MIN_CCC_LCCC_CP) { 322 return 0; 323 } 324 return getCCFromYesOrMaybe(impl.getNorm16(c)); 325 } 326 327 private int codePointStart, codePointLimit; 328 } 329 330 // TODO: Propose as public API on the UTF16 class. 331 // TODO: Propose widening UTF16 methods that take char to take int. 332 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 333 public static final class UTF16Plus { 334 /** 335 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 336 * is it a lead surrogate? 337 * @param c code unit or code point 338 * @return true or false 339 */ 340 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 341 342 /** 343 * Compares two CharSequence subsequences for binary equality. 344 * @param s1 first sequence 345 * @param start1 start offset in first sequence 346 * @param limit1 limit offset in first sequence 347 * @param s2 second sequence 348 * @param start2 start offset in second sequence 349 * @param limit2 limit offset in second sequence 350 * @return true if s1.subSequence(start1, limit1) contains the same text 351 * as s2.subSequence(start2, limit2) 352 */ 353 public static boolean equal(CharSequence s1, int start1, int limit1, 354 CharSequence s2, int start2, int limit2) { 355 if((limit1-start1)!=(limit2-start2)) { 356 return false; 357 } 358 if(s1==s2 && start1==start2) { 359 return true; 360 } 361 while(start1<limit1) { 362 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 363 return false; 364 } 365 } 366 return true; 367 } 368 } 369 370 public NormalizerImpl() {} 371 372 private static final class IsAcceptable implements ICUBinary.Authenticate { 373 // @Override when we switch to Java 6 374 public boolean isDataVersionAcceptable(byte version[]) { 375 return version[0]==2; 376 } 377 } 378 379 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 380 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 381 382 public NormalizerImpl load(ByteBuffer bytes) { 383 try { 384 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 385 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 386 if(indexesLength<=IX_MIN_MAYBE_YES) { 387 throw new IOException("Normalizer2 data: not enough indexes"); 388 } 389 int[] inIndexes=new int[indexesLength]; 390 inIndexes[0]=indexesLength*4; 391 for(int i=1; i<indexesLength; ++i) { 392 inIndexes[i]=bytes.getInt(); 393 } 394 395 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 396 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 397 398 minYesNo=inIndexes[IX_MIN_YES_NO]; 399 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 400 minNoNo=inIndexes[IX_MIN_NO_NO]; 401 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 402 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 403 404 // Read the normTrie. 405 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 406 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 407 normTrie=Trie2_16.createFromSerialized(bytes); 408 int trieLength=normTrie.getSerializedLength(); 409 if(trieLength>(nextOffset-offset)) { 410 throw new IOException("Normalizer2 data: not enough bytes for normTrie"); 411 } 412 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 413 414 // Read the composition and mapping data. 415 offset=nextOffset; 416 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 417 int numChars=(nextOffset-offset)/2; 418 char[] chars; 419 if(numChars!=0) { 420 chars=new char[numChars]; 421 for(int i=0; i<numChars; ++i) { 422 chars[i]=bytes.getChar(); 423 } 424 maybeYesCompositions=new String(chars); 425 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); 426 } 427 428 // smallFCD: new in formatVersion 2 429 offset=nextOffset; 430 smallFCD=new byte[0x100]; 431 for(int i=0; i<0x100; ++i) { 432 smallFCD[i]=bytes.get(); 433 } 434 435 // Build tccc180[]. 436 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 437 tccc180=new int[0x180]; 438 int bits=0; 439 for(int c=0; c<0x180; bits>>=1) { 440 if((c&0xff)==0) { 441 bits=smallFCD[c>>8]; // one byte per 0x100 code points 442 } 443 if((bits&1)!=0) { 444 for(int i=0; i<0x20; ++i, ++c) { 445 tccc180[c]=getFCD16FromNormData(c)&0xff; 446 } 447 } else { 448 c+=0x20; 449 } 450 } 451 452 return this; 453 } catch(IOException e) { 454 throw new InternalError(e); 455 } 456 } 457 458 public NormalizerImpl load(String name) { 459 return load(ICUBinary.getRequiredData(name)); 460 } 461 462 public int getNorm16(int c) { 463 return normTrie.get(c); 464 } 465 466 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 467 468 public int getCC(int norm16) { 469 if(norm16>=MIN_NORMAL_MAYBE_YES) { 470 return norm16&0xff; 471 } 472 if(norm16<minNoNo || limitNoNo<=norm16) { 473 return 0; 474 } 475 return getCCFromNoNo(norm16); 476 } 477 478 public static int getCCFromYesOrMaybe(int norm16) { 479 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; 480 } 481 482 /** 483 * Returns the FCD data for code point c. 484 * @param c A Unicode code point. 485 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 486 */ 487 public int getFCD16(int c) { 488 if(c<0) { 489 return 0; 490 } else if(c<0x180) { 491 return tccc180[c]; 492 } else if(c<=0xffff) { 493 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 494 } 495 return getFCD16FromNormData(c); 496 } 497 498 /** Returns the FCD data for U+0000<=c<U+0180. */ 499 public int getFCD16FromBelow180(int c) { return tccc180[c]; } 500 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 501 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 502 // 0<=lead<=0xffff 503 byte bits=smallFCD[lead>>8]; 504 if(bits==0) { return false; } 505 return ((bits>>((lead>>5)&7))&1)!=0; 506 } 507 508 /** Gets the FCD value from the regular normalization data. */ 509 public int getFCD16FromNormData(int c) { 510 // Only loops for 1:1 algorithmic mappings. 511 for(;;) { 512 int norm16=getNorm16(c); 513 if(norm16<=minYesNo) { 514 // no decomposition or Hangul syllable, all zeros 515 return 0; 516 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 517 // combining mark 518 norm16&=0xff; 519 return norm16|(norm16<<8); 520 } else if(norm16>=minMaybeYes) { 521 return 0; 522 } else if(isDecompNoAlgorithmic(norm16)) { 523 c=mapAlgorithmic(c, norm16); 524 } else { 525 // c decomposes, get everything from the variable-length extra data 526 int firstUnit=extraData.charAt(norm16); 527 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 528 // A character that is deleted (maps to an empty string) must 529 // get the worst-case lccc and tccc values because arbitrary 530 // characters on both sides will become adjacent. 531 return 0x1ff; 532 } else { 533 int fcd16=firstUnit>>8; // tccc 534 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 535 fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc 536 } 537 return fcd16; 538 } 539 } 540 } 541 } 542 543 /** 544 * Gets the decomposition for one code point. 545 * @param c code point 546 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 547 */ 548 public String getDecomposition(int c) { 549 int decomp=-1; 550 int norm16; 551 for(;;) { 552 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 553 // c does not decompose 554 } else if(isHangul(norm16)) { 555 // Hangul syllable: decompose algorithmically 556 StringBuilder buffer=new StringBuilder(); 557 Hangul.decompose(c, buffer); 558 return buffer.toString(); 559 } else if(isDecompNoAlgorithmic(norm16)) { 560 decomp=c=mapAlgorithmic(c, norm16); 561 continue; 562 } else { 563 // c decomposes, get everything from the variable-length extra data 564 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; 565 return extraData.substring(norm16, norm16+length); 566 } 567 if(decomp<0) { 568 return null; 569 } else { 570 return UTF16.valueOf(decomp); 571 } 572 } 573 } 574 575 public static final int MIN_CCC_LCCC_CP=0x300; 576 577 public static final int MIN_YES_YES_WITH_CC=0xff01; 578 public static final int JAMO_VT=0xff00; 579 public static final int MIN_NORMAL_MAYBE_YES=0xfe00; 580 public static final int MAX_DELTA=0x40; 581 582 // Byte offsets from the start of the data, after the generic header. 583 public static final int IX_NORM_TRIE_OFFSET=0; 584 public static final int IX_EXTRA_DATA_OFFSET=1; 585 public static final int IX_SMALL_FCD_OFFSET=2; 586 587 // Code point thresholds for quick check codes. 588 public static final int IX_MIN_DECOMP_NO_CP=8; 589 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 590 591 // Norm16 value thresholds for quick check combinations and types of extra data. 592 // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 593 public static final int IX_MIN_YES_NO=10; 594 public static final int IX_MIN_NO_NO=11; 595 public static final int IX_LIMIT_NO_NO=12; 596 public static final int IX_MIN_MAYBE_YES=13; 597 598 // Mappings only in [minYesNoMappingsOnly..minNoNo[. 599 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 600 601 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 602 public static final int MAPPING_LENGTH_MASK=0x1f; 603 604 public static final int COMP_1_LAST_TUPLE=0x8000; 605 public static final int COMP_1_TRIPLE=1; 606 public static final int COMP_1_TRAIL_LIMIT=0x3400; 607 public static final int COMP_1_TRAIL_MASK=0x7ffe; 608 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 609 public static final int COMP_2_TRAIL_SHIFT=6; 610 public static final int COMP_2_TRAIL_MASK=0xffc0; 611 612 // higher-level functionality ------------------------------------------ *** 613 614 /** 615 * Decomposes s[src, limit[ and writes the result to dest. 616 * limit can be NULL if src is NUL-terminated. 617 * destLengthEstimate is the initial dest buffer capacity and can be -1. 618 */ 619 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 620 int destLengthEstimate) { 621 if(destLengthEstimate<0) { 622 destLengthEstimate=limit-src; 623 } 624 dest.setLength(0); 625 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 626 decompose(s, src, limit, buffer); 627 } 628 629 // Dual functionality: 630 // buffer!=NULL: normalize 631 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 632 public int decompose(CharSequence s, int src, int limit, 633 ReorderingBuffer buffer) { 634 int minNoCP=minDecompNoCP; 635 636 int prevSrc; 637 int c=0; 638 int norm16=0; 639 640 // only for quick check 641 int prevBoundary=src; 642 int prevCC=0; 643 644 for(;;) { 645 // count code units below the minimum or with irrelevant data for the quick check 646 for(prevSrc=src; src!=limit;) { 647 if( (c=s.charAt(src))<minNoCP || 648 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 649 ) { 650 ++src; 651 } else if(!UTF16.isSurrogate((char)c)) { 652 break; 653 } else { 654 char c2; 655 if(UTF16Plus.isSurrogateLead(c)) { 656 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 657 c=Character.toCodePoint((char)c, c2); 658 } 659 } else /* trail surrogate */ { 660 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 661 --src; 662 c=Character.toCodePoint(c2, (char)c); 663 } 664 } 665 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 666 src+=Character.charCount(c); 667 } else { 668 break; 669 } 670 } 671 } 672 // copy these code units all at once 673 if(src!=prevSrc) { 674 if(buffer!=null) { 675 buffer.flushAndAppendZeroCC(s, prevSrc, src); 676 } else { 677 prevCC=0; 678 prevBoundary=src; 679 } 680 } 681 if(src==limit) { 682 break; 683 } 684 685 // Check one above-minimum, relevant code point. 686 src+=Character.charCount(c); 687 if(buffer!=null) { 688 decompose(c, norm16, buffer); 689 } else { 690 if(isDecompYes(norm16)) { 691 int cc=getCCFromYesOrMaybe(norm16); 692 if(prevCC<=cc || cc==0) { 693 prevCC=cc; 694 if(cc<=1) { 695 prevBoundary=src; 696 } 697 continue; 698 } 699 } 700 return prevBoundary; // "no" or cc out of order 701 } 702 } 703 return src; 704 } 705 706 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 707 int limit=s.length(); 708 if(limit==0) { 709 return; 710 } 711 if(doDecompose) { 712 decompose(s, 0, limit, buffer); 713 return; 714 } 715 // Just merge the strings at the boundary. 716 int c=Character.codePointAt(s, 0); 717 int src=0; 718 int firstCC, prevCC, cc; 719 firstCC=prevCC=cc=getCC(getNorm16(c)); 720 while(cc!=0) { 721 prevCC=cc; 722 src+=Character.charCount(c); 723 if(src>=limit) { 724 break; 725 } 726 c=Character.codePointAt(s, src); 727 cc=getCC(getNorm16(c)); 728 }; 729 buffer.append(s, 0, src, firstCC, prevCC); 730 buffer.append(s, src, limit); 731 } 732 733 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 734 // doCompose: normalize 735 // !doCompose: isNormalized (buffer must be empty and initialized) 736 public boolean compose(CharSequence s, int src, int limit, 737 boolean onlyContiguous, 738 boolean doCompose, 739 ReorderingBuffer buffer) { 740 int minNoMaybeCP=minCompNoMaybeCP; 741 742 /* 743 * prevBoundary points to the last character before the current one 744 * that has a composition boundary before it with ccc==0 and quick check "yes". 745 * Keeping track of prevBoundary saves us looking for a composition boundary 746 * when we find a "no" or "maybe". 747 * 748 * When we back out from prevSrc back to prevBoundary, 749 * then we also remove those same characters (which had been simply copied 750 * or canonically-order-inserted) from the ReorderingBuffer. 751 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 752 * must correspond 1:1 to destination units at the end of the destination buffer. 753 */ 754 int prevBoundary=src; 755 int prevSrc; 756 int c=0; 757 int norm16=0; 758 759 // only for isNormalized 760 int prevCC=0; 761 762 for(;;) { 763 // count code units below the minimum or with irrelevant data for the quick check 764 for(prevSrc=src; src!=limit;) { 765 if( (c=s.charAt(src))<minNoMaybeCP || 766 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 767 ) { 768 ++src; 769 } else if(!UTF16.isSurrogate((char)c)) { 770 break; 771 } else { 772 char c2; 773 if(UTF16Plus.isSurrogateLead(c)) { 774 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 775 c=Character.toCodePoint((char)c, c2); 776 } 777 } else /* trail surrogate */ { 778 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 779 --src; 780 c=Character.toCodePoint(c2, (char)c); 781 } 782 } 783 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 784 src+=Character.charCount(c); 785 } else { 786 break; 787 } 788 } 789 } 790 // copy these code units all at once 791 if(src!=prevSrc) { 792 if(src==limit) { 793 if(doCompose) { 794 buffer.flushAndAppendZeroCC(s, prevSrc, src); 795 } 796 break; 797 } 798 // Set prevBoundary to the last character in the quick check loop. 799 prevBoundary=src-1; 800 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 801 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 802 ) { 803 --prevBoundary; 804 } 805 if(doCompose) { 806 // The last "quick check yes" character is excluded from the 807 // flush-and-append call in case it needs to be modified. 808 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 809 buffer.append(s, prevBoundary, src); 810 } else { 811 prevCC=0; 812 } 813 // The start of the current character (c). 814 prevSrc=src; 815 } else if(src==limit) { 816 break; 817 } 818 819 src+=Character.charCount(c); 820 /* 821 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 822 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 823 * or has ccc!=0. 824 * Check for Jamo V/T, then for regular characters. 825 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 826 */ 827 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 828 char prev=s.charAt(prevSrc-1); 829 boolean needToDecompose=false; 830 if(c<Hangul.JAMO_T_BASE) { 831 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 832 prev-=Hangul.JAMO_L_BASE; 833 if(prev<Hangul.JAMO_L_COUNT) { 834 if(!doCompose) { 835 return false; 836 } 837 char syllable=(char) 838 (Hangul.HANGUL_BASE+ 839 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 840 Hangul.JAMO_T_COUNT); 841 char t; 842 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 843 ++src; 844 syllable+=t; // The next character was a Jamo T. 845 prevBoundary=src; 846 buffer.setLastChar(syllable); 847 continue; 848 } 849 // If we see L+V+x where x!=T then we drop to the slow path, 850 // decompose and recompose. 851 // This is to deal with NFKC finding normal L and V but a 852 // compatibility variant of a T. We need to either fully compose that 853 // combination here (which would complicate the code and may not work 854 // with strange custom data) or use the slow path -- or else our replacing 855 // two input characters (L+V) with one output character (LV syllable) 856 // would violate the invariant that [prevBoundary..prevSrc[ has the same 857 // length as what we appended to the buffer since prevBoundary. 858 needToDecompose=true; 859 } 860 } else if(Hangul.isHangulWithoutJamoT(prev)) { 861 // c is a Jamo Trailing consonant, 862 // compose with previous Hangul LV that does not contain a Jamo T. 863 if(!doCompose) { 864 return false; 865 } 866 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); 867 prevBoundary=src; 868 continue; 869 } 870 if(!needToDecompose) { 871 // The Jamo V/T did not compose into a Hangul syllable. 872 if(doCompose) { 873 buffer.append((char)c); 874 } else { 875 prevCC=0; 876 } 877 continue; 878 } 879 } 880 /* 881 * Source buffer pointers: 882 * 883 * all done quick check current char not yet 884 * "yes" but (c) processed 885 * may combine 886 * forward 887 * [-------------[-------------[-------------[-------------[ 888 * | | | | | 889 * orig. src prevBoundary prevSrc src limit 890 * 891 * 892 * Destination buffer pointers inside the ReorderingBuffer: 893 * 894 * all done might take not filled yet 895 * characters for 896 * reordering 897 * [-------------[-------------[-------------[ 898 * | | | | 899 * start reorderStart limit | 900 * +remainingCap.+ 901 */ 902 if(norm16>=MIN_YES_YES_WITH_CC) { 903 int cc=norm16&0xff; // cc!=0 904 if( onlyContiguous && // FCC 905 (doCompose ? buffer.getLastCC() : prevCC)==0 && 906 prevBoundary<prevSrc && 907 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 908 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 909 // passed the quick check "yes && ccc==0" test. 910 // Check whether the last character was a "yesYes" or a "yesNo". 911 // If a "yesNo", then we get its trailing ccc from its 912 // mapping and check for canonical order. 913 // All other cases are ok. 914 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 915 ) { 916 // Fails FCD test, need to decompose and contiguously recompose. 917 if(!doCompose) { 918 return false; 919 } 920 } else if(doCompose) { 921 buffer.append(c, cc); 922 continue; 923 } else if(prevCC<=cc) { 924 prevCC=cc; 925 continue; 926 } else { 927 return false; 928 } 929 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 930 return false; 931 } 932 933 /* 934 * Find appropriate boundaries around this character, 935 * decompose the source text from between the boundaries, 936 * and recompose it. 937 * 938 * We may need to remove the last few characters from the ReorderingBuffer 939 * to account for source text that was copied or appended 940 * but needs to take part in the recomposition. 941 */ 942 943 /* 944 * Find the last composition boundary in [prevBoundary..src[. 945 * It is either the decomposition of the current character (at prevSrc), 946 * or prevBoundary. 947 */ 948 if(hasCompBoundaryBefore(c, norm16)) { 949 prevBoundary=prevSrc; 950 } else if(doCompose) { 951 buffer.removeSuffix(prevSrc-prevBoundary); 952 } 953 954 // Find the next composition boundary in [src..limit[ - 955 // modifies src to point to the next starter. 956 src=findNextCompBoundary(s, src, limit); 957 958 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 959 int recomposeStartIndex=buffer.length(); 960 decomposeShort(s, prevBoundary, src, buffer); 961 recompose(buffer, recomposeStartIndex, onlyContiguous); 962 if(!doCompose) { 963 if(!buffer.equals(s, prevBoundary, src)) { 964 return false; 965 } 966 buffer.remove(); 967 prevCC=0; 968 } 969 970 // Move to the next starter. We never need to look back before this point again. 971 prevBoundary=src; 972 } 973 return true; 974 } 975 976 /** 977 * Very similar to compose(): Make the same changes in both places if relevant. 978 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 979 * !doSpan: quickCheck 980 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 981 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 982 * then the quick check result is "no" 983 */ 984 public int composeQuickCheck(CharSequence s, int src, int limit, 985 boolean onlyContiguous, boolean doSpan) { 986 int qcResult=0; 987 int minNoMaybeCP=minCompNoMaybeCP; 988 989 /* 990 * prevBoundary points to the last character before the current one 991 * that has a composition boundary before it with ccc==0 and quick check "yes". 992 */ 993 int prevBoundary=src; 994 int prevSrc; 995 int c=0; 996 int norm16=0; 997 int prevCC=0; 998 999 for(;;) { 1000 // count code units below the minimum or with irrelevant data for the quick check 1001 for(prevSrc=src;;) { 1002 if(src==limit) { 1003 return (src<<1)|qcResult; // "yes" or "maybe" 1004 } 1005 if( (c=s.charAt(src))<minNoMaybeCP || 1006 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1007 ) { 1008 ++src; 1009 } else if(!UTF16.isSurrogate((char)c)) { 1010 break; 1011 } else { 1012 char c2; 1013 if(UTF16Plus.isSurrogateLead(c)) { 1014 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1015 c=Character.toCodePoint((char)c, c2); 1016 } 1017 } else /* trail surrogate */ { 1018 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1019 --src; 1020 c=Character.toCodePoint(c2, (char)c); 1021 } 1022 } 1023 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1024 src+=Character.charCount(c); 1025 } else { 1026 break; 1027 } 1028 } 1029 } 1030 if(src!=prevSrc) { 1031 // Set prevBoundary to the last character in the quick check loop. 1032 prevBoundary=src-1; 1033 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1034 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1035 ) { 1036 --prevBoundary; 1037 } 1038 prevCC=0; 1039 // The start of the current character (c). 1040 prevSrc=src; 1041 } 1042 1043 src+=Character.charCount(c); 1044 /* 1045 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1046 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1047 * or has ccc!=0. 1048 */ 1049 if(isMaybeOrNonZeroCC(norm16)) { 1050 int cc=getCCFromYesOrMaybe(norm16); 1051 if( onlyContiguous && // FCC 1052 cc!=0 && 1053 prevCC==0 && 1054 prevBoundary<prevSrc && 1055 // prevCC==0 && prevBoundary<prevSrc tell us that 1056 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1057 // passed the quick check "yes && ccc==0" test. 1058 // Check whether the last character was a "yesYes" or a "yesNo". 1059 // If a "yesNo", then we get its trailing ccc from its 1060 // mapping and check for canonical order. 1061 // All other cases are ok. 1062 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1063 ) { 1064 // Fails FCD test. 1065 } else if(prevCC<=cc || cc==0) { 1066 prevCC=cc; 1067 if(norm16<MIN_YES_YES_WITH_CC) { 1068 if(!doSpan) { 1069 qcResult=1; 1070 } else { 1071 return prevBoundary<<1; // spanYes does not care to know it's "maybe" 1072 } 1073 } 1074 continue; 1075 } 1076 } 1077 return prevBoundary<<1; // "no" 1078 } 1079 } 1080 1081 public void composeAndAppend(CharSequence s, 1082 boolean doCompose, 1083 boolean onlyContiguous, 1084 ReorderingBuffer buffer) { 1085 int src=0, limit=s.length(); 1086 if(!buffer.isEmpty()) { 1087 int firstStarterInSrc=findNextCompBoundary(s, 0, limit); 1088 if(0!=firstStarterInSrc) { 1089 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1090 buffer.length()); 1091 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1092 firstStarterInSrc+16); 1093 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1094 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1095 middle.append(s, 0, firstStarterInSrc); 1096 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1097 src=firstStarterInSrc; 1098 } 1099 } 1100 if(doCompose) { 1101 compose(s, src, limit, onlyContiguous, true, buffer); 1102 } else { 1103 buffer.append(s, src, limit); 1104 } 1105 } 1106 1107 // Dual functionality: 1108 // buffer!=NULL: normalize 1109 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1110 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1111 // Note: In this function we use buffer->appendZeroCC() because we track 1112 // the lead and trail combining classes here, rather than leaving it to 1113 // the ReorderingBuffer. 1114 // The exception is the call to decomposeShort() which uses the buffer 1115 // in the normal way. 1116 1117 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1118 // Similar to the prevBoundary in the compose() implementation. 1119 int prevBoundary=src; 1120 int prevSrc; 1121 int c=0; 1122 int prevFCD16=0; 1123 int fcd16=0; 1124 1125 for(;;) { 1126 // count code units with lccc==0 1127 for(prevSrc=src; src!=limit;) { 1128 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { 1129 prevFCD16=~c; 1130 ++src; 1131 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1132 prevFCD16=0; 1133 ++src; 1134 } else { 1135 if(UTF16.isSurrogate((char)c)) { 1136 char c2; 1137 if(UTF16Plus.isSurrogateLead(c)) { 1138 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1139 c=Character.toCodePoint((char)c, c2); 1140 } 1141 } else /* trail surrogate */ { 1142 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1143 --src; 1144 c=Character.toCodePoint(c2, (char)c); 1145 } 1146 } 1147 } 1148 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1149 prevFCD16=fcd16; 1150 src+=Character.charCount(c); 1151 } else { 1152 break; 1153 } 1154 } 1155 } 1156 // copy these code units all at once 1157 if(src!=prevSrc) { 1158 if(src==limit) { 1159 if(buffer!=null) { 1160 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1161 } 1162 break; 1163 } 1164 prevBoundary=src; 1165 // We know that the previous character's lccc==0. 1166 if(prevFCD16<0) { 1167 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1168 int prev=~prevFCD16; 1169 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1170 if(prevFCD16>1) { 1171 --prevBoundary; 1172 } 1173 } else { 1174 int p=src-1; 1175 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1176 Character.isHighSurrogate(s.charAt(p-1)) 1177 ) { 1178 --p; 1179 // Need to fetch the previous character's FCD value because 1180 // prevFCD16 was just for the trail surrogate code point. 1181 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1182 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1183 } 1184 if(prevFCD16>1) { 1185 prevBoundary=p; 1186 } 1187 } 1188 if(buffer!=null) { 1189 // The last lccc==0 character is excluded from the 1190 // flush-and-append call in case it needs to be modified. 1191 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1192 buffer.append(s, prevBoundary, src); 1193 } 1194 // The start of the current character (c). 1195 prevSrc=src; 1196 } else if(src==limit) { 1197 break; 1198 } 1199 1200 src+=Character.charCount(c); 1201 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1202 // Check for proper order, and decompose locally if necessary. 1203 if((prevFCD16&0xff)<=(fcd16>>8)) { 1204 // proper order: prev tccc <= current lccc 1205 if((fcd16&0xff)<=1) { 1206 prevBoundary=src; 1207 } 1208 if(buffer!=null) { 1209 buffer.appendZeroCC(c); 1210 } 1211 prevFCD16=fcd16; 1212 continue; 1213 } else if(buffer==null) { 1214 return prevBoundary; // quick check "no" 1215 } else { 1216 /* 1217 * Back out the part of the source that we copied or appended 1218 * already but is now going to be decomposed. 1219 * prevSrc is set to after what was copied/appended. 1220 */ 1221 buffer.removeSuffix(prevSrc-prevBoundary); 1222 /* 1223 * Find the part of the source that needs to be decomposed, 1224 * up to the next safe boundary. 1225 */ 1226 src=findNextFCDBoundary(s, src, limit); 1227 /* 1228 * The source text does not fulfill the conditions for FCD. 1229 * Decompose and reorder a limited piece of the text. 1230 */ 1231 decomposeShort(s, prevBoundary, src, buffer); 1232 prevBoundary=src; 1233 prevFCD16=0; 1234 } 1235 } 1236 return src; 1237 } 1238 1239 // Note: hasDecompBoundary() could be implemented as aliases to 1240 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 1241 // at the cost of building the FCD trie for a decomposition normalizer. 1242 public boolean hasDecompBoundary(int c, boolean before) { 1243 for(;;) { 1244 if(c<minDecompNoCP) { 1245 return true; 1246 } 1247 int norm16=getNorm16(c); 1248 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 1249 return true; 1250 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 1251 return false; // ccc!=0 1252 } else if(isDecompNoAlgorithmic(norm16)) { 1253 c=mapAlgorithmic(c, norm16); 1254 } else { 1255 // c decomposes, get everything from the variable-length extra data 1256 int firstUnit=extraData.charAt(norm16); 1257 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1258 return false; 1259 } 1260 if(!before) { 1261 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1262 // fcd16<=1 || trailCC==0 1263 if(firstUnit>0x1ff) { 1264 return false; // trailCC>1 1265 } 1266 if(firstUnit<=0xff) { 1267 return true; // trailCC==0 1268 } 1269 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1270 } 1271 // true if leadCC==0 (hasFCDBoundaryBefore()) 1272 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; 1273 } 1274 } 1275 } 1276 1277 public boolean hasCompBoundaryBefore(int c) { 1278 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 1279 } 1280 1281 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1282 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1283 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1284 private boolean isHangul(int norm16) { return norm16==minYesNo; } 1285 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1286 1287 // UBool isCompYes(uint16_t norm16) const { 1288 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1289 // } 1290 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1291 // return norm16<minNoNo || minMaybeYes<=norm16; 1292 // } 1293 // private boolean hasZeroCCFromDecompYes(int norm16) { 1294 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1295 // } 1296 private boolean isDecompYesAndZeroCC(int norm16) { 1297 return norm16<minYesNo || 1298 norm16==JAMO_VT || 1299 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1300 } 1301 1302 /** 1303 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1304 * the MaybeYes which combine-forward and have ccc=0. 1305 * (Standard Unicode 5.2 normalization does not have such characters.) 1306 */ 1307 private boolean isMostDecompYesAndZeroCC(int norm16) { 1308 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1309 } 1310 1311 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1312 1313 // For use with isCompYes(). 1314 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1315 // static uint8_t getCCFromYes(uint16_t norm16) { 1316 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 1317 // } 1318 private int getCCFromNoNo(int norm16) { 1319 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1320 return extraData.charAt(norm16-1)&0xff; 1321 } else { 1322 return 0; 1323 } 1324 } 1325 1326 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 1327 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { 1328 int c; 1329 if(cpStart==(cpLimit-1)) { 1330 c=s.charAt(cpStart); 1331 } else { 1332 c=Character.codePointAt(s, cpStart); 1333 } 1334 int prevNorm16=getNorm16(c); 1335 if(prevNorm16<=minYesNo) { 1336 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 1337 } else { 1338 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo 1339 } 1340 } 1341 1342 // Requires algorithmic-NoNo. 1343 private int mapAlgorithmic(int c, int norm16) { 1344 return c+norm16-(minMaybeYes-MAX_DELTA-1); 1345 } 1346 1347 // Requires minYesNo<norm16<limitNoNo. 1348 // private int getMapping(int norm16) { return /*extraData+*/norm16; } 1349 1350 /** 1351 * @return index into maybeYesCompositions, or -1 1352 */ 1353 private int getCompositionsListForDecompYes(int norm16) { 1354 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 1355 return -1; 1356 } else { 1357 if((norm16-=minMaybeYes)<0) { 1358 // norm16<minMaybeYes: index into extraData which is a substring at 1359 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1360 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1361 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1362 } 1363 return norm16; 1364 } 1365 } 1366 1367 /** 1368 * @return index into maybeYesCompositions 1369 */ 1370 private int getCompositionsListForComposite(int norm16) { 1371 // composite has both mapping & compositions list 1372 int firstUnit=extraData.charAt(norm16); 1373 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 1374 1+ // +1 to skip the first unit with the mapping lenth 1375 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1376 } 1377 1378 // Decompose a short piece of text which is likely to contain characters that 1379 // fail the quick check loop and/or where the quick check loop's overhead 1380 // is unlikely to be amortized. 1381 // Called by the compose() and makeFCD() implementations. 1382 // Public in Java for collation implementation code. 1383 public void decomposeShort(CharSequence s, int src, int limit, 1384 ReorderingBuffer buffer) { 1385 while(src<limit) { 1386 int c=Character.codePointAt(s, src); 1387 src+=Character.charCount(c); 1388 decompose(c, getNorm16(c), buffer); 1389 } 1390 } 1391 1392 private void decompose(int c, int norm16, 1393 ReorderingBuffer buffer) { 1394 // Only loops for 1:1 algorithmic mappings. 1395 for(;;) { 1396 // get the decomposition and the lead and trail cc's 1397 if(isDecompYes(norm16)) { 1398 // c does not decompose 1399 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1400 } else if(isHangul(norm16)) { 1401 // Hangul syllable: decompose algorithmically 1402 Hangul.decompose(c, buffer); 1403 } else if(isDecompNoAlgorithmic(norm16)) { 1404 c=mapAlgorithmic(c, norm16); 1405 norm16=getNorm16(c); 1406 continue; 1407 } else { 1408 // c decomposes, get everything from the variable-length extra data 1409 int firstUnit=extraData.charAt(norm16); 1410 int length=firstUnit&MAPPING_LENGTH_MASK; 1411 int leadCC, trailCC; 1412 trailCC=firstUnit>>8; 1413 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1414 leadCC=extraData.charAt(norm16-1)>>8; 1415 } else { 1416 leadCC=0; 1417 } 1418 ++norm16; // skip over the firstUnit 1419 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); 1420 } 1421 return; 1422 } 1423 } 1424 1425 /** 1426 * Finds the recomposition result for 1427 * a forward-combining "lead" character, 1428 * specified with a pointer to its compositions list, 1429 * and a backward-combining "trail" character. 1430 * 1431 * <p>If the lead and trail characters combine, then this function returns 1432 * the following "compositeAndFwd" value: 1433 * <pre> 1434 * Bits 21..1 composite character 1435 * Bit 0 set if the composite is a forward-combining starter 1436 * </pre> 1437 * otherwise it returns -1. 1438 * 1439 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1440 * encoded as either pairs or triples of 16-bit units. 1441 * The last entry has the high bit of its first unit set. 1442 * 1443 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1444 * A linear search is used. 1445 * 1446 * <p>See normalizer2impl.h for a more detailed description 1447 * of the compositions list format. 1448 */ 1449 private static int combine(String compositions, int list, int trail) { 1450 int key1, firstUnit; 1451 if(trail<COMP_1_TRAIL_LIMIT) { 1452 // trail character is 0..33FF 1453 // result entry may have 2 or 3 units 1454 key1=(trail<<1); 1455 while(key1>(firstUnit=compositions.charAt(list))) { 1456 list+=2+(firstUnit&COMP_1_TRIPLE); 1457 } 1458 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1459 if((firstUnit&COMP_1_TRIPLE)!=0) { 1460 return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1461 } else { 1462 return compositions.charAt(list+1); 1463 } 1464 } 1465 } else { 1466 // trail character is 3400..10FFFF 1467 // result entry has 3 units 1468 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1469 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1470 int secondUnit; 1471 for(;;) { 1472 if(key1>(firstUnit=compositions.charAt(list))) { 1473 list+=2+(firstUnit&COMP_1_TRIPLE); 1474 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1475 if(key2>(secondUnit=compositions.charAt(list+1))) { 1476 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1477 break; 1478 } else { 1479 list+=3; 1480 } 1481 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1482 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1483 } else { 1484 break; 1485 } 1486 } else { 1487 break; 1488 } 1489 } 1490 } 1491 return -1; 1492 } 1493 1494 /* 1495 * Recomposes the buffer text starting at recomposeStartIndex 1496 * (which is in NFD - decomposed and canonically ordered), 1497 * and truncates the buffer contents. 1498 * 1499 * Note that recomposition never lengthens the text: 1500 * Any character consists of either one or two code units; 1501 * a composition may contain at most one more code unit than the original starter, 1502 * while the combining mark that is removed has at least one code unit. 1503 */ 1504 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1505 boolean onlyContiguous) { 1506 StringBuilder sb=buffer.getStringBuilder(); 1507 int p=recomposeStartIndex; 1508 if(p==sb.length()) { 1509 return; 1510 } 1511 1512 int starter, pRemove; 1513 int compositionsList; 1514 int c, compositeAndFwd; 1515 int norm16; 1516 int cc, prevCC; 1517 boolean starterIsSupplementary; 1518 1519 // Some of the following variables are not used until we have a forward-combining starter 1520 // and are only initialized now to avoid compiler warnings. 1521 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1522 starter=-1; 1523 starterIsSupplementary=false; 1524 prevCC=0; 1525 1526 for(;;) { 1527 c=sb.codePointAt(p); 1528 p+=Character.charCount(c); 1529 norm16=getNorm16(c); 1530 cc=getCCFromYesOrMaybe(norm16); 1531 if( // this character combines backward and 1532 isMaybe(norm16) && 1533 // we have seen a starter that combines forward and 1534 compositionsList>=0 && 1535 // the backward-combining character is not blocked 1536 (prevCC<cc || prevCC==0)) { 1537 if(isJamoVT(norm16)) { 1538 // c is a Jamo V/T, see if we can compose it with the previous character. 1539 if(c<Hangul.JAMO_T_BASE) { 1540 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1541 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1542 if(prev<Hangul.JAMO_L_COUNT) { 1543 pRemove=p-1; 1544 char syllable=(char) 1545 (Hangul.HANGUL_BASE+ 1546 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1547 Hangul.JAMO_T_COUNT); 1548 char t; 1549 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1550 ++p; 1551 syllable+=t; // The next character was a Jamo T. 1552 } 1553 sb.setCharAt(starter, syllable); 1554 // remove the Jamo V/T 1555 sb.delete(pRemove, p); 1556 p=pRemove; 1557 } 1558 } 1559 /* 1560 * No "else" for Jamo T: 1561 * Since the input is in NFD, there are no Hangul LV syllables that 1562 * a Jamo T could combine with. 1563 * All Jamo Ts are combined above when handling Jamo Vs. 1564 */ 1565 if(p==sb.length()) { 1566 break; 1567 } 1568 compositionsList=-1; 1569 continue; 1570 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1571 // The starter and the combining mark (c) do combine. 1572 int composite=compositeAndFwd>>1; 1573 1574 // Remove the combining mark. 1575 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1576 sb.delete(pRemove, p); 1577 p=pRemove; 1578 // Replace the starter with the composite. 1579 if(starterIsSupplementary) { 1580 if(composite>0xffff) { 1581 // both are supplementary 1582 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1583 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1584 } else { 1585 sb.setCharAt(starter, (char)c); 1586 sb.deleteCharAt(starter+1); 1587 // The composite is shorter than the starter, 1588 // move the intermediate characters forward one. 1589 starterIsSupplementary=false; 1590 --p; 1591 } 1592 } else if(composite>0xffff) { 1593 // The composite is longer than the starter, 1594 // move the intermediate characters back one. 1595 starterIsSupplementary=true; 1596 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1597 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1598 ++p; 1599 } else { 1600 // both are on the BMP 1601 sb.setCharAt(starter, (char)composite); 1602 } 1603 1604 // Keep prevCC because we removed the combining mark. 1605 1606 if(p==sb.length()) { 1607 break; 1608 } 1609 // Is the composite a starter that combines forward? 1610 if((compositeAndFwd&1)!=0) { 1611 compositionsList= 1612 getCompositionsListForComposite(getNorm16(composite)); 1613 } else { 1614 compositionsList=-1; 1615 } 1616 1617 // We combined; continue with looking for compositions. 1618 continue; 1619 } 1620 } 1621 1622 // no combination this time 1623 prevCC=cc; 1624 if(p==sb.length()) { 1625 break; 1626 } 1627 1628 // If c did not combine, then check if it is a starter. 1629 if(cc==0) { 1630 // Found a new starter. 1631 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 1632 // It may combine with something, prepare for it. 1633 if(c<=0xffff) { 1634 starterIsSupplementary=false; 1635 starter=p-1; 1636 } else { 1637 starterIsSupplementary=true; 1638 starter=p-2; 1639 } 1640 } 1641 } else if(onlyContiguous) { 1642 // FCC: no discontiguous compositions; any intervening character blocks. 1643 compositionsList=-1; 1644 } 1645 } 1646 buffer.flush(); 1647 } 1648 1649 /** 1650 * Does c have a composition boundary before it? 1651 * True if its decomposition begins with a character that has 1652 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1653 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1654 * (isCompYesAndZeroCC()) so we need not decompose. 1655 */ 1656 private boolean hasCompBoundaryBefore(int c, int norm16) { 1657 for(;;) { 1658 if(isCompYesAndZeroCC(norm16)) { 1659 return true; 1660 } else if(isMaybeOrNonZeroCC(norm16)) { 1661 return false; 1662 } else if(isDecompNoAlgorithmic(norm16)) { 1663 c=mapAlgorithmic(c, norm16); 1664 norm16=getNorm16(c); 1665 } else { 1666 // c decomposes, get everything from the variable-length extra data 1667 int firstUnit=extraData.charAt(norm16); 1668 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1669 return false; 1670 } 1671 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { 1672 return false; // non-zero leadCC 1673 } 1674 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); 1675 } 1676 } 1677 } 1678 1679 private int findPreviousCompBoundary(CharSequence s, int p) { 1680 while(p>0) { 1681 int c=Character.codePointBefore(s, p); 1682 p-=Character.charCount(c); 1683 if(hasCompBoundaryBefore(c)) { 1684 break; 1685 } 1686 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1687 // but that's probably not worth the extra cost. 1688 } 1689 return p; 1690 } 1691 1692 private int findNextCompBoundary(CharSequence s, int p, int limit) { 1693 while(p<limit) { 1694 int c=Character.codePointAt(s, p); 1695 int norm16=normTrie.get(c); 1696 if(hasCompBoundaryBefore(c, norm16)) { 1697 break; 1698 } 1699 p+=Character.charCount(c); 1700 } 1701 return p; 1702 } 1703 1704 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 1705 while(p<limit) { 1706 int c=Character.codePointAt(s, p); 1707 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 1708 break; 1709 } 1710 p+=Character.charCount(c); 1711 } 1712 return p; 1713 } 1714 1715 /** 1716 * Get the canonical decomposition 1717 * sherman for ComposedCharIter 1718 */ 1719 public static int getDecompose(int chars[], String decomps[]) { 1720 Normalizer2 impl = Normalizer2.getNFDInstance(); 1721 1722 int length=0; 1723 int norm16 = 0; 1724 int ch = -1; 1725 int i = 0; 1726 1727 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff 1728 //TBD !!!! the hack code heres save us about 50ms for startup 1729 //need a better solution/lookup 1730 if (ch == 0x30ff) 1731 ch = 0xf900; 1732 else if (ch == 0x115bc) 1733 ch = 0x1d15e; 1734 else if (ch == 0x1d1c1) 1735 ch = 0x2f800; 1736 1737 String s = impl.getDecomposition(ch); 1738 1739 if(s != null && i < chars.length) { 1740 chars[i] = ch; 1741 decomps[i++] = s; 1742 } 1743 } 1744 return i; 1745 } 1746 1747 //------------------------------------------------------ 1748 // special method for Collation (RBTableBuilder.build()) 1749 //------------------------------------------------------ 1750 private static boolean needSingleQuotation(char c) { 1751 return (c >= 0x0009 && c <= 0x000D) || 1752 (c >= 0x0020 && c <= 0x002F) || 1753 (c >= 0x003A && c <= 0x0040) || 1754 (c >= 0x005B && c <= 0x0060) || 1755 (c >= 0x007B && c <= 0x007E); 1756 } 1757 1758 public static String canonicalDecomposeWithSingleQuotation(String string) { 1759 Normalizer2 impl = Normalizer2.getNFDInstance(); 1760 char[] src = string.toCharArray(); 1761 int srcIndex = 0; 1762 int srcLimit = src.length; 1763 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 1764 int destIndex = 0; 1765 int destLimit = dest.length; 1766 1767 int prevSrc; 1768 String norm; 1769 int reorderStartIndex, length; 1770 char c1, c2; 1771 int cp; 1772 int minNoMaybe = 0x00c0; 1773 int cc, prevCC, trailCC; 1774 char[] p; 1775 int pStart; 1776 1777 // initialize 1778 reorderStartIndex = 0; 1779 prevCC = 0; 1780 norm = null; 1781 cp = 0; 1782 pStart = 0; 1783 1784 cc = trailCC = -1; // initialize to bogus value 1785 c1 = 0; 1786 for (;;) { 1787 prevSrc=srcIndex; 1788 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual 1789 while (srcIndex != srcLimit && 1790 ((c1 = src[srcIndex]) < minNoMaybe || 1791 (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || 1792 (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables 1793 prevCC = 0; 1794 srcIndex += (cp < 0x10000) ? 1 : 2; 1795 } 1796 1797 // copy these code units all at once 1798 if (srcIndex != prevSrc) { 1799 length = srcIndex - prevSrc; 1800 if ((destIndex + length) <= destLimit) { 1801 System.arraycopy(src,prevSrc,dest,destIndex,length); 1802 } 1803 1804 destIndex += length; 1805 reorderStartIndex = destIndex; 1806 } 1807 1808 // end of source reached? 1809 if (srcIndex == srcLimit) { 1810 break; 1811 } 1812 1813 // cp already contains *src and norm32 is set for it, increment src 1814 srcIndex += (cp < 0x10000) ? 1 : 2; 1815 1816 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1817 c2 = 0; 1818 length = 1; 1819 1820 if (Character.isHighSurrogate(c1) 1821 || Character.isLowSurrogate(c1)) { 1822 norm = null; 1823 } 1824 } else { 1825 length = 2; 1826 c2 = src[srcIndex-1]; 1827 } 1828 1829 // get the decomposition and the lead and trail cc's 1830 if (norm == null) { 1831 // cp does not decompose 1832 cc = trailCC = UCharacter.getCombiningClass(cp); 1833 p = null; 1834 pStart = -1; 1835 } else { 1836 1837 pStart = 0; 1838 p = norm.toCharArray(); 1839 length = p.length; 1840 int cpNum = norm.codePointCount(0, length); 1841 cc= UCharacter.getCombiningClass(norm.codePointAt(0)); 1842 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); 1843 if (length == 1) { 1844 // fastpath a single code unit from decomposition 1845 c1 = p[pStart]; 1846 c2 = 0; 1847 p = null; 1848 pStart = -1; 1849 } 1850 } 1851 1852 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 1853 // buffer overflow 1854 char[] tmpBuf = new char[destLimit * 2]; 1855 System.arraycopy(dest, 0, tmpBuf, 0, destIndex); 1856 dest = tmpBuf; 1857 destLimit = dest.length; 1858 } 1859 1860 // append the decomposition to the destination buffer, assume length>0 1861 { 1862 int reorderSplit = destIndex; 1863 if (p == null) { 1864 // fastpath: single code point 1865 if (needSingleQuotation(c1)) { 1866 //if we need single quotation, no need to consider "prevCC" 1867 //and it must NOT be a supplementary pair 1868 dest[destIndex++] = '\''; 1869 dest[destIndex++] = c1; 1870 dest[destIndex++] = '\''; 1871 trailCC = 0; 1872 } else if(cc != 0 && cc < prevCC) { 1873 // (c1, c2) is out of order with respect to the preceding 1874 // text 1875 destIndex += length; 1876 trailCC = insertOrdered(dest, reorderStartIndex, 1877 reorderSplit, destIndex, c1, c2, cc); 1878 } else { 1879 // just append (c1, c2) 1880 dest[destIndex++] = c1; 1881 if(c2 != 0) { 1882 dest[destIndex++] = c2; 1883 } 1884 } 1885 } else { 1886 // general: multiple code points (ordered by themselves) 1887 // from decomposition 1888 if (needSingleQuotation(p[pStart])) { 1889 dest[destIndex++] = '\''; 1890 dest[destIndex++] = p[pStart++]; 1891 dest[destIndex++] = '\''; 1892 length--; 1893 do { 1894 dest[destIndex++] = p[pStart++]; 1895 } while(--length > 0); 1896 } else if (cc != 0 && cc < prevCC) { 1897 destIndex += length; 1898 trailCC = mergeOrdered(dest, reorderStartIndex, 1899 reorderSplit, p, pStart, 1900 pStart+length); 1901 } else { 1902 // just append the decomposition 1903 do { 1904 dest[destIndex++] = p[pStart++]; 1905 } while (--length > 0); 1906 } 1907 } 1908 } 1909 prevCC = trailCC; 1910 if(prevCC == 0) { 1911 reorderStartIndex = destIndex; 1912 } 1913 } 1914 1915 return new String(dest, 0, destIndex); 1916 } 1917 1918 /** 1919 * simpler, single-character version of mergeOrdered() - 1920 * bubble-insert one single code point into the preceding string 1921 * which is already canonically ordered 1922 * (c, c2) may or may not yet have been inserted at src[current]..src[p] 1923 * 1924 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) 1925 * 1926 * before: src[start]..src[current] is already ordered, and 1927 * src[current]..src[p] may or may not hold (c, c2) but 1928 * must be exactly the same length as (c, c2) 1929 * after: src[start]..src[p] is ordered 1930 * 1931 * @return the trailing combining class 1932 */ 1933 private static int/*unsigned byte*/ insertOrdered(char[] source, 1934 int start, 1935 int current, int p, 1936 char c1, char c2, 1937 int/*unsigned byte*/ cc) { 1938 int back, preBack; 1939 int r; 1940 int prevCC, trailCC=cc; 1941 1942 if (start<current && cc!=0) { 1943 // search for the insertion point where cc>=prevCC 1944 preBack=back=current; 1945 1946 PrevArgs prevArgs = new PrevArgs(); 1947 prevArgs.current = current; 1948 prevArgs.start = start; 1949 prevArgs.src = source; 1950 prevArgs.c1 = c1; 1951 prevArgs.c2 = c2; 1952 1953 // get the prevCC 1954 prevCC=getPrevCC(prevArgs); 1955 preBack = prevArgs.current; 1956 1957 if(cc<prevCC) { 1958 // this will be the last code point, so keep its cc 1959 trailCC=prevCC; 1960 back=preBack; 1961 while(start<preBack) { 1962 prevCC=getPrevCC(prevArgs); 1963 preBack=prevArgs.current; 1964 if(cc>=prevCC) { 1965 break; 1966 } 1967 back=preBack; 1968 } 1969 1970 // this is where we are right now with all these indicies: 1971 // [start]..[pPreBack] 0..? code points that we can ignore 1972 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc 1973 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) 1974 // [current]..[p] 1 code point (c, c2) with cc 1975 1976 // move the code units in between up 1977 r=p; 1978 do { 1979 source[--r]=source[--current]; 1980 } while (back!=current); 1981 } 1982 } 1983 1984 // insert (c1, c2) 1985 source[current] = c1; 1986 if (c2!=0) { 1987 source[(current+1)] = c2; 1988 } 1989 1990 // we know the cc of the last code point 1991 return trailCC; 1992 } 1993 1994 /** 1995 * merge two UTF-16 string parts together 1996 * to canonically order (order by combining classes) their concatenation 1997 * 1998 * the two strings may already be adjacent, so that the merging is done 1999 * in-place if the two strings are not adjacent, then the buffer holding the 2000 * first one must be large enough 2001 * the second string may or may not be ordered in itself 2002 * 2003 * before: [start]..[current] is already ordered, and 2004 * [next]..[limit] may be ordered in itself, but 2005 * is not in relation to [start..current[ 2006 * after: [start..current+(limit-next)[ is ordered 2007 * 2008 * the algorithm is a simple bubble-sort that takes the characters from 2009 * src[next++] and inserts them in correct combining class order into the 2010 * preceding part of the string 2011 * 2012 * since this function is called much less often than the single-code point 2013 * insertOrdered(), it just uses that for easier maintenance 2014 * 2015 * @return the trailing combining class 2016 */ 2017 private static int /*unsigned byte*/ mergeOrdered(char[] source, 2018 int start, 2019 int current, 2020 char[] data, 2021 int next, 2022 int limit) { 2023 int r; 2024 int /*unsigned byte*/ cc, trailCC=0; 2025 boolean adjacent; 2026 2027 adjacent= current==next; 2028 NextCCArgs ncArgs = new NextCCArgs(); 2029 ncArgs.source = data; 2030 ncArgs.next = next; 2031 ncArgs.limit = limit; 2032 2033 if(start!=current) { 2034 2035 while(ncArgs.next<ncArgs.limit) { 2036 cc=getNextCC(ncArgs); 2037 if(cc==0) { 2038 // does not bubble back 2039 trailCC=0; 2040 if(adjacent) { 2041 current=ncArgs.next; 2042 } else { 2043 data[current++]=ncArgs.c1; 2044 if(ncArgs.c2!=0) { 2045 data[current++]=ncArgs.c2; 2046 } 2047 } 2048 break; 2049 } else { 2050 r=current+(ncArgs.c2==0 ? 1 : 2); 2051 trailCC=insertOrdered(source,start, current, r, 2052 ncArgs.c1, ncArgs.c2, cc); 2053 current=r; 2054 } 2055 } 2056 } 2057 2058 if(ncArgs.next==ncArgs.limit) { 2059 // we know the cc of the last code point 2060 return trailCC; 2061 } else { 2062 if(!adjacent) { 2063 // copy the second string part 2064 do { 2065 source[current++]=data[ncArgs.next++]; 2066 } while(ncArgs.next!=ncArgs.limit); 2067 ncArgs.limit=current; 2068 } 2069 PrevArgs prevArgs = new PrevArgs(); 2070 prevArgs.src = data; 2071 prevArgs.start = start; 2072 prevArgs.current = ncArgs.limit; 2073 return getPrevCC(prevArgs); 2074 } 2075 2076 } 2077 2078 private static final class PrevArgs{ 2079 char[] src; 2080 int start; 2081 int current; 2082 char c1; 2083 char c2; 2084 } 2085 2086 private static final class NextCCArgs{ 2087 char[] source; 2088 int next; 2089 int limit; 2090 char c1; 2091 char c2; 2092 } 2093 2094 private static int /*unsigned*/ getPrevCC(PrevArgs args) { 2095 args.c1=args.src[--args.current]; 2096 args.c2=0; 2097 2098 if (args.c1 < MIN_CCC_LCCC_CP) { 2099 return 0; 2100 } else if (UTF16.isLeadSurrogate(args.c1)) { 2101 /* unpaired first surrogate */ 2102 return 0; 2103 } else if (!UTF16.isTrailSurrogate(args.c1)) { 2104 return UCharacter.getCombiningClass(args.c1); 2105 } else if (args.current!=args.start && 2106 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { 2107 --args.current; 2108 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); 2109 } else { 2110 /* unpaired second surrogate */ 2111 args.c2=0; 2112 return 0; 2113 } 2114 } 2115 2116 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { 2117 args.c1=args.source[args.next++]; 2118 args.c2=0; 2119 2120 if (UTF16.isTrailSurrogate(args.c1)) { 2121 /* unpaired second surrogate */ 2122 return 0; 2123 } else if (!UTF16.isLeadSurrogate(args.c1)) { 2124 return UCharacter.getCombiningClass(args.c1); 2125 } else if (args.next!=args.limit && 2126 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ 2127 ++args.next; 2128 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); 2129 } else { 2130 /* unpaired first surrogate */ 2131 args.c2=0; 2132 return 0; 2133 } 2134 } 2135 2136 private VersionInfo dataVersion; 2137 2138 // Code point thresholds for quick check codes. 2139 private int minDecompNoCP; 2140 private int minCompNoMaybeCP; 2141 2142 // Norm16 value thresholds for quick check combinations and types of extra data. 2143 private int minYesNo; 2144 private int minYesNoMappingsOnly; 2145 private int minNoNo; 2146 private int limitNoNo; 2147 private int minMaybeYes; 2148 2149 private Trie2_16 normTrie; 2150 private String maybeYesCompositions; 2151 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2152 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2153 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F 2154 2155 }