1 /* 2 * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 package sun.text.normalizer; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.text.Normalizer; 37 38 // Original filename in ICU4J: Normalizer2Impl.java 39 public final class NormalizerImpl { 40 public static final class Hangul { 41 /* Korean Hangul and Jamo constants */ 42 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 43 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 44 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 45 46 public static final int HANGUL_BASE=0xac00; 47 public static final int HANGUL_END=0xd7a3; 48 49 public static final int JAMO_L_COUNT=19; 50 public static final int JAMO_V_COUNT=21; 51 public static final int JAMO_T_COUNT=28; 52 53 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 54 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 55 56 public static boolean isHangul(int c) { 57 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 58 } 59 public static boolean isHangulLV(int c) { 60 c-=HANGUL_BASE; 61 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 62 } 63 64 /** 65 * Decomposes c, which must be a Hangul syllable, into buffer 66 * and returns the length of the decomposition (2 or 3). 67 */ 68 public static int decompose(int c, Appendable buffer) { 69 try { 70 c-=HANGUL_BASE; 71 int c2=c%JAMO_T_COUNT; 72 c/=JAMO_T_COUNT; 73 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 74 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 75 if(c2==0) { 76 return 2; 77 } else { 78 buffer.append((char)(JAMO_T_BASE+c2)); 79 return 3; 80 } 81 } catch(IOException e) { 82 throw new InternalError(e); 83 } 84 } 85 } 86 87 /** 88 * Writable buffer that takes care of canonical ordering. 89 * Its Appendable methods behave like the C++ implementation's 90 * appendZeroCC() methods. 91 * <p> 92 * If dest is a StringBuilder, then the buffer writes directly to it. 93 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 94 * until no further changes are necessary and whole segments are appended. 95 * append() methods that take combining-class values always write to the StringBuilder. 96 * Other append() methods flush and append to the Appendable. 97 */ 98 public static final class ReorderingBuffer implements Appendable { 99 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { 100 impl=ni; 101 app=dest; 102 if (app instanceof StringBuilder) { 103 appIsStringBuilder=true; 104 str=(StringBuilder)dest; 105 // In Java, the constructor subsumes public void init(int destCapacity) 106 str.ensureCapacity(destCapacity); 107 reorderStart=0; 108 if(str.length()==0) { 109 lastCC=0; 110 } else { 111 setIterator(); 112 lastCC=previousCC(); 113 // Set reorderStart after the last code point with cc<=1 if there is one. 114 if(lastCC>1) { 115 while(previousCC()>1) {} 116 } 117 reorderStart=codePointLimit; 118 } 119 } else { 120 appIsStringBuilder=false; 121 str=new StringBuilder(); 122 reorderStart=0; 123 lastCC=0; 124 } 125 } 126 127 public boolean isEmpty() { return str.length()==0; } 128 public int length() { return str.length(); } 129 public int getLastCC() { return lastCC; } 130 131 public StringBuilder getStringBuilder() { return str; } 132 133 public boolean equals(CharSequence s, int start, int limit) { 134 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 135 } 136 137 public void append(int c, int cc) { 138 if(lastCC<=cc || cc==0) { 139 str.appendCodePoint(c); 140 lastCC=cc; 141 if(cc<=1) { 142 reorderStart=str.length(); 143 } 144 } else { 145 insert(c, cc); 146 } 147 } 148 public void append(CharSequence s, int start, int limit, boolean isNFD, 149 int leadCC, int trailCC) { 150 if(start==limit) { 151 return; 152 } 153 if(lastCC<=leadCC || leadCC==0) { 154 if(trailCC<=1) { 155 reorderStart=str.length()+(limit-start); 156 } else if(leadCC<=1) { 157 reorderStart=str.length()+1; // Ok if not a code point boundary. 158 } 159 str.append(s, start, limit); 160 lastCC=trailCC; 161 } else { 162 int c=Character.codePointAt(s, start); 163 start+=Character.charCount(c); 164 insert(c, leadCC); // insert first code point 165 while(start<limit) { 166 c=Character.codePointAt(s, start); 167 start+=Character.charCount(c); 168 if(start<limit) { 169 if (isNFD) { 170 leadCC = getCCFromYesOrMaybe(impl.getNorm16(c)); 171 } else { 172 leadCC = impl.getCC(impl.getNorm16(c)); 173 } 174 } else { 175 leadCC=trailCC; 176 } 177 append(c, leadCC); 178 } 179 } 180 } 181 // The following append() methods work like C++ appendZeroCC(). 182 // They assume that the cc or trailCC of their input is 0. 183 // Most of them implement Appendable interface methods. 184 @Override 185 public ReorderingBuffer append(char c) { 186 str.append(c); 187 lastCC=0; 188 reorderStart=str.length(); 189 return this; 190 } 191 public void appendZeroCC(int c) { 192 str.appendCodePoint(c); 193 lastCC=0; 194 reorderStart=str.length(); 195 } 196 @Override 197 public ReorderingBuffer append(CharSequence s) { 198 if(s.length()!=0) { 199 str.append(s); 200 lastCC=0; 201 reorderStart=str.length(); 202 } 203 return this; 204 } 205 @Override 206 public ReorderingBuffer append(CharSequence s, int start, int limit) { 207 if(start!=limit) { 208 str.append(s, start, limit); 209 lastCC=0; 210 reorderStart=str.length(); 211 } 212 return this; 213 } 214 /** 215 * Flushes from the intermediate StringBuilder to the Appendable, 216 * if they are different objects. 217 * Used after recomposition. 218 * Must be called at the end when writing to a non-StringBuilder Appendable. 219 */ 220 public void flush() { 221 if(appIsStringBuilder) { 222 reorderStart=str.length(); 223 } else { 224 try { 225 app.append(str); 226 str.setLength(0); 227 reorderStart=0; 228 } catch(IOException e) { 229 throw new InternalError(e); // Avoid declaring "throws IOException". 230 } 231 } 232 lastCC=0; 233 } 234 /** 235 * Flushes from the intermediate StringBuilder to the Appendable, 236 * if they are different objects. 237 * Then appends the new text to the Appendable or StringBuilder. 238 * Normally used after quick check loops find a non-empty sequence. 239 */ 240 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 241 if(appIsStringBuilder) { 242 str.append(s, start, limit); 243 reorderStart=str.length(); 244 } else { 245 try { 246 app.append(str).append(s, start, limit); 247 str.setLength(0); 248 reorderStart=0; 249 } catch(IOException e) { 250 throw new InternalError(e); // Avoid declaring "throws IOException". 251 } 252 } 253 lastCC=0; 254 return this; 255 } 256 public void remove() { 257 str.setLength(0); 258 lastCC=0; 259 reorderStart=0; 260 } 261 public void removeSuffix(int suffixLength) { 262 int oldLength=str.length(); 263 str.delete(oldLength-suffixLength, oldLength); 264 lastCC=0; 265 reorderStart=str.length(); 266 } 267 268 // Inserts c somewhere before the last character. 269 // Requires 0<cc<lastCC which implies reorderStart<limit. 270 private void insert(int c, int cc) { 271 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 272 // insert c at codePointLimit, after the character with prevCC<=cc 273 if(c<=0xffff) { 274 str.insert(codePointLimit, (char)c); 275 if(cc<=1) { 276 reorderStart=codePointLimit+1; 277 } 278 } else { 279 str.insert(codePointLimit, Character.toChars(c)); 280 if(cc<=1) { 281 reorderStart=codePointLimit+2; 282 } 283 } 284 } 285 286 private final NormalizerImpl impl; 287 private final Appendable app; 288 private final StringBuilder str; 289 private final boolean appIsStringBuilder; 290 private int reorderStart; 291 private int lastCC; 292 293 // private backward iterator 294 private void setIterator() { codePointStart=str.length(); } 295 private void skipPrevious() { // Requires 0<codePointStart. 296 codePointLimit=codePointStart; 297 codePointStart=str.offsetByCodePoints(codePointStart, -1); 298 } 299 private int previousCC() { // Returns 0 if there is no previous character. 300 codePointLimit=codePointStart; 301 if(reorderStart>=codePointStart) { 302 return 0; 303 } 304 int c=str.codePointBefore(codePointStart); 305 codePointStart-=Character.charCount(c); 306 return impl.getCCFromYesOrMaybeCP(c); 307 } 308 private int codePointStart, codePointLimit; 309 } 310 311 // TODO: Propose as public API on the UTF16 class. 312 // TODO: Propose widening UTF16 methods that take char to take int. 313 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 314 public static final class UTF16Plus { 315 /** 316 * Is this code point a lead surrogate (U+d800..U+dbff)? 317 * @param c code unit or code point 318 * @return true or false 319 */ 320 public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } 321 /** 322 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 323 * is it a lead surrogate? 324 * @param c code unit or code point 325 * @return true or false 326 */ 327 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 328 329 /** 330 * Compares two CharSequence subsequences for binary equality. 331 * @param s1 first sequence 332 * @param start1 start offset in first sequence 333 * @param limit1 limit offset in first sequence 334 * @param s2 second sequence 335 * @param start2 start offset in second sequence 336 * @param limit2 limit offset in second sequence 337 * @return true if s1.subSequence(start1, limit1) contains the same text 338 * as s2.subSequence(start2, limit2) 339 */ 340 public static boolean equal(CharSequence s1, int start1, int limit1, 341 CharSequence s2, int start2, int limit2) { 342 if((limit1-start1)!=(limit2-start2)) { 343 return false; 344 } 345 if(s1==s2 && start1==start2) { 346 return true; 347 } 348 while(start1<limit1) { 349 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 350 return false; 351 } 352 } 353 return true; 354 } 355 } 356 357 public NormalizerImpl() {} 358 359 private static final class IsAcceptable implements ICUBinary.Authenticate { 360 public boolean isDataVersionAcceptable(byte version[]) { 361 return version[0]==4; 362 } 363 } 364 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 365 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 366 367 public NormalizerImpl load(ByteBuffer bytes) { 368 try { 369 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 370 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 371 if(indexesLength<=IX_MIN_LCCC_CP) { 372 throw new InternalError("Normalizer2 data: not enough indexes"); 373 } 374 int[] inIndexes=new int[indexesLength]; 375 inIndexes[0]=indexesLength*4; 376 for(int i=1; i<indexesLength; ++i) { 377 inIndexes[i]=bytes.getInt(); 378 } 379 380 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 381 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 382 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 383 384 minYesNo=inIndexes[IX_MIN_YES_NO]; 385 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 386 minNoNo=inIndexes[IX_MIN_NO_NO]; 387 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 388 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 389 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 390 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 391 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 392 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 393 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 394 395 // Read the normTrie. 396 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 397 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 398 int triePosition = bytes.position(); 399 normTrie = CodePointTrie.Fast16.fromBinary(bytes); 400 int trieLength = bytes.position() - triePosition; 401 if(trieLength>(nextOffset-offset)) { 402 throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); 403 } 404 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 405 406 // Read the composition and mapping data. 407 offset=nextOffset; 408 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 409 int numChars=(nextOffset-offset)/2; 410 if(numChars!=0) { 411 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 412 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 413 } 414 415 // smallFCD: new in formatVersion 2 416 offset=nextOffset; 417 smallFCD=new byte[0x100]; 418 bytes.get(smallFCD); 419 420 return this; 421 } catch(IOException e) { 422 throw new InternalError(e); 423 } 424 } 425 public NormalizerImpl load(String name) { 426 return load(ICUBinary.getRequiredData(name)); 427 } 428 429 // The trie stores values for lead surrogate code *units*. 430 // Surrogate code *points* are inert. 431 public int getNorm16(int c) { 432 return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); 433 } 434 public int getRawNorm16(int c) { return normTrie.get(c); } 435 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 436 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 437 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 438 439 public int getCC(int norm16) { 440 if(norm16>=MIN_NORMAL_MAYBE_YES) { 441 return getCCFromNormalYesOrMaybe(norm16); 442 } 443 if(norm16<minNoNo || limitNoNo<=norm16) { 444 return 0; 445 } 446 return getCCFromNoNo(norm16); 447 } 448 public static int getCCFromNormalYesOrMaybe(int norm16) { 449 return (norm16 >> OFFSET_SHIFT) & 0xff; 450 } 451 public static int getCCFromYesOrMaybe(int norm16) { 452 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 453 } 454 public int getCCFromYesOrMaybeCP(int c) { 455 if (c < minCompNoMaybeCP) { return 0; } 456 return getCCFromYesOrMaybe(getNorm16(c)); 457 } 458 459 /** 460 * Returns the FCD data for code point c. 461 * @param c A Unicode code point. 462 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 463 */ 464 public int getFCD16(int c) { 465 if(c<minDecompNoCP) { 466 return 0; 467 } else if(c<=0xffff) { 468 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 469 } 470 return getFCD16FromNormData(c); 471 } 472 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 473 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 474 // 0<=lead<=0xffff 475 byte bits=smallFCD[lead>>8]; 476 if(bits==0) { return false; } 477 return ((bits>>((lead>>5)&7))&1)!=0; 478 } 479 480 /** Gets the FCD value from the regular normalization data. */ 481 public int getFCD16FromNormData(int c) { 482 int norm16=getNorm16(c); 483 if (norm16 >= limitNoNo) { 484 if(norm16>=MIN_NORMAL_MAYBE_YES) { 485 // combining mark 486 norm16=getCCFromNormalYesOrMaybe(norm16); 487 return norm16|(norm16<<8); 488 } else if(norm16>=minMaybeYes) { 489 return 0; 490 } else { // isDecompNoAlgorithmic(norm16) 491 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 492 if (deltaTrailCC <= DELTA_TCCC_1) { 493 return deltaTrailCC >> OFFSET_SHIFT; 494 } 495 // Maps to an isCompYesAndZeroCC. 496 c=mapAlgorithmic(c, norm16); 497 norm16=getRawNorm16(c); 498 } 499 } 500 if(norm16<=minYesNo || isHangulLVT(norm16)) { 501 // no decomposition or Hangul syllable, all zeros 502 return 0; 503 } 504 // c decomposes, get everything from the variable-length extra data 505 int mapping=norm16>>OFFSET_SHIFT; 506 int firstUnit=extraData.charAt(mapping); 507 int fcd16=firstUnit>>8; // tccc 508 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 509 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 510 } 511 return fcd16; 512 } 513 514 /** 515 * Gets the decomposition for one code point. 516 * @param c code point 517 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 518 */ 519 public String getDecomposition(int c) { 520 int norm16; 521 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 522 // c does not decompose 523 return null; 524 } 525 int decomp = -1; 526 if(isDecompNoAlgorithmic(norm16)) { 527 // Maps to an isCompYesAndZeroCC. 528 decomp=c=mapAlgorithmic(c, norm16); 529 // The mapping might decompose further. 530 norm16 = getRawNorm16(c); 531 } 532 if (norm16 < minYesNo) { 533 if(decomp<0) { 534 return null; 535 } else { 536 return UTF16.valueOf(decomp); 537 } 538 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 539 // Hangul syllable: decompose algorithmically 540 StringBuilder buffer=new StringBuilder(); 541 Hangul.decompose(c, buffer); 542 return buffer.toString(); 543 } 544 // c decomposes, get everything from the variable-length extra data 545 int mapping=norm16>>OFFSET_SHIFT; 546 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 547 return extraData.substring(mapping, mapping+length); 548 } 549 550 // Fixed norm16 values. 551 public static final int MIN_YES_YES_WITH_CC=0xfe02; 552 public static final int JAMO_VT=0xfe00; 553 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 554 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 555 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 556 557 // norm16 bit 0 is comp-boundary-after. 558 public static final int HAS_COMP_BOUNDARY_AFTER=1; 559 public static final int OFFSET_SHIFT=1; 560 561 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 562 // tccc (0, 1, >1) for quick FCC boundary-after tests. 563 public static final int DELTA_TCCC_0=0; 564 public static final int DELTA_TCCC_1=2; 565 public static final int DELTA_TCCC_GT_1=4; 566 public static final int DELTA_TCCC_MASK=6; 567 public static final int DELTA_SHIFT=3; 568 569 public static final int MAX_DELTA=0x40; 570 571 // Byte offsets from the start of the data, after the generic header. 572 public static final int IX_NORM_TRIE_OFFSET=0; 573 public static final int IX_EXTRA_DATA_OFFSET=1; 574 public static final int IX_SMALL_FCD_OFFSET=2; 575 public static final int IX_RESERVED3_OFFSET=3; 576 public static final int IX_TOTAL_SIZE=7; 577 public static final int MIN_CCC_LCCC_CP=0x300; 578 // Code point thresholds for quick check codes. 579 public static final int IX_MIN_DECOMP_NO_CP=8; 580 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 581 582 // Norm16 value thresholds for quick check combinations and types of extra data. 583 584 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 585 public static final int IX_MIN_YES_NO=10; 586 /** Mappings are comp-normalized. */ 587 public static final int IX_MIN_NO_NO=11; 588 public static final int IX_LIMIT_NO_NO=12; 589 public static final int IX_MIN_MAYBE_YES=13; 590 591 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 592 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 593 /** Mappings are not comp-normalized but have a comp boundary before. */ 594 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 595 /** Mappings do not have a comp boundary before. */ 596 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 597 /** Mappings to the empty string. */ 598 public static final int IX_MIN_NO_NO_EMPTY=17; 599 600 public static final int IX_MIN_LCCC_CP=18; 601 public static final int IX_COUNT=20; 602 603 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 604 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 605 // unused bit 0x20; 606 public static final int MAPPING_LENGTH_MASK=0x1f; 607 608 public static final int COMP_1_LAST_TUPLE=0x8000; 609 public static final int COMP_1_TRIPLE=1; 610 public static final int COMP_1_TRAIL_LIMIT=0x3400; 611 public static final int COMP_1_TRAIL_MASK=0x7ffe; 612 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 613 public static final int COMP_2_TRAIL_SHIFT=6; 614 public static final int COMP_2_TRAIL_MASK=0xffc0; 615 616 // higher-level functionality ------------------------------------------ *** 617 618 /** 619 * Decomposes s[src, limit[ and writes the result to dest. 620 * limit can be NULL if src is NUL-terminated. 621 * destLengthEstimate is the initial dest buffer capacity and can be -1. 622 */ 623 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 624 int destLengthEstimate) { 625 if(destLengthEstimate<0) { 626 destLengthEstimate=limit-src; 627 } 628 dest.setLength(0); 629 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 630 decompose(s, src, limit, buffer); 631 } 632 633 // Dual functionality: 634 // buffer!=NULL: normalize 635 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 636 public int decompose(CharSequence s, int src, int limit, 637 ReorderingBuffer buffer) { 638 int minNoCP=minDecompNoCP; 639 640 int prevSrc; 641 int c=0; 642 int norm16=0; 643 644 // only for quick check 645 int prevBoundary=src; 646 int prevCC=0; 647 648 for(;;) { 649 // count code units below the minimum or with irrelevant data for the quick check 650 for(prevSrc=src; src!=limit;) { 651 if( (c=s.charAt(src))<minNoCP || 652 isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 653 ) { 654 ++src; 655 } else if(!UTF16Plus.isLeadSurrogate(c)) { 656 break; 657 } else { 658 char c2; 659 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 660 c = Character.toCodePoint((char)c, c2); 661 norm16 = normTrie.suppGet(c); 662 if (isMostDecompYesAndZeroCC(norm16)) { 663 src += 2; 664 } else { 665 break; 666 } 667 } else { 668 ++src; // unpaired lead surrogate: inert 669 } 670 } 671 } 672 // copy these code units all at once 673 if(src!=prevSrc) { 674 if(buffer!=null) { 675 buffer.flushAndAppendZeroCC(s, prevSrc, src); 676 } else { 677 prevCC=0; 678 prevBoundary=src; 679 } 680 } 681 if(src==limit) { 682 break; 683 } 684 685 // Check one above-minimum, relevant code point. 686 src+=Character.charCount(c); 687 if(buffer!=null) { 688 decompose(c, norm16, buffer); 689 } else { 690 if(isDecompYes(norm16)) { 691 int cc=getCCFromYesOrMaybe(norm16); 692 if(prevCC<=cc || cc==0) { 693 prevCC=cc; 694 if(cc<=1) { 695 prevBoundary=src; 696 } 697 continue; 698 } 699 } 700 return prevBoundary; // "no" or cc out of order 701 } 702 } 703 return src; 704 } 705 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 706 int limit=s.length(); 707 if(limit==0) { 708 return; 709 } 710 if(doDecompose) { 711 decompose(s, 0, limit, buffer); 712 return; 713 } 714 // Just merge the strings at the boundary. 715 int c=Character.codePointAt(s, 0); 716 int src=0; 717 int firstCC, prevCC, cc; 718 firstCC=prevCC=cc=getCC(getNorm16(c)); 719 while(cc!=0) { 720 prevCC=cc; 721 src+=Character.charCount(c); 722 if(src>=limit) { 723 break; 724 } 725 c=Character.codePointAt(s, src); 726 cc=getCC(getNorm16(c)); 727 }; 728 buffer.append(s, 0, src, false, firstCC, prevCC); 729 buffer.append(s, src, limit); 730 } 731 732 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 733 // doCompose: normalize 734 // !doCompose: isNormalized (buffer must be empty and initialized) 735 public boolean compose(CharSequence s, int src, int limit, 736 boolean onlyContiguous, 737 boolean doCompose, 738 ReorderingBuffer buffer) { 739 int prevBoundary=src; 740 int minNoMaybeCP=minCompNoMaybeCP; 741 742 for (;;) { 743 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 744 // or with (compYes && ccc==0) properties. 745 int prevSrc; 746 int c = 0; 747 int norm16 = 0; 748 for (;;) { 749 if (src == limit) { 750 if (prevBoundary != limit && doCompose) { 751 buffer.append(s, prevBoundary, limit); 752 } 753 return true; 754 } 755 if( (c=s.charAt(src))<minNoMaybeCP || 756 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 757 ) { 758 ++src; 759 } else { 760 prevSrc = src++; 761 if (!UTF16Plus.isLeadSurrogate(c)) { 762 break; 763 } else { 764 char c2; 765 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 766 ++src; 767 c = Character.toCodePoint((char)c, c2); 768 norm16 = normTrie.suppGet(c); 769 if (!isCompYesAndZeroCC(norm16)) { 770 break; 771 } 772 } 773 } 774 } 775 } 776 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 777 // The current character is either a "noNo" (has a mapping) 778 // or a "maybeYes" (combines backward) 779 // or a "yesYes" with ccc!=0. 780 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 781 782 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 783 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 784 if (!doCompose) { 785 return false; 786 } 787 // Fast path for mapping a character that is immediately surrounded by boundaries. 788 // In this case, we need not decompose around the current character. 789 if (isDecompNoAlgorithmic(norm16)) { 790 // Maps to a single isCompYesAndZeroCC character 791 // which also implies hasCompBoundaryBefore. 792 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 793 hasCompBoundaryBefore(s, src, limit)) { 794 if (prevBoundary != prevSrc) { 795 buffer.append(s, prevBoundary, prevSrc); 796 } 797 buffer.append(mapAlgorithmic(c, norm16), 0); 798 prevBoundary = src; 799 continue; 800 } 801 } else if (norm16 < minNoNoCompBoundaryBefore) { 802 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 803 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 804 hasCompBoundaryBefore(s, src, limit)) { 805 if (prevBoundary != prevSrc) { 806 buffer.append(s, prevBoundary, prevSrc); 807 } 808 int mapping = norm16 >> OFFSET_SHIFT; 809 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 810 buffer.append(extraData, mapping, mapping + length); 811 prevBoundary = src; 812 continue; 813 } 814 } else if (norm16 >= minNoNoEmpty) { 815 // The current character maps to nothing. 816 // Simply omit it from the output if there is a boundary before _or_ after it. 817 // The character itself implies no boundaries. 818 if (hasCompBoundaryBefore(s, src, limit) || 819 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 820 if (prevBoundary != prevSrc) { 821 buffer.append(s, prevBoundary, prevSrc); 822 } 823 prevBoundary = src; 824 continue; 825 } 826 } 827 // Other "noNo" type, or need to examine more text around this character: 828 // Fall through to the slow path. 829 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 830 char prev=s.charAt(prevSrc-1); 831 if(c<Hangul.JAMO_T_BASE) { 832 // The current character is a Jamo Vowel, 833 // compose with previous Jamo L and following Jamo T. 834 char l = (char)(prev-Hangul.JAMO_L_BASE); 835 if(l<Hangul.JAMO_L_COUNT) { 836 if (!doCompose) { 837 return false; 838 } 839 int t; 840 if (src != limit && 841 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 842 t < Hangul.JAMO_T_COUNT) { 843 // The next character is a Jamo T. 844 ++src; 845 } else if (hasCompBoundaryBefore(s, src, limit)) { 846 // No Jamo T follows, not even via decomposition. 847 t = 0; 848 } else { 849 t = -1; 850 } 851 if (t >= 0) { 852 int syllable = Hangul.HANGUL_BASE + 853 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 854 Hangul.JAMO_T_COUNT + t; 855 --prevSrc; // Replace the Jamo L as well. 856 if (prevBoundary != prevSrc) { 857 buffer.append(s, prevBoundary, prevSrc); 858 } 859 buffer.append((char)syllable); 860 prevBoundary = src; 861 continue; 862 } 863 // If we see L+V+x where x!=T then we drop to the slow path, 864 // decompose and recompose. 865 // This is to deal with NFKC finding normal L and V but a 866 // compatibility variant of a T. 867 // We need to either fully compose that combination here 868 // (which would complicate the code and may not work with strange custom data) 869 // or use the slow path. 870 } 871 } else if (Hangul.isHangulLV(prev)) { 872 // The current character is a Jamo Trailing consonant, 873 // compose with previous Hangul LV that does not contain a Jamo T. 874 if (!doCompose) { 875 return false; 876 } 877 int syllable = prev + c - Hangul.JAMO_T_BASE; 878 --prevSrc; // Replace the Hangul LV as well. 879 if (prevBoundary != prevSrc) { 880 buffer.append(s, prevBoundary, prevSrc); 881 } 882 buffer.append((char)syllable); 883 prevBoundary = src; 884 continue; 885 } 886 // No matching context, or may need to decompose surrounding text first: 887 // Fall through to the slow path. 888 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 889 // One or more combining marks that do not combine-back: 890 // Check for canonical order, copy unchanged if ok and 891 // if followed by a character with a boundary-before. 892 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 893 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 894 // Fails FCD test, need to decompose and contiguously recompose. 895 if (!doCompose) { 896 return false; 897 } 898 } else { 899 // If !onlyContiguous (not FCC), then we ignore the tccc of 900 // the previous character which passed the quick check "yes && ccc==0" test. 901 int n16; 902 for (;;) { 903 if (src == limit) { 904 if (doCompose) { 905 buffer.append(s, prevBoundary, limit); 906 } 907 return true; 908 } 909 int prevCC = cc; 910 c = Character.codePointAt(s, src); 911 n16 = normTrie.get(c); 912 if (n16 >= MIN_YES_YES_WITH_CC) { 913 cc = getCCFromNormalYesOrMaybe(n16); 914 if (prevCC > cc) { 915 if (!doCompose) { 916 return false; 917 } 918 break; 919 } 920 } else { 921 break; 922 } 923 src += Character.charCount(c); 924 } 925 // p is after the last in-order combining mark. 926 // If there is a boundary here, then we continue with no change. 927 if (norm16HasCompBoundaryBefore(n16)) { 928 if (isCompYesAndZeroCC(n16)) { 929 src += Character.charCount(c); 930 } 931 continue; 932 } 933 // Use the slow path. There is no boundary in [prevSrc, src[. 934 } 935 } 936 937 // Slow path: Find the nearest boundaries around the current character, 938 // decompose and recompose. 939 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 940 c = Character.codePointBefore(s, prevSrc); 941 norm16 = normTrie.get(c); 942 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 943 prevSrc -= Character.charCount(c); 944 } 945 } 946 if (doCompose && prevBoundary != prevSrc) { 947 buffer.append(s, prevBoundary, prevSrc); 948 } 949 int recomposeStartIndex=buffer.length(); 950 // We know there is not a boundary here. 951 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 952 buffer); 953 // Decompose until the next boundary. 954 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 955 buffer); 956 recompose(buffer, recomposeStartIndex, onlyContiguous); 957 if(!doCompose) { 958 if(!buffer.equals(s, prevSrc, src)) { 959 return false; 960 } 961 buffer.remove(); 962 } 963 prevBoundary=src; 964 } 965 } 966 967 /** 968 * Very similar to compose(): Make the same changes in both places if relevant. 969 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 970 * !doSpan: quickCheck 971 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 972 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 973 * then the quick check result is "no" 974 */ 975 public int composeQuickCheck(CharSequence s, int src, int limit, 976 boolean onlyContiguous, boolean doSpan) { 977 int qcResult=0; 978 int prevBoundary=src; 979 int minNoMaybeCP=minCompNoMaybeCP; 980 981 for(;;) { 982 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 983 // or with (compYes && ccc==0) properties. 984 int prevSrc; 985 int c = 0; 986 int norm16 = 0; 987 for (;;) { 988 if(src==limit) { 989 return (src<<1)|qcResult; // "yes" or "maybe" 990 } 991 if( (c=s.charAt(src))<minNoMaybeCP || 992 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 993 ) { 994 ++src; 995 } else { 996 prevSrc = src++; 997 if (!UTF16Plus.isLeadSurrogate(c)) { 998 break; 999 } else { 1000 char c2; 1001 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 1002 ++src; 1003 c = Character.toCodePoint((char)c, c2); 1004 norm16 = normTrie.suppGet(c); 1005 if (!isCompYesAndZeroCC(norm16)) { 1006 break; 1007 } 1008 } 1009 } 1010 } 1011 } 1012 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1013 // The current character is either a "noNo" (has a mapping) 1014 // or a "maybeYes" (combines backward) 1015 // or a "yesYes" with ccc!=0. 1016 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1017 1018 int prevNorm16 = INERT; 1019 if (prevBoundary != prevSrc) { 1020 prevBoundary = prevSrc; 1021 if (!norm16HasCompBoundaryBefore(norm16)) { 1022 c = Character.codePointBefore(s, prevSrc); 1023 int n16 = getNorm16(c); 1024 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1025 prevBoundary -= Character.charCount(c); 1026 prevNorm16 = n16; 1027 } 1028 } 1029 } 1030 1031 if(isMaybeOrNonZeroCC(norm16)) { 1032 int cc=getCCFromYesOrMaybe(norm16); 1033 if (onlyContiguous /* FCC */ && cc != 0 && 1034 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1035 // The [prevBoundary..prevSrc[ character 1036 // passed the quick check "yes && ccc==0" test 1037 // but is out of canonical order with the current combining mark. 1038 } else { 1039 // If !onlyContiguous (not FCC), then we ignore the tccc of 1040 // the previous character which passed the quick check "yes && ccc==0" test. 1041 for (;;) { 1042 if (norm16 < MIN_YES_YES_WITH_CC) { 1043 if (!doSpan) { 1044 qcResult = 1; 1045 } else { 1046 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1047 } 1048 } 1049 if (src == limit) { 1050 return (src<<1) | qcResult; // "yes" or "maybe" 1051 } 1052 int prevCC = cc; 1053 c = Character.codePointAt(s, src); 1054 norm16 = getNorm16(c); 1055 if (isMaybeOrNonZeroCC(norm16)) { 1056 cc = getCCFromYesOrMaybe(norm16); 1057 if (!(prevCC <= cc || cc == 0)) { 1058 break; 1059 } 1060 } else { 1061 break; 1062 } 1063 src += Character.charCount(c); 1064 } 1065 // src is after the last in-order combining mark. 1066 if (isCompYesAndZeroCC(norm16)) { 1067 prevBoundary = src; 1068 src += Character.charCount(c); 1069 continue; 1070 } 1071 } 1072 } 1073 return prevBoundary<<1; // "no" 1074 } 1075 } 1076 public void composeAndAppend(CharSequence s, 1077 boolean doCompose, 1078 boolean onlyContiguous, 1079 ReorderingBuffer buffer) { 1080 int src=0, limit=s.length(); 1081 if(!buffer.isEmpty()) { 1082 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1083 if(0!=firstStarterInSrc) { 1084 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1085 buffer.length(), onlyContiguous); 1086 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1087 firstStarterInSrc+16); 1088 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1089 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1090 middle.append(s, 0, firstStarterInSrc); 1091 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1092 src=firstStarterInSrc; 1093 } 1094 } 1095 if(doCompose) { 1096 compose(s, src, limit, onlyContiguous, true, buffer); 1097 } else { 1098 buffer.append(s, src, limit); 1099 } 1100 } 1101 // Dual functionality: 1102 // buffer!=NULL: normalize 1103 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1104 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1105 // Note: In this function we use buffer->appendZeroCC() because we track 1106 // the lead and trail combining classes here, rather than leaving it to 1107 // the ReorderingBuffer. 1108 // The exception is the call to decomposeShort() which uses the buffer 1109 // in the normal way. 1110 1111 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1112 // Similar to the prevBoundary in the compose() implementation. 1113 int prevBoundary=src; 1114 int prevSrc; 1115 int c=0; 1116 int prevFCD16=0; 1117 int fcd16=0; 1118 1119 for(;;) { 1120 // count code units with lccc==0 1121 for(prevSrc=src; src!=limit;) { 1122 if((c=s.charAt(src))<minLcccCP) { 1123 prevFCD16=~c; 1124 ++src; 1125 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1126 prevFCD16=0; 1127 ++src; 1128 } else { 1129 if (UTF16Plus.isLeadSurrogate(c)) { 1130 char c2; 1131 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 1132 c = Character.toCodePoint((char)c, c2); 1133 } 1134 } 1135 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1136 prevFCD16=fcd16; 1137 src+=Character.charCount(c); 1138 } else { 1139 break; 1140 } 1141 } 1142 } 1143 // copy these code units all at once 1144 if(src!=prevSrc) { 1145 if(src==limit) { 1146 if(buffer!=null) { 1147 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1148 } 1149 break; 1150 } 1151 prevBoundary=src; 1152 // We know that the previous character's lccc==0. 1153 if(prevFCD16<0) { 1154 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1155 int prev=~prevFCD16; 1156 if(prev<minDecompNoCP) { 1157 prevFCD16=0; 1158 } else { 1159 prevFCD16=getFCD16FromNormData(prev); 1160 if(prevFCD16>1) { 1161 --prevBoundary; 1162 } 1163 } 1164 } else { 1165 int p=src-1; 1166 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1167 Character.isHighSurrogate(s.charAt(p-1)) 1168 ) { 1169 --p; 1170 // Need to fetch the previous character's FCD value because 1171 // prevFCD16 was just for the trail surrogate code point. 1172 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1173 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1174 } 1175 if(prevFCD16>1) { 1176 prevBoundary=p; 1177 } 1178 } 1179 if(buffer!=null) { 1180 // The last lccc==0 character is excluded from the 1181 // flush-and-append call in case it needs to be modified. 1182 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1183 buffer.append(s, prevBoundary, src); 1184 } 1185 // The start of the current character (c). 1186 prevSrc=src; 1187 } else if(src==limit) { 1188 break; 1189 } 1190 1191 src+=Character.charCount(c); 1192 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1193 // Check for proper order, and decompose locally if necessary. 1194 if((prevFCD16&0xff)<=(fcd16>>8)) { 1195 // proper order: prev tccc <= current lccc 1196 if((fcd16&0xff)<=1) { 1197 prevBoundary=src; 1198 } 1199 if(buffer!=null) { 1200 buffer.appendZeroCC(c); 1201 } 1202 prevFCD16=fcd16; 1203 continue; 1204 } else if(buffer==null) { 1205 return prevBoundary; // quick check "no" 1206 } else { 1207 /* 1208 * Back out the part of the source that we copied or appended 1209 * already but is now going to be decomposed. 1210 * prevSrc is set to after what was copied/appended. 1211 */ 1212 buffer.removeSuffix(prevSrc-prevBoundary); 1213 /* 1214 * Find the part of the source that needs to be decomposed, 1215 * up to the next safe boundary. 1216 */ 1217 src=findNextFCDBoundary(s, src, limit); 1218 /* 1219 * The source text does not fulfill the conditions for FCD. 1220 * Decompose and reorder a limited piece of the text. 1221 */ 1222 decomposeShort(s, prevBoundary, src, false, false, buffer); 1223 prevBoundary=src; 1224 prevFCD16=0; 1225 } 1226 } 1227 return src; 1228 } 1229 1230 public boolean hasDecompBoundaryBefore(int c) { 1231 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1232 norm16HasDecompBoundaryBefore(getNorm16(c)); 1233 } 1234 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1235 if (norm16 < minNoNoCompNoMaybeCC) { 1236 return true; 1237 } 1238 if (norm16 >= limitNoNo) { 1239 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1240 } 1241 // c decomposes, get everything from the variable-length extra data 1242 int mapping=norm16>>OFFSET_SHIFT; 1243 int firstUnit=extraData.charAt(mapping); 1244 // true if leadCC==0 (hasFCDBoundaryBefore()) 1245 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1246 } 1247 public boolean hasDecompBoundaryAfter(int c) { 1248 if (c < minDecompNoCP) { 1249 return true; 1250 } 1251 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1252 return true; 1253 } 1254 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1255 } 1256 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1257 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1258 return true; 1259 } 1260 if (norm16 >= limitNoNo) { 1261 if (isMaybeOrNonZeroCC(norm16)) { 1262 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1263 } 1264 // Maps to an isCompYesAndZeroCC. 1265 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1266 } 1267 // c decomposes, get everything from the variable-length extra data 1268 int mapping=norm16>>OFFSET_SHIFT; 1269 int firstUnit=extraData.charAt(mapping); 1270 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1271 // fcd16<=1 || trailCC==0 1272 if(firstUnit>0x1ff) { 1273 return false; // trailCC>1 1274 } 1275 if(firstUnit<=0xff) { 1276 return true; // trailCC==0 1277 } 1278 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1279 // true if leadCC==0 (hasFCDBoundaryBefore()) 1280 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1281 } 1282 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1283 1284 public boolean hasCompBoundaryBefore(int c) { 1285 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1286 } 1287 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1288 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1289 } 1290 1291 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1292 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1293 private static boolean isInert(int norm16) { return norm16==INERT; } 1294 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1295 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } 1296 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } 1297 private boolean isHangulLVT(int norm16) { 1298 return norm16==hangulLVT(); 1299 } 1300 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1301 // UBool isCompYes(uint16_t norm16) const { 1302 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1303 // } 1304 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1305 // return norm16<minNoNo || minMaybeYes<=norm16; 1306 // } 1307 // private boolean hasZeroCCFromDecompYes(int norm16) { 1308 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1309 // } 1310 private boolean isDecompYesAndZeroCC(int norm16) { 1311 return norm16<minYesNo || 1312 norm16==JAMO_VT || 1313 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1314 } 1315 /** 1316 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1317 * the MaybeYes which combine-forward and have ccc=0. 1318 * (Standard Unicode 10 normalization does not have such characters.) 1319 */ 1320 private boolean isMostDecompYesAndZeroCC(int norm16) { 1321 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1322 } 1323 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1324 1325 // For use with isCompYes(). 1326 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1327 // static uint8_t getCCFromYes(uint16_t norm16) { 1328 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1329 // } 1330 private int getCCFromNoNo(int norm16) { 1331 int mapping=norm16>>OFFSET_SHIFT; 1332 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1333 return extraData.charAt(mapping-1)&0xff; 1334 } else { 1335 return 0; 1336 } 1337 } 1338 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1339 if(norm16<=minYesNo) { 1340 return 0; // yesYes and Hangul LV have ccc=tccc=0 1341 } else { 1342 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1343 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1344 } 1345 } 1346 1347 // Requires algorithmic-NoNo. 1348 private int mapAlgorithmic(int c, int norm16) { 1349 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1350 } 1351 1352 // Requires minYesNo<norm16<limitNoNo. 1353 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1354 1355 /** 1356 * @return index into maybeYesCompositions, or -1 1357 */ 1358 private int getCompositionsListForDecompYes(int norm16) { 1359 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1360 return -1; 1361 } else { 1362 if((norm16-=minMaybeYes)<0) { 1363 // norm16<minMaybeYes: index into extraData which is a substring at 1364 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1365 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1366 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1367 } 1368 return norm16>>OFFSET_SHIFT; 1369 } 1370 } 1371 /** 1372 * @return index into maybeYesCompositions 1373 */ 1374 private int getCompositionsListForComposite(int norm16) { 1375 // A composite has both mapping & compositions list. 1376 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1377 int firstUnit=maybeYesCompositions.charAt(list); 1378 return list+ // mapping in maybeYesCompositions 1379 1+ // +1 to skip the first unit with the mapping length 1380 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1381 } 1382 1383 // Decompose a short piece of text which is likely to contain characters that 1384 // fail the quick check loop and/or where the quick check loop's overhead 1385 // is unlikely to be amortized. 1386 // Called by the compose() and makeFCD() implementations. 1387 // Public in Java for collation implementation code. 1388 private int decomposeShort( 1389 CharSequence s, int src, int limit, 1390 boolean stopAtCompBoundary, boolean onlyContiguous, 1391 ReorderingBuffer buffer) { 1392 while(src<limit) { 1393 int c=Character.codePointAt(s, src); 1394 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1395 return src; 1396 } 1397 int norm16 = getNorm16(c); 1398 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1399 return src; 1400 } 1401 src+=Character.charCount(c); 1402 decompose(c, norm16, buffer); 1403 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1404 return src; 1405 } 1406 } 1407 return src; 1408 } 1409 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1410 // get the decomposition and the lead and trail cc's 1411 if (norm16 >= limitNoNo) { 1412 if (isMaybeOrNonZeroCC(norm16)) { 1413 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1414 return; 1415 } 1416 // Maps to an isCompYesAndZeroCC. 1417 c=mapAlgorithmic(c, norm16); 1418 norm16=getRawNorm16(c); 1419 } 1420 if (norm16 < minYesNo) { 1421 // c does not decompose 1422 buffer.append(c, 0); 1423 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1424 // Hangul syllable: decompose algorithmically 1425 Hangul.decompose(c, buffer); 1426 } else { 1427 // c decomposes, get everything from the variable-length extra data 1428 int mapping=norm16>>OFFSET_SHIFT; 1429 int firstUnit=extraData.charAt(mapping); 1430 int length=firstUnit&MAPPING_LENGTH_MASK; 1431 int leadCC, trailCC; 1432 trailCC=firstUnit>>8; 1433 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1434 leadCC=extraData.charAt(mapping-1)>>8; 1435 } else { 1436 leadCC=0; 1437 } 1438 ++mapping; // skip over the firstUnit 1439 buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); 1440 } 1441 } 1442 1443 /** 1444 * Finds the recomposition result for 1445 * a forward-combining "lead" character, 1446 * specified with a pointer to its compositions list, 1447 * and a backward-combining "trail" character. 1448 * 1449 * <p>If the lead and trail characters combine, then this function returns 1450 * the following "compositeAndFwd" value: 1451 * <pre> 1452 * Bits 21..1 composite character 1453 * Bit 0 set if the composite is a forward-combining starter 1454 * </pre> 1455 * otherwise it returns -1. 1456 * 1457 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1458 * encoded as either pairs or triples of 16-bit units. 1459 * The last entry has the high bit of its first unit set. 1460 * 1461 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1462 * A linear search is used. 1463 * 1464 * <p>See normalizer2impl.h for a more detailed description 1465 * of the compositions list format. 1466 */ 1467 private static int combine(String compositions, int list, int trail) { 1468 int key1, firstUnit; 1469 if(trail<COMP_1_TRAIL_LIMIT) { 1470 // trail character is 0..33FF 1471 // result entry may have 2 or 3 units 1472 key1=(trail<<1); 1473 while(key1>(firstUnit=compositions.charAt(list))) { 1474 list+=2+(firstUnit&COMP_1_TRIPLE); 1475 } 1476 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1477 if((firstUnit&COMP_1_TRIPLE)!=0) { 1478 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1479 } else { 1480 return compositions.charAt(list+1); 1481 } 1482 } 1483 } else { 1484 // trail character is 3400..10FFFF 1485 // result entry has 3 units 1486 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1487 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1488 int secondUnit; 1489 for(;;) { 1490 if(key1>(firstUnit=compositions.charAt(list))) { 1491 list+=2+(firstUnit&COMP_1_TRIPLE); 1492 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1493 if(key2>(secondUnit=compositions.charAt(list+1))) { 1494 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1495 break; 1496 } else { 1497 list+=3; 1498 } 1499 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1500 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1501 } else { 1502 break; 1503 } 1504 } else { 1505 break; 1506 } 1507 } 1508 } 1509 return -1; 1510 } 1511 1512 /* 1513 * Recomposes the buffer text starting at recomposeStartIndex 1514 * (which is in NFD - decomposed and canonically ordered), 1515 * and truncates the buffer contents. 1516 * 1517 * Note that recomposition never lengthens the text: 1518 * Any character consists of either one or two code units; 1519 * a composition may contain at most one more code unit than the original starter, 1520 * while the combining mark that is removed has at least one code unit. 1521 */ 1522 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1523 boolean onlyContiguous) { 1524 StringBuilder sb=buffer.getStringBuilder(); 1525 int p=recomposeStartIndex; 1526 if(p==sb.length()) { 1527 return; 1528 } 1529 1530 int starter, pRemove; 1531 int compositionsList; 1532 int c, compositeAndFwd; 1533 int norm16; 1534 int cc, prevCC; 1535 boolean starterIsSupplementary; 1536 1537 // Some of the following variables are not used until we have a forward-combining starter 1538 // and are only initialized now to avoid compiler warnings. 1539 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1540 starter=-1; 1541 starterIsSupplementary=false; 1542 prevCC=0; 1543 1544 for(;;) { 1545 c=sb.codePointAt(p); 1546 p+=Character.charCount(c); 1547 norm16=getNorm16(c); 1548 cc=getCCFromYesOrMaybe(norm16); 1549 if( // this character combines backward and 1550 isMaybe(norm16) && 1551 // we have seen a starter that combines forward and 1552 compositionsList>=0 && 1553 // the backward-combining character is not blocked 1554 (prevCC<cc || prevCC==0) 1555 ) { 1556 if(isJamoVT(norm16)) { 1557 // c is a Jamo V/T, see if we can compose it with the previous character. 1558 if(c<Hangul.JAMO_T_BASE) { 1559 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1560 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1561 if(prev<Hangul.JAMO_L_COUNT) { 1562 pRemove=p-1; 1563 char syllable=(char) 1564 (Hangul.HANGUL_BASE+ 1565 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1566 Hangul.JAMO_T_COUNT); 1567 char t; 1568 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1569 ++p; 1570 syllable+=t; // The next character was a Jamo T. 1571 } 1572 sb.setCharAt(starter, syllable); 1573 // remove the Jamo V/T 1574 sb.delete(pRemove, p); 1575 p=pRemove; 1576 } 1577 } 1578 /* 1579 * No "else" for Jamo T: 1580 * Since the input is in NFD, there are no Hangul LV syllables that 1581 * a Jamo T could combine with. 1582 * All Jamo Ts are combined above when handling Jamo Vs. 1583 */ 1584 if(p==sb.length()) { 1585 break; 1586 } 1587 compositionsList=-1; 1588 continue; 1589 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1590 // The starter and the combining mark (c) do combine. 1591 int composite=compositeAndFwd>>1; 1592 1593 // Remove the combining mark. 1594 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1595 sb.delete(pRemove, p); 1596 p=pRemove; 1597 // Replace the starter with the composite. 1598 if(starterIsSupplementary) { 1599 if(composite>0xffff) { 1600 // both are supplementary 1601 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1602 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1603 } else { 1604 sb.setCharAt(starter, (char)c); 1605 sb.deleteCharAt(starter+1); 1606 // The composite is shorter than the starter, 1607 // move the intermediate characters forward one. 1608 starterIsSupplementary=false; 1609 --p; 1610 } 1611 } else if(composite>0xffff) { 1612 // The composite is longer than the starter, 1613 // move the intermediate characters back one. 1614 starterIsSupplementary=true; 1615 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1616 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1617 ++p; 1618 } else { 1619 // both are on the BMP 1620 sb.setCharAt(starter, (char)composite); 1621 } 1622 1623 // Keep prevCC because we removed the combining mark. 1624 1625 if(p==sb.length()) { 1626 break; 1627 } 1628 // Is the composite a starter that combines forward? 1629 if((compositeAndFwd&1)!=0) { 1630 compositionsList= 1631 getCompositionsListForComposite(getRawNorm16(composite)); 1632 } else { 1633 compositionsList=-1; 1634 } 1635 1636 // We combined; continue with looking for compositions. 1637 continue; 1638 } 1639 } 1640 1641 // no combination this time 1642 prevCC=cc; 1643 if(p==sb.length()) { 1644 break; 1645 } 1646 1647 // If c did not combine, then check if it is a starter. 1648 if(cc==0) { 1649 // Found a new starter. 1650 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 1651 // It may combine with something, prepare for it. 1652 if(c<=0xffff) { 1653 starterIsSupplementary=false; 1654 starter=p-1; 1655 } else { 1656 starterIsSupplementary=true; 1657 starter=p-2; 1658 } 1659 } 1660 } else if(onlyContiguous) { 1661 // FCC: no discontiguous compositions; any intervening character blocks. 1662 compositionsList=-1; 1663 } 1664 } 1665 buffer.flush(); 1666 } 1667 1668 /** 1669 * Does c have a composition boundary before it? 1670 * True if its decomposition begins with a character that has 1671 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1672 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1673 * (isCompYesAndZeroCC()) so we need not decompose. 1674 */ 1675 private boolean hasCompBoundaryBefore(int c, int norm16) { 1676 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 1677 } 1678 private boolean norm16HasCompBoundaryBefore(int norm16) { 1679 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 1680 } 1681 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 1682 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 1683 } 1684 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 1685 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1686 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 1687 } 1688 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 1689 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 1690 } 1691 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ 1692 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 1693 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 1694 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 1695 } 1696 1697 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 1698 while(p>0) { 1699 int c=Character.codePointBefore(s, p); 1700 int norm16 = getNorm16(c); 1701 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1702 break; 1703 } 1704 p-=Character.charCount(c); 1705 if(hasCompBoundaryBefore(c, norm16)) { 1706 break; 1707 } 1708 } 1709 return p; 1710 } 1711 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 1712 while(p<limit) { 1713 int c=Character.codePointAt(s, p); 1714 int norm16=normTrie.get(c); 1715 if(hasCompBoundaryBefore(c, norm16)) { 1716 break; 1717 } 1718 p+=Character.charCount(c); 1719 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1720 break; 1721 } 1722 } 1723 return p; 1724 } 1725 1726 1727 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 1728 while(p<limit) { 1729 int c=Character.codePointAt(s, p); 1730 int norm16; 1731 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 1732 break; 1733 } 1734 p+=Character.charCount(c); 1735 if (norm16HasDecompBoundaryAfter(norm16)) { 1736 break; 1737 } 1738 } 1739 return p; 1740 } 1741 1742 /** 1743 * Get the canonical decomposition 1744 * sherman for ComposedCharIter 1745 */ 1746 public static int getDecompose(int chars[], String decomps[]) { 1747 Normalizer2 impl = Normalizer2.getNFDInstance(); 1748 1749 int length=0; 1750 int norm16 = 0; 1751 int ch = -1; 1752 int i = 0; 1753 1754 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff 1755 //TBD !!!! the hack code heres save us about 50ms for startup 1756 //need a better solution/lookup 1757 if (ch == 0x30ff) 1758 ch = 0xf900; 1759 else if (ch == 0x115bc) 1760 ch = 0x1d15e; 1761 else if (ch == 0x1d1c1) 1762 ch = 0x2f800; 1763 1764 String s = impl.getDecomposition(ch); 1765 1766 if(s != null && i < chars.length) { 1767 chars[i] = ch; 1768 decomps[i++] = s; 1769 } 1770 } 1771 return i; 1772 } 1773 1774 //------------------------------------------------------ 1775 // special method for Collation (RBTableBuilder.build()) 1776 //------------------------------------------------------ 1777 private static boolean needSingleQuotation(char c) { 1778 return (c >= 0x0009 && c <= 0x000D) || 1779 (c >= 0x0020 && c <= 0x002F) || 1780 (c >= 0x003A && c <= 0x0040) || 1781 (c >= 0x005B && c <= 0x0060) || 1782 (c >= 0x007B && c <= 0x007E); 1783 } 1784 1785 public static String canonicalDecomposeWithSingleQuotation(String string) { 1786 Normalizer2 impl = Normalizer2.getNFDInstance(); 1787 char[] src = string.toCharArray(); 1788 int srcIndex = 0; 1789 int srcLimit = src.length; 1790 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 1791 int destIndex = 0; 1792 int destLimit = dest.length; 1793 1794 int prevSrc; 1795 String norm; 1796 int reorderStartIndex, length; 1797 char c1, c2; 1798 int cp; 1799 int minNoMaybe = 0x00c0; 1800 int cc, prevCC, trailCC; 1801 char[] p; 1802 int pStart; 1803 1804 // initialize 1805 reorderStartIndex = 0; 1806 prevCC = 0; 1807 norm = null; 1808 cp = 0; 1809 pStart = 0; 1810 1811 cc = trailCC = -1; // initialize to bogus value 1812 c1 = 0; 1813 for (;;) { 1814 prevSrc=srcIndex; 1815 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual 1816 while (srcIndex != srcLimit && 1817 ((c1 = src[srcIndex]) < minNoMaybe || 1818 (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || 1819 (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables 1820 prevCC = 0; 1821 srcIndex += (cp < 0x10000) ? 1 : 2; 1822 } 1823 1824 // copy these code units all at once 1825 if (srcIndex != prevSrc) { 1826 length = srcIndex - prevSrc; 1827 if ((destIndex + length) <= destLimit) { 1828 System.arraycopy(src,prevSrc,dest,destIndex,length); 1829 } 1830 1831 destIndex += length; 1832 reorderStartIndex = destIndex; 1833 } 1834 1835 // end of source reached? 1836 if (srcIndex == srcLimit) { 1837 break; 1838 } 1839 1840 // cp already contains *src and norm32 is set for it, increment src 1841 srcIndex += (cp < 0x10000) ? 1 : 2; 1842 1843 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1844 c2 = 0; 1845 length = 1; 1846 1847 if (Character.isHighSurrogate(c1) 1848 || Character.isLowSurrogate(c1)) { 1849 norm = null; 1850 } 1851 } else { 1852 length = 2; 1853 c2 = src[srcIndex-1]; 1854 } 1855 1856 // get the decomposition and the lead and trail cc's 1857 if (norm == null) { 1858 // cp does not decompose 1859 cc = trailCC = UCharacter.getCombiningClass(cp); 1860 p = null; 1861 pStart = -1; 1862 } else { 1863 1864 pStart = 0; 1865 p = norm.toCharArray(); 1866 length = p.length; 1867 int cpNum = norm.codePointCount(0, length); 1868 cc= UCharacter.getCombiningClass(norm.codePointAt(0)); 1869 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); 1870 if (length == 1) { 1871 // fastpath a single code unit from decomposition 1872 c1 = p[pStart]; 1873 c2 = 0; 1874 p = null; 1875 pStart = -1; 1876 } 1877 } 1878 1879 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 1880 // buffer overflow 1881 char[] tmpBuf = new char[destLimit * 2]; 1882 System.arraycopy(dest, 0, tmpBuf, 0, destIndex); 1883 dest = tmpBuf; 1884 destLimit = dest.length; 1885 } 1886 1887 // append the decomposition to the destination buffer, assume length>0 1888 { 1889 int reorderSplit = destIndex; 1890 if (p == null) { 1891 // fastpath: single code point 1892 if (needSingleQuotation(c1)) { 1893 //if we need single quotation, no need to consider "prevCC" 1894 //and it must NOT be a supplementary pair 1895 dest[destIndex++] = '\''; 1896 dest[destIndex++] = c1; 1897 dest[destIndex++] = '\''; 1898 trailCC = 0; 1899 } else if(cc != 0 && cc < prevCC) { 1900 // (c1, c2) is out of order with respect to the preceding 1901 // text 1902 destIndex += length; 1903 trailCC = insertOrdered(dest, reorderStartIndex, 1904 reorderSplit, destIndex, c1, c2, cc); 1905 } else { 1906 // just append (c1, c2) 1907 dest[destIndex++] = c1; 1908 if(c2 != 0) { 1909 dest[destIndex++] = c2; 1910 } 1911 } 1912 } else { 1913 // general: multiple code points (ordered by themselves) 1914 // from decomposition 1915 if (needSingleQuotation(p[pStart])) { 1916 dest[destIndex++] = '\''; 1917 dest[destIndex++] = p[pStart++]; 1918 dest[destIndex++] = '\''; 1919 length--; 1920 do { 1921 dest[destIndex++] = p[pStart++]; 1922 } while(--length > 0); 1923 } else if (cc != 0 && cc < prevCC) { 1924 destIndex += length; 1925 trailCC = mergeOrdered(dest, reorderStartIndex, 1926 reorderSplit, p, pStart, 1927 pStart+length); 1928 } else { 1929 // just append the decomposition 1930 do { 1931 dest[destIndex++] = p[pStart++]; 1932 } while (--length > 0); 1933 } 1934 } 1935 } 1936 prevCC = trailCC; 1937 if(prevCC == 0) { 1938 reorderStartIndex = destIndex; 1939 } 1940 } 1941 1942 return new String(dest, 0, destIndex); 1943 } 1944 1945 /** 1946 * simpler, single-character version of mergeOrdered() - 1947 * bubble-insert one single code point into the preceding string 1948 * which is already canonically ordered 1949 * (c, c2) may or may not yet have been inserted at src[current]..src[p] 1950 * 1951 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) 1952 * 1953 * before: src[start]..src[current] is already ordered, and 1954 * src[current]..src[p] may or may not hold (c, c2) but 1955 * must be exactly the same length as (c, c2) 1956 * after: src[start]..src[p] is ordered 1957 * 1958 * @return the trailing combining class 1959 */ 1960 private static int/*unsigned byte*/ insertOrdered(char[] source, 1961 int start, 1962 int current, int p, 1963 char c1, char c2, 1964 int/*unsigned byte*/ cc) { 1965 int back, preBack; 1966 int r; 1967 int prevCC, trailCC=cc; 1968 1969 if (start<current && cc!=0) { 1970 // search for the insertion point where cc>=prevCC 1971 preBack=back=current; 1972 1973 PrevArgs prevArgs = new PrevArgs(); 1974 prevArgs.current = current; 1975 prevArgs.start = start; 1976 prevArgs.src = source; 1977 prevArgs.c1 = c1; 1978 prevArgs.c2 = c2; 1979 1980 // get the prevCC 1981 prevCC=getPrevCC(prevArgs); 1982 preBack = prevArgs.current; 1983 1984 if(cc<prevCC) { 1985 // this will be the last code point, so keep its cc 1986 trailCC=prevCC; 1987 back=preBack; 1988 while(start<preBack) { 1989 prevCC=getPrevCC(prevArgs); 1990 preBack=prevArgs.current; 1991 if(cc>=prevCC) { 1992 break; 1993 } 1994 back=preBack; 1995 } 1996 1997 // this is where we are right now with all these indicies: 1998 // [start]..[pPreBack] 0..? code points that we can ignore 1999 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc 2000 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) 2001 // [current]..[p] 1 code point (c, c2) with cc 2002 2003 // move the code units in between up 2004 r=p; 2005 do { 2006 source[--r]=source[--current]; 2007 } while (back!=current); 2008 } 2009 } 2010 2011 // insert (c1, c2) 2012 source[current] = c1; 2013 if (c2!=0) { 2014 source[(current+1)] = c2; 2015 } 2016 2017 // we know the cc of the last code point 2018 return trailCC; 2019 } 2020 /** 2021 * merge two UTF-16 string parts together 2022 * to canonically order (order by combining classes) their concatenation 2023 * 2024 * the two strings may already be adjacent, so that the merging is done 2025 * in-place if the two strings are not adjacent, then the buffer holding the 2026 * first one must be large enough 2027 * the second string may or may not be ordered in itself 2028 * 2029 * before: [start]..[current] is already ordered, and 2030 * [next]..[limit] may be ordered in itself, but 2031 * is not in relation to [start..current[ 2032 * after: [start..current+(limit-next)[ is ordered 2033 * 2034 * the algorithm is a simple bubble-sort that takes the characters from 2035 * src[next++] and inserts them in correct combining class order into the 2036 * preceding part of the string 2037 * 2038 * since this function is called much less often than the single-code point 2039 * insertOrdered(), it just uses that for easier maintenance 2040 * 2041 * @return the trailing combining class 2042 */ 2043 private static int /*unsigned byte*/ mergeOrdered(char[] source, 2044 int start, 2045 int current, 2046 char[] data, 2047 int next, 2048 int limit) { 2049 int r; 2050 int /*unsigned byte*/ cc, trailCC=0; 2051 boolean adjacent; 2052 2053 adjacent= current==next; 2054 NextCCArgs ncArgs = new NextCCArgs(); 2055 ncArgs.source = data; 2056 ncArgs.next = next; 2057 ncArgs.limit = limit; 2058 2059 if(start!=current) { 2060 2061 while(ncArgs.next<ncArgs.limit) { 2062 cc=getNextCC(ncArgs); 2063 if(cc==0) { 2064 // does not bubble back 2065 trailCC=0; 2066 if(adjacent) { 2067 current=ncArgs.next; 2068 } else { 2069 data[current++]=ncArgs.c1; 2070 if(ncArgs.c2!=0) { 2071 data[current++]=ncArgs.c2; 2072 } 2073 } 2074 break; 2075 } else { 2076 r=current+(ncArgs.c2==0 ? 1 : 2); 2077 trailCC=insertOrdered(source,start, current, r, 2078 ncArgs.c1, ncArgs.c2, cc); 2079 current=r; 2080 } 2081 } 2082 } 2083 2084 if(ncArgs.next==ncArgs.limit) { 2085 // we know the cc of the last code point 2086 return trailCC; 2087 } else { 2088 if(!adjacent) { 2089 // copy the second string part 2090 do { 2091 source[current++]=data[ncArgs.next++]; 2092 } while(ncArgs.next!=ncArgs.limit); 2093 ncArgs.limit=current; 2094 } 2095 PrevArgs prevArgs = new PrevArgs(); 2096 prevArgs.src = data; 2097 prevArgs.start = start; 2098 prevArgs.current = ncArgs.limit; 2099 return getPrevCC(prevArgs); 2100 } 2101 2102 } 2103 private static final class PrevArgs{ 2104 char[] src; 2105 int start; 2106 int current; 2107 char c1; 2108 char c2; 2109 } 2110 2111 private static final class NextCCArgs{ 2112 char[] source; 2113 int next; 2114 int limit; 2115 char c1; 2116 char c2; 2117 } 2118 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { 2119 args.c1=args.source[args.next++]; 2120 args.c2=0; 2121 2122 if (UTF16.isTrailSurrogate(args.c1)) { 2123 /* unpaired second surrogate */ 2124 return 0; 2125 } else if (!UTF16.isLeadSurrogate(args.c1)) { 2126 return UCharacter.getCombiningClass(args.c1); 2127 } else if (args.next!=args.limit && 2128 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ 2129 ++args.next; 2130 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); 2131 } else { 2132 /* unpaired first surrogate */ 2133 args.c2=0; 2134 return 0; 2135 } 2136 } 2137 private static int /*unsigned*/ getPrevCC(PrevArgs args) { 2138 args.c1=args.src[--args.current]; 2139 args.c2=0; 2140 2141 if (args.c1 < MIN_CCC_LCCC_CP) { 2142 return 0; 2143 } else if (UTF16.isLeadSurrogate(args.c1)) { 2144 /* unpaired first surrogate */ 2145 return 0; 2146 } else if (!UTF16.isTrailSurrogate(args.c1)) { 2147 return UCharacter.getCombiningClass(args.c1); 2148 } else if (args.current!=args.start && 2149 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { 2150 --args.current; 2151 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); 2152 } else { 2153 /* unpaired second surrogate */ 2154 args.c2=0; 2155 return 0; 2156 } 2157 } 2158 2159 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2160 if (start == p) { 2161 return 0; 2162 } 2163 return getFCD16(Character.codePointBefore(s, p)); 2164 } 2165 2166 private VersionInfo dataVersion; 2167 2168 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2169 private int minDecompNoCP; 2170 private int minCompNoMaybeCP; 2171 private int minLcccCP; 2172 2173 // Norm16 value thresholds for quick check combinations and types of extra data. 2174 private int minYesNo; 2175 private int minYesNoMappingsOnly; 2176 private int minNoNo; 2177 private int minNoNoCompBoundaryBefore; 2178 private int minNoNoCompNoMaybeCC; 2179 private int minNoNoEmpty; 2180 private int limitNoNo; 2181 private int centerNoNoDelta; 2182 private int minMaybeYes; 2183 2184 private CodePointTrie.Fast16 normTrie; 2185 private String maybeYesCompositions; 2186 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2187 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2188 }