1 /* 2 * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 package jdk.internal.icu.impl; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 37 import jdk.internal.icu.lang.UCharacter; 38 import jdk.internal.icu.text.Normalizer2; 39 import jdk.internal.icu.text.UTF16; 40 import jdk.internal.icu.util.CodePointTrie; 41 import jdk.internal.icu.util.VersionInfo; 42 43 // Original filename in ICU4J: Normalizer2Impl.java 44 public final class NormalizerImpl { 45 public static final class Hangul { 46 /* Korean Hangul and Jamo constants */ 47 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 48 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 49 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 50 51 public static final int HANGUL_BASE=0xac00; 52 public static final int HANGUL_END=0xd7a3; 53 54 public static final int JAMO_L_COUNT=19; 55 public static final int JAMO_V_COUNT=21; 56 public static final int JAMO_T_COUNT=28; 57 58 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 59 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 60 61 public static boolean isHangul(int c) { 62 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 63 } 64 public static boolean isHangulLV(int c) { 65 c-=HANGUL_BASE; 66 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 67 } 68 69 /** 70 * Decomposes c, which must be a Hangul syllable, into buffer 71 * and returns the length of the decomposition (2 or 3). 72 */ 73 public static int decompose(int c, Appendable buffer) { 74 try { 75 c-=HANGUL_BASE; 76 int c2=c%JAMO_T_COUNT; 77 c/=JAMO_T_COUNT; 78 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 79 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 80 if(c2==0) { 81 return 2; 82 } else { 83 buffer.append((char)(JAMO_T_BASE+c2)); 84 return 3; 85 } 86 } catch(IOException e) { 87 throw new InternalError(e); 88 } 89 } 90 } 91 92 /** 93 * Writable buffer that takes care of canonical ordering. 94 * Its Appendable methods behave like the C++ implementation's 95 * appendZeroCC() methods. 96 * <p> 97 * If dest is a StringBuilder, then the buffer writes directly to it. 98 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 99 * until no further changes are necessary and whole segments are appended. 100 * append() methods that take combining-class values always write to the StringBuilder. 101 * Other append() methods flush and append to the Appendable. 102 */ 103 public static final class ReorderingBuffer implements Appendable { 104 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { 105 impl=ni; 106 app=dest; 107 if (app instanceof StringBuilder) { 108 appIsStringBuilder=true; 109 str=(StringBuilder)dest; 110 // In Java, the constructor subsumes public void init(int destCapacity) 111 str.ensureCapacity(destCapacity); 112 reorderStart=0; 113 if(str.length()==0) { 114 lastCC=0; 115 } else { 116 setIterator(); 117 lastCC=previousCC(); 118 // Set reorderStart after the last code point with cc<=1 if there is one. 119 if(lastCC>1) { 120 while(previousCC()>1) {} 121 } 122 reorderStart=codePointLimit; 123 } 124 } else { 125 appIsStringBuilder=false; 126 str=new StringBuilder(); 127 reorderStart=0; 128 lastCC=0; 129 } 130 } 131 132 public boolean isEmpty() { return str.length()==0; } 133 public int length() { return str.length(); } 134 public int getLastCC() { return lastCC; } 135 136 public StringBuilder getStringBuilder() { return str; } 137 138 public boolean equals(CharSequence s, int start, int limit) { 139 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 140 } 141 142 public void append(int c, int cc) { 143 if(lastCC<=cc || cc==0) { 144 str.appendCodePoint(c); 145 lastCC=cc; 146 if(cc<=1) { 147 reorderStart=str.length(); 148 } 149 } else { 150 insert(c, cc); 151 } 152 } 153 public void append(CharSequence s, int start, int limit, boolean isNFD, 154 int leadCC, int trailCC) { 155 if(start==limit) { 156 return; 157 } 158 if(lastCC<=leadCC || leadCC==0) { 159 if(trailCC<=1) { 160 reorderStart=str.length()+(limit-start); 161 } else if(leadCC<=1) { 162 reorderStart=str.length()+1; // Ok if not a code point boundary. 163 } 164 str.append(s, start, limit); 165 lastCC=trailCC; 166 } else { 167 int c=Character.codePointAt(s, start); 168 start+=Character.charCount(c); 169 insert(c, leadCC); // insert first code point 170 while(start<limit) { 171 c=Character.codePointAt(s, start); 172 start+=Character.charCount(c); 173 if(start<limit) { 174 if (isNFD) { 175 leadCC = getCCFromYesOrMaybe(impl.getNorm16(c)); 176 } else { 177 leadCC = impl.getCC(impl.getNorm16(c)); 178 } 179 } else { 180 leadCC=trailCC; 181 } 182 append(c, leadCC); 183 } 184 } 185 } 186 // The following append() methods work like C++ appendZeroCC(). 187 // They assume that the cc or trailCC of their input is 0. 188 // Most of them implement Appendable interface methods. 189 @Override 190 public ReorderingBuffer append(char c) { 191 str.append(c); 192 lastCC=0; 193 reorderStart=str.length(); 194 return this; 195 } 196 public void appendZeroCC(int c) { 197 str.appendCodePoint(c); 198 lastCC=0; 199 reorderStart=str.length(); 200 } 201 @Override 202 public ReorderingBuffer append(CharSequence s) { 203 if(s.length()!=0) { 204 str.append(s); 205 lastCC=0; 206 reorderStart=str.length(); 207 } 208 return this; 209 } 210 @Override 211 public ReorderingBuffer append(CharSequence s, int start, int limit) { 212 if(start!=limit) { 213 str.append(s, start, limit); 214 lastCC=0; 215 reorderStart=str.length(); 216 } 217 return this; 218 } 219 /** 220 * Flushes from the intermediate StringBuilder to the Appendable, 221 * if they are different objects. 222 * Used after recomposition. 223 * Must be called at the end when writing to a non-StringBuilder Appendable. 224 */ 225 public void flush() { 226 if(appIsStringBuilder) { 227 reorderStart=str.length(); 228 } else { 229 try { 230 app.append(str); 231 str.setLength(0); 232 reorderStart=0; 233 } catch(IOException e) { 234 throw new InternalError(e); // Avoid declaring "throws IOException". 235 } 236 } 237 lastCC=0; 238 } 239 /** 240 * Flushes from the intermediate StringBuilder to the Appendable, 241 * if they are different objects. 242 * Then appends the new text to the Appendable or StringBuilder. 243 * Normally used after quick check loops find a non-empty sequence. 244 */ 245 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 246 if(appIsStringBuilder) { 247 str.append(s, start, limit); 248 reorderStart=str.length(); 249 } else { 250 try { 251 app.append(str).append(s, start, limit); 252 str.setLength(0); 253 reorderStart=0; 254 } catch(IOException e) { 255 throw new InternalError(e); // Avoid declaring "throws IOException". 256 } 257 } 258 lastCC=0; 259 return this; 260 } 261 public void remove() { 262 str.setLength(0); 263 lastCC=0; 264 reorderStart=0; 265 } 266 public void removeSuffix(int suffixLength) { 267 int oldLength=str.length(); 268 str.delete(oldLength-suffixLength, oldLength); 269 lastCC=0; 270 reorderStart=str.length(); 271 } 272 273 // Inserts c somewhere before the last character. 274 // Requires 0<cc<lastCC which implies reorderStart<limit. 275 private void insert(int c, int cc) { 276 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 277 // insert c at codePointLimit, after the character with prevCC<=cc 278 if(c<=0xffff) { 279 str.insert(codePointLimit, (char)c); 280 if(cc<=1) { 281 reorderStart=codePointLimit+1; 282 } 283 } else { 284 str.insert(codePointLimit, Character.toChars(c)); 285 if(cc<=1) { 286 reorderStart=codePointLimit+2; 287 } 288 } 289 } 290 291 private final NormalizerImpl impl; 292 private final Appendable app; 293 private final StringBuilder str; 294 private final boolean appIsStringBuilder; 295 private int reorderStart; 296 private int lastCC; 297 298 // private backward iterator 299 private void setIterator() { codePointStart=str.length(); } 300 private void skipPrevious() { // Requires 0<codePointStart. 301 codePointLimit=codePointStart; 302 codePointStart=str.offsetByCodePoints(codePointStart, -1); 303 } 304 private int previousCC() { // Returns 0 if there is no previous character. 305 codePointLimit=codePointStart; 306 if(reorderStart>=codePointStart) { 307 return 0; 308 } 309 int c=str.codePointBefore(codePointStart); 310 codePointStart-=Character.charCount(c); 311 return impl.getCCFromYesOrMaybeCP(c); 312 } 313 private int codePointStart, codePointLimit; 314 } 315 316 // TODO: Propose as public API on the UTF16 class. 317 // TODO: Propose widening UTF16 methods that take char to take int. 318 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 319 public static final class UTF16Plus { 320 /** 321 * Is this code point a lead surrogate (U+d800..U+dbff)? 322 * @param c code unit or code point 323 * @return true or false 324 */ 325 public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } 326 /** 327 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 328 * is it a lead surrogate? 329 * @param c code unit or code point 330 * @return true or false 331 */ 332 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 333 334 /** 335 * Compares two CharSequence subsequences for binary equality. 336 * @param s1 first sequence 337 * @param start1 start offset in first sequence 338 * @param limit1 limit offset in first sequence 339 * @param s2 second sequence 340 * @param start2 start offset in second sequence 341 * @param limit2 limit offset in second sequence 342 * @return true if s1.subSequence(start1, limit1) contains the same text 343 * as s2.subSequence(start2, limit2) 344 */ 345 public static boolean equal(CharSequence s1, int start1, int limit1, 346 CharSequence s2, int start2, int limit2) { 347 if((limit1-start1)!=(limit2-start2)) { 348 return false; 349 } 350 if(s1==s2 && start1==start2) { 351 return true; 352 } 353 while(start1<limit1) { 354 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 355 return false; 356 } 357 } 358 return true; 359 } 360 } 361 362 public NormalizerImpl() {} 363 364 private static final class IsAcceptable implements ICUBinary.Authenticate { 365 public boolean isDataVersionAcceptable(byte version[]) { 366 return version[0]==4; 367 } 368 } 369 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 370 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 371 372 public NormalizerImpl load(ByteBuffer bytes) { 373 try { 374 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 375 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 376 if(indexesLength<=IX_MIN_LCCC_CP) { 377 throw new InternalError("Normalizer2 data: not enough indexes"); 378 } 379 int[] inIndexes=new int[indexesLength]; 380 inIndexes[0]=indexesLength*4; 381 for(int i=1; i<indexesLength; ++i) { 382 inIndexes[i]=bytes.getInt(); 383 } 384 385 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 386 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 387 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 388 389 minYesNo=inIndexes[IX_MIN_YES_NO]; 390 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 391 minNoNo=inIndexes[IX_MIN_NO_NO]; 392 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 393 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 394 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 395 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 396 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 397 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 398 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 399 400 // Read the normTrie. 401 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 402 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 403 int triePosition = bytes.position(); 404 normTrie = CodePointTrie.Fast16.fromBinary(bytes); 405 int trieLength = bytes.position() - triePosition; 406 if(trieLength>(nextOffset-offset)) { 407 throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); 408 } 409 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 410 411 // Read the composition and mapping data. 412 offset=nextOffset; 413 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 414 int numChars=(nextOffset-offset)/2; 415 if(numChars!=0) { 416 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 417 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 418 } 419 420 // smallFCD: new in formatVersion 2 421 offset=nextOffset; 422 smallFCD=new byte[0x100]; 423 bytes.get(smallFCD); 424 425 return this; 426 } catch(IOException e) { 427 throw new InternalError(e); 428 } 429 } 430 public NormalizerImpl load(String name) { 431 return load(ICUBinary.getRequiredData(name)); 432 } 433 434 // The trie stores values for lead surrogate code *units*. 435 // Surrogate code *points* are inert. 436 public int getNorm16(int c) { 437 return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); 438 } 439 public int getRawNorm16(int c) { return normTrie.get(c); } 440 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } 441 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } 442 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 443 444 public int getCC(int norm16) { 445 if(norm16>=MIN_NORMAL_MAYBE_YES) { 446 return getCCFromNormalYesOrMaybe(norm16); 447 } 448 if(norm16<minNoNo || limitNoNo<=norm16) { 449 return 0; 450 } 451 return getCCFromNoNo(norm16); 452 } 453 public static int getCCFromNormalYesOrMaybe(int norm16) { 454 return (norm16 >> OFFSET_SHIFT) & 0xff; 455 } 456 public static int getCCFromYesOrMaybe(int norm16) { 457 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 458 } 459 public int getCCFromYesOrMaybeCP(int c) { 460 if (c < minCompNoMaybeCP) { return 0; } 461 return getCCFromYesOrMaybe(getNorm16(c)); 462 } 463 464 /** 465 * Returns the FCD data for code point c. 466 * @param c A Unicode code point. 467 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 468 */ 469 public int getFCD16(int c) { 470 if(c<minDecompNoCP) { 471 return 0; 472 } else if(c<=0xffff) { 473 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 474 } 475 return getFCD16FromNormData(c); 476 } 477 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 478 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 479 // 0<=lead<=0xffff 480 byte bits=smallFCD[lead>>8]; 481 if(bits==0) { return false; } 482 return ((bits>>((lead>>5)&7))&1)!=0; 483 } 484 485 /** Gets the FCD value from the regular normalization data. */ 486 public int getFCD16FromNormData(int c) { 487 int norm16=getNorm16(c); 488 if (norm16 >= limitNoNo) { 489 if(norm16>=MIN_NORMAL_MAYBE_YES) { 490 // combining mark 491 norm16=getCCFromNormalYesOrMaybe(norm16); 492 return norm16|(norm16<<8); 493 } else if(norm16>=minMaybeYes) { 494 return 0; 495 } else { // isDecompNoAlgorithmic(norm16) 496 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 497 if (deltaTrailCC <= DELTA_TCCC_1) { 498 return deltaTrailCC >> OFFSET_SHIFT; 499 } 500 // Maps to an isCompYesAndZeroCC. 501 c=mapAlgorithmic(c, norm16); 502 norm16=getRawNorm16(c); 503 } 504 } 505 if(norm16<=minYesNo || isHangulLVT(norm16)) { 506 // no decomposition or Hangul syllable, all zeros 507 return 0; 508 } 509 // c decomposes, get everything from the variable-length extra data 510 int mapping=norm16>>OFFSET_SHIFT; 511 int firstUnit=extraData.charAt(mapping); 512 int fcd16=firstUnit>>8; // tccc 513 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 514 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 515 } 516 return fcd16; 517 } 518 519 /** 520 * Gets the decomposition for one code point. 521 * @param c code point 522 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 523 */ 524 public String getDecomposition(int c) { 525 int norm16; 526 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 527 // c does not decompose 528 return null; 529 } 530 int decomp = -1; 531 if(isDecompNoAlgorithmic(norm16)) { 532 // Maps to an isCompYesAndZeroCC. 533 decomp=c=mapAlgorithmic(c, norm16); 534 // The mapping might decompose further. 535 norm16 = getRawNorm16(c); 536 } 537 if (norm16 < minYesNo) { 538 if(decomp<0) { 539 return null; 540 } else { 541 return UTF16.valueOf(decomp); 542 } 543 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 544 // Hangul syllable: decompose algorithmically 545 StringBuilder buffer=new StringBuilder(); 546 Hangul.decompose(c, buffer); 547 return buffer.toString(); 548 } 549 // c decomposes, get everything from the variable-length extra data 550 int mapping=norm16>>OFFSET_SHIFT; 551 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 552 return extraData.substring(mapping, mapping+length); 553 } 554 555 // Fixed norm16 values. 556 public static final int MIN_YES_YES_WITH_CC=0xfe02; 557 public static final int JAMO_VT=0xfe00; 558 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 559 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 560 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 561 562 // norm16 bit 0 is comp-boundary-after. 563 public static final int HAS_COMP_BOUNDARY_AFTER=1; 564 public static final int OFFSET_SHIFT=1; 565 566 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 567 // tccc (0, 1, >1) for quick FCC boundary-after tests. 568 public static final int DELTA_TCCC_0=0; 569 public static final int DELTA_TCCC_1=2; 570 public static final int DELTA_TCCC_GT_1=4; 571 public static final int DELTA_TCCC_MASK=6; 572 public static final int DELTA_SHIFT=3; 573 574 public static final int MAX_DELTA=0x40; 575 576 // Byte offsets from the start of the data, after the generic header. 577 public static final int IX_NORM_TRIE_OFFSET=0; 578 public static final int IX_EXTRA_DATA_OFFSET=1; 579 public static final int IX_SMALL_FCD_OFFSET=2; 580 public static final int IX_RESERVED3_OFFSET=3; 581 public static final int IX_TOTAL_SIZE=7; 582 public static final int MIN_CCC_LCCC_CP=0x300; 583 // Code point thresholds for quick check codes. 584 public static final int IX_MIN_DECOMP_NO_CP=8; 585 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 586 587 // Norm16 value thresholds for quick check combinations and types of extra data. 588 589 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 590 public static final int IX_MIN_YES_NO=10; 591 /** Mappings are comp-normalized. */ 592 public static final int IX_MIN_NO_NO=11; 593 public static final int IX_LIMIT_NO_NO=12; 594 public static final int IX_MIN_MAYBE_YES=13; 595 596 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 597 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 598 /** Mappings are not comp-normalized but have a comp boundary before. */ 599 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 600 /** Mappings do not have a comp boundary before. */ 601 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 602 /** Mappings to the empty string. */ 603 public static final int IX_MIN_NO_NO_EMPTY=17; 604 605 public static final int IX_MIN_LCCC_CP=18; 606 public static final int IX_COUNT=20; 607 608 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 609 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 610 // unused bit 0x20; 611 public static final int MAPPING_LENGTH_MASK=0x1f; 612 613 public static final int COMP_1_LAST_TUPLE=0x8000; 614 public static final int COMP_1_TRIPLE=1; 615 public static final int COMP_1_TRAIL_LIMIT=0x3400; 616 public static final int COMP_1_TRAIL_MASK=0x7ffe; 617 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 618 public static final int COMP_2_TRAIL_SHIFT=6; 619 public static final int COMP_2_TRAIL_MASK=0xffc0; 620 621 // higher-level functionality ------------------------------------------ *** 622 623 /** 624 * Decomposes s[src, limit[ and writes the result to dest. 625 * limit can be NULL if src is NUL-terminated. 626 * destLengthEstimate is the initial dest buffer capacity and can be -1. 627 */ 628 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 629 int destLengthEstimate) { 630 if(destLengthEstimate<0) { 631 destLengthEstimate=limit-src; 632 } 633 dest.setLength(0); 634 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 635 decompose(s, src, limit, buffer); 636 } 637 638 // Dual functionality: 639 // buffer!=NULL: normalize 640 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 641 public int decompose(CharSequence s, int src, int limit, 642 ReorderingBuffer buffer) { 643 int minNoCP=minDecompNoCP; 644 645 int prevSrc; 646 int c=0; 647 int norm16=0; 648 649 // only for quick check 650 int prevBoundary=src; 651 int prevCC=0; 652 653 for(;;) { 654 // count code units below the minimum or with irrelevant data for the quick check 655 for(prevSrc=src; src!=limit;) { 656 if( (c=s.charAt(src))<minNoCP || 657 isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 658 ) { 659 ++src; 660 } else if(!UTF16Plus.isLeadSurrogate(c)) { 661 break; 662 } else { 663 char c2; 664 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 665 c = Character.toCodePoint((char)c, c2); 666 norm16 = normTrie.suppGet(c); 667 if (isMostDecompYesAndZeroCC(norm16)) { 668 src += 2; 669 } else { 670 break; 671 } 672 } else { 673 ++src; // unpaired lead surrogate: inert 674 } 675 } 676 } 677 // copy these code units all at once 678 if(src!=prevSrc) { 679 if(buffer!=null) { 680 buffer.flushAndAppendZeroCC(s, prevSrc, src); 681 } else { 682 prevCC=0; 683 prevBoundary=src; 684 } 685 } 686 if(src==limit) { 687 break; 688 } 689 690 // Check one above-minimum, relevant code point. 691 src+=Character.charCount(c); 692 if(buffer!=null) { 693 decompose(c, norm16, buffer); 694 } else { 695 if(isDecompYes(norm16)) { 696 int cc=getCCFromYesOrMaybe(norm16); 697 if(prevCC<=cc || cc==0) { 698 prevCC=cc; 699 if(cc<=1) { 700 prevBoundary=src; 701 } 702 continue; 703 } 704 } 705 return prevBoundary; // "no" or cc out of order 706 } 707 } 708 return src; 709 } 710 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 711 int limit=s.length(); 712 if(limit==0) { 713 return; 714 } 715 if(doDecompose) { 716 decompose(s, 0, limit, buffer); 717 return; 718 } 719 // Just merge the strings at the boundary. 720 int c=Character.codePointAt(s, 0); 721 int src=0; 722 int firstCC, prevCC, cc; 723 firstCC=prevCC=cc=getCC(getNorm16(c)); 724 while(cc!=0) { 725 prevCC=cc; 726 src+=Character.charCount(c); 727 if(src>=limit) { 728 break; 729 } 730 c=Character.codePointAt(s, src); 731 cc=getCC(getNorm16(c)); 732 }; 733 buffer.append(s, 0, src, false, firstCC, prevCC); 734 buffer.append(s, src, limit); 735 } 736 737 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 738 // doCompose: normalize 739 // !doCompose: isNormalized (buffer must be empty and initialized) 740 public boolean compose(CharSequence s, int src, int limit, 741 boolean onlyContiguous, 742 boolean doCompose, 743 ReorderingBuffer buffer) { 744 int prevBoundary=src; 745 int minNoMaybeCP=minCompNoMaybeCP; 746 747 for (;;) { 748 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 749 // or with (compYes && ccc==0) properties. 750 int prevSrc; 751 int c = 0; 752 int norm16 = 0; 753 for (;;) { 754 if (src == limit) { 755 if (prevBoundary != limit && doCompose) { 756 buffer.append(s, prevBoundary, limit); 757 } 758 return true; 759 } 760 if( (c=s.charAt(src))<minNoMaybeCP || 761 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 762 ) { 763 ++src; 764 } else { 765 prevSrc = src++; 766 if (!UTF16Plus.isLeadSurrogate(c)) { 767 break; 768 } else { 769 char c2; 770 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 771 ++src; 772 c = Character.toCodePoint((char)c, c2); 773 norm16 = normTrie.suppGet(c); 774 if (!isCompYesAndZeroCC(norm16)) { 775 break; 776 } 777 } 778 } 779 } 780 } 781 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 782 // The current character is either a "noNo" (has a mapping) 783 // or a "maybeYes" (combines backward) 784 // or a "yesYes" with ccc!=0. 785 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 786 787 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 788 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 789 if (!doCompose) { 790 return false; 791 } 792 // Fast path for mapping a character that is immediately surrounded by boundaries. 793 // In this case, we need not decompose around the current character. 794 if (isDecompNoAlgorithmic(norm16)) { 795 // Maps to a single isCompYesAndZeroCC character 796 // which also implies hasCompBoundaryBefore. 797 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 798 hasCompBoundaryBefore(s, src, limit)) { 799 if (prevBoundary != prevSrc) { 800 buffer.append(s, prevBoundary, prevSrc); 801 } 802 buffer.append(mapAlgorithmic(c, norm16), 0); 803 prevBoundary = src; 804 continue; 805 } 806 } else if (norm16 < minNoNoCompBoundaryBefore) { 807 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 808 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 809 hasCompBoundaryBefore(s, src, limit)) { 810 if (prevBoundary != prevSrc) { 811 buffer.append(s, prevBoundary, prevSrc); 812 } 813 int mapping = norm16 >> OFFSET_SHIFT; 814 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 815 buffer.append(extraData, mapping, mapping + length); 816 prevBoundary = src; 817 continue; 818 } 819 } else if (norm16 >= minNoNoEmpty) { 820 // The current character maps to nothing. 821 // Simply omit it from the output if there is a boundary before _or_ after it. 822 // The character itself implies no boundaries. 823 if (hasCompBoundaryBefore(s, src, limit) || 824 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 825 if (prevBoundary != prevSrc) { 826 buffer.append(s, prevBoundary, prevSrc); 827 } 828 prevBoundary = src; 829 continue; 830 } 831 } 832 // Other "noNo" type, or need to examine more text around this character: 833 // Fall through to the slow path. 834 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 835 char prev=s.charAt(prevSrc-1); 836 if(c<Hangul.JAMO_T_BASE) { 837 // The current character is a Jamo Vowel, 838 // compose with previous Jamo L and following Jamo T. 839 char l = (char)(prev-Hangul.JAMO_L_BASE); 840 if(l<Hangul.JAMO_L_COUNT) { 841 if (!doCompose) { 842 return false; 843 } 844 int t; 845 if (src != limit && 846 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 847 t < Hangul.JAMO_T_COUNT) { 848 // The next character is a Jamo T. 849 ++src; 850 } else if (hasCompBoundaryBefore(s, src, limit)) { 851 // No Jamo T follows, not even via decomposition. 852 t = 0; 853 } else { 854 t = -1; 855 } 856 if (t >= 0) { 857 int syllable = Hangul.HANGUL_BASE + 858 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 859 Hangul.JAMO_T_COUNT + t; 860 --prevSrc; // Replace the Jamo L as well. 861 if (prevBoundary != prevSrc) { 862 buffer.append(s, prevBoundary, prevSrc); 863 } 864 buffer.append((char)syllable); 865 prevBoundary = src; 866 continue; 867 } 868 // If we see L+V+x where x!=T then we drop to the slow path, 869 // decompose and recompose. 870 // This is to deal with NFKC finding normal L and V but a 871 // compatibility variant of a T. 872 // We need to either fully compose that combination here 873 // (which would complicate the code and may not work with strange custom data) 874 // or use the slow path. 875 } 876 } else if (Hangul.isHangulLV(prev)) { 877 // The current character is a Jamo Trailing consonant, 878 // compose with previous Hangul LV that does not contain a Jamo T. 879 if (!doCompose) { 880 return false; 881 } 882 int syllable = prev + c - Hangul.JAMO_T_BASE; 883 --prevSrc; // Replace the Hangul LV as well. 884 if (prevBoundary != prevSrc) { 885 buffer.append(s, prevBoundary, prevSrc); 886 } 887 buffer.append((char)syllable); 888 prevBoundary = src; 889 continue; 890 } 891 // No matching context, or may need to decompose surrounding text first: 892 // Fall through to the slow path. 893 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 894 // One or more combining marks that do not combine-back: 895 // Check for canonical order, copy unchanged if ok and 896 // if followed by a character with a boundary-before. 897 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 898 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 899 // Fails FCD test, need to decompose and contiguously recompose. 900 if (!doCompose) { 901 return false; 902 } 903 } else { 904 // If !onlyContiguous (not FCC), then we ignore the tccc of 905 // the previous character which passed the quick check "yes && ccc==0" test. 906 int n16; 907 for (;;) { 908 if (src == limit) { 909 if (doCompose) { 910 buffer.append(s, prevBoundary, limit); 911 } 912 return true; 913 } 914 int prevCC = cc; 915 c = Character.codePointAt(s, src); 916 n16 = normTrie.get(c); 917 if (n16 >= MIN_YES_YES_WITH_CC) { 918 cc = getCCFromNormalYesOrMaybe(n16); 919 if (prevCC > cc) { 920 if (!doCompose) { 921 return false; 922 } 923 break; 924 } 925 } else { 926 break; 927 } 928 src += Character.charCount(c); 929 } 930 // p is after the last in-order combining mark. 931 // If there is a boundary here, then we continue with no change. 932 if (norm16HasCompBoundaryBefore(n16)) { 933 if (isCompYesAndZeroCC(n16)) { 934 src += Character.charCount(c); 935 } 936 continue; 937 } 938 // Use the slow path. There is no boundary in [prevSrc, src[. 939 } 940 } 941 942 // Slow path: Find the nearest boundaries around the current character, 943 // decompose and recompose. 944 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 945 c = Character.codePointBefore(s, prevSrc); 946 norm16 = normTrie.get(c); 947 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 948 prevSrc -= Character.charCount(c); 949 } 950 } 951 if (doCompose && prevBoundary != prevSrc) { 952 buffer.append(s, prevBoundary, prevSrc); 953 } 954 int recomposeStartIndex=buffer.length(); 955 // We know there is not a boundary here. 956 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 957 buffer); 958 // Decompose until the next boundary. 959 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 960 buffer); 961 recompose(buffer, recomposeStartIndex, onlyContiguous); 962 if(!doCompose) { 963 if(!buffer.equals(s, prevSrc, src)) { 964 return false; 965 } 966 buffer.remove(); 967 } 968 prevBoundary=src; 969 } 970 } 971 972 /** 973 * Very similar to compose(): Make the same changes in both places if relevant. 974 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 975 * !doSpan: quickCheck 976 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 977 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 978 * then the quick check result is "no" 979 */ 980 public int composeQuickCheck(CharSequence s, int src, int limit, 981 boolean onlyContiguous, boolean doSpan) { 982 int qcResult=0; 983 int prevBoundary=src; 984 int minNoMaybeCP=minCompNoMaybeCP; 985 986 for(;;) { 987 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 988 // or with (compYes && ccc==0) properties. 989 int prevSrc; 990 int c = 0; 991 int norm16 = 0; 992 for (;;) { 993 if(src==limit) { 994 return (src<<1)|qcResult; // "yes" or "maybe" 995 } 996 if( (c=s.charAt(src))<minNoMaybeCP || 997 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 998 ) { 999 ++src; 1000 } else { 1001 prevSrc = src++; 1002 if (!UTF16Plus.isLeadSurrogate(c)) { 1003 break; 1004 } else { 1005 char c2; 1006 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 1007 ++src; 1008 c = Character.toCodePoint((char)c, c2); 1009 norm16 = normTrie.suppGet(c); 1010 if (!isCompYesAndZeroCC(norm16)) { 1011 break; 1012 } 1013 } 1014 } 1015 } 1016 } 1017 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1018 // The current character is either a "noNo" (has a mapping) 1019 // or a "maybeYes" (combines backward) 1020 // or a "yesYes" with ccc!=0. 1021 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1022 1023 int prevNorm16 = INERT; 1024 if (prevBoundary != prevSrc) { 1025 prevBoundary = prevSrc; 1026 if (!norm16HasCompBoundaryBefore(norm16)) { 1027 c = Character.codePointBefore(s, prevSrc); 1028 int n16 = getNorm16(c); 1029 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1030 prevBoundary -= Character.charCount(c); 1031 prevNorm16 = n16; 1032 } 1033 } 1034 } 1035 1036 if(isMaybeOrNonZeroCC(norm16)) { 1037 int cc=getCCFromYesOrMaybe(norm16); 1038 if (onlyContiguous /* FCC */ && cc != 0 && 1039 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1040 // The [prevBoundary..prevSrc[ character 1041 // passed the quick check "yes && ccc==0" test 1042 // but is out of canonical order with the current combining mark. 1043 } else { 1044 // If !onlyContiguous (not FCC), then we ignore the tccc of 1045 // the previous character which passed the quick check "yes && ccc==0" test. 1046 for (;;) { 1047 if (norm16 < MIN_YES_YES_WITH_CC) { 1048 if (!doSpan) { 1049 qcResult = 1; 1050 } else { 1051 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1052 } 1053 } 1054 if (src == limit) { 1055 return (src<<1) | qcResult; // "yes" or "maybe" 1056 } 1057 int prevCC = cc; 1058 c = Character.codePointAt(s, src); 1059 norm16 = getNorm16(c); 1060 if (isMaybeOrNonZeroCC(norm16)) { 1061 cc = getCCFromYesOrMaybe(norm16); 1062 if (!(prevCC <= cc || cc == 0)) { 1063 break; 1064 } 1065 } else { 1066 break; 1067 } 1068 src += Character.charCount(c); 1069 } 1070 // src is after the last in-order combining mark. 1071 if (isCompYesAndZeroCC(norm16)) { 1072 prevBoundary = src; 1073 src += Character.charCount(c); 1074 continue; 1075 } 1076 } 1077 } 1078 return prevBoundary<<1; // "no" 1079 } 1080 } 1081 public void composeAndAppend(CharSequence s, 1082 boolean doCompose, 1083 boolean onlyContiguous, 1084 ReorderingBuffer buffer) { 1085 int src=0, limit=s.length(); 1086 if(!buffer.isEmpty()) { 1087 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1088 if(0!=firstStarterInSrc) { 1089 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1090 buffer.length(), onlyContiguous); 1091 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1092 firstStarterInSrc+16); 1093 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1094 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1095 middle.append(s, 0, firstStarterInSrc); 1096 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1097 src=firstStarterInSrc; 1098 } 1099 } 1100 if(doCompose) { 1101 compose(s, src, limit, onlyContiguous, true, buffer); 1102 } else { 1103 buffer.append(s, src, limit); 1104 } 1105 } 1106 // Dual functionality: 1107 // buffer!=NULL: normalize 1108 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1109 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1110 // Note: In this function we use buffer->appendZeroCC() because we track 1111 // the lead and trail combining classes here, rather than leaving it to 1112 // the ReorderingBuffer. 1113 // The exception is the call to decomposeShort() which uses the buffer 1114 // in the normal way. 1115 1116 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1117 // Similar to the prevBoundary in the compose() implementation. 1118 int prevBoundary=src; 1119 int prevSrc; 1120 int c=0; 1121 int prevFCD16=0; 1122 int fcd16=0; 1123 1124 for(;;) { 1125 // count code units with lccc==0 1126 for(prevSrc=src; src!=limit;) { 1127 if((c=s.charAt(src))<minLcccCP) { 1128 prevFCD16=~c; 1129 ++src; 1130 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1131 prevFCD16=0; 1132 ++src; 1133 } else { 1134 if (UTF16Plus.isLeadSurrogate(c)) { 1135 char c2; 1136 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 1137 c = Character.toCodePoint((char)c, c2); 1138 } 1139 } 1140 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1141 prevFCD16=fcd16; 1142 src+=Character.charCount(c); 1143 } else { 1144 break; 1145 } 1146 } 1147 } 1148 // copy these code units all at once 1149 if(src!=prevSrc) { 1150 if(src==limit) { 1151 if(buffer!=null) { 1152 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1153 } 1154 break; 1155 } 1156 prevBoundary=src; 1157 // We know that the previous character's lccc==0. 1158 if(prevFCD16<0) { 1159 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1160 int prev=~prevFCD16; 1161 if(prev<minDecompNoCP) { 1162 prevFCD16=0; 1163 } else { 1164 prevFCD16=getFCD16FromNormData(prev); 1165 if(prevFCD16>1) { 1166 --prevBoundary; 1167 } 1168 } 1169 } else { 1170 int p=src-1; 1171 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1172 Character.isHighSurrogate(s.charAt(p-1)) 1173 ) { 1174 --p; 1175 // Need to fetch the previous character's FCD value because 1176 // prevFCD16 was just for the trail surrogate code point. 1177 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1178 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1179 } 1180 if(prevFCD16>1) { 1181 prevBoundary=p; 1182 } 1183 } 1184 if(buffer!=null) { 1185 // The last lccc==0 character is excluded from the 1186 // flush-and-append call in case it needs to be modified. 1187 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1188 buffer.append(s, prevBoundary, src); 1189 } 1190 // The start of the current character (c). 1191 prevSrc=src; 1192 } else if(src==limit) { 1193 break; 1194 } 1195 1196 src+=Character.charCount(c); 1197 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1198 // Check for proper order, and decompose locally if necessary. 1199 if((prevFCD16&0xff)<=(fcd16>>8)) { 1200 // proper order: prev tccc <= current lccc 1201 if((fcd16&0xff)<=1) { 1202 prevBoundary=src; 1203 } 1204 if(buffer!=null) { 1205 buffer.appendZeroCC(c); 1206 } 1207 prevFCD16=fcd16; 1208 continue; 1209 } else if(buffer==null) { 1210 return prevBoundary; // quick check "no" 1211 } else { 1212 /* 1213 * Back out the part of the source that we copied or appended 1214 * already but is now going to be decomposed. 1215 * prevSrc is set to after what was copied/appended. 1216 */ 1217 buffer.removeSuffix(prevSrc-prevBoundary); 1218 /* 1219 * Find the part of the source that needs to be decomposed, 1220 * up to the next safe boundary. 1221 */ 1222 src=findNextFCDBoundary(s, src, limit); 1223 /* 1224 * The source text does not fulfill the conditions for FCD. 1225 * Decompose and reorder a limited piece of the text. 1226 */ 1227 decomposeShort(s, prevBoundary, src, false, false, buffer); 1228 prevBoundary=src; 1229 prevFCD16=0; 1230 } 1231 } 1232 return src; 1233 } 1234 1235 public boolean hasDecompBoundaryBefore(int c) { 1236 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1237 norm16HasDecompBoundaryBefore(getNorm16(c)); 1238 } 1239 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1240 if (norm16 < minNoNoCompNoMaybeCC) { 1241 return true; 1242 } 1243 if (norm16 >= limitNoNo) { 1244 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1245 } 1246 // c decomposes, get everything from the variable-length extra data 1247 int mapping=norm16>>OFFSET_SHIFT; 1248 int firstUnit=extraData.charAt(mapping); 1249 // true if leadCC==0 (hasFCDBoundaryBefore()) 1250 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1251 } 1252 public boolean hasDecompBoundaryAfter(int c) { 1253 if (c < minDecompNoCP) { 1254 return true; 1255 } 1256 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1257 return true; 1258 } 1259 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1260 } 1261 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1262 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1263 return true; 1264 } 1265 if (norm16 >= limitNoNo) { 1266 if (isMaybeOrNonZeroCC(norm16)) { 1267 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1268 } 1269 // Maps to an isCompYesAndZeroCC. 1270 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1271 } 1272 // c decomposes, get everything from the variable-length extra data 1273 int mapping=norm16>>OFFSET_SHIFT; 1274 int firstUnit=extraData.charAt(mapping); 1275 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1276 // fcd16<=1 || trailCC==0 1277 if(firstUnit>0x1ff) { 1278 return false; // trailCC>1 1279 } 1280 if(firstUnit<=0xff) { 1281 return true; // trailCC==0 1282 } 1283 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1284 // true if leadCC==0 (hasFCDBoundaryBefore()) 1285 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1286 } 1287 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1288 1289 public boolean hasCompBoundaryBefore(int c) { 1290 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1291 } 1292 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1293 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1294 } 1295 1296 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1297 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } 1298 private static boolean isInert(int norm16) { return norm16==INERT; } 1299 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } 1300 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } 1301 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } 1302 private boolean isHangulLVT(int norm16) { 1303 return norm16==hangulLVT(); 1304 } 1305 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1306 // UBool isCompYes(uint16_t norm16) const { 1307 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1308 // } 1309 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1310 // return norm16<minNoNo || minMaybeYes<=norm16; 1311 // } 1312 // private boolean hasZeroCCFromDecompYes(int norm16) { 1313 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1314 // } 1315 private boolean isDecompYesAndZeroCC(int norm16) { 1316 return norm16<minYesNo || 1317 norm16==JAMO_VT || 1318 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1319 } 1320 /** 1321 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1322 * the MaybeYes which combine-forward and have ccc=0. 1323 * (Standard Unicode 10 normalization does not have such characters.) 1324 */ 1325 private boolean isMostDecompYesAndZeroCC(int norm16) { 1326 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1327 } 1328 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1329 1330 // For use with isCompYes(). 1331 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1332 // static uint8_t getCCFromYes(uint16_t norm16) { 1333 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1334 // } 1335 private int getCCFromNoNo(int norm16) { 1336 int mapping=norm16>>OFFSET_SHIFT; 1337 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1338 return extraData.charAt(mapping-1)&0xff; 1339 } else { 1340 return 0; 1341 } 1342 } 1343 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1344 if(norm16<=minYesNo) { 1345 return 0; // yesYes and Hangul LV have ccc=tccc=0 1346 } else { 1347 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1348 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1349 } 1350 } 1351 1352 // Requires algorithmic-NoNo. 1353 private int mapAlgorithmic(int c, int norm16) { 1354 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1355 } 1356 1357 // Requires minYesNo<norm16<limitNoNo. 1358 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1359 1360 /** 1361 * @return index into maybeYesCompositions, or -1 1362 */ 1363 private int getCompositionsListForDecompYes(int norm16) { 1364 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1365 return -1; 1366 } else { 1367 if((norm16-=minMaybeYes)<0) { 1368 // norm16<minMaybeYes: index into extraData which is a substring at 1369 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1370 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1371 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1372 } 1373 return norm16>>OFFSET_SHIFT; 1374 } 1375 } 1376 /** 1377 * @return index into maybeYesCompositions 1378 */ 1379 private int getCompositionsListForComposite(int norm16) { 1380 // A composite has both mapping & compositions list. 1381 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1382 int firstUnit=maybeYesCompositions.charAt(list); 1383 return list+ // mapping in maybeYesCompositions 1384 1+ // +1 to skip the first unit with the mapping length 1385 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1386 } 1387 1388 // Decompose a short piece of text which is likely to contain characters that 1389 // fail the quick check loop and/or where the quick check loop's overhead 1390 // is unlikely to be amortized. 1391 // Called by the compose() and makeFCD() implementations. 1392 // Public in Java for collation implementation code. 1393 private int decomposeShort( 1394 CharSequence s, int src, int limit, 1395 boolean stopAtCompBoundary, boolean onlyContiguous, 1396 ReorderingBuffer buffer) { 1397 while(src<limit) { 1398 int c=Character.codePointAt(s, src); 1399 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1400 return src; 1401 } 1402 int norm16 = getNorm16(c); 1403 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1404 return src; 1405 } 1406 src+=Character.charCount(c); 1407 decompose(c, norm16, buffer); 1408 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1409 return src; 1410 } 1411 } 1412 return src; 1413 } 1414 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1415 // get the decomposition and the lead and trail cc's 1416 if (norm16 >= limitNoNo) { 1417 if (isMaybeOrNonZeroCC(norm16)) { 1418 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1419 return; 1420 } 1421 // Maps to an isCompYesAndZeroCC. 1422 c=mapAlgorithmic(c, norm16); 1423 norm16=getRawNorm16(c); 1424 } 1425 if (norm16 < minYesNo) { 1426 // c does not decompose 1427 buffer.append(c, 0); 1428 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1429 // Hangul syllable: decompose algorithmically 1430 Hangul.decompose(c, buffer); 1431 } else { 1432 // c decomposes, get everything from the variable-length extra data 1433 int mapping=norm16>>OFFSET_SHIFT; 1434 int firstUnit=extraData.charAt(mapping); 1435 int length=firstUnit&MAPPING_LENGTH_MASK; 1436 int leadCC, trailCC; 1437 trailCC=firstUnit>>8; 1438 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1439 leadCC=extraData.charAt(mapping-1)>>8; 1440 } else { 1441 leadCC=0; 1442 } 1443 ++mapping; // skip over the firstUnit 1444 buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); 1445 } 1446 } 1447 1448 /** 1449 * Finds the recomposition result for 1450 * a forward-combining "lead" character, 1451 * specified with a pointer to its compositions list, 1452 * and a backward-combining "trail" character. 1453 * 1454 * <p>If the lead and trail characters combine, then this function returns 1455 * the following "compositeAndFwd" value: 1456 * <pre> 1457 * Bits 21..1 composite character 1458 * Bit 0 set if the composite is a forward-combining starter 1459 * </pre> 1460 * otherwise it returns -1. 1461 * 1462 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1463 * encoded as either pairs or triples of 16-bit units. 1464 * The last entry has the high bit of its first unit set. 1465 * 1466 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1467 * A linear search is used. 1468 * 1469 * <p>See normalizer2impl.h for a more detailed description 1470 * of the compositions list format. 1471 */ 1472 private static int combine(String compositions, int list, int trail) { 1473 int key1, firstUnit; 1474 if(trail<COMP_1_TRAIL_LIMIT) { 1475 // trail character is 0..33FF 1476 // result entry may have 2 or 3 units 1477 key1=(trail<<1); 1478 while(key1>(firstUnit=compositions.charAt(list))) { 1479 list+=2+(firstUnit&COMP_1_TRIPLE); 1480 } 1481 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1482 if((firstUnit&COMP_1_TRIPLE)!=0) { 1483 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1484 } else { 1485 return compositions.charAt(list+1); 1486 } 1487 } 1488 } else { 1489 // trail character is 3400..10FFFF 1490 // result entry has 3 units 1491 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1492 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1493 int secondUnit; 1494 for(;;) { 1495 if(key1>(firstUnit=compositions.charAt(list))) { 1496 list+=2+(firstUnit&COMP_1_TRIPLE); 1497 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1498 if(key2>(secondUnit=compositions.charAt(list+1))) { 1499 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1500 break; 1501 } else { 1502 list+=3; 1503 } 1504 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1505 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1506 } else { 1507 break; 1508 } 1509 } else { 1510 break; 1511 } 1512 } 1513 } 1514 return -1; 1515 } 1516 1517 /* 1518 * Recomposes the buffer text starting at recomposeStartIndex 1519 * (which is in NFD - decomposed and canonically ordered), 1520 * and truncates the buffer contents. 1521 * 1522 * Note that recomposition never lengthens the text: 1523 * Any character consists of either one or two code units; 1524 * a composition may contain at most one more code unit than the original starter, 1525 * while the combining mark that is removed has at least one code unit. 1526 */ 1527 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1528 boolean onlyContiguous) { 1529 StringBuilder sb=buffer.getStringBuilder(); 1530 int p=recomposeStartIndex; 1531 if(p==sb.length()) { 1532 return; 1533 } 1534 1535 int starter, pRemove; 1536 int compositionsList; 1537 int c, compositeAndFwd; 1538 int norm16; 1539 int cc, prevCC; 1540 boolean starterIsSupplementary; 1541 1542 // Some of the following variables are not used until we have a forward-combining starter 1543 // and are only initialized now to avoid compiler warnings. 1544 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1545 starter=-1; 1546 starterIsSupplementary=false; 1547 prevCC=0; 1548 1549 for(;;) { 1550 c=sb.codePointAt(p); 1551 p+=Character.charCount(c); 1552 norm16=getNorm16(c); 1553 cc=getCCFromYesOrMaybe(norm16); 1554 if( // this character combines backward and 1555 isMaybe(norm16) && 1556 // we have seen a starter that combines forward and 1557 compositionsList>=0 && 1558 // the backward-combining character is not blocked 1559 (prevCC<cc || prevCC==0) 1560 ) { 1561 if(isJamoVT(norm16)) { 1562 // c is a Jamo V/T, see if we can compose it with the previous character. 1563 if(c<Hangul.JAMO_T_BASE) { 1564 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1565 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1566 if(prev<Hangul.JAMO_L_COUNT) { 1567 pRemove=p-1; 1568 char syllable=(char) 1569 (Hangul.HANGUL_BASE+ 1570 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1571 Hangul.JAMO_T_COUNT); 1572 char t; 1573 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1574 ++p; 1575 syllable+=t; // The next character was a Jamo T. 1576 } 1577 sb.setCharAt(starter, syllable); 1578 // remove the Jamo V/T 1579 sb.delete(pRemove, p); 1580 p=pRemove; 1581 } 1582 } 1583 /* 1584 * No "else" for Jamo T: 1585 * Since the input is in NFD, there are no Hangul LV syllables that 1586 * a Jamo T could combine with. 1587 * All Jamo Ts are combined above when handling Jamo Vs. 1588 */ 1589 if(p==sb.length()) { 1590 break; 1591 } 1592 compositionsList=-1; 1593 continue; 1594 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1595 // The starter and the combining mark (c) do combine. 1596 int composite=compositeAndFwd>>1; 1597 1598 // Remove the combining mark. 1599 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1600 sb.delete(pRemove, p); 1601 p=pRemove; 1602 // Replace the starter with the composite. 1603 if(starterIsSupplementary) { 1604 if(composite>0xffff) { 1605 // both are supplementary 1606 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1607 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1608 } else { 1609 sb.setCharAt(starter, (char)c); 1610 sb.deleteCharAt(starter+1); 1611 // The composite is shorter than the starter, 1612 // move the intermediate characters forward one. 1613 starterIsSupplementary=false; 1614 --p; 1615 } 1616 } else if(composite>0xffff) { 1617 // The composite is longer than the starter, 1618 // move the intermediate characters back one. 1619 starterIsSupplementary=true; 1620 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1621 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1622 ++p; 1623 } else { 1624 // both are on the BMP 1625 sb.setCharAt(starter, (char)composite); 1626 } 1627 1628 // Keep prevCC because we removed the combining mark. 1629 1630 if(p==sb.length()) { 1631 break; 1632 } 1633 // Is the composite a starter that combines forward? 1634 if((compositeAndFwd&1)!=0) { 1635 compositionsList= 1636 getCompositionsListForComposite(getRawNorm16(composite)); 1637 } else { 1638 compositionsList=-1; 1639 } 1640 1641 // We combined; continue with looking for compositions. 1642 continue; 1643 } 1644 } 1645 1646 // no combination this time 1647 prevCC=cc; 1648 if(p==sb.length()) { 1649 break; 1650 } 1651 1652 // If c did not combine, then check if it is a starter. 1653 if(cc==0) { 1654 // Found a new starter. 1655 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 1656 // It may combine with something, prepare for it. 1657 if(c<=0xffff) { 1658 starterIsSupplementary=false; 1659 starter=p-1; 1660 } else { 1661 starterIsSupplementary=true; 1662 starter=p-2; 1663 } 1664 } 1665 } else if(onlyContiguous) { 1666 // FCC: no discontiguous compositions; any intervening character blocks. 1667 compositionsList=-1; 1668 } 1669 } 1670 buffer.flush(); 1671 } 1672 1673 /** 1674 * Does c have a composition boundary before it? 1675 * True if its decomposition begins with a character that has 1676 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1677 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1678 * (isCompYesAndZeroCC()) so we need not decompose. 1679 */ 1680 private boolean hasCompBoundaryBefore(int c, int norm16) { 1681 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 1682 } 1683 private boolean norm16HasCompBoundaryBefore(int norm16) { 1684 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 1685 } 1686 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 1687 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 1688 } 1689 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 1690 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1691 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 1692 } 1693 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 1694 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 1695 } 1696 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ 1697 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 1698 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 1699 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 1700 } 1701 1702 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 1703 while(p>0) { 1704 int c=Character.codePointBefore(s, p); 1705 int norm16 = getNorm16(c); 1706 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1707 break; 1708 } 1709 p-=Character.charCount(c); 1710 if(hasCompBoundaryBefore(c, norm16)) { 1711 break; 1712 } 1713 } 1714 return p; 1715 } 1716 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 1717 while(p<limit) { 1718 int c=Character.codePointAt(s, p); 1719 int norm16=normTrie.get(c); 1720 if(hasCompBoundaryBefore(c, norm16)) { 1721 break; 1722 } 1723 p+=Character.charCount(c); 1724 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1725 break; 1726 } 1727 } 1728 return p; 1729 } 1730 1731 1732 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 1733 while(p<limit) { 1734 int c=Character.codePointAt(s, p); 1735 int norm16; 1736 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 1737 break; 1738 } 1739 p+=Character.charCount(c); 1740 if (norm16HasDecompBoundaryAfter(norm16)) { 1741 break; 1742 } 1743 } 1744 return p; 1745 } 1746 1747 /** 1748 * Get the canonical decomposition 1749 * sherman for ComposedCharIter 1750 */ 1751 public static int getDecompose(int chars[], String decomps[]) { 1752 Normalizer2 impl = Normalizer2.getNFDInstance(); 1753 1754 int length=0; 1755 int norm16 = 0; 1756 int ch = -1; 1757 int i = 0; 1758 1759 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff 1760 //TBD !!!! the hack code heres save us about 50ms for startup 1761 //need a better solution/lookup 1762 if (ch == 0x30ff) 1763 ch = 0xf900; 1764 else if (ch == 0x115bc) 1765 ch = 0x1d15e; 1766 else if (ch == 0x1d1c1) 1767 ch = 0x2f800; 1768 1769 String s = impl.getDecomposition(ch); 1770 1771 if(s != null && i < chars.length) { 1772 chars[i] = ch; 1773 decomps[i++] = s; 1774 } 1775 } 1776 return i; 1777 } 1778 1779 //------------------------------------------------------ 1780 // special method for Collation (RBTableBuilder.build()) 1781 //------------------------------------------------------ 1782 private static boolean needSingleQuotation(char c) { 1783 return (c >= 0x0009 && c <= 0x000D) || 1784 (c >= 0x0020 && c <= 0x002F) || 1785 (c >= 0x003A && c <= 0x0040) || 1786 (c >= 0x005B && c <= 0x0060) || 1787 (c >= 0x007B && c <= 0x007E); 1788 } 1789 1790 public static String canonicalDecomposeWithSingleQuotation(String string) { 1791 Normalizer2 impl = Normalizer2.getNFDInstance(); 1792 char[] src = string.toCharArray(); 1793 int srcIndex = 0; 1794 int srcLimit = src.length; 1795 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 1796 int destIndex = 0; 1797 int destLimit = dest.length; 1798 1799 int prevSrc; 1800 String norm; 1801 int reorderStartIndex, length; 1802 char c1, c2; 1803 int cp; 1804 int minNoMaybe = 0x00c0; 1805 int cc, prevCC, trailCC; 1806 char[] p; 1807 int pStart; 1808 1809 // initialize 1810 reorderStartIndex = 0; 1811 prevCC = 0; 1812 norm = null; 1813 cp = 0; 1814 pStart = 0; 1815 1816 cc = trailCC = -1; // initialize to bogus value 1817 c1 = 0; 1818 for (;;) { 1819 prevSrc=srcIndex; 1820 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual 1821 while (srcIndex != srcLimit && 1822 ((c1 = src[srcIndex]) < minNoMaybe || 1823 (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || 1824 (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables 1825 prevCC = 0; 1826 srcIndex += (cp < 0x10000) ? 1 : 2; 1827 } 1828 1829 // copy these code units all at once 1830 if (srcIndex != prevSrc) { 1831 length = srcIndex - prevSrc; 1832 if ((destIndex + length) <= destLimit) { 1833 System.arraycopy(src,prevSrc,dest,destIndex,length); 1834 } 1835 1836 destIndex += length; 1837 reorderStartIndex = destIndex; 1838 } 1839 1840 // end of source reached? 1841 if (srcIndex == srcLimit) { 1842 break; 1843 } 1844 1845 // cp already contains *src and norm32 is set for it, increment src 1846 srcIndex += (cp < 0x10000) ? 1 : 2; 1847 1848 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1849 c2 = 0; 1850 length = 1; 1851 1852 if (Character.isHighSurrogate(c1) 1853 || Character.isLowSurrogate(c1)) { 1854 norm = null; 1855 } 1856 } else { 1857 length = 2; 1858 c2 = src[srcIndex-1]; 1859 } 1860 1861 // get the decomposition and the lead and trail cc's 1862 if (norm == null) { 1863 // cp does not decompose 1864 cc = trailCC = UCharacter.getCombiningClass(cp); 1865 p = null; 1866 pStart = -1; 1867 } else { 1868 1869 pStart = 0; 1870 p = norm.toCharArray(); 1871 length = p.length; 1872 int cpNum = norm.codePointCount(0, length); 1873 cc= UCharacter.getCombiningClass(norm.codePointAt(0)); 1874 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); 1875 if (length == 1) { 1876 // fastpath a single code unit from decomposition 1877 c1 = p[pStart]; 1878 c2 = 0; 1879 p = null; 1880 pStart = -1; 1881 } 1882 } 1883 1884 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 1885 // buffer overflow 1886 char[] tmpBuf = new char[destLimit * 2]; 1887 System.arraycopy(dest, 0, tmpBuf, 0, destIndex); 1888 dest = tmpBuf; 1889 destLimit = dest.length; 1890 } 1891 1892 // append the decomposition to the destination buffer, assume length>0 1893 { 1894 int reorderSplit = destIndex; 1895 if (p == null) { 1896 // fastpath: single code point 1897 if (needSingleQuotation(c1)) { 1898 //if we need single quotation, no need to consider "prevCC" 1899 //and it must NOT be a supplementary pair 1900 dest[destIndex++] = '\''; 1901 dest[destIndex++] = c1; 1902 dest[destIndex++] = '\''; 1903 trailCC = 0; 1904 } else if(cc != 0 && cc < prevCC) { 1905 // (c1, c2) is out of order with respect to the preceding 1906 // text 1907 destIndex += length; 1908 trailCC = insertOrdered(dest, reorderStartIndex, 1909 reorderSplit, destIndex, c1, c2, cc); 1910 } else { 1911 // just append (c1, c2) 1912 dest[destIndex++] = c1; 1913 if(c2 != 0) { 1914 dest[destIndex++] = c2; 1915 } 1916 } 1917 } else { 1918 // general: multiple code points (ordered by themselves) 1919 // from decomposition 1920 if (needSingleQuotation(p[pStart])) { 1921 dest[destIndex++] = '\''; 1922 dest[destIndex++] = p[pStart++]; 1923 dest[destIndex++] = '\''; 1924 length--; 1925 do { 1926 dest[destIndex++] = p[pStart++]; 1927 } while(--length > 0); 1928 } else if (cc != 0 && cc < prevCC) { 1929 destIndex += length; 1930 trailCC = mergeOrdered(dest, reorderStartIndex, 1931 reorderSplit, p, pStart, 1932 pStart+length); 1933 } else { 1934 // just append the decomposition 1935 do { 1936 dest[destIndex++] = p[pStart++]; 1937 } while (--length > 0); 1938 } 1939 } 1940 } 1941 prevCC = trailCC; 1942 if(prevCC == 0) { 1943 reorderStartIndex = destIndex; 1944 } 1945 } 1946 1947 return new String(dest, 0, destIndex); 1948 } 1949 1950 /** 1951 * simpler, single-character version of mergeOrdered() - 1952 * bubble-insert one single code point into the preceding string 1953 * which is already canonically ordered 1954 * (c, c2) may or may not yet have been inserted at src[current]..src[p] 1955 * 1956 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) 1957 * 1958 * before: src[start]..src[current] is already ordered, and 1959 * src[current]..src[p] may or may not hold (c, c2) but 1960 * must be exactly the same length as (c, c2) 1961 * after: src[start]..src[p] is ordered 1962 * 1963 * @return the trailing combining class 1964 */ 1965 private static int/*unsigned byte*/ insertOrdered(char[] source, 1966 int start, 1967 int current, int p, 1968 char c1, char c2, 1969 int/*unsigned byte*/ cc) { 1970 int back, preBack; 1971 int r; 1972 int prevCC, trailCC=cc; 1973 1974 if (start<current && cc!=0) { 1975 // search for the insertion point where cc>=prevCC 1976 preBack=back=current; 1977 1978 PrevArgs prevArgs = new PrevArgs(); 1979 prevArgs.current = current; 1980 prevArgs.start = start; 1981 prevArgs.src = source; 1982 prevArgs.c1 = c1; 1983 prevArgs.c2 = c2; 1984 1985 // get the prevCC 1986 prevCC=getPrevCC(prevArgs); 1987 preBack = prevArgs.current; 1988 1989 if(cc<prevCC) { 1990 // this will be the last code point, so keep its cc 1991 trailCC=prevCC; 1992 back=preBack; 1993 while(start<preBack) { 1994 prevCC=getPrevCC(prevArgs); 1995 preBack=prevArgs.current; 1996 if(cc>=prevCC) { 1997 break; 1998 } 1999 back=preBack; 2000 } 2001 2002 // this is where we are right now with all these indicies: 2003 // [start]..[pPreBack] 0..? code points that we can ignore 2004 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc 2005 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) 2006 // [current]..[p] 1 code point (c, c2) with cc 2007 2008 // move the code units in between up 2009 r=p; 2010 do { 2011 source[--r]=source[--current]; 2012 } while (back!=current); 2013 } 2014 } 2015 2016 // insert (c1, c2) 2017 source[current] = c1; 2018 if (c2!=0) { 2019 source[(current+1)] = c2; 2020 } 2021 2022 // we know the cc of the last code point 2023 return trailCC; 2024 } 2025 /** 2026 * merge two UTF-16 string parts together 2027 * to canonically order (order by combining classes) their concatenation 2028 * 2029 * the two strings may already be adjacent, so that the merging is done 2030 * in-place if the two strings are not adjacent, then the buffer holding the 2031 * first one must be large enough 2032 * the second string may or may not be ordered in itself 2033 * 2034 * before: [start]..[current] is already ordered, and 2035 * [next]..[limit] may be ordered in itself, but 2036 * is not in relation to [start..current[ 2037 * after: [start..current+(limit-next)[ is ordered 2038 * 2039 * the algorithm is a simple bubble-sort that takes the characters from 2040 * src[next++] and inserts them in correct combining class order into the 2041 * preceding part of the string 2042 * 2043 * since this function is called much less often than the single-code point 2044 * insertOrdered(), it just uses that for easier maintenance 2045 * 2046 * @return the trailing combining class 2047 */ 2048 private static int /*unsigned byte*/ mergeOrdered(char[] source, 2049 int start, 2050 int current, 2051 char[] data, 2052 int next, 2053 int limit) { 2054 int r; 2055 int /*unsigned byte*/ cc, trailCC=0; 2056 boolean adjacent; 2057 2058 adjacent= current==next; 2059 NextCCArgs ncArgs = new NextCCArgs(); 2060 ncArgs.source = data; 2061 ncArgs.next = next; 2062 ncArgs.limit = limit; 2063 2064 if(start!=current) { 2065 2066 while(ncArgs.next<ncArgs.limit) { 2067 cc=getNextCC(ncArgs); 2068 if(cc==0) { 2069 // does not bubble back 2070 trailCC=0; 2071 if(adjacent) { 2072 current=ncArgs.next; 2073 } else { 2074 data[current++]=ncArgs.c1; 2075 if(ncArgs.c2!=0) { 2076 data[current++]=ncArgs.c2; 2077 } 2078 } 2079 break; 2080 } else { 2081 r=current+(ncArgs.c2==0 ? 1 : 2); 2082 trailCC=insertOrdered(source,start, current, r, 2083 ncArgs.c1, ncArgs.c2, cc); 2084 current=r; 2085 } 2086 } 2087 } 2088 2089 if(ncArgs.next==ncArgs.limit) { 2090 // we know the cc of the last code point 2091 return trailCC; 2092 } else { 2093 if(!adjacent) { 2094 // copy the second string part 2095 do { 2096 source[current++]=data[ncArgs.next++]; 2097 } while(ncArgs.next!=ncArgs.limit); 2098 ncArgs.limit=current; 2099 } 2100 PrevArgs prevArgs = new PrevArgs(); 2101 prevArgs.src = data; 2102 prevArgs.start = start; 2103 prevArgs.current = ncArgs.limit; 2104 return getPrevCC(prevArgs); 2105 } 2106 2107 } 2108 private static final class PrevArgs{ 2109 char[] src; 2110 int start; 2111 int current; 2112 char c1; 2113 char c2; 2114 } 2115 2116 private static final class NextCCArgs{ 2117 char[] source; 2118 int next; 2119 int limit; 2120 char c1; 2121 char c2; 2122 } 2123 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { 2124 args.c1=args.source[args.next++]; 2125 args.c2=0; 2126 2127 if (UTF16.isTrailSurrogate(args.c1)) { 2128 /* unpaired second surrogate */ 2129 return 0; 2130 } else if (!UTF16.isLeadSurrogate(args.c1)) { 2131 return UCharacter.getCombiningClass(args.c1); 2132 } else if (args.next!=args.limit && 2133 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ 2134 ++args.next; 2135 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); 2136 } else { 2137 /* unpaired first surrogate */ 2138 args.c2=0; 2139 return 0; 2140 } 2141 } 2142 private static int /*unsigned*/ getPrevCC(PrevArgs args) { 2143 args.c1=args.src[--args.current]; 2144 args.c2=0; 2145 2146 if (args.c1 < MIN_CCC_LCCC_CP) { 2147 return 0; 2148 } else if (UTF16.isLeadSurrogate(args.c1)) { 2149 /* unpaired first surrogate */ 2150 return 0; 2151 } else if (!UTF16.isTrailSurrogate(args.c1)) { 2152 return UCharacter.getCombiningClass(args.c1); 2153 } else if (args.current!=args.start && 2154 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { 2155 --args.current; 2156 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); 2157 } else { 2158 /* unpaired second surrogate */ 2159 args.c2=0; 2160 return 0; 2161 } 2162 } 2163 2164 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2165 if (start == p) { 2166 return 0; 2167 } 2168 return getFCD16(Character.codePointBefore(s, p)); 2169 } 2170 2171 private VersionInfo dataVersion; 2172 2173 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2174 private int minDecompNoCP; 2175 private int minCompNoMaybeCP; 2176 private int minLcccCP; 2177 2178 // Norm16 value thresholds for quick check combinations and types of extra data. 2179 private int minYesNo; 2180 private int minYesNoMappingsOnly; 2181 private int minNoNo; 2182 private int minNoNoCompBoundaryBefore; 2183 private int minNoNoCompNoMaybeCC; 2184 private int minNoNoEmpty; 2185 private int limitNoNo; 2186 private int centerNoNoDelta; 2187 private int minMaybeYes; 2188 2189 private CodePointTrie.Fast16 normTrie; 2190 private String maybeYesCompositions; 2191 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2192 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2193 }