1 /* 2 * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.io.BufferedInputStream; 40 import java.io.ByteArrayInputStream; 41 import java.io.IOException; 42 import java.io.BufferedInputStream; 43 import java.io.InputStream; 44 45 /** 46 * @author Ram Viswanadha 47 */ 48 public final class NormalizerImpl { 49 // Static block for the class to initialize its own self 50 static final NormalizerImpl IMPL; 51 52 static 53 { 54 try 55 { 56 IMPL = new NormalizerImpl(); 57 } 58 catch (Exception e) 59 { 60 throw new RuntimeException(e.getMessage()); 61 } 62 } 63 64 static final int UNSIGNED_BYTE_MASK =0xFF; 65 static final long UNSIGNED_INT_MASK = 0xffffffffL; 66 /* 67 * This new implementation of the normalization code loads its data from 68 * unorm.icu, which is generated with the gennorm tool. 69 * The format of that file is described at the end of this file. 70 */ 71 private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu"; 72 73 // norm32 value constants 74 75 // quick check flags 0..3 set mean "no" for their forms 76 public static final int QC_NFC=0x11; /* no|maybe */ 77 public static final int QC_NFKC=0x22; /* no|maybe */ 78 public static final int QC_NFD=4; /* no */ 79 public static final int QC_NFKD=8; /* no */ 80 81 public static final int QC_ANY_NO=0xf; 82 83 /* quick check flags 4..5 mean "maybe" for their forms; 84 * test flags>=QC_MAYBE 85 */ 86 public static final int QC_MAYBE=0x10; 87 public static final int QC_ANY_MAYBE=0x30; 88 89 public static final int QC_MASK=0x3f; 90 91 private static final int COMBINES_FWD=0x40; 92 private static final int COMBINES_BACK=0x80; 93 public static final int COMBINES_ANY=0xc0; 94 // UnicodeData.txt combining class in bits 15. 95 private static final int CC_SHIFT=8; 96 public static final int CC_MASK=0xff00; 97 // 16 bits for the index to UChars and other extra data 98 private static final int EXTRA_SHIFT=16; 99 100 /* norm32 value constants using >16 bits */ 101 private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK; 102 private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK; 103 private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK; 104 // private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK; 105 private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK; 106 107 108 /* indexes[] value names */ 109 /* number of bytes in normalization trie */ 110 static final int INDEX_TRIE_SIZE = 0; 111 /* number of chars in extra data */ 112 static final int INDEX_CHAR_COUNT = 1; 113 /* number of uint16_t words for combining data */ 114 static final int INDEX_COMBINE_DATA_COUNT = 2; 115 /* first code point with quick check NFC NO/MAYBE */ 116 public static final int INDEX_MIN_NFC_NO_MAYBE = 6; 117 /* first code point with quick check NFKC NO/MAYBE */ 118 public static final int INDEX_MIN_NFKC_NO_MAYBE = 7; 119 /* first code point with quick check NFD NO/MAYBE */ 120 public static final int INDEX_MIN_NFD_NO_MAYBE = 8; 121 /* first code point with quick check NFKD NO/MAYBE */ 122 public static final int INDEX_MIN_NFKD_NO_MAYBE = 9; 123 /* number of bytes in FCD trie */ 124 static final int INDEX_FCD_TRIE_SIZE = 10; 125 /* number of bytes in the auxiliary trie */ 126 static final int INDEX_AUX_TRIE_SIZE = 11; 127 /* changing this requires a new formatVersion */ 128 static final int INDEX_TOP = 32; 129 130 131 /* AUX constants */ 132 /* value constants for auxTrie */ 133 private static final int AUX_UNSAFE_SHIFT = 11; 134 private static final int AUX_COMP_EX_SHIFT = 10; 135 private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12; 136 137 private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT; 138 private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK); 139 private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK); 140 private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK); 141 private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT); 142 143 private static final int MAX_BUFFER_SIZE = 20; 144 145 /*******************************/ 146 147 /* Wrappers for Trie implementations */ 148 static final class NormTrieImpl implements Trie.DataManipulate{ 149 static IntTrie normTrie= null; 150 /** 151 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 152 * data the index array offset of the indexes for that lead surrogate. 153 * @param property data value for a surrogate from the trie, including 154 * the folding offset 155 * @return data offset or 0 if there is no data for the lead surrogate 156 */ 157 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ 158 public int getFoldingOffset(int value){ 159 return BMP_INDEX_LENGTH+ 160 ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))& 161 (0x3ff<<SURROGATE_BLOCK_BITS)); 162 } 163 164 } 165 static final class FCDTrieImpl implements Trie.DataManipulate{ 166 static CharTrie fcdTrie=null; 167 /** 168 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 169 * data the index array offset of the indexes for that lead surrogate. 170 * @param property data value for a surrogate from the trie, including 171 * the folding offset 172 * @return data offset or 0 if there is no data for the lead surrogate 173 */ 174 /* fcdTrie: the folding offset is the lead FCD value itself */ 175 public int getFoldingOffset(int value){ 176 return value; 177 } 178 } 179 180 static final class AuxTrieImpl implements Trie.DataManipulate{ 181 static CharTrie auxTrie = null; 182 /** 183 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 184 * data the index array offset of the indexes for that lead surrogate. 185 * @param property data value for a surrogate from the trie, including 186 * the folding offset 187 * @return data offset or 0 if there is no data for the lead surrogate 188 */ 189 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */ 190 public int getFoldingOffset(int value){ 191 return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS; 192 } 193 } 194 195 /****************************************************/ 196 197 198 private static FCDTrieImpl fcdTrieImpl; 199 private static NormTrieImpl normTrieImpl; 200 private static AuxTrieImpl auxTrieImpl; 201 private static int[] indexes; 202 private static char[] combiningTable; 203 private static char[] extraData; 204 205 private static boolean isDataLoaded; 206 private static boolean isFormatVersion_2_1; 207 private static boolean isFormatVersion_2_2; 208 private static byte[] unicodeVersion; 209 210 /** 211 * Default buffer size of datafile 212 */ 213 private static final int DATA_BUFFER_SIZE = 25000; 214 215 /** 216 * FCD check: everything below this code point is known to have a 0 217 * lead combining class 218 */ 219 public static final int MIN_WITH_LEAD_CC=0x300; 220 221 222 /** 223 * Bit 7 of the length byte for a decomposition string in extra data is 224 * a flag indicating whether the decomposition string is 225 * preceded by a 16-bit word with the leading and trailing cc 226 * of the decomposition (like for A-umlaut); 227 * if not, then both cc's are zero (like for compatibility ideographs). 228 */ 229 private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80; 230 /** 231 * Bits 6..0 of the length byte contain the actual length. 232 */ 233 private static final int DECOMP_LENGTH_MASK=0x7f; 234 235 /** Length of the BMP portion of the index (stage 1) array. */ 236 private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_; 237 /** Number of bits of a trail surrogate that are used in index table 238 * lookups. 239 */ 240 private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_; 241 242 243 // public utility 244 public static int getFromIndexesArr(int index){ 245 return indexes[index]; 246 } 247 248 // protected constructor --------------------------------------------- 249 250 /** 251 * Constructor 252 * @exception thrown when data reading fails or data corrupted 253 */ 254 private NormalizerImpl() throws IOException { 255 //data should be loaded only once 256 if(!isDataLoaded){ 257 258 // jar access 259 InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME); 260 BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE); 261 NormalizerDataReader reader = new NormalizerDataReader(b); 262 263 // read the indexes 264 indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP); 265 266 byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]]; 267 268 int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT]; 269 combiningTable = new char[combiningTableTop]; 270 271 int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT]; 272 extraData = new char[extraDataTop]; 273 274 byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]]; 275 byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]]; 276 277 fcdTrieImpl = new FCDTrieImpl(); 278 normTrieImpl = new NormTrieImpl(); 279 auxTrieImpl = new AuxTrieImpl(); 280 281 // load the rest of the data data and initialize the data members 282 reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable); 283 284 NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl ); 285 FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl ); 286 AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl ); 287 288 // we reached here without any exceptions so the data is fully 289 // loaded set the variable to true 290 isDataLoaded = true; 291 292 // get the data format version 293 byte[] formatVersion = reader.getDataFormatVersion(); 294 295 isFormatVersion_2_1 =( formatVersion[0]>2 296 || 297 (formatVersion[0]==2 && formatVersion[1]>=1) 298 ); 299 isFormatVersion_2_2 =( formatVersion[0]>2 300 || 301 (formatVersion[0]==2 && formatVersion[1]>=2) 302 ); 303 unicodeVersion = reader.getUnicodeVersion(); 304 b.close(); 305 } 306 } 307 308 /* ---------------------------------------------------------------------- */ 309 310 /* Korean Hangul and Jamo constants */ 311 312 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 313 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 314 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 315 316 public static final int HANGUL_BASE=0xac00; 317 318 public static final int JAMO_L_COUNT=19; 319 public static final int JAMO_V_COUNT=21; 320 public static final int JAMO_T_COUNT=28; 321 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 322 323 private static boolean isHangulWithoutJamoT(char c) { 324 c-=HANGUL_BASE; 325 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 326 } 327 328 /* norm32 helpers */ 329 330 /* is this a norm32 with a regular index? */ 331 private static boolean isNorm32Regular(long norm32) { 332 return norm32<MIN_SPECIAL; 333 } 334 335 /* is this a norm32 with a special index for a lead surrogate? */ 336 private static boolean isNorm32LeadSurrogate(long norm32) { 337 return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP; 338 } 339 340 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */ 341 private static boolean isNorm32HangulOrJamo(long norm32) { 342 return norm32>=MIN_HANGUL; 343 } 344 345 /* 346 * Given norm32 for Jamo V or T, 347 * is this a Jamo V? 348 */ 349 private static boolean isJamoVTNorm32JamoV(long norm32) { 350 return norm32<JAMO_V_TOP; 351 } 352 353 /* data access primitives ----------------------------------------------- */ 354 355 public static long/*unsigned*/ getNorm32(char c) { 356 return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c))); 357 } 358 359 public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32, 360 char c2) { 361 /* 362 * the surrogate index in norm32 stores only the number of the surrogate 363 * index block see gennorm/store.c/getFoldedNormValue() 364 */ 365 return ((UNSIGNED_INT_MASK) & 366 NormTrieImpl.normTrie.getTrailValue((int)norm32, c2)); 367 } 368 ///CLOVER:OFF 369 private static long getNorm32(int c){ 370 return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c))); 371 } 372 373 /* 374 * get a norm32 from text with complete code points 375 * (like from decompositions) 376 */ 377 private static long/*unsigned*/ getNorm32(char[] p,int start, 378 int/*unsigned*/ mask) { 379 long/*unsigned*/ norm32= getNorm32(p[start]); 380 if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) { 381 /* *p is a lead surrogate, get the real norm32 */ 382 norm32=getNorm32FromSurrogatePair(norm32, p[start+1]); 383 } 384 return norm32; 385 } 386 387 //// for StringPrep 388 public static VersionInfo getUnicodeVersion(){ 389 return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1], 390 unicodeVersion[2], unicodeVersion[3]); 391 } 392 393 public static char getFCD16(char c) { 394 return FCDTrieImpl.fcdTrie.getLeadValue(c); 395 } 396 397 public static char getFCD16FromSurrogatePair(char fcd16, char c2) { 398 /* the surrogate index in fcd16 is an absolute offset over the 399 * start of stage 1 400 * */ 401 return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2); 402 } 403 public static int getFCD16(int c) { 404 return FCDTrieImpl.fcdTrie.getCodePointValue(c); 405 } 406 407 private static int getExtraDataIndex(long norm32) { 408 return (int)(norm32>>EXTRA_SHIFT); 409 } 410 411 private static final class DecomposeArgs{ 412 int /*unsigned byte*/ cc; 413 int /*unsigned byte*/ trailCC; 414 int length; 415 } 416 /** 417 * 418 * get the canonical or compatibility decomposition for one character 419 * 420 * @return index into the extraData array 421 */ 422 private static int/*index*/ decompose(long/*unsigned*/ norm32, 423 int/*unsigned*/ qcMask, 424 DecomposeArgs args) { 425 int p= getExtraDataIndex(norm32); 426 args.length=extraData[p++]; 427 428 if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) { 429 /* use compatibility decomposition, skip canonical data */ 430 p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK); 431 args.length>>=8; 432 } 433 434 if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { 435 /* get the lead and trail cc's */ 436 char bothCCs=extraData[p++]; 437 args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); 438 args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; 439 } else { 440 /* lead and trail cc's are both 0 */ 441 args.cc=args.trailCC=0; 442 } 443 444 args.length&=DECOMP_LENGTH_MASK; 445 return p; 446 } 447 448 449 /** 450 * get the canonical decomposition for one character 451 * @return index into the extraData array 452 */ 453 private static int decompose(long/*unsigned*/ norm32, 454 DecomposeArgs args) { 455 456 int p= getExtraDataIndex(norm32); 457 args.length=extraData[p++]; 458 459 if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { 460 /* get the lead and trail cc's */ 461 char bothCCs=extraData[p++]; 462 args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); 463 args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; 464 } else { 465 /* lead and trail cc's are both 0 */ 466 args.cc=args.trailCC=0; 467 } 468 469 args.length&=DECOMP_LENGTH_MASK; 470 return p; 471 } 472 473 474 private static final class NextCCArgs{ 475 char[] source; 476 int next; 477 int limit; 478 char c; 479 char c2; 480 } 481 482 /* 483 * get the combining class of (c, c2)= args.source[args.next++] 484 * before: args.next<args.limit after: args.next<=args.limit 485 * if only one code unit is used, then c2==0 486 */ 487 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { 488 long /*unsigned*/ norm32; 489 490 args.c=args.source[args.next++]; 491 492 norm32= getNorm32(args.c); 493 if((norm32 & CC_MASK)==0) { 494 args.c2=0; 495 return 0; 496 } else { 497 if(!isNorm32LeadSurrogate(norm32)) { 498 args.c2=0; 499 } else { 500 /* c is a lead surrogate, get the real norm32 */ 501 if(args.next!=args.limit && 502 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ 503 ++args.next; 504 norm32=getNorm32FromSurrogatePair(norm32, args.c2); 505 } else { 506 args.c2=0; 507 return 0; 508 } 509 } 510 511 return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); 512 } 513 } 514 515 private static final class PrevArgs{ 516 char[] src; 517 int start; 518 int current; 519 char c; 520 char c2; 521 } 522 523 /* 524 * read backwards and get norm32 525 * return 0 if the character is <minC 526 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first 527 * surrogate but read second!) 528 */ 529 private static long /*unsigned*/ getPrevNorm32(PrevArgs args, 530 int/*unsigned*/ minC, 531 int/*unsigned*/ mask) { 532 long/*unsigned*/ norm32; 533 534 args.c=args.src[--args.current]; 535 args.c2=0; 536 537 /* check for a surrogate before getting norm32 to see if we need to 538 * predecrement further 539 */ 540 if(args.c<minC) { 541 return 0; 542 } else if(!UTF16.isSurrogate(args.c)) { 543 return getNorm32(args.c); 544 } else if(UTF16.isLeadSurrogate(args.c)) { 545 /* unpaired first surrogate */ 546 return 0; 547 } else if(args.current!=args.start && 548 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { 549 --args.current; 550 norm32=getNorm32(args.c2); 551 552 if((norm32&mask)==0) { 553 /* all surrogate pairs with this lead surrogate have 554 * only irrelevant data 555 */ 556 return 0; 557 } else { 558 /* norm32 must be a surrogate special */ 559 return getNorm32FromSurrogatePair(norm32, args.c); 560 } 561 } else { 562 /* unpaired second surrogate */ 563 args.c2=0; 564 return 0; 565 } 566 } 567 568 /* 569 * get the combining class of (c, c2)=*--p 570 * before: start<p after: start<=p 571 */ 572 private static int /*unsigned byte*/ getPrevCC(PrevArgs args) { 573 574 return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC, 575 CC_MASK)>>CC_SHIFT)); 576 } 577 578 /* 579 * is this a safe boundary character for NF*D? 580 * (lead cc==0) 581 */ 582 public static boolean isNFDSafe(long/*unsigned*/ norm32, 583 int/*unsigned*/ccOrQCMask, 584 int/*unsigned*/ decompQCMask) { 585 if((norm32&ccOrQCMask)==0) { 586 return true; /* cc==0 and no decomposition: this is NF*D safe */ 587 } 588 589 /* inspect its decomposition - maybe a Hangul but not a surrogate here*/ 590 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) { 591 DecomposeArgs args=new DecomposeArgs(); 592 /* decomposes, get everything from the variable-length extra data */ 593 decompose(norm32, decompQCMask, args); 594 return args.cc==0; 595 } else { 596 /* no decomposition (or Hangul), test the cc directly */ 597 return (norm32&CC_MASK)==0; 598 } 599 } 600 601 /* 602 * is this (or does its decomposition begin with) a "true starter"? 603 * (cc==0 and NF*C_YES) 604 */ 605 public static boolean isTrueStarter(long/*unsigned*/ norm32, 606 int/*unsigned*/ ccOrQCMask, 607 int/*unsigned*/ decompQCMask) { 608 if((norm32&ccOrQCMask)==0) { 609 return true; /* this is a true starter (could be Hangul or Jamo L)*/ 610 } 611 612 /* inspect its decomposition - not a Hangul or a surrogate here */ 613 if((norm32&decompQCMask)!=0) { 614 int p; /* index into extra data array */ 615 DecomposeArgs args=new DecomposeArgs(); 616 /* decomposes, get everything from the variable-length extra data */ 617 p=decompose(norm32, decompQCMask, args); 618 619 if(args.cc==0) { 620 int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK; 621 622 /* does it begin with NFC_YES? */ 623 if((getNorm32(extraData,p, qcMask)&qcMask)==0) { 624 /* yes, the decomposition begins with a true starter */ 625 return true; 626 } 627 } 628 } 629 return false; 630 } 631 632 /* reorder UTF-16 in-place ---------------------------------------------- */ 633 634 /** 635 * simpler, single-character version of mergeOrdered() - 636 * bubble-insert one single code point into the preceding string 637 * which is already canonically ordered 638 * (c, c2) may or may not yet have been inserted at src[current]..src[p] 639 * 640 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) 641 * 642 * before: src[start]..src[current] is already ordered, and 643 * src[current]..src[p] may or may not hold (c, c2) but 644 * must be exactly the same length as (c, c2) 645 * after: src[start]..src[p] is ordered 646 * 647 * @return the trailing combining class 648 */ 649 private static int/*unsigned byte*/ insertOrdered(char[] source, 650 int start, 651 int current, int p, 652 char c, char c2, 653 int/*unsigned byte*/ cc) { 654 int back, preBack; 655 int r; 656 int prevCC, trailCC=cc; 657 658 if(start<current && cc!=0) { 659 // search for the insertion point where cc>=prevCC 660 preBack=back=current; 661 PrevArgs prevArgs = new PrevArgs(); 662 prevArgs.current = current; 663 prevArgs.start = start; 664 prevArgs.src = source; 665 // get the prevCC 666 prevCC=getPrevCC(prevArgs); 667 preBack = prevArgs.current; 668 669 if(cc<prevCC) { 670 // this will be the last code point, so keep its cc 671 trailCC=prevCC; 672 back=preBack; 673 while(start<preBack) { 674 prevCC=getPrevCC(prevArgs); 675 preBack=prevArgs.current; 676 if(cc>=prevCC) { 677 break; 678 } 679 back=preBack; 680 } 681 682 683 // this is where we are right now with all these indicies: 684 // [start]..[pPreBack] 0..? code points that we can ignore 685 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc 686 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) 687 // [current]..[p] 1 code point (c, c2) with cc 688 689 // move the code units in between up 690 r=p; 691 do { 692 source[--r]=source[--current]; 693 } while(back!=current); 694 } 695 } 696 697 // insert (c, c2) 698 source[current]=c; 699 if(c2!=0) { 700 source[(current+1)]=c2; 701 } 702 703 // we know the cc of the last code point 704 return trailCC; 705 } 706 707 /** 708 * merge two UTF-16 string parts together 709 * to canonically order (order by combining classes) their concatenation 710 * 711 * the two strings may already be adjacent, so that the merging is done 712 * in-place if the two strings are not adjacent, then the buffer holding the 713 * first one must be large enough 714 * the second string may or may not be ordered in itself 715 * 716 * before: [start]..[current] is already ordered, and 717 * [next]..[limit] may be ordered in itself, but 718 * is not in relation to [start..current[ 719 * after: [start..current+(limit-next)[ is ordered 720 * 721 * the algorithm is a simple bubble-sort that takes the characters from 722 * src[next++] and inserts them in correct combining class order into the 723 * preceding part of the string 724 * 725 * since this function is called much less often than the single-code point 726 * insertOrdered(), it just uses that for easier maintenance 727 * 728 * @return the trailing combining class 729 */ 730 private static int /*unsigned byte*/ mergeOrdered(char[] source, 731 int start, 732 int current, 733 char[] data, 734 int next, 735 int limit, 736 boolean isOrdered) { 737 int r; 738 int /*unsigned byte*/ cc, trailCC=0; 739 boolean adjacent; 740 741 adjacent= current==next; 742 NextCCArgs ncArgs = new NextCCArgs(); 743 ncArgs.source = data; 744 ncArgs.next = next; 745 ncArgs.limit = limit; 746 747 if(start!=current || !isOrdered) { 748 749 while(ncArgs.next<ncArgs.limit) { 750 cc=getNextCC(ncArgs); 751 if(cc==0) { 752 // does not bubble back 753 trailCC=0; 754 if(adjacent) { 755 current=ncArgs.next; 756 } else { 757 data[current++]=ncArgs.c; 758 if(ncArgs.c2!=0) { 759 data[current++]=ncArgs.c2; 760 } 761 } 762 if(isOrdered) { 763 break; 764 } else { 765 start=current; 766 } 767 } else { 768 r=current+(ncArgs.c2==0 ? 1 : 2); 769 trailCC=insertOrdered(source,start, current, r, 770 ncArgs.c, ncArgs.c2, cc); 771 current=r; 772 } 773 } 774 } 775 776 if(ncArgs.next==ncArgs.limit) { 777 // we know the cc of the last code point 778 return trailCC; 779 } else { 780 if(!adjacent) { 781 // copy the second string part 782 do { 783 source[current++]=data[ncArgs.next++]; 784 } while(ncArgs.next!=ncArgs.limit); 785 ncArgs.limit=current; 786 } 787 PrevArgs prevArgs = new PrevArgs(); 788 prevArgs.src = data; 789 prevArgs.start = start; 790 prevArgs.current = ncArgs.limit; 791 return getPrevCC(prevArgs); 792 } 793 794 } 795 private static int /*unsigned byte*/ mergeOrdered(char[] source, 796 int start, 797 int current, 798 char[] data, 799 final int next, 800 final int limit) { 801 return mergeOrdered(source,start,current,data,next,limit,true); 802 } 803 804 public static NormalizerBase.QuickCheckResult quickCheck(char[] src, 805 int srcStart, 806 int srcLimit, 807 int minNoMaybe, 808 int qcMask, 809 int options, 810 boolean allowMaybe, 811 UnicodeSet nx){ 812 813 int ccOrQCMask; 814 long norm32; 815 char c, c2; 816 char cc, prevCC; 817 long qcNorm32; 818 NormalizerBase.QuickCheckResult result; 819 ComposePartArgs args = new ComposePartArgs(); 820 char[] buffer ; 821 int start = srcStart; 822 823 if(!isDataLoaded) { 824 return NormalizerBase.MAYBE; 825 } 826 // initialize 827 ccOrQCMask=CC_MASK|qcMask; 828 result=NormalizerBase.YES; 829 prevCC=0; 830 831 for(;;) { 832 for(;;) { 833 if(srcStart==srcLimit) { 834 return result; 835 } else if((c=src[srcStart++])>=minNoMaybe && 836 (( norm32=getNorm32(c)) & ccOrQCMask)!=0) { 837 break; 838 } 839 prevCC=0; 840 } 841 842 843 // check one above-minimum, relevant code unit 844 if(isNorm32LeadSurrogate(norm32)) { 845 // c is a lead surrogate, get the real norm32 846 if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) { 847 ++srcStart; 848 norm32=getNorm32FromSurrogatePair(norm32,c2); 849 } else { 850 norm32=0; 851 c2=0; 852 } 853 }else{ 854 c2=0; 855 } 856 if(nx_contains(nx, c, c2)) { 857 /* excluded: norm32==0 */ 858 norm32=0; 859 } 860 861 // check the combining order 862 cc=(char)((norm32>>CC_SHIFT)&0xFF); 863 if(cc!=0 && cc<prevCC) { 864 return NormalizerBase.NO; 865 } 866 prevCC=cc; 867 868 // check for "no" or "maybe" quick check flags 869 qcNorm32 = norm32 & qcMask; 870 if((qcNorm32& QC_ANY_NO)>=1) { 871 result= NormalizerBase.NO; 872 break; 873 } else if(qcNorm32!=0) { 874 // "maybe" can only occur for NFC and NFKC 875 if(allowMaybe){ 876 result=NormalizerBase.MAYBE; 877 }else{ 878 // normalize a section around here to see if it is really 879 // normalized or not 880 int prevStarter; 881 int/*unsigned*/ decompQCMask; 882 883 decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask 884 885 // find the previous starter 886 887 // set prevStarter to the beginning of the current character 888 prevStarter=srcStart-1; 889 if(UTF16.isTrailSurrogate(src[prevStarter])) { 890 // safe because unpaired surrogates do not result 891 // in "maybe" 892 --prevStarter; 893 } 894 prevStarter=findPreviousStarter(src, start, prevStarter, 895 ccOrQCMask, decompQCMask, 896 (char)minNoMaybe); 897 898 // find the next true starter in [src..limit[ - modifies 899 // src to point to the next starter 900 srcStart=findNextStarter(src,srcStart, srcLimit, qcMask, 901 decompQCMask,(char) minNoMaybe); 902 903 //set the args for compose part 904 args.prevCC = prevCC; 905 906 // decompose and recompose [prevStarter..src[ 907 buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx); 908 909 // compare the normalized version with the original 910 if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) { 911 result=NormalizerBase.NO; // normalization differs 912 break; 913 } 914 915 // continue after the next starter 916 } 917 } 918 } 919 return result; 920 } 921 922 923 //------------------------------------------------------ 924 // make NFD & NFKD 925 //------------------------------------------------------ 926 927 public static int decompose(char[] src,int srcStart,int srcLimit, 928 char[] dest,int destStart,int destLimit, 929 boolean compat,int[] outTrailCC, 930 UnicodeSet nx) { 931 932 char[] buffer = new char[3]; 933 int prevSrc; 934 long norm32; 935 int ccOrQCMask, qcMask; 936 int reorderStartIndex, length; 937 char c, c2, minNoMaybe; 938 int/*unsigned byte*/ cc, prevCC, trailCC; 939 char[] p; 940 int pStart; 941 int destIndex = destStart; 942 int srcIndex = srcStart; 943 if(!compat) { 944 minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE]; 945 qcMask=QC_NFD; 946 } else { 947 minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE]; 948 qcMask=QC_NFKD; 949 } 950 951 /* initialize */ 952 ccOrQCMask=CC_MASK|qcMask; 953 reorderStartIndex=0; 954 prevCC=0; 955 norm32=0; 956 c=0; 957 pStart=0; 958 959 cc=trailCC=-1;//initialize to bogus value 960 961 for(;;) { 962 /* count code units below the minimum or with irrelevant data for 963 * the quick check 964 */ 965 prevSrc=srcIndex; 966 967 while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe || 968 ((norm32=getNorm32(c))&ccOrQCMask)==0)){ 969 prevCC=0; 970 ++srcIndex; 971 } 972 973 /* copy these code units all at once */ 974 if(srcIndex!=prevSrc) { 975 length=srcIndex-prevSrc; 976 if((destIndex+length)<=destLimit) { 977 System.arraycopy(src,prevSrc,dest,destIndex,length); 978 } 979 980 destIndex+=length; 981 reorderStartIndex=destIndex; 982 } 983 984 /* end of source reached? */ 985 if(srcIndex==srcLimit) { 986 break; 987 } 988 989 /* c already contains *src and norm32 is set for it, increment src*/ 990 ++srcIndex; 991 992 /* check one above-minimum, relevant code unit */ 993 /* 994 * generally, set p and length to the decomposition string 995 * in simple cases, p==NULL and (c, c2) will hold the length code 996 * units to append in all cases, set cc to the lead and trailCC to 997 * the trail combining class 998 * 999 * the following merge-sort of the current character into the 1000 * preceding, canonically ordered result text will use the 1001 * optimized insertOrdered() 1002 * if there is only one single code point to process; 1003 * this is indicated with p==NULL, and (c, c2) is the character to 1004 * insert 1005 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate) 1006 * for a supplementary character) 1007 * otherwise, p[length] is merged in with _mergeOrdered() 1008 */ 1009 if(isNorm32HangulOrJamo(norm32)) { 1010 if(nx_contains(nx, c)) { 1011 c2=0; 1012 p=null; 1013 length=1; 1014 } else { 1015 // Hangul syllable: decompose algorithmically 1016 p=buffer; 1017 pStart=0; 1018 cc=trailCC=0; 1019 1020 c-=HANGUL_BASE; 1021 1022 c2=(char)(c%JAMO_T_COUNT); 1023 c/=JAMO_T_COUNT; 1024 if(c2>0) { 1025 buffer[2]=(char)(JAMO_T_BASE+c2); 1026 length=3; 1027 } else { 1028 length=2; 1029 } 1030 1031 buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT); 1032 buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT); 1033 } 1034 } else { 1035 if(isNorm32Regular(norm32)) { 1036 c2=0; 1037 length=1; 1038 } else { 1039 // c is a lead surrogate, get the real norm32 1040 if(srcIndex!=srcLimit && 1041 UTF16.isTrailSurrogate(c2=src[srcIndex])) { 1042 ++srcIndex; 1043 length=2; 1044 norm32=getNorm32FromSurrogatePair(norm32, c2); 1045 } else { 1046 c2=0; 1047 length=1; 1048 norm32=0; 1049 } 1050 } 1051 1052 /* get the decomposition and the lead and trail cc's */ 1053 if(nx_contains(nx, c, c2)) { 1054 /* excluded: norm32==0 */ 1055 cc=trailCC=0; 1056 p=null; 1057 } else if((norm32&qcMask)==0) { 1058 /* c does not decompose */ 1059 cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); 1060 p=null; 1061 pStart=-1; 1062 } else { 1063 DecomposeArgs arg = new DecomposeArgs(); 1064 /* c decomposes, get everything from the variable-length 1065 * extra data 1066 */ 1067 pStart=decompose(norm32, qcMask, arg); 1068 p=extraData; 1069 length=arg.length; 1070 cc=arg.cc; 1071 trailCC=arg.trailCC; 1072 if(length==1) { 1073 /* fastpath a single code unit from decomposition */ 1074 c=p[pStart]; 1075 c2=0; 1076 p=null; 1077 pStart=-1; 1078 } 1079 } 1080 } 1081 1082 /* append the decomposition to the destination buffer, assume 1083 * length>0 1084 */ 1085 if((destIndex+length)<=destLimit) { 1086 int reorderSplit=destIndex; 1087 if(p==null) { 1088 /* fastpath: single code point */ 1089 if(cc!=0 && cc<prevCC) { 1090 /* (c, c2) is out of order with respect to the preceding 1091 * text 1092 */ 1093 destIndex+=length; 1094 trailCC=insertOrdered(dest,reorderStartIndex, 1095 reorderSplit, destIndex, c, c2, cc); 1096 } else { 1097 /* just append (c, c2) */ 1098 dest[destIndex++]=c; 1099 if(c2!=0) { 1100 dest[destIndex++]=c2; 1101 } 1102 } 1103 } else { 1104 /* general: multiple code points (ordered by themselves) 1105 * from decomposition 1106 */ 1107 if(cc!=0 && cc<prevCC) { 1108 /* the decomposition is out of order with respect to the 1109 * preceding text 1110 */ 1111 destIndex+=length; 1112 trailCC=mergeOrdered(dest,reorderStartIndex, 1113 reorderSplit,p, pStart,pStart+length); 1114 } else { 1115 /* just append the decomposition */ 1116 do { 1117 dest[destIndex++]=p[pStart++]; 1118 } while(--length>0); 1119 } 1120 } 1121 } else { 1122 /* buffer overflow */ 1123 /* keep incrementing the destIndex for preflighting */ 1124 destIndex+=length; 1125 } 1126 1127 prevCC=trailCC; 1128 if(prevCC==0) { 1129 reorderStartIndex=destIndex; 1130 } 1131 } 1132 1133 outTrailCC[0]=prevCC; 1134 1135 return destIndex - destStart; 1136 } 1137 1138 /* make NFC & NFKC ------------------------------------------------------ */ 1139 private static final class NextCombiningArgs{ 1140 char[] source; 1141 int start; 1142 //int limit; 1143 char c; 1144 char c2; 1145 int/*unsigned*/ combiningIndex; 1146 char /*unsigned byte*/ cc; 1147 } 1148 1149 /* get the composition properties of the next character */ 1150 private static int /*unsigned*/ getNextCombining(NextCombiningArgs args, 1151 int limit, 1152 UnicodeSet nx) { 1153 long/*unsigned*/ norm32; 1154 int combineFlags; 1155 /* get properties */ 1156 args.c=args.source[args.start++]; 1157 norm32=getNorm32(args.c); 1158 1159 /* preset output values for most characters */ 1160 args.c2=0; 1161 args.combiningIndex=0; 1162 args.cc=0; 1163 1164 if((norm32&(CC_MASK|COMBINES_ANY))==0) { 1165 return 0; 1166 } else { 1167 if(isNorm32Regular(norm32)) { 1168 /* set cc etc. below */ 1169 } else if(isNorm32HangulOrJamo(norm32)) { 1170 /* a compatibility decomposition contained Jamos */ 1171 args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0| 1172 (norm32>>EXTRA_SHIFT))); 1173 return (int)(norm32&COMBINES_ANY); 1174 } else { 1175 /* c is a lead surrogate, get the real norm32 */ 1176 if(args.start!=limit && UTF16.isTrailSurrogate(args.c2= 1177 args.source[args.start])) { 1178 ++args.start; 1179 norm32=getNorm32FromSurrogatePair(norm32, args.c2); 1180 } else { 1181 args.c2=0; 1182 return 0; 1183 } 1184 } 1185 1186 if(nx_contains(nx, args.c, args.c2)) { 1187 return 0; /* excluded: norm32==0 */ 1188 } 1189 1190 args.cc= (char)((norm32>>CC_SHIFT)&0xff); 1191 1192 combineFlags=(int)(norm32&COMBINES_ANY); 1193 if(combineFlags!=0) { 1194 int index = getExtraDataIndex(norm32); 1195 args.combiningIndex=index>0 ? extraData[(index-1)] :0; 1196 } 1197 1198 return combineFlags; 1199 } 1200 } 1201 1202 /* 1203 * given a composition-result starter (c, c2) - which means its cc==0, 1204 * it combines forward, it has extra data, its norm32!=0, 1205 * it is not a Hangul or Jamo, 1206 * get just its combineFwdIndex 1207 * 1208 * norm32(c) is special if and only if c2!=0 1209 */ 1210 private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){ 1211 long/*unsigned*/ norm32; 1212 1213 norm32=getNorm32(c); 1214 if(c2!=0) { 1215 norm32=getNorm32FromSurrogatePair(norm32, c2); 1216 } 1217 return extraData[(getExtraDataIndex(norm32)-1)]; 1218 } 1219 1220 /* 1221 * Find the recomposition result for 1222 * a forward-combining character 1223 * (specified with a pointer to its part of the combiningTable[]) 1224 * and a backward-combining character 1225 * (specified with its combineBackIndex). 1226 * 1227 * If these two characters combine, then set (value, value2) 1228 * with the code unit(s) of the composition character. 1229 * 1230 * Return value: 1231 * 0 do not combine 1232 * 1 combine 1233 * >1 combine, and the composition is a forward-combining starter 1234 * 1235 * See unormimp.h for a description of the composition table format. 1236 */ 1237 private static int/*unsigned*/ combine(char[]table,int tableStart, 1238 int/*unsinged*/ combineBackIndex, 1239 int[] outValues) { 1240 int/*unsigned*/ key; 1241 int value,value2; 1242 1243 if(outValues.length<2){ 1244 throw new IllegalArgumentException(); 1245 } 1246 1247 /* search in the starter's composition table */ 1248 for(;;) { 1249 key=table[tableStart++]; 1250 if(key>=combineBackIndex) { 1251 break; 1252 } 1253 tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1; 1254 } 1255 1256 /* mask off bit 15, the last-entry-in-the-list flag */ 1257 if((key&0x7fff)==combineBackIndex) { 1258 /* found! combine! */ 1259 value=table[tableStart]; 1260 1261 /* is the composition a starter that combines forward? */ 1262 key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1)); 1263 1264 /* get the composition result code point from the variable-length 1265 * result value 1266 */ 1267 if((value&0x8000) != 0) { 1268 if((value&0x4000) != 0) { 1269 /* surrogate pair composition result */ 1270 value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800)); 1271 value2=table[tableStart+1]; 1272 } else { 1273 /* BMP composition result U+2000..U+ffff */ 1274 value=table[tableStart+1]; 1275 value2=0; 1276 } 1277 } else { 1278 /* BMP composition result U+0000..U+1fff */ 1279 value&=0x1fff; 1280 value2=0; 1281 } 1282 outValues[0]=value; 1283 outValues[1]=value2; 1284 return key; 1285 } else { 1286 /* not found */ 1287 return 0; 1288 } 1289 } 1290 1291 1292 private static final class RecomposeArgs{ 1293 char[] source; 1294 int start; 1295 int limit; 1296 } 1297 /* 1298 * recompose the characters in [p..limit[ 1299 * (which is in NFD - decomposed and canonically ordered), 1300 * adjust limit, and return the trailing cc 1301 * 1302 * since for NFKC we may get Jamos in decompositions, we need to 1303 * recompose those too 1304 * 1305 * note that recomposition never lengthens the text: 1306 * any character consists of either one or two code units; 1307 * a composition may contain at most one more code unit than the original 1308 * starter, while the combining mark that is removed has at least one code 1309 * unit 1310 */ 1311 private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) { 1312 int remove, q, r; 1313 int /*unsigned*/ combineFlags; 1314 int /*unsigned*/ combineFwdIndex, combineBackIndex; 1315 int /*unsigned*/ result, value=0, value2=0; 1316 int /*unsigned byte*/ prevCC; 1317 boolean starterIsSupplementary; 1318 int starter; 1319 int[] outValues = new int[2]; 1320 starter=-1; /* no starter */ 1321 combineFwdIndex=0; /* will not be used until starter!=NULL */ 1322 starterIsSupplementary=false; /* will not be used until starter!=NULL */ 1323 prevCC=0; 1324 1325 NextCombiningArgs ncArg = new NextCombiningArgs(); 1326 ncArg.source = args.source; 1327 1328 ncArg.cc =0; 1329 ncArg.c2 =0; 1330 1331 for(;;) { 1332 ncArg.start = args.start; 1333 combineFlags=getNextCombining(ncArg,args.limit,nx); 1334 combineBackIndex=ncArg.combiningIndex; 1335 args.start = ncArg.start; 1336 1337 if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) { 1338 if((combineBackIndex&0x8000)!=0) { 1339 /* c is a Jamo V/T, see if we can compose it with the 1340 * previous character 1341 */ 1342 /* for the PRI #29 fix, check that there is no intervening combining mark */ 1343 if((options&BEFORE_PRI_29)!=0 || prevCC==0) { 1344 remove=-1; /* NULL while no Hangul composition */ 1345 combineFlags=0; 1346 ncArg.c2=args.source[starter]; 1347 if(combineBackIndex==0xfff2) { 1348 /* Jamo V, compose with previous Jamo L and following 1349 * Jamo T 1350 */ 1351 ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE); 1352 if(ncArg.c2<JAMO_L_COUNT) { 1353 remove=args.start-1; 1354 ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+ 1355 (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT); 1356 if(args.start!=args.limit && 1357 (ncArg.c2=(char)(args.source[args.start] 1358 -JAMO_T_BASE))<JAMO_T_COUNT) { 1359 ++args.start; 1360 ncArg.c+=ncArg.c2; 1361 } else { 1362 /* the result is an LV syllable, which is a starter (unlike LVT) */ 1363 combineFlags=COMBINES_FWD; 1364 } 1365 if(!nx_contains(nx, ncArg.c)) { 1366 args.source[starter]=ncArg.c; 1367 } else { 1368 /* excluded */ 1369 if(!isHangulWithoutJamoT(ncArg.c)) { 1370 --args.start; /* undo the ++args.start from reading the Jamo T */ 1371 } 1372 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */ 1373 remove=args.start; 1374 } 1375 } 1376 1377 /* 1378 * Normally, the following can not occur: 1379 * Since the input is in NFD, there are no Hangul LV syllables that 1380 * a Jamo T could combine with. 1381 * All Jamo Ts are combined above when handling Jamo Vs. 1382 * 1383 * However, before the PRI #29 fix, this can occur due to 1384 * an intervening combining mark between the Hangul LV and the Jamo T. 1385 */ 1386 } else { 1387 /* Jamo T, compose with previous Hangul that does not have a Jamo T */ 1388 if(isHangulWithoutJamoT(ncArg.c2)) { 1389 ncArg.c2+=ncArg.c-JAMO_T_BASE; 1390 if(!nx_contains(nx, ncArg.c2)) { 1391 remove=args.start-1; 1392 args.source[starter]=ncArg.c2; 1393 } 1394 } 1395 } 1396 1397 if(remove!=-1) { 1398 /* remove the Jamo(s) */ 1399 q=remove; 1400 r=args.start; 1401 while(r<args.limit) { 1402 args.source[q++]=args.source[r++]; 1403 } 1404 args.start=remove; 1405 args.limit=q; 1406 } 1407 1408 ncArg.c2=0; /* c2 held *starter temporarily */ 1409 1410 if(combineFlags!=0) { 1411 /* 1412 * not starter=NULL because the composition is a Hangul LV syllable 1413 * and might combine once more (but only before the PRI #29 fix) 1414 */ 1415 1416 /* done? */ 1417 if(args.start==args.limit) { 1418 return (char)prevCC; 1419 } 1420 1421 /* the composition is a Hangul LV syllable which is a starter that combines forward */ 1422 combineFwdIndex=0xfff0; 1423 1424 /* we combined; continue with looking for compositions */ 1425 continue; 1426 } 1427 } 1428 1429 /* 1430 * now: cc==0 and the combining index does not include 1431 * "forward" -> the rest of the loop body will reset starter 1432 * to NULL; technically, a composed Hangul syllable is a 1433 * starter, but it does not combine forward now that we have 1434 * consumed all eligible Jamos; for Jamo V/T, combineFlags 1435 * does not contain _NORM_COMBINES_FWD 1436 */ 1437 1438 } else if( 1439 /* the starter is not a Hangul LV or Jamo V/T and */ 1440 !((combineFwdIndex&0x8000)!=0) && 1441 /* the combining mark is not blocked and */ 1442 ((options&BEFORE_PRI_29)!=0 ? 1443 (prevCC!=ncArg.cc || prevCC==0) : 1444 (prevCC<ncArg.cc || prevCC==0)) && 1445 /* the starter and the combining mark (c, c2) do combine */ 1446 0!=(result=combine(combiningTable,combineFwdIndex, 1447 combineBackIndex, outValues)) && 1448 /* the composition result is not excluded */ 1449 !nx_contains(nx, (char)value, (char)value2) 1450 ) { 1451 value=outValues[0]; 1452 value2=outValues[1]; 1453 /* replace the starter with the composition, remove the 1454 * combining mark 1455 */ 1456 remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */ 1457 1458 /* replace the starter with the composition */ 1459 args.source[starter]=(char)value; 1460 if(starterIsSupplementary) { 1461 if(value2!=0) { 1462 /* both are supplementary */ 1463 args.source[starter+1]=(char)value2; 1464 } else { 1465 /* the composition is shorter than the starter, 1466 * move the intermediate characters forward one */ 1467 starterIsSupplementary=false; 1468 q=starter+1; 1469 r=q+1; 1470 while(r<remove) { 1471 args.source[q++]=args.source[r++]; 1472 } 1473 --remove; 1474 } 1475 } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB 1476 starterIsSupplementary=true; 1477 args.source[starter+1]=(char)value2; 1478 /* } else { both are on the BMP, nothing more to do */ 1479 } 1480 1481 /* remove the combining mark by moving the following text 1482 * over it */ 1483 if(remove<args.start) { 1484 q=remove; 1485 r=args.start; 1486 while(r<args.limit) { 1487 args.source[q++]=args.source[r++]; 1488 } 1489 args.start=remove; 1490 args.limit=q; 1491 } 1492 1493 /* keep prevCC because we removed the combining mark */ 1494 1495 /* done? */ 1496 if(args.start==args.limit) { 1497 return (char)prevCC; 1498 } 1499 1500 /* is the composition a starter that combines forward? */ 1501 if(result>1) { 1502 combineFwdIndex=getCombiningIndexFromStarter((char)value, 1503 (char)value2); 1504 } else { 1505 starter=-1; 1506 } 1507 1508 /* we combined; continue with looking for compositions */ 1509 continue; 1510 } 1511 } 1512 1513 /* no combination this time */ 1514 prevCC=ncArg.cc; 1515 if(args.start==args.limit) { 1516 return (char)prevCC; 1517 } 1518 1519 /* if (c, c2) did not combine, then check if it is a starter */ 1520 if(ncArg.cc==0) { 1521 /* found a new starter; combineFlags==0 if (c, c2) is excluded */ 1522 if((combineFlags&COMBINES_FWD)!=0) { 1523 /* it may combine with something, prepare for it */ 1524 if(ncArg.c2==0) { 1525 starterIsSupplementary=false; 1526 starter=args.start-1; 1527 } else { 1528 starterIsSupplementary=false; 1529 starter=args.start-2; 1530 } 1531 combineFwdIndex=combineBackIndex; 1532 } else { 1533 /* it will not combine with anything */ 1534 starter=-1; 1535 } 1536 } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) { 1537 /* FCC: no discontiguous compositions; any intervening character blocks */ 1538 starter=-1; 1539 } 1540 } 1541 } 1542 1543 // find the last true starter between src[start]....src[current] going 1544 // backwards and return its index 1545 private static int findPreviousStarter(char[]src, int srcStart, int current, 1546 int/*unsigned*/ ccOrQCMask, 1547 int/*unsigned*/ decompQCMask, 1548 char minNoMaybe) { 1549 long norm32; 1550 PrevArgs args = new PrevArgs(); 1551 args.src = src; 1552 args.start = srcStart; 1553 args.current = current; 1554 1555 while(args.start<args.current) { 1556 norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask); 1557 if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { 1558 break; 1559 } 1560 } 1561 return args.current; 1562 } 1563 1564 /* find the first true starter in [src..limit[ and return the 1565 * pointer to it 1566 */ 1567 private static int/*index*/ findNextStarter(char[] src,int start,int limit, 1568 int/*unsigned*/ qcMask, 1569 int/*unsigned*/ decompQCMask, 1570 char minNoMaybe) { 1571 int p; 1572 long/*unsigned*/ norm32; 1573 int ccOrQCMask; 1574 char c, c2; 1575 1576 ccOrQCMask=CC_MASK|qcMask; 1577 1578 DecomposeArgs decompArgs = new DecomposeArgs(); 1579 1580 for(;;) { 1581 if(start==limit) { 1582 break; /* end of string */ 1583 } 1584 c=src[start]; 1585 if(c<minNoMaybe) { 1586 break; /* catches NUL terminater, too */ 1587 } 1588 1589 norm32=getNorm32(c); 1590 if((norm32&ccOrQCMask)==0) { 1591 break; /* true starter */ 1592 } 1593 1594 if(isNorm32LeadSurrogate(norm32)) { 1595 /* c is a lead surrogate, get the real norm32 */ 1596 if((start+1)==limit || 1597 !UTF16.isTrailSurrogate(c2=(src[start+1]))){ 1598 /* unmatched first surrogate: counts as a true starter */ 1599 break; 1600 } 1601 norm32=getNorm32FromSurrogatePair(norm32, c2); 1602 1603 if((norm32&ccOrQCMask)==0) { 1604 break; /* true starter */ 1605 } 1606 } else { 1607 c2=0; 1608 } 1609 1610 /* (c, c2) is not a true starter but its decomposition may be */ 1611 if((norm32&decompQCMask)!=0) { 1612 /* (c, c2) decomposes, get everything from the variable-length 1613 * extra data */ 1614 p=decompose(norm32, decompQCMask, decompArgs); 1615 1616 /* get the first character's norm32 to check if it is a true 1617 * starter */ 1618 if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) { 1619 break; /* true starter */ 1620 } 1621 } 1622 1623 start+= c2==0 ? 1 : 2; /* not a true starter, continue */ 1624 } 1625 1626 return start; 1627 } 1628 1629 1630 private static final class ComposePartArgs{ 1631 int prevCC; 1632 int length; /* length of decomposed part */ 1633 } 1634 1635 /* decompose and recompose [prevStarter..src[ */ 1636 private static char[] composePart(ComposePartArgs args, 1637 int prevStarter, 1638 char[] src, int start, int limit, 1639 int options, 1640 UnicodeSet nx) { 1641 int recomposeLimit; 1642 boolean compat =((options&OPTIONS_COMPAT)!=0); 1643 1644 /* decompose [prevStarter..src[ */ 1645 int[] outTrailCC = new int[1]; 1646 char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE]; 1647 1648 for(;;){ 1649 args.length=decompose(src,prevStarter,(start), 1650 buffer,0,buffer.length, 1651 compat,outTrailCC,nx); 1652 if(args.length<=buffer.length){ 1653 break; 1654 }else{ 1655 buffer = new char[args.length]; 1656 } 1657 } 1658 1659 /* recompose the decomposition */ 1660 recomposeLimit=args.length; 1661 1662 if(args.length>=2) { 1663 RecomposeArgs rcArgs = new RecomposeArgs(); 1664 rcArgs.source = buffer; 1665 rcArgs.start = 0; 1666 rcArgs.limit = recomposeLimit; 1667 args.prevCC=recompose(rcArgs, options, nx); 1668 recomposeLimit = rcArgs.limit; 1669 } 1670 1671 /* return with a pointer to the recomposition and its length */ 1672 args.length=recomposeLimit; 1673 return buffer; 1674 } 1675 1676 private static boolean composeHangul(char prev, char c, 1677 long/*unsigned*/ norm32, 1678 char[] src,int[] srcIndex, int limit, 1679 boolean compat, 1680 char[] dest,int destIndex, 1681 UnicodeSet nx) { 1682 int start=srcIndex[0]; 1683 if(isJamoVTNorm32JamoV(norm32)) { 1684 /* c is a Jamo V, compose with previous Jamo L and 1685 * following Jamo T */ 1686 prev=(char)(prev-JAMO_L_BASE); 1687 if(prev<JAMO_L_COUNT) { 1688 c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+ 1689 (c-JAMO_V_BASE))*JAMO_T_COUNT); 1690 1691 /* check if the next character is a Jamo T (normal or 1692 * compatibility) */ 1693 if(start!=limit) { 1694 char next, t; 1695 1696 next=src[start]; 1697 if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) { 1698 /* normal Jamo T */ 1699 ++start; 1700 c+=t; 1701 } else if(compat) { 1702 /* if NFKC, then check for compatibility Jamo T 1703 * (BMP only) */ 1704 norm32=getNorm32(next); 1705 if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) { 1706 int p /*index into extra data array*/; 1707 DecomposeArgs dcArgs = new DecomposeArgs(); 1708 p=decompose(norm32, QC_NFKD, dcArgs); 1709 if(dcArgs.length==1 && 1710 (t=(char)(extraData[p]-JAMO_T_BASE)) 1711 <JAMO_T_COUNT) { 1712 /* compatibility Jamo T */ 1713 ++start; 1714 c+=t; 1715 } 1716 } 1717 } 1718 } 1719 if(nx_contains(nx, c)) { 1720 if(!isHangulWithoutJamoT(c)) { 1721 --start; /* undo ++start from reading the Jamo T */ 1722 } 1723 return false; 1724 } 1725 dest[destIndex]=c; 1726 srcIndex[0]=start; 1727 return true; 1728 } 1729 } else if(isHangulWithoutJamoT(prev)) { 1730 /* c is a Jamo T, compose with previous Hangul LV that does not 1731 * contain a Jamo T */ 1732 c=(char)(prev+(c-JAMO_T_BASE)); 1733 if(nx_contains(nx, c)) { 1734 return false; 1735 } 1736 dest[destIndex]=c; 1737 srcIndex[0]=start; 1738 return true; 1739 } 1740 return false; 1741 } 1742 /* 1743 public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){ 1744 return compose(src,0,src.length,dest,0,dest.length,compat, nx); 1745 } 1746 */ 1747 1748 public static int compose(char[] src, int srcStart, int srcLimit, 1749 char[] dest,int destStart,int destLimit, 1750 int options,UnicodeSet nx) { 1751 1752 int prevSrc, prevStarter; 1753 long/*unsigned*/ norm32; 1754 int ccOrQCMask, qcMask; 1755 int reorderStartIndex, length; 1756 char c, c2, minNoMaybe; 1757 int/*unsigned byte*/ cc, prevCC; 1758 int[] ioIndex = new int[1]; 1759 int destIndex = destStart; 1760 int srcIndex = srcStart; 1761 1762 if((options&OPTIONS_COMPAT)!=0) { 1763 minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE]; 1764 qcMask=QC_NFKC; 1765 } else { 1766 minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE]; 1767 qcMask=QC_NFC; 1768 } 1769 1770 /* 1771 * prevStarter points to the last character before the current one 1772 * that is a "true" starter with cc==0 and quick check "yes". 1773 * 1774 * prevStarter will be used instead of looking for a true starter 1775 * while incrementally decomposing [prevStarter..prevSrc[ 1776 * in _composePart(). Having a good prevStarter allows to just decompose 1777 * the entire [prevStarter..prevSrc[. 1778 * 1779 * When _composePart() backs out from prevSrc back to prevStarter, 1780 * then it also backs out destIndex by the same amount. 1781 * Therefore, at all times, the (prevSrc-prevStarter) source units 1782 * must correspond 1:1 to destination units counted with destIndex, 1783 * except for reordering. 1784 * This is true for the qc "yes" characters copied in the fast loop, 1785 * and for pure reordering. 1786 * prevStarter must be set forward to src when this is not true: 1787 * In _composePart() and after composing a Hangul syllable. 1788 * 1789 * This mechanism relies on the assumption that the decomposition of a 1790 * true starter also begins with a true starter. gennorm/store.c checks 1791 * for this. 1792 */ 1793 prevStarter=srcIndex; 1794 1795 ccOrQCMask=CC_MASK|qcMask; 1796 /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/ 1797 prevCC=0; 1798 1799 /* avoid compiler warnings */ 1800 norm32=0; 1801 c=0; 1802 1803 for(;;) { 1804 /* count code units below the minimum or with irrelevant data for 1805 * the quick check */ 1806 prevSrc=srcIndex; 1807 1808 while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe || 1809 ((norm32=getNorm32(c))&ccOrQCMask)==0)) { 1810 prevCC=0; 1811 ++srcIndex; 1812 } 1813 1814 1815 /* copy these code units all at once */ 1816 if(srcIndex!=prevSrc) { 1817 length=srcIndex-prevSrc; 1818 if((destIndex+length)<=destLimit) { 1819 System.arraycopy(src,prevSrc,dest,destIndex,length); 1820 } 1821 destIndex+=length; 1822 reorderStartIndex=destIndex; 1823 1824 /* set prevStarter to the last character in the quick check 1825 * loop */ 1826 prevStarter=srcIndex-1; 1827 if(UTF16.isTrailSurrogate(src[prevStarter]) && 1828 prevSrc<prevStarter && 1829 UTF16.isLeadSurrogate(src[(prevStarter-1)])) { 1830 --prevStarter; 1831 } 1832 1833 prevSrc=srcIndex; 1834 } 1835 1836 /* end of source reached? */ 1837 if(srcIndex==srcLimit) { 1838 break; 1839 } 1840 1841 /* c already contains *src and norm32 is set for it, increment src*/ 1842 ++srcIndex; 1843 1844 /* 1845 * source buffer pointers: 1846 * 1847 * all done quick check current char not yet 1848 * "yes" but (c, c2) processed 1849 * may combine 1850 * forward 1851 * [-------------[-------------[-------------[-------------[ 1852 * | | | | | 1853 * start prevStarter prevSrc src limit 1854 * 1855 * 1856 * destination buffer pointers and indexes: 1857 * 1858 * all done might take not filled yet 1859 * characters for 1860 * reordering 1861 * [-------------[-------------[-------------[ 1862 * | | | | 1863 * dest reorderStartIndex destIndex destCapacity 1864 */ 1865 1866 /* check one above-minimum, relevant code unit */ 1867 /* 1868 * norm32 is for c=*(src-1), and the quick check flag is "no" or 1869 * "maybe", and/or cc!=0 1870 * check for Jamo V/T, then for surrogates and regular characters 1871 * c is not a Hangul syllable or Jamo L because 1872 * they are not marked with no/maybe for NFC & NFKC(and their cc==0) 1873 */ 1874 if(isNorm32HangulOrJamo(norm32)) { 1875 /* 1876 * c is a Jamo V/T: 1877 * try to compose with the previous character, Jamo V also with 1878 * a following Jamo T, and set values here right now in case we 1879 * just continue with the main loop 1880 */ 1881 prevCC=cc=0; 1882 reorderStartIndex=destIndex; 1883 ioIndex[0]=srcIndex; 1884 if( 1885 destIndex>0 && 1886 composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex, 1887 srcLimit, (options&OPTIONS_COMPAT)!=0, dest, 1888 destIndex<=destLimit ? destIndex-1: 0, 1889 nx) 1890 ) { 1891 srcIndex=ioIndex[0]; 1892 prevStarter=srcIndex; 1893 continue; 1894 } 1895 1896 srcIndex = ioIndex[0]; 1897 1898 /* the Jamo V/T did not compose into a Hangul syllable, just 1899 * append to dest */ 1900 c2=0; 1901 length=1; 1902 prevStarter=prevSrc; 1903 } else { 1904 if(isNorm32Regular(norm32)) { 1905 c2=0; 1906 length=1; 1907 } else { 1908 /* c is a lead surrogate, get the real norm32 */ 1909 if(srcIndex!=srcLimit && 1910 UTF16.isTrailSurrogate(c2=src[srcIndex])) { 1911 ++srcIndex; 1912 length=2; 1913 norm32=getNorm32FromSurrogatePair(norm32, c2); 1914 } else { 1915 /* c is an unpaired lead surrogate, nothing to do */ 1916 c2=0; 1917 length=1; 1918 norm32=0; 1919 } 1920 } 1921 ComposePartArgs args =new ComposePartArgs(); 1922 1923 /* we are looking at the character (c, c2) at [prevSrc..src[ */ 1924 if(nx_contains(nx, c, c2)) { 1925 /* excluded: norm32==0 */ 1926 cc=0; 1927 } else if((norm32&qcMask)==0) { 1928 cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT)); 1929 } else { 1930 char[] p; 1931 1932 /* 1933 * find appropriate boundaries around this character, 1934 * decompose the source text from between the boundaries, 1935 * and recompose it 1936 * 1937 * this puts the intermediate text into the side buffer because 1938 * it might be longer than the recomposition end result, 1939 * or the destination buffer may be too short or missing 1940 * 1941 * note that destIndex may be adjusted backwards to account 1942 * for source text that passed the quick check but needed to 1943 * take part in the recomposition 1944 */ 1945 int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ 1946 /* 1947 * find the last true starter in [prevStarter..src[ 1948 * it is either the decomposition of the current character (at prevSrc), 1949 * or prevStarter 1950 */ 1951 if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) { 1952 prevStarter=prevSrc; 1953 } else { 1954 /* adjust destIndex: back out what had been copied with qc "yes" */ 1955 destIndex-=prevSrc-prevStarter; 1956 } 1957 1958 /* find the next true starter in [src..limit[ */ 1959 srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask, 1960 decompQCMask, minNoMaybe); 1961 //args.prevStarter = prevStarter; 1962 args.prevCC = prevCC; 1963 //args.destIndex = destIndex; 1964 args.length = length; 1965 p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx); 1966 1967 if(p==null) { 1968 /* an error occurred (out of memory) */ 1969 break; 1970 } 1971 1972 prevCC = args.prevCC; 1973 length = args.length; 1974 1975 /* append the recomposed buffer contents to the destination 1976 * buffer */ 1977 if((destIndex+args.length)<=destLimit) { 1978 int i=0; 1979 while(i<args.length) { 1980 dest[destIndex++]=p[i++]; 1981 --length; 1982 } 1983 } else { 1984 /* buffer overflow */ 1985 /* keep incrementing the destIndex for preflighting */ 1986 destIndex+=length; 1987 } 1988 1989 prevStarter=srcIndex; 1990 continue; 1991 } 1992 } 1993 1994 /* append the single code point (c, c2) to the destination buffer */ 1995 if((destIndex+length)<=destLimit) { 1996 if(cc!=0 && cc<prevCC) { 1997 /* (c, c2) is out of order with respect to the preceding 1998 * text */ 1999 int reorderSplit= destIndex; 2000 destIndex+=length; 2001 prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit, 2002 destIndex, c, c2, cc); 2003 } else { 2004 /* just append (c, c2) */ 2005 dest[destIndex++]=c; 2006 if(c2!=0) { 2007 dest[destIndex++]=c2; 2008 } 2009 prevCC=cc; 2010 } 2011 } else { 2012 /* buffer overflow */ 2013 /* keep incrementing the destIndex for preflighting */ 2014 destIndex+=length; 2015 prevCC=cc; 2016 } 2017 } 2018 2019 return destIndex - destStart; 2020 } 2021 2022 public static int getCombiningClass(int c) { 2023 long norm32; 2024 norm32=getNorm32(c); 2025 return (int)((norm32>>CC_SHIFT)&0xFF); 2026 } 2027 2028 public static boolean isFullCompositionExclusion(int c) { 2029 if(isFormatVersion_2_1) { 2030 int aux =AuxTrieImpl.auxTrie.getCodePointValue(c); 2031 return (aux & AUX_COMP_EX_MASK)!=0; 2032 } else { 2033 return false; 2034 } 2035 } 2036 2037 public static boolean isCanonSafeStart(int c) { 2038 if(isFormatVersion_2_1) { 2039 int aux = AuxTrieImpl.auxTrie.getCodePointValue(c); 2040 return (aux & AUX_UNSAFE_MASK)==0; 2041 } else { 2042 return false; 2043 } 2044 } 2045 2046 /* Is c an NF<mode>-skippable code point? See unormimp.h. */ 2047 public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) { 2048 long /*unsigned int*/ norm32; 2049 mask = mask & UNSIGNED_INT_MASK; 2050 char aux; 2051 2052 /* check conditions (a)..(e), see unormimp.h */ 2053 norm32 = getNorm32(c); 2054 2055 if((norm32&mask)!=0) { 2056 return false; /* fails (a)..(e), not skippable */ 2057 } 2058 2059 if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){ 2060 return true; /* NF*D, passed (a)..(c), is skippable */ 2061 } 2062 /* check conditions (a)..(e), see unormimp.h */ 2063 2064 /* NF*C/FCC, passed (a)..(e) */ 2065 if((norm32& QC_NFD)==0) { 2066 return true; /* no canonical decomposition, is skippable */ 2067 } 2068 2069 /* check Hangul syllables algorithmically */ 2070 if(isNorm32HangulOrJamo(norm32)) { 2071 /* Jamo passed (a)..(e) above, must be Hangul */ 2072 return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */ 2073 } 2074 2075 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */ 2076 /* NF*C, test (f) flag */ 2077 if(!isFormatVersion_2_2) { 2078 return false; /* no (f) data, say not skippable to be safe */ 2079 } 2080 2081 2082 aux = AuxTrieImpl.auxTrie.getCodePointValue(c); 2083 return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */ 2084 2085 /* } else { FCC, test fcd<=1 instead of the above } */ 2086 } 2087 2088 public static UnicodeSet addPropertyStarts(UnicodeSet set) { 2089 int c; 2090 2091 /* add the start code point of each same-value range of each trie */ 2092 //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); 2093 TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie); 2094 RangeValueIterator.Element normResult = new RangeValueIterator.Element(); 2095 2096 while(normIter.next(normResult)){ 2097 set.add(normResult.start); 2098 } 2099 2100 //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); 2101 TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie); 2102 RangeValueIterator.Element fcdResult = new RangeValueIterator.Element(); 2103 2104 while(fcdIter.next(fcdResult)){ 2105 set.add(fcdResult.start); 2106 } 2107 2108 if(isFormatVersion_2_1){ 2109 //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set); 2110 TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie); 2111 RangeValueIterator.Element auxResult = new RangeValueIterator.Element(); 2112 while(auxIter.next(auxResult)){ 2113 set.add(auxResult.start); 2114 } 2115 } 2116 /* add Hangul LV syllables and LV+1 because of skippables */ 2117 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) { 2118 set.add(c); 2119 set.add(c+1); 2120 } 2121 set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ 2122 return set; // for chaining 2123 } 2124 2125 /** 2126 * Internal API, used in UCharacter.getIntPropertyValue(). 2127 * @internal 2128 * @param c code point 2129 * @param modeValue numeric value compatible with Mode 2130 * @return numeric value compatible with QuickCheck 2131 */ 2132 public static final int quickCheck(int c, int modeValue) { 2133 final int qcMask[/*UNORM_MODE_COUNT*/]={ 2134 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC 2135 }; 2136 2137 int norm32=(int)getNorm32(c)&qcMask[modeValue]; 2138 2139 if(norm32==0) { 2140 return 1; // YES 2141 } else if((norm32&QC_ANY_NO)!=0) { 2142 return 0; // NO 2143 } else /* _NORM_QC_ANY_MAYBE */ { 2144 return 2; // MAYBE; 2145 } 2146 } 2147 2148 private static int strCompare(char[] s1, int s1Start, int s1Limit, 2149 char[] s2, int s2Start, int s2Limit, 2150 boolean codePointOrder) { 2151 2152 int start1, start2, limit1, limit2; 2153 2154 char c1, c2; 2155 2156 /* setup for fix-up */ 2157 start1=s1Start; 2158 start2=s2Start; 2159 2160 int length1, length2; 2161 2162 length1 = s1Limit - s1Start; 2163 length2 = s2Limit - s2Start; 2164 2165 int lengthResult; 2166 2167 if(length1<length2) { 2168 lengthResult=-1; 2169 limit1=start1+length1; 2170 } else if(length1==length2) { 2171 lengthResult=0; 2172 limit1=start1+length1; 2173 } else /* length1>length2 */ { 2174 lengthResult=1; 2175 limit1=start1+length2; 2176 } 2177 2178 if(s1==s2) { 2179 return lengthResult; 2180 } 2181 2182 for(;;) { 2183 /* check pseudo-limit */ 2184 if(s1Start==limit1) { 2185 return lengthResult; 2186 } 2187 2188 c1=s1[s1Start]; 2189 c2=s2[s2Start]; 2190 if(c1!=c2) { 2191 break; 2192 } 2193 ++s1Start; 2194 ++s2Start; 2195 } 2196 2197 /* setup for fix-up */ 2198 limit1=start1+length1; 2199 limit2=start2+length2; 2200 2201 2202 /* if both values are in or above the surrogate range, fix them up */ 2203 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 2204 /* subtract 0x2800 from BMP code points to make them smaller than 2205 * supplementary ones */ 2206 if( 2207 ( c1<=0xdbff && (s1Start+1)!=limit1 && 2208 UTF16.isTrailSurrogate(s1[(s1Start+1)]) 2209 ) || 2210 ( UTF16.isTrailSurrogate(c1) && start1!=s1Start && 2211 UTF16.isLeadSurrogate(s1[(s1Start-1)]) 2212 ) 2213 ) { 2214 /* part of a surrogate pair, leave >=d800 */ 2215 } else { 2216 /* BMP code point - may be surrogate code point - make <d800 */ 2217 c1-=0x2800; 2218 } 2219 2220 if( 2221 ( c2<=0xdbff && (s2Start+1)!=limit2 && 2222 UTF16.isTrailSurrogate(s2[(s2Start+1)]) 2223 ) || 2224 ( UTF16.isTrailSurrogate(c2) && start2!=s2Start && 2225 UTF16.isLeadSurrogate(s2[(s2Start-1)]) 2226 ) 2227 ) { 2228 /* part of a surrogate pair, leave >=d800 */ 2229 } else { 2230 /* BMP code point - may be surrogate code point - make <d800 */ 2231 c2-=0x2800; 2232 } 2233 } 2234 2235 /* now c1 and c2 are in UTF-32-compatible order */ 2236 return (int)c1-(int)c2; 2237 } 2238 2239 2240 /* 2241 * Status of tailored normalization 2242 * 2243 * This was done initially for investigation on Unicode public review issue 7 2244 * (http://www.unicode.org/review/). See Jitterbug 2481. 2245 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is 2246 * a permanent feature in ICU 2.6 in support of IDNA which requires true 2247 * Unicode 3.2 normalization. 2248 * (NormalizationCorrections are rolled into IDNA mapping tables.) 2249 * 2250 * Tailored normalization as implemented here allows to "normalize less" 2251 * than full Unicode normalization would. 2252 * Based internally on a UnicodeSet of code points that are 2253 * "excluded from normalization", the normalization functions leave those 2254 * code points alone ("inert"). This means that tailored normalization 2255 * still transforms text into a canonically equivalent form. 2256 * It does not add decompositions to code points that do not have any or 2257 * change decomposition results. 2258 * 2259 * Any function that searches for a safe boundary has not been touched, 2260 * which means that these functions will be over-pessimistic when 2261 * exclusions are applied. 2262 * This should not matter because subsequent checks and normalizations 2263 * do apply the exclusions; only a little more of the text may be processed 2264 * than necessary under exclusions. 2265 * 2266 * Normalization exclusions have the following effect on excluded code points c: 2267 * - c is not decomposed 2268 * - c is not a composition target 2269 * - c does not combine forward or backward for composition 2270 * except that this is not implemented for Jamo 2271 * - c is treated as having a combining class of 0 2272 */ 2273 2274 /* 2275 * Constants for the bit fields in the options bit set parameter. 2276 * These need not be public. 2277 * A user only needs to know the currently assigned values. 2278 * The number and positions of reserved bits per field can remain private. 2279 */ 2280 private static final int OPTIONS_NX_MASK=0x1f; 2281 private static final int OPTIONS_UNICODE_MASK=0xe0; 2282 public static final int OPTIONS_SETS_MASK=0xff; 2283 // private static final int OPTIONS_UNICODE_SHIFT=5; 2284 private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1]; 2285 2286 /* Constants for options flags for normalization.*/ 2287 2288 /** 2289 * Options bit 0, do not decompose Hangul syllables. 2290 * @draft ICU 2.6 2291 */ 2292 private static final int NX_HANGUL = 1; 2293 /** 2294 * Options bit 1, do not decompose CJK compatibility characters. 2295 * @draft ICU 2.6 2296 */ 2297 private static final int NX_CJK_COMPAT=2; 2298 /** 2299 * Options bit 8, use buggy recomposition described in 2300 * Unicode Public Review Issue #29 2301 * at http://www.unicode.org/review/resolved-pri.html#pri29 2302 * 2303 * Used in IDNA implementation according to strict interpretation 2304 * of IDNA definition based on Unicode 3.2 which predates PRI #29. 2305 * 2306 * See ICU4C unormimp.h 2307 * 2308 * @draft ICU 3.2 2309 */ 2310 public static final int BEFORE_PRI_29=0x100; 2311 2312 /* 2313 * The following options are used only in some composition functions. 2314 * They use bits 12 and up to preserve lower bits for the available options 2315 * space in unorm_compare() - 2316 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. 2317 */ 2318 2319 /** Options bit 12, for compatibility vs. canonical decomposition. */ 2320 public static final int OPTIONS_COMPAT=0x1000; 2321 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */ 2322 public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000; 2323 2324 /* normalization exclusion sets --------------------------------------------- */ 2325 2326 /* 2327 * Normalization exclusion UnicodeSets are used for tailored normalization; 2328 * see the comment near the beginning of this file. 2329 * 2330 * By specifying one or several sets of code points, 2331 * those code points become inert for normalization. 2332 */ 2333 private static final synchronized UnicodeSet internalGetNXHangul() { 2334 /* internal function, does not check for incoming U_FAILURE */ 2335 2336 if(nxCache[NX_HANGUL]==null) { 2337 nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3); 2338 } 2339 return nxCache[NX_HANGUL]; 2340 } 2341 2342 private static final synchronized UnicodeSet internalGetNXCJKCompat() { 2343 /* internal function, does not check for incoming U_FAILURE */ 2344 2345 if(nxCache[NX_CJK_COMPAT]==null) { 2346 2347 /* build a set from [CJK Ideographs]&[has canonical decomposition] */ 2348 UnicodeSet set, hasDecomp; 2349 2350 set=new UnicodeSet("[:Ideographic:]"); 2351 2352 /* start with an empty set for [has canonical decomposition] */ 2353 hasDecomp=new UnicodeSet(); 2354 2355 /* iterate over all ideographs and remember which canonically decompose */ 2356 UnicodeSetIterator it = new UnicodeSetIterator(set); 2357 int start, end; 2358 long norm32; 2359 2360 while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) { 2361 start=it.codepoint; 2362 end=it.codepointEnd; 2363 while(start<=end) { 2364 norm32 = getNorm32(start); 2365 if((norm32 & QC_NFD)>0) { 2366 hasDecomp.add(start); 2367 } 2368 ++start; 2369 } 2370 } 2371 2372 /* hasDecomp now contains all ideographs that decompose canonically */ 2373 nxCache[NX_CJK_COMPAT]=hasDecomp; 2374 2375 } 2376 2377 return nxCache[NX_CJK_COMPAT]; 2378 } 2379 2380 private static final synchronized UnicodeSet internalGetNXUnicode(int options) { 2381 options &= OPTIONS_UNICODE_MASK; 2382 if(options==0) { 2383 return null; 2384 } 2385 2386 if(nxCache[options]==null) { 2387 /* build a set with all code points that were not designated by the specified Unicode version */ 2388 UnicodeSet set = new UnicodeSet(); 2389 2390 switch(options) { 2391 case NormalizerBase.UNICODE_3_2: 2392 set.applyPattern("[:^Age=3.2:]"); 2393 break; 2394 default: 2395 return null; 2396 } 2397 2398 nxCache[options]=set; 2399 } 2400 2401 return nxCache[options]; 2402 } 2403 2404 /* Get a decomposition exclusion set. The data must be loaded. */ 2405 private static final synchronized UnicodeSet internalGetNX(int options) { 2406 options&=OPTIONS_SETS_MASK; 2407 2408 if(nxCache[options]==null) { 2409 /* return basic sets */ 2410 if(options==NX_HANGUL) { 2411 return internalGetNXHangul(); 2412 } 2413 if(options==NX_CJK_COMPAT) { 2414 return internalGetNXCJKCompat(); 2415 } 2416 if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) { 2417 return internalGetNXUnicode(options); 2418 } 2419 2420 /* build a set from multiple subsets */ 2421 UnicodeSet set; 2422 UnicodeSet other; 2423 2424 set=new UnicodeSet(); 2425 2426 2427 if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) { 2428 set.addAll(other); 2429 } 2430 if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) { 2431 set.addAll(other); 2432 } 2433 if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) { 2434 set.addAll(other); 2435 } 2436 2437 nxCache[options]=set; 2438 } 2439 return nxCache[options]; 2440 } 2441 2442 public static final UnicodeSet getNX(int options) { 2443 if((options&=OPTIONS_SETS_MASK)==0) { 2444 /* incoming failure, or no decomposition exclusions requested */ 2445 return null; 2446 } else { 2447 return internalGetNX(options); 2448 } 2449 } 2450 2451 private static final boolean nx_contains(UnicodeSet nx, int c) { 2452 return nx!=null && nx.contains(c); 2453 } 2454 2455 private static final boolean nx_contains(UnicodeSet nx, char c, char c2) { 2456 return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2)); 2457 } 2458 2459 /*****************************************************************************/ 2460 2461 /** 2462 * Get the canonical decomposition 2463 * sherman for ComposedCharIter 2464 */ 2465 2466 public static int getDecompose(int chars[], String decomps[]) { 2467 DecomposeArgs args = new DecomposeArgs(); 2468 int length=0; 2469 long norm32 = 0; 2470 int ch = -1; 2471 int index = 0; 2472 int i = 0; 2473 2474 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff 2475 //TBD !!!! the hack code heres save us about 50ms for startup 2476 //need a better solution/lookup 2477 if (ch == 0x30ff) 2478 ch = 0xf900; 2479 else if (ch == 0x10000) 2480 ch = 0x1d15e; 2481 else if (ch == 0x1d1c1) 2482 ch = 0x2f800; 2483 2484 norm32 = NormalizerImpl.getNorm32(ch); 2485 if((norm32 & QC_NFD)!=0 && i < chars.length) { 2486 chars[i] = ch; 2487 index = decompose(norm32, args); 2488 decomps[i++] = new String(extraData,index, args.length); 2489 } 2490 } 2491 return i; 2492 } 2493 2494 //------------------------------------------------------ 2495 // special method for Collation 2496 //------------------------------------------------------ 2497 private static boolean needSingleQuotation(char c) { 2498 return (c >= 0x0009 && c <= 0x000D) || 2499 (c >= 0x0020 && c <= 0x002F) || 2500 (c >= 0x003A && c <= 0x0040) || 2501 (c >= 0x005B && c <= 0x0060) || 2502 (c >= 0x007B && c <= 0x007E); 2503 } 2504 2505 public static String canonicalDecomposeWithSingleQuotation(String string) { 2506 char[] src = string.toCharArray(); 2507 int srcIndex = 0; 2508 int srcLimit = src.length; 2509 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 2510 int destIndex = 0; 2511 int destLimit = dest.length; 2512 2513 char[] buffer = new char[3]; 2514 int prevSrc; 2515 long norm32; 2516 int ccOrQCMask; 2517 int qcMask = QC_NFD; 2518 int reorderStartIndex, length; 2519 char c, c2; 2520 char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE]; 2521 int cc, prevCC, trailCC; 2522 char[] p; 2523 int pStart; 2524 2525 2526 // initialize 2527 ccOrQCMask = CC_MASK | qcMask; 2528 reorderStartIndex = 0; 2529 prevCC = 0; 2530 norm32 = 0; 2531 c = 0; 2532 pStart = 0; 2533 2534 cc = trailCC = -1; // initialize to bogus value 2535 for(;;) { 2536 prevSrc=srcIndex; 2537 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual 2538 while (srcIndex != srcLimit && 2539 (( c = src[srcIndex]) < minNoMaybe || 2540 ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || 2541 ( c >= '\uac00' && c <= '\ud7a3'))){ 2542 2543 prevCC = 0; 2544 ++srcIndex; 2545 } 2546 2547 // copy these code units all at once 2548 if (srcIndex != prevSrc) { 2549 length = srcIndex - prevSrc; 2550 if ((destIndex + length) <= destLimit) { 2551 System.arraycopy(src,prevSrc,dest,destIndex,length); 2552 } 2553 2554 destIndex += length; 2555 reorderStartIndex = destIndex; 2556 } 2557 2558 // end of source reached? 2559 if(srcIndex == srcLimit) { 2560 break; 2561 } 2562 // c already contains *src and norm32 is set for it, increment src 2563 ++srcIndex; 2564 2565 if(isNorm32Regular(norm32)) { 2566 c2 = 0; 2567 length = 1; 2568 } else { 2569 // c is a lead surrogate, get the real norm32 2570 if(srcIndex != srcLimit && 2571 Character.isLowSurrogate(c2 = src[srcIndex])) { 2572 ++srcIndex; 2573 length = 2; 2574 norm32 = getNorm32FromSurrogatePair(norm32, c2); 2575 } else { 2576 c2 = 0; 2577 length = 1; 2578 norm32 = 0; 2579 } 2580 } 2581 2582 // get the decomposition and the lead and trail cc's 2583 if((norm32 & qcMask) == 0) { 2584 // c does not decompose 2585 cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT)); 2586 p = null; 2587 pStart = -1; 2588 } else { 2589 DecomposeArgs arg = new DecomposeArgs(); 2590 // c decomposes, get everything from the variable-length 2591 // extra data 2592 pStart = decompose(norm32, qcMask, arg); 2593 p = extraData; 2594 length = arg.length; 2595 cc = arg.cc; 2596 trailCC = arg.trailCC; 2597 if(length == 1) { 2598 // fastpath a single code unit from decomposition 2599 c = p[pStart]; 2600 c2 = 0; 2601 p = null; 2602 pStart = -1; 2603 } 2604 } 2605 2606 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 2607 // buffer overflow 2608 char[] tmpBuf = new char[destLimit * 2]; 2609 System.arraycopy(dest, 0, tmpBuf, 0, destIndex); 2610 dest = tmpBuf; 2611 destLimit = dest.length; 2612 } 2613 // append the decomposition to the destination buffer, assume length>0 2614 { 2615 int reorderSplit = destIndex; 2616 if(p == null) { 2617 // fastpath: single code point 2618 if (needSingleQuotation(c)) { 2619 //if we need single quotation, no need to consider "prevCC" 2620 //and it must NOT be a supplementary pair 2621 dest[destIndex++] = '\''; 2622 dest[destIndex++] = c; 2623 dest[destIndex++] = '\''; 2624 trailCC = 0; 2625 } else if(cc != 0 && cc < prevCC) { 2626 // (c, c2) is out of order with respect to the preceding 2627 // text 2628 destIndex += length; 2629 trailCC = insertOrdered(dest,reorderStartIndex, 2630 reorderSplit, destIndex, c, c2, cc); 2631 } else { 2632 // just append (c, c2) 2633 dest[destIndex++] = c; 2634 if(c2 != 0) { 2635 dest[destIndex++] = c2; 2636 } 2637 } 2638 } else { 2639 // general: multiple code points (ordered by themselves) 2640 // from decomposition 2641 if (needSingleQuotation(p[pStart])) { 2642 dest[destIndex++] = '\''; 2643 dest[destIndex++] = p[pStart++]; 2644 dest[destIndex++] = '\''; 2645 length--; 2646 do { 2647 dest[destIndex++] = p[pStart++]; 2648 } while(--length > 0); 2649 } else 2650 if(cc != 0 && cc < prevCC) { 2651 destIndex += length; 2652 trailCC = mergeOrdered(dest,reorderStartIndex, 2653 reorderSplit,p, pStart,pStart+length); 2654 } else { 2655 // just append the decomposition 2656 do { 2657 dest[destIndex++] = p[pStart++]; 2658 } while(--length > 0); 2659 } 2660 } 2661 } 2662 prevCC = trailCC; 2663 if(prevCC == 0) { 2664 reorderStartIndex = destIndex; 2665 } 2666 } 2667 return new String(dest, 0, destIndex); 2668 } 2669 2670 //------------------------------------------------------ 2671 // mapping method for IDNA/StringPrep 2672 //------------------------------------------------------ 2673 2674 /* 2675 * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode 2676 * 3.2 normalization with Corrigendum 4 corrections. However, normalization 2677 * without the corrections is necessary for IDNA/StringPrep support. 2678 * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option 2679 * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five 2680 * characters in Corrigendum 4 before normalization in order to avoid 2681 * incorrect normalization. 2682 * For the Corrigendum 4 issue, refer 2683 * http://www.unicode.org/versions/corrigendum4.html 2684 */ 2685 2686 /* 2687 * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL. 2688 */ 2689 public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000; 2690 2691 private static final char[][] corrigendum4MappingTable = { 2692 {'\uD844', '\uDF6A'}, // 0x2F868 2693 {'\u5F33'}, // 0x2F874 2694 {'\u43AB'}, // 0x2F91F 2695 {'\u7AAE'}, // 0x2F95F 2696 {'\u4D57'}}; // 0x2F9BF 2697 2698 /* 2699 * Removing Corrigendum 4 fix 2700 * @return normalized text 2701 */ 2702 public static String convert(String str) { 2703 if (str == null) { 2704 return null; 2705 } 2706 2707 int ch = UCharacterIterator.DONE; 2708 StringBuffer dest = new StringBuffer(); 2709 UCharacterIterator iter = UCharacterIterator.getInstance(str); 2710 2711 while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 2712 switch (ch) { 2713 case 0x2F868: 2714 dest.append(corrigendum4MappingTable[0]); 2715 break; 2716 case 0x2F874: 2717 dest.append(corrigendum4MappingTable[1]); 2718 break; 2719 case 0x2F91F: 2720 dest.append(corrigendum4MappingTable[2]); 2721 break; 2722 case 0x2F95F: 2723 dest.append(corrigendum4MappingTable[3]); 2724 break; 2725 case 0x2F9BF: 2726 dest.append(corrigendum4MappingTable[4]); 2727 break; 2728 default: 2729 UTF16.append(dest,ch); 2730 break; 2731 } 2732 } 2733 2734 return dest.toString(); 2735 } 2736 }