1 /*
2 * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
128 public int length() { return str.length(); }
129 public int getLastCC() { return lastCC; }
130
131 public StringBuilder getStringBuilder() { return str; }
132
133 public boolean equals(CharSequence s, int start, int limit) {
134 return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
135 }
136
137 public void append(int c, int cc) {
138 if(lastCC<=cc || cc==0) {
139 str.appendCodePoint(c);
140 lastCC=cc;
141 if(cc<=1) {
142 reorderStart=str.length();
143 }
144 } else {
145 insert(c, cc);
146 }
147 }
148 // s must be in NFD, otherwise change the implementation.
149 public void append(CharSequence s, int start, int limit,
150 int leadCC, int trailCC) {
151 if(start==limit) {
152 return;
153 }
154 if(lastCC<=leadCC || leadCC==0) {
155 if(trailCC<=1) {
156 reorderStart=str.length()+(limit-start);
157 } else if(leadCC<=1) {
158 reorderStart=str.length()+1; // Ok if not a code point boundary.
159 }
160 str.append(s, start, limit);
161 lastCC=trailCC;
162 } else {
163 int c=Character.codePointAt(s, start);
164 start+=Character.charCount(c);
165 insert(c, leadCC); // insert first code point
166 while(start<limit) {
167 c=Character.codePointAt(s, start);
168 start+=Character.charCount(c);
169 if(start<limit) {
170 // s must be in NFD, otherwise we need to use getCC().
171 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
172 } else {
173 leadCC=trailCC;
174 }
175 append(c, leadCC);
176 }
177 }
178 }
179 // The following append() methods work like C++ appendZeroCC().
180 // They assume that the cc or trailCC of their input is 0.
181 // Most of them implement Appendable interface methods.
182 @Override
183 public ReorderingBuffer append(char c) {
184 str.append(c);
185 lastCC=0;
186 reorderStart=str.length();
187 return this;
188 }
189 public void appendZeroCC(int c) {
190 str.appendCodePoint(c);
191 lastCC=0;
294 codePointLimit=codePointStart;
295 codePointStart=str.offsetByCodePoints(codePointStart, -1);
296 }
297 private int previousCC() { // Returns 0 if there is no previous character.
298 codePointLimit=codePointStart;
299 if(reorderStart>=codePointStart) {
300 return 0;
301 }
302 int c=str.codePointBefore(codePointStart);
303 codePointStart-=Character.charCount(c);
304 return impl.getCCFromYesOrMaybeCP(c);
305 }
306 private int codePointStart, codePointLimit;
307 }
308
309 // TODO: Propose as public API on the UTF16 class.
310 // TODO: Propose widening UTF16 methods that take char to take int.
311 // TODO: Propose widening UTF16 methods that take String to take CharSequence.
312 public static final class UTF16Plus {
313 /**
314 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
315 * is it a lead surrogate?
316 * @param c code unit or code point
317 * @return true or false
318 */
319 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
320
321 /**
322 * Compares two CharSequence subsequences for binary equality.
323 * @param s1 first sequence
324 * @param start1 start offset in first sequence
325 * @param limit1 limit offset in first sequence
326 * @param s2 second sequence
327 * @param start2 start offset in second sequence
328 * @param limit2 limit offset in second sequence
329 * @return true if s1.subSequence(start1, limit1) contains the same text
330 * as s2.subSequence(start2, limit2)
331 */
332 public static boolean equal(CharSequence s1, int start1, int limit1,
333 CharSequence s2, int start2, int limit2) {
334 if((limit1-start1)!=(limit2-start2)) {
335 return false;
336 }
337 if(s1==s2 && start1==start2) {
338 return true;
339 }
340 while(start1<limit1) {
341 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
342 return false;
343 }
344 }
345 return true;
346 }
347 }
348
349 public NormalizerImpl() {}
350
351 private static final class IsAcceptable implements ICUBinary.Authenticate {
352 public boolean isDataVersionAcceptable(byte version[]) {
353 return version[0]==3;
354 }
355 }
356 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
357 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
358
359 public NormalizerImpl load(ByteBuffer bytes) {
360 try {
361 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
362 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
363 if(indexesLength<=IX_MIN_LCCC_CP) {
364 throw new InternalError("Normalizer2 data: not enough indexes");
365 }
366 int[] inIndexes=new int[indexesLength];
367 inIndexes[0]=indexesLength*4;
368 for(int i=1; i<indexesLength; ++i) {
369 inIndexes[i]=bytes.getInt();
370 }
371
372 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
373 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
374 minLcccCP=inIndexes[IX_MIN_LCCC_CP];
375
376 minYesNo=inIndexes[IX_MIN_YES_NO];
377 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
378 minNoNo=inIndexes[IX_MIN_NO_NO];
379 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
380 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
381 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
382 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
383 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
384 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields
385 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
386
387 // Read the normTrie.
388 int offset=inIndexes[IX_NORM_TRIE_OFFSET];
389 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
390 normTrie=Trie2_16.createFromSerialized(bytes);
391 int trieLength=normTrie.getSerializedLength();
392 if(trieLength>(nextOffset-offset)) {
393 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
394 }
395 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
396
397 // Read the composition and mapping data.
398 offset=nextOffset;
399 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
400 int numChars=(nextOffset-offset)/2;
401 char[] chars;
402 if(numChars!=0) {
403 chars=new char[numChars];
404 for(int i=0; i<numChars; ++i) {
405 chars[i]=bytes.getChar();
406 }
407 maybeYesCompositions=new String(chars);
408 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
409 }
410
411 // smallFCD: new in formatVersion 2
412 offset=nextOffset;
413 smallFCD=new byte[0x100];
414 bytes.get(smallFCD);
415
416 return this;
417 } catch(IOException e) {
418 throw new InternalError(e);
419 }
420 }
421 public NormalizerImpl load(String name) {
422 return load(ICUBinary.getRequiredData(name));
423 }
424
425
426 public int getNorm16(int c) { return normTrie.get(c); }
427 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
428 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
429 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
430
431 public int getCC(int norm16) {
432 if(norm16>=MIN_NORMAL_MAYBE_YES) {
433 return getCCFromNormalYesOrMaybe(norm16);
434 }
435 if(norm16<minNoNo || limitNoNo<=norm16) {
436 return 0;
437 }
438 return getCCFromNoNo(norm16);
439 }
440 public static int getCCFromNormalYesOrMaybe(int norm16) {
441 return (norm16 >> OFFSET_SHIFT) & 0xff;
442 }
443 public static int getCCFromYesOrMaybe(int norm16) {
444 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
445 }
446 public int getCCFromYesOrMaybeCP(int c) {
469 return ((bits>>((lead>>5)&7))&1)!=0;
470 }
471
472 /** Gets the FCD value from the regular normalization data. */
473 public int getFCD16FromNormData(int c) {
474 int norm16=getNorm16(c);
475 if (norm16 >= limitNoNo) {
476 if(norm16>=MIN_NORMAL_MAYBE_YES) {
477 // combining mark
478 norm16=getCCFromNormalYesOrMaybe(norm16);
479 return norm16|(norm16<<8);
480 } else if(norm16>=minMaybeYes) {
481 return 0;
482 } else { // isDecompNoAlgorithmic(norm16)
483 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
484 if (deltaTrailCC <= DELTA_TCCC_1) {
485 return deltaTrailCC >> OFFSET_SHIFT;
486 }
487 // Maps to an isCompYesAndZeroCC.
488 c=mapAlgorithmic(c, norm16);
489 norm16=getNorm16(c);
490 }
491 }
492 if(norm16<=minYesNo || isHangulLVT(norm16)) {
493 // no decomposition or Hangul syllable, all zeros
494 return 0;
495 }
496 // c decomposes, get everything from the variable-length extra data
497 int mapping=norm16>>OFFSET_SHIFT;
498 int firstUnit=extraData.charAt(mapping);
499 int fcd16=firstUnit>>8; // tccc
500 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
501 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc
502 }
503 return fcd16;
504 }
505
506 /**
507 * Gets the decomposition for one code point.
508 * @param c code point
509 * @return c's decomposition, if it has one; returns null if it does not have a decomposition
510 */
511 public String getDecomposition(int c) {
512 int norm16;
513 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
514 // c does not decompose
515 return null;
516 }
517 int decomp = -1;
518 if(isDecompNoAlgorithmic(norm16)) {
519 // Maps to an isCompYesAndZeroCC.
520 decomp=c=mapAlgorithmic(c, norm16);
521 // The mapping might decompose further.
522 norm16 = getNorm16(c);
523 }
524 if (norm16 < minYesNo) {
525 if(decomp<0) {
526 return null;
527 } else {
528 return UTF16.valueOf(decomp);
529 }
530 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
531 // Hangul syllable: decompose algorithmically
532 StringBuilder buffer=new StringBuilder();
533 Hangul.decompose(c, buffer);
534 return buffer.toString();
535 }
536 // c decomposes, get everything from the variable-length extra data
537 int mapping=norm16>>OFFSET_SHIFT;
538 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
539 return extraData.substring(mapping, mapping+length);
540 }
541
542 // Fixed norm16 values.
624
625 // Dual functionality:
626 // buffer!=NULL: normalize
627 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
628 public int decompose(CharSequence s, int src, int limit,
629 ReorderingBuffer buffer) {
630 int minNoCP=minDecompNoCP;
631
632 int prevSrc;
633 int c=0;
634 int norm16=0;
635
636 // only for quick check
637 int prevBoundary=src;
638 int prevCC=0;
639
640 for(;;) {
641 // count code units below the minimum or with irrelevant data for the quick check
642 for(prevSrc=src; src!=limit;) {
643 if( (c=s.charAt(src))<minNoCP ||
644 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
645 ) {
646 ++src;
647 } else if(!UTF16.isSurrogate((char)c)) {
648 break;
649 } else {
650 char c2;
651 if(UTF16Plus.isSurrogateLead(c)) {
652 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
653 c=Character.toCodePoint((char)c, c2);
654 }
655 } else /* trail surrogate */ {
656 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
657 --src;
658 c=Character.toCodePoint(c2, (char)c);
659 }
660 }
661 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
662 src+=Character.charCount(c);
663 } else {
664 break;
665 }
666 }
667 }
668 // copy these code units all at once
669 if(src!=prevSrc) {
670 if(buffer!=null) {
671 buffer.flushAndAppendZeroCC(s, prevSrc, src);
672 } else {
673 prevCC=0;
674 prevBoundary=src;
675 }
676 }
677 if(src==limit) {
678 break;
679 }
680
681 // Check one above-minimum, relevant code point.
682 src+=Character.charCount(c);
683 if(buffer!=null) {
684 decompose(c, norm16, buffer);
685 } else {
704 return;
705 }
706 if(doDecompose) {
707 decompose(s, 0, limit, buffer);
708 return;
709 }
710 // Just merge the strings at the boundary.
711 int c=Character.codePointAt(s, 0);
712 int src=0;
713 int firstCC, prevCC, cc;
714 firstCC=prevCC=cc=getCC(getNorm16(c));
715 while(cc!=0) {
716 prevCC=cc;
717 src+=Character.charCount(c);
718 if(src>=limit) {
719 break;
720 }
721 c=Character.codePointAt(s, src);
722 cc=getCC(getNorm16(c));
723 };
724 buffer.append(s, 0, src, firstCC, prevCC);
725 buffer.append(s, src, limit);
726 }
727
728 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
729 // doCompose: normalize
730 // !doCompose: isNormalized (buffer must be empty and initialized)
731 public boolean compose(CharSequence s, int src, int limit,
732 boolean onlyContiguous,
733 boolean doCompose,
734 ReorderingBuffer buffer) {
735 int prevBoundary=src;
736 int minNoMaybeCP=minCompNoMaybeCP;
737
738 for (;;) {
739 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
740 // or with (compYes && ccc==0) properties.
741 int prevSrc;
742 int c = 0;
743 int norm16 = 0;
744 for (;;) {
745 if (src == limit) {
746 if (prevBoundary != limit && doCompose) {
747 buffer.append(s, prevBoundary, limit);
748 }
749 return true;
750 }
751 if( (c=s.charAt(src))<minNoMaybeCP ||
752 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
753 ) {
754 ++src;
755 } else {
756 prevSrc = src++;
757 if(!UTF16.isSurrogate((char)c)) {
758 break;
759 } else {
760 char c2;
761 if(UTF16Plus.isSurrogateLead(c)) {
762 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
763 ++src;
764 c=Character.toCodePoint((char)c, c2);
765 }
766 } else /* trail surrogate */ {
767 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
768 --prevSrc;
769 c=Character.toCodePoint(c2, (char)c);
770 }
771 }
772 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
773 break;
774 }
775 }
776 }
777 }
778 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
779 // The current character is either a "noNo" (has a mapping)
780 // or a "maybeYes" (combines backward)
781 // or a "yesYes" with ccc!=0.
782 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
783
784 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
785 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
786 if (!doCompose) {
787 return false;
788 }
789 // Fast path for mapping a character that is immediately surrounded by boundaries.
790 // In this case, we need not decompose around the current character.
791 if (isDecompNoAlgorithmic(norm16)) {
792 // Maps to a single isCompYesAndZeroCC character
793 // which also implies hasCompBoundaryBefore.
794 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
795 hasCompBoundaryBefore(s, src, limit)) {
796 if (prevBoundary != prevSrc) {
797 buffer.append(s, prevBoundary, prevSrc);
974 * bit 0: set if "maybe"; otherwise, if the span length<s.length()
975 * then the quick check result is "no"
976 */
977 public int composeQuickCheck(CharSequence s, int src, int limit,
978 boolean onlyContiguous, boolean doSpan) {
979 int qcResult=0;
980 int prevBoundary=src;
981 int minNoMaybeCP=minCompNoMaybeCP;
982
983 for(;;) {
984 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
985 // or with (compYes && ccc==0) properties.
986 int prevSrc;
987 int c = 0;
988 int norm16 = 0;
989 for (;;) {
990 if(src==limit) {
991 return (src<<1)|qcResult; // "yes" or "maybe"
992 }
993 if( (c=s.charAt(src))<minNoMaybeCP ||
994 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
995 ) {
996 ++src;
997 } else {
998 prevSrc = src++;
999 if(!UTF16.isSurrogate((char)c)) {
1000 break;
1001 } else {
1002 char c2;
1003 if(UTF16Plus.isSurrogateLead(c)) {
1004 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1005 ++src;
1006 c=Character.toCodePoint((char)c, c2);
1007 }
1008 } else /* trail surrogate */ {
1009 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1010 --prevSrc;
1011 c=Character.toCodePoint(c2, (char)c);
1012 }
1013 }
1014 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1015 break;
1016 }
1017 }
1018 }
1019 }
1020 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1021 // The current character is either a "noNo" (has a mapping)
1022 // or a "maybeYes" (combines backward)
1023 // or a "yesYes" with ccc!=0.
1024 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1025
1026 int prevNorm16 = INERT;
1027 if (prevBoundary != prevSrc) {
1028 prevBoundary = prevSrc;
1029 if (!norm16HasCompBoundaryBefore(norm16)) {
1030 c = Character.codePointBefore(s, prevSrc);
1031 int n16 = getNorm16(c);
1032 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1033 prevBoundary -= Character.charCount(c);
1034 prevNorm16 = n16;
1035 }
1036 }
1037 }
1038
1039 if(isMaybeOrNonZeroCC(norm16)) {
1117 // in the normal way.
1118
1119 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1120 // Similar to the prevBoundary in the compose() implementation.
1121 int prevBoundary=src;
1122 int prevSrc;
1123 int c=0;
1124 int prevFCD16=0;
1125 int fcd16=0;
1126
1127 for(;;) {
1128 // count code units with lccc==0
1129 for(prevSrc=src; src!=limit;) {
1130 if((c=s.charAt(src))<minLcccCP) {
1131 prevFCD16=~c;
1132 ++src;
1133 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1134 prevFCD16=0;
1135 ++src;
1136 } else {
1137 if(UTF16.isSurrogate((char)c)) {
1138 char c2;
1139 if(UTF16Plus.isSurrogateLead(c)) {
1140 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1141 c=Character.toCodePoint((char)c, c2);
1142 }
1143 } else /* trail surrogate */ {
1144 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1145 --src;
1146 c=Character.toCodePoint(c2, (char)c);
1147 }
1148 }
1149 }
1150 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1151 prevFCD16=fcd16;
1152 src+=Character.charCount(c);
1153 } else {
1154 break;
1155 }
1156 }
1157 }
1158 // copy these code units all at once
1159 if(src!=prevSrc) {
1160 if(src==limit) {
1161 if(buffer!=null) {
1162 buffer.flushAndAppendZeroCC(s, prevSrc, src);
1163 }
1164 break;
1165 }
1166 prevBoundary=src;
1167 // We know that the previous character's lccc==0.
1413 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1414 return src;
1415 }
1416 src+=Character.charCount(c);
1417 decompose(c, norm16, buffer);
1418 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1419 return src;
1420 }
1421 }
1422 return src;
1423 }
1424 private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1425 // get the decomposition and the lead and trail cc's
1426 if (norm16 >= limitNoNo) {
1427 if (isMaybeOrNonZeroCC(norm16)) {
1428 buffer.append(c, getCCFromYesOrMaybe(norm16));
1429 return;
1430 }
1431 // Maps to an isCompYesAndZeroCC.
1432 c=mapAlgorithmic(c, norm16);
1433 norm16=getNorm16(c);
1434 }
1435 if (norm16 < minYesNo) {
1436 // c does not decompose
1437 buffer.append(c, 0);
1438 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1439 // Hangul syllable: decompose algorithmically
1440 Hangul.decompose(c, buffer);
1441 } else {
1442 // c decomposes, get everything from the variable-length extra data
1443 int mapping=norm16>>OFFSET_SHIFT;
1444 int firstUnit=extraData.charAt(mapping);
1445 int length=firstUnit&MAPPING_LENGTH_MASK;
1446 int leadCC, trailCC;
1447 trailCC=firstUnit>>8;
1448 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1449 leadCC=extraData.charAt(mapping-1)>>8;
1450 } else {
1451 leadCC=0;
1452 }
1453 ++mapping; // skip over the firstUnit
1454 buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
1455 }
1456 }
1457
1458 /**
1459 * Finds the recomposition result for
1460 * a forward-combining "lead" character,
1461 * specified with a pointer to its compositions list,
1462 * and a backward-combining "trail" character.
1463 *
1464 * <p>If the lead and trail characters combine, then this function returns
1465 * the following "compositeAndFwd" value:
1466 * <pre>
1467 * Bits 21..1 composite character
1468 * Bit 0 set if the composite is a forward-combining starter
1469 * </pre>
1470 * otherwise it returns -1.
1471 *
1472 * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1473 * encoded as either pairs or triples of 16-bit units.
1474 * The last entry has the high bit of its first unit set.
1626 } else if(composite>0xffff) {
1627 // The composite is longer than the starter,
1628 // move the intermediate characters back one.
1629 starterIsSupplementary=true;
1630 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1631 sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1632 ++p;
1633 } else {
1634 // both are on the BMP
1635 sb.setCharAt(starter, (char)composite);
1636 }
1637
1638 // Keep prevCC because we removed the combining mark.
1639
1640 if(p==sb.length()) {
1641 break;
1642 }
1643 // Is the composite a starter that combines forward?
1644 if((compositeAndFwd&1)!=0) {
1645 compositionsList=
1646 getCompositionsListForComposite(getNorm16(composite));
1647 } else {
1648 compositionsList=-1;
1649 }
1650
1651 // We combined; continue with looking for compositions.
1652 continue;
1653 }
1654 }
1655
1656 // no combination this time
1657 prevCC=cc;
1658 if(p==sb.length()) {
1659 break;
1660 }
1661
1662 // If c did not combine, then check if it is a starter.
1663 if(cc==0) {
1664 // Found a new starter.
1665 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1666 // It may combine with something, prepare for it.
2179 }
2180
2181 private VersionInfo dataVersion;
2182
2183 // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2184 private int minDecompNoCP;
2185 private int minCompNoMaybeCP;
2186 private int minLcccCP;
2187
2188 // Norm16 value thresholds for quick check combinations and types of extra data.
2189 private int minYesNo;
2190 private int minYesNoMappingsOnly;
2191 private int minNoNo;
2192 private int minNoNoCompBoundaryBefore;
2193 private int minNoNoCompNoMaybeCC;
2194 private int minNoNoEmpty;
2195 private int limitNoNo;
2196 private int centerNoNoDelta;
2197 private int minMaybeYes;
2198
2199 private Trie2_16 normTrie;
2200 private String maybeYesCompositions;
2201 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
2202 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2203
2204 }
|
1 /*
2 * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
128 public int length() { return str.length(); }
129 public int getLastCC() { return lastCC; }
130
131 public StringBuilder getStringBuilder() { return str; }
132
133 public boolean equals(CharSequence s, int start, int limit) {
134 return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
135 }
136
137 public void append(int c, int cc) {
138 if(lastCC<=cc || cc==0) {
139 str.appendCodePoint(c);
140 lastCC=cc;
141 if(cc<=1) {
142 reorderStart=str.length();
143 }
144 } else {
145 insert(c, cc);
146 }
147 }
148 public void append(CharSequence s, int start, int limit, boolean isNFD,
149 int leadCC, int trailCC) {
150 if(start==limit) {
151 return;
152 }
153 if(lastCC<=leadCC || leadCC==0) {
154 if(trailCC<=1) {
155 reorderStart=str.length()+(limit-start);
156 } else if(leadCC<=1) {
157 reorderStart=str.length()+1; // Ok if not a code point boundary.
158 }
159 str.append(s, start, limit);
160 lastCC=trailCC;
161 } else {
162 int c=Character.codePointAt(s, start);
163 start+=Character.charCount(c);
164 insert(c, leadCC); // insert first code point
165 while(start<limit) {
166 c=Character.codePointAt(s, start);
167 start+=Character.charCount(c);
168 if(start<limit) {
169 if (isNFD) {
170 leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
171 } else {
172 leadCC = impl.getCC(impl.getNorm16(c));
173 }
174 } else {
175 leadCC=trailCC;
176 }
177 append(c, leadCC);
178 }
179 }
180 }
181 // The following append() methods work like C++ appendZeroCC().
182 // They assume that the cc or trailCC of their input is 0.
183 // Most of them implement Appendable interface methods.
184 @Override
185 public ReorderingBuffer append(char c) {
186 str.append(c);
187 lastCC=0;
188 reorderStart=str.length();
189 return this;
190 }
191 public void appendZeroCC(int c) {
192 str.appendCodePoint(c);
193 lastCC=0;
296 codePointLimit=codePointStart;
297 codePointStart=str.offsetByCodePoints(codePointStart, -1);
298 }
299 private int previousCC() { // Returns 0 if there is no previous character.
300 codePointLimit=codePointStart;
301 if(reorderStart>=codePointStart) {
302 return 0;
303 }
304 int c=str.codePointBefore(codePointStart);
305 codePointStart-=Character.charCount(c);
306 return impl.getCCFromYesOrMaybeCP(c);
307 }
308 private int codePointStart, codePointLimit;
309 }
310
311 // TODO: Propose as public API on the UTF16 class.
312 // TODO: Propose widening UTF16 methods that take char to take int.
313 // TODO: Propose widening UTF16 methods that take String to take CharSequence.
314 public static final class UTF16Plus {
315 /**
316 * Is this code point a lead surrogate (U+d800..U+dbff)?
317 * @param c code unit or code point
318 * @return true or false
319 */
320 public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
321 /**
322 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
323 * is it a lead surrogate?
324 * @param c code unit or code point
325 * @return true or false
326 */
327 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
328
329 /**
330 * Compares two CharSequence subsequences for binary equality.
331 * @param s1 first sequence
332 * @param start1 start offset in first sequence
333 * @param limit1 limit offset in first sequence
334 * @param s2 second sequence
335 * @param start2 start offset in second sequence
336 * @param limit2 limit offset in second sequence
337 * @return true if s1.subSequence(start1, limit1) contains the same text
338 * as s2.subSequence(start2, limit2)
339 */
340 public static boolean equal(CharSequence s1, int start1, int limit1,
341 CharSequence s2, int start2, int limit2) {
342 if((limit1-start1)!=(limit2-start2)) {
343 return false;
344 }
345 if(s1==s2 && start1==start2) {
346 return true;
347 }
348 while(start1<limit1) {
349 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
350 return false;
351 }
352 }
353 return true;
354 }
355 }
356
357 public NormalizerImpl() {}
358
359 private static final class IsAcceptable implements ICUBinary.Authenticate {
360 public boolean isDataVersionAcceptable(byte version[]) {
361 return version[0]==4;
362 }
363 }
364 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
365 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
366
367 public NormalizerImpl load(ByteBuffer bytes) {
368 try {
369 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
370 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
371 if(indexesLength<=IX_MIN_LCCC_CP) {
372 throw new InternalError("Normalizer2 data: not enough indexes");
373 }
374 int[] inIndexes=new int[indexesLength];
375 inIndexes[0]=indexesLength*4;
376 for(int i=1; i<indexesLength; ++i) {
377 inIndexes[i]=bytes.getInt();
378 }
379
380 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
381 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
382 minLcccCP=inIndexes[IX_MIN_LCCC_CP];
383
384 minYesNo=inIndexes[IX_MIN_YES_NO];
385 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
386 minNoNo=inIndexes[IX_MIN_NO_NO];
387 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
388 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
389 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
390 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
391 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
392 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields
393 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
394
395 // Read the normTrie.
396 int offset=inIndexes[IX_NORM_TRIE_OFFSET];
397 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
398 int triePosition = bytes.position();
399 normTrie = CodePointTrie.Fast16.fromBinary(bytes);
400 int trieLength = bytes.position() - triePosition;
401 if(trieLength>(nextOffset-offset)) {
402 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
403 }
404 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
405
406 // Read the composition and mapping data.
407 offset=nextOffset;
408 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
409 int numChars=(nextOffset-offset)/2;
410 if(numChars!=0) {
411 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
412 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
413 }
414
415 // smallFCD: new in formatVersion 2
416 offset=nextOffset;
417 smallFCD=new byte[0x100];
418 bytes.get(smallFCD);
419
420 return this;
421 } catch(IOException e) {
422 throw new InternalError(e);
423 }
424 }
425 public NormalizerImpl load(String name) {
426 return load(ICUBinary.getRequiredData(name));
427 }
428
429 // The trie stores values for lead surrogate code *units*.
430 // Surrogate code *points* are inert.
431 public int getNorm16(int c) {
432 return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
433 }
434 public int getRawNorm16(int c) { return normTrie.get(c); }
435 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
436 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
437 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
438
439 public int getCC(int norm16) {
440 if(norm16>=MIN_NORMAL_MAYBE_YES) {
441 return getCCFromNormalYesOrMaybe(norm16);
442 }
443 if(norm16<minNoNo || limitNoNo<=norm16) {
444 return 0;
445 }
446 return getCCFromNoNo(norm16);
447 }
448 public static int getCCFromNormalYesOrMaybe(int norm16) {
449 return (norm16 >> OFFSET_SHIFT) & 0xff;
450 }
451 public static int getCCFromYesOrMaybe(int norm16) {
452 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
453 }
454 public int getCCFromYesOrMaybeCP(int c) {
477 return ((bits>>((lead>>5)&7))&1)!=0;
478 }
479
480 /** Gets the FCD value from the regular normalization data. */
481 public int getFCD16FromNormData(int c) {
482 int norm16=getNorm16(c);
483 if (norm16 >= limitNoNo) {
484 if(norm16>=MIN_NORMAL_MAYBE_YES) {
485 // combining mark
486 norm16=getCCFromNormalYesOrMaybe(norm16);
487 return norm16|(norm16<<8);
488 } else if(norm16>=minMaybeYes) {
489 return 0;
490 } else { // isDecompNoAlgorithmic(norm16)
491 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
492 if (deltaTrailCC <= DELTA_TCCC_1) {
493 return deltaTrailCC >> OFFSET_SHIFT;
494 }
495 // Maps to an isCompYesAndZeroCC.
496 c=mapAlgorithmic(c, norm16);
497 norm16=getRawNorm16(c);
498 }
499 }
500 if(norm16<=minYesNo || isHangulLVT(norm16)) {
501 // no decomposition or Hangul syllable, all zeros
502 return 0;
503 }
504 // c decomposes, get everything from the variable-length extra data
505 int mapping=norm16>>OFFSET_SHIFT;
506 int firstUnit=extraData.charAt(mapping);
507 int fcd16=firstUnit>>8; // tccc
508 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
509 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc
510 }
511 return fcd16;
512 }
513
514 /**
515 * Gets the decomposition for one code point.
516 * @param c code point
517 * @return c's decomposition, if it has one; returns null if it does not have a decomposition
518 */
519 public String getDecomposition(int c) {
520 int norm16;
521 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
522 // c does not decompose
523 return null;
524 }
525 int decomp = -1;
526 if(isDecompNoAlgorithmic(norm16)) {
527 // Maps to an isCompYesAndZeroCC.
528 decomp=c=mapAlgorithmic(c, norm16);
529 // The mapping might decompose further.
530 norm16 = getRawNorm16(c);
531 }
532 if (norm16 < minYesNo) {
533 if(decomp<0) {
534 return null;
535 } else {
536 return UTF16.valueOf(decomp);
537 }
538 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
539 // Hangul syllable: decompose algorithmically
540 StringBuilder buffer=new StringBuilder();
541 Hangul.decompose(c, buffer);
542 return buffer.toString();
543 }
544 // c decomposes, get everything from the variable-length extra data
545 int mapping=norm16>>OFFSET_SHIFT;
546 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
547 return extraData.substring(mapping, mapping+length);
548 }
549
550 // Fixed norm16 values.
632
633 // Dual functionality:
634 // buffer!=NULL: normalize
635 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
636 public int decompose(CharSequence s, int src, int limit,
637 ReorderingBuffer buffer) {
638 int minNoCP=minDecompNoCP;
639
640 int prevSrc;
641 int c=0;
642 int norm16=0;
643
644 // only for quick check
645 int prevBoundary=src;
646 int prevCC=0;
647
648 for(;;) {
649 // count code units below the minimum or with irrelevant data for the quick check
650 for(prevSrc=src; src!=limit;) {
651 if( (c=s.charAt(src))<minNoCP ||
652 isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
653 ) {
654 ++src;
655 } else if(!UTF16Plus.isLeadSurrogate(c)) {
656 break;
657 } else {
658 char c2;
659 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
660 c = Character.toCodePoint((char)c, c2);
661 norm16 = normTrie.suppGet(c);
662 if (isMostDecompYesAndZeroCC(norm16)) {
663 src += 2;
664 } else {
665 break;
666 }
667 } else {
668 ++src; // unpaired lead surrogate: inert
669 }
670 }
671 }
672 // copy these code units all at once
673 if(src!=prevSrc) {
674 if(buffer!=null) {
675 buffer.flushAndAppendZeroCC(s, prevSrc, src);
676 } else {
677 prevCC=0;
678 prevBoundary=src;
679 }
680 }
681 if(src==limit) {
682 break;
683 }
684
685 // Check one above-minimum, relevant code point.
686 src+=Character.charCount(c);
687 if(buffer!=null) {
688 decompose(c, norm16, buffer);
689 } else {
708 return;
709 }
710 if(doDecompose) {
711 decompose(s, 0, limit, buffer);
712 return;
713 }
714 // Just merge the strings at the boundary.
715 int c=Character.codePointAt(s, 0);
716 int src=0;
717 int firstCC, prevCC, cc;
718 firstCC=prevCC=cc=getCC(getNorm16(c));
719 while(cc!=0) {
720 prevCC=cc;
721 src+=Character.charCount(c);
722 if(src>=limit) {
723 break;
724 }
725 c=Character.codePointAt(s, src);
726 cc=getCC(getNorm16(c));
727 };
728 buffer.append(s, 0, src, false, firstCC, prevCC);
729 buffer.append(s, src, limit);
730 }
731
732 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
733 // doCompose: normalize
734 // !doCompose: isNormalized (buffer must be empty and initialized)
735 public boolean compose(CharSequence s, int src, int limit,
736 boolean onlyContiguous,
737 boolean doCompose,
738 ReorderingBuffer buffer) {
739 int prevBoundary=src;
740 int minNoMaybeCP=minCompNoMaybeCP;
741
742 for (;;) {
743 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
744 // or with (compYes && ccc==0) properties.
745 int prevSrc;
746 int c = 0;
747 int norm16 = 0;
748 for (;;) {
749 if (src == limit) {
750 if (prevBoundary != limit && doCompose) {
751 buffer.append(s, prevBoundary, limit);
752 }
753 return true;
754 }
755 if( (c=s.charAt(src))<minNoMaybeCP ||
756 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
757 ) {
758 ++src;
759 } else {
760 prevSrc = src++;
761 if (!UTF16Plus.isLeadSurrogate(c)) {
762 break;
763 } else {
764 char c2;
765 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
766 ++src;
767 c = Character.toCodePoint((char)c, c2);
768 norm16 = normTrie.suppGet(c);
769 if (!isCompYesAndZeroCC(norm16)) {
770 break;
771 }
772 }
773 }
774 }
775 }
776 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
777 // The current character is either a "noNo" (has a mapping)
778 // or a "maybeYes" (combines backward)
779 // or a "yesYes" with ccc!=0.
780 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
781
782 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
783 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
784 if (!doCompose) {
785 return false;
786 }
787 // Fast path for mapping a character that is immediately surrounded by boundaries.
788 // In this case, we need not decompose around the current character.
789 if (isDecompNoAlgorithmic(norm16)) {
790 // Maps to a single isCompYesAndZeroCC character
791 // which also implies hasCompBoundaryBefore.
792 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
793 hasCompBoundaryBefore(s, src, limit)) {
794 if (prevBoundary != prevSrc) {
795 buffer.append(s, prevBoundary, prevSrc);
972 * bit 0: set if "maybe"; otherwise, if the span length<s.length()
973 * then the quick check result is "no"
974 */
975 public int composeQuickCheck(CharSequence s, int src, int limit,
976 boolean onlyContiguous, boolean doSpan) {
977 int qcResult=0;
978 int prevBoundary=src;
979 int minNoMaybeCP=minCompNoMaybeCP;
980
981 for(;;) {
982 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
983 // or with (compYes && ccc==0) properties.
984 int prevSrc;
985 int c = 0;
986 int norm16 = 0;
987 for (;;) {
988 if(src==limit) {
989 return (src<<1)|qcResult; // "yes" or "maybe"
990 }
991 if( (c=s.charAt(src))<minNoMaybeCP ||
992 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
993 ) {
994 ++src;
995 } else {
996 prevSrc = src++;
997 if (!UTF16Plus.isLeadSurrogate(c)) {
998 break;
999 } else {
1000 char c2;
1001 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1002 ++src;
1003 c = Character.toCodePoint((char)c, c2);
1004 norm16 = normTrie.suppGet(c);
1005 if (!isCompYesAndZeroCC(norm16)) {
1006 break;
1007 }
1008 }
1009 }
1010 }
1011 }
1012 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1013 // The current character is either a "noNo" (has a mapping)
1014 // or a "maybeYes" (combines backward)
1015 // or a "yesYes" with ccc!=0.
1016 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1017
1018 int prevNorm16 = INERT;
1019 if (prevBoundary != prevSrc) {
1020 prevBoundary = prevSrc;
1021 if (!norm16HasCompBoundaryBefore(norm16)) {
1022 c = Character.codePointBefore(s, prevSrc);
1023 int n16 = getNorm16(c);
1024 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1025 prevBoundary -= Character.charCount(c);
1026 prevNorm16 = n16;
1027 }
1028 }
1029 }
1030
1031 if(isMaybeOrNonZeroCC(norm16)) {
1109 // in the normal way.
1110
1111 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1112 // Similar to the prevBoundary in the compose() implementation.
1113 int prevBoundary=src;
1114 int prevSrc;
1115 int c=0;
1116 int prevFCD16=0;
1117 int fcd16=0;
1118
1119 for(;;) {
1120 // count code units with lccc==0
1121 for(prevSrc=src; src!=limit;) {
1122 if((c=s.charAt(src))<minLcccCP) {
1123 prevFCD16=~c;
1124 ++src;
1125 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1126 prevFCD16=0;
1127 ++src;
1128 } else {
1129 if (UTF16Plus.isLeadSurrogate(c)) {
1130 char c2;
1131 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1132 c = Character.toCodePoint((char)c, c2);
1133 }
1134 }
1135 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1136 prevFCD16=fcd16;
1137 src+=Character.charCount(c);
1138 } else {
1139 break;
1140 }
1141 }
1142 }
1143 // copy these code units all at once
1144 if(src!=prevSrc) {
1145 if(src==limit) {
1146 if(buffer!=null) {
1147 buffer.flushAndAppendZeroCC(s, prevSrc, src);
1148 }
1149 break;
1150 }
1151 prevBoundary=src;
1152 // We know that the previous character's lccc==0.
1398 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1399 return src;
1400 }
1401 src+=Character.charCount(c);
1402 decompose(c, norm16, buffer);
1403 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1404 return src;
1405 }
1406 }
1407 return src;
1408 }
1409 private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1410 // get the decomposition and the lead and trail cc's
1411 if (norm16 >= limitNoNo) {
1412 if (isMaybeOrNonZeroCC(norm16)) {
1413 buffer.append(c, getCCFromYesOrMaybe(norm16));
1414 return;
1415 }
1416 // Maps to an isCompYesAndZeroCC.
1417 c=mapAlgorithmic(c, norm16);
1418 norm16=getRawNorm16(c);
1419 }
1420 if (norm16 < minYesNo) {
1421 // c does not decompose
1422 buffer.append(c, 0);
1423 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1424 // Hangul syllable: decompose algorithmically
1425 Hangul.decompose(c, buffer);
1426 } else {
1427 // c decomposes, get everything from the variable-length extra data
1428 int mapping=norm16>>OFFSET_SHIFT;
1429 int firstUnit=extraData.charAt(mapping);
1430 int length=firstUnit&MAPPING_LENGTH_MASK;
1431 int leadCC, trailCC;
1432 trailCC=firstUnit>>8;
1433 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1434 leadCC=extraData.charAt(mapping-1)>>8;
1435 } else {
1436 leadCC=0;
1437 }
1438 ++mapping; // skip over the firstUnit
1439 buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
1440 }
1441 }
1442
1443 /**
1444 * Finds the recomposition result for
1445 * a forward-combining "lead" character,
1446 * specified with a pointer to its compositions list,
1447 * and a backward-combining "trail" character.
1448 *
1449 * <p>If the lead and trail characters combine, then this function returns
1450 * the following "compositeAndFwd" value:
1451 * <pre>
1452 * Bits 21..1 composite character
1453 * Bit 0 set if the composite is a forward-combining starter
1454 * </pre>
1455 * otherwise it returns -1.
1456 *
1457 * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1458 * encoded as either pairs or triples of 16-bit units.
1459 * The last entry has the high bit of its first unit set.
1611 } else if(composite>0xffff) {
1612 // The composite is longer than the starter,
1613 // move the intermediate characters back one.
1614 starterIsSupplementary=true;
1615 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1616 sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1617 ++p;
1618 } else {
1619 // both are on the BMP
1620 sb.setCharAt(starter, (char)composite);
1621 }
1622
1623 // Keep prevCC because we removed the combining mark.
1624
1625 if(p==sb.length()) {
1626 break;
1627 }
1628 // Is the composite a starter that combines forward?
1629 if((compositeAndFwd&1)!=0) {
1630 compositionsList=
1631 getCompositionsListForComposite(getRawNorm16(composite));
1632 } else {
1633 compositionsList=-1;
1634 }
1635
1636 // We combined; continue with looking for compositions.
1637 continue;
1638 }
1639 }
1640
1641 // no combination this time
1642 prevCC=cc;
1643 if(p==sb.length()) {
1644 break;
1645 }
1646
1647 // If c did not combine, then check if it is a starter.
1648 if(cc==0) {
1649 // Found a new starter.
1650 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1651 // It may combine with something, prepare for it.
2164 }
2165
2166 private VersionInfo dataVersion;
2167
2168 // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2169 private int minDecompNoCP;
2170 private int minCompNoMaybeCP;
2171 private int minLcccCP;
2172
2173 // Norm16 value thresholds for quick check combinations and types of extra data.
2174 private int minYesNo;
2175 private int minYesNoMappingsOnly;
2176 private int minNoNo;
2177 private int minNoNoCompBoundaryBefore;
2178 private int minNoNoCompNoMaybeCC;
2179 private int minNoNoEmpty;
2180 private int limitNoNo;
2181 private int centerNoNoDelta;
2182 private int minMaybeYes;
2183
2184 private CodePointTrie.Fast16 normTrie;
2185 private String maybeYesCompositions;
2186 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
2187 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2188 }
|