1 /*
2 * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 *******************************************************************************
28 * Copyright (C) 2009-2014, International Business Machines
29 * Corporation and others. All Rights Reserved.
30 *******************************************************************************
31 */
32
33 package sun.text.normalizer;
34
35 import java.io.IOException;
36 import java.nio.ByteBuffer;
37 import java.text.Normalizer;
38
39 // Original filename in ICU4J: Normalizer2Impl.java
40 public final class NormalizerImpl {
41
42 public static final class Hangul {
43 /* Korean Hangul and Jamo constants */
44 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
45 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
46 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
47
48 public static final int HANGUL_BASE=0xac00;
49 public static final int HANGUL_END=0xd7a3;
50
51 public static final int JAMO_L_COUNT=19;
52 public static final int JAMO_V_COUNT=21;
53 public static final int JAMO_T_COUNT=28;
54
55 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
56 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
57
58 public static boolean isHangul(int c) {
59 return HANGUL_BASE<=c && c<HANGUL_LIMIT;
60 }
61
62 public static boolean isHangulWithoutJamoT(char c) {
63 c-=HANGUL_BASE;
64 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
65 }
66
67 /**
68 * Decomposes c, which must be a Hangul syllable, into buffer
69 * and returns the length of the decomposition (2 or 3).
70 */
71 public static int decompose(int c, Appendable buffer) {
72 try {
73 c-=HANGUL_BASE;
74 int c2=c%JAMO_T_COUNT;
75 c/=JAMO_T_COUNT;
76 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
77 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
78 if(c2==0) {
79 return 2;
80 } else {
81 buffer.append((char)(JAMO_T_BASE+c2));
82 return 3;
83 }
84 } catch(IOException e) {
85 throw new InternalError(e);
86 }
87 }
88 }
89
90 /**
91 * Writable buffer that takes care of canonical ordering.
92 * Its Appendable methods behave like the C++ implementation's
93 * appendZeroCC() methods.
94 * <p>
95 * If dest is a StringBuilder, then the buffer writes directly to it.
96 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
97 * until no further changes are necessary and whole segments are appended.
98 * append() methods that take combining-class values always write to the StringBuilder.
99 * Other append() methods flush and append to the Appendable.
100 */
101 public static final class ReorderingBuffer implements Appendable {
102 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
103 impl=ni;
104 app=dest;
105 if (app instanceof StringBuilder) {
106 appIsStringBuilder=true;
107 str=(StringBuilder)dest;
108 // In Java, the constructor subsumes public void init(int destCapacity)
109 str.ensureCapacity(destCapacity);
110 reorderStart=0;
111 if(str.length()==0) {
112 lastCC=0;
113 } else {
114 setIterator();
115 lastCC=previousCC();
116 // Set reorderStart after the last code point with cc<=1 if there is one.
117 if(lastCC>1) {
118 while(previousCC()>1) {}
119 }
120 reorderStart=codePointLimit;
121 }
122 } else {
123 appIsStringBuilder=false;
124 str=new StringBuilder();
125 reorderStart=0;
126 lastCC=0;
127 }
128 }
129
130 public boolean isEmpty() { return str.length()==0; }
131 public int length() { return str.length(); }
132 public int getLastCC() { return lastCC; }
133
134 public StringBuilder getStringBuilder() { return str; }
135
136 public boolean equals(CharSequence s, int start, int limit) {
137 return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
138 }
139
140 // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
141 public void setLastChar(char c) {
142 str.setCharAt(str.length()-1, c);
143 }
144
145 public void append(int c, int cc) {
146 if(lastCC<=cc || cc==0) {
147 str.appendCodePoint(c);
148 lastCC=cc;
149 if(cc<=1) {
150 reorderStart=str.length();
151 }
152 } else {
153 insert(c, cc);
154 }
155 }
156
157 // s must be in NFD, otherwise change the implementation.
158 public void append(CharSequence s, int start, int limit,
159 int leadCC, int trailCC) {
160 if(start==limit) {
161 return;
162 }
163 if(lastCC<=leadCC || leadCC==0) {
164 if(trailCC<=1) {
165 reorderStart=str.length()+(limit-start);
166 } else if(leadCC<=1) {
167 reorderStart=str.length()+1; // Ok if not a code point boundary.
168 }
169 str.append(s, start, limit);
170 lastCC=trailCC;
171 } else {
172 int c=Character.codePointAt(s, start);
173 start+=Character.charCount(c);
174 insert(c, leadCC); // insert first code point
175 while(start<limit) {
176 c=Character.codePointAt(s, start);
177 start+=Character.charCount(c);
178 if(start<limit) {
179 // s must be in NFD, otherwise we need to use getCC().
180 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
181 } else {
182 leadCC=trailCC;
183 }
184 append(c, leadCC);
185 }
186 }
187 }
188
189 // The following append() methods work like C++ appendZeroCC().
190 // They assume that the cc or trailCC of their input is 0.
191 // Most of them implement Appendable interface methods.
192 // @Override when we switch to Java 6
193 public ReorderingBuffer append(char c) {
194 str.append(c);
195 lastCC=0;
196 reorderStart=str.length();
197 return this;
198 }
199
200 public void appendZeroCC(int c) {
201 str.appendCodePoint(c);
202 lastCC=0;
203 reorderStart=str.length();
204 }
205
206 // @Override when we switch to Java 6
207 public ReorderingBuffer append(CharSequence s) {
208 if(s.length()!=0) {
209 str.append(s);
210 lastCC=0;
211 reorderStart=str.length();
212 }
213 return this;
214 }
215
216 // @Override when we switch to Java 6
217 public ReorderingBuffer append(CharSequence s, int start, int limit) {
218 if(start!=limit) {
219 str.append(s, start, limit);
220 lastCC=0;
221 reorderStart=str.length();
222 }
223 return this;
224 }
225
226 /**
227 * Flushes from the intermediate StringBuilder to the Appendable,
228 * if they are different objects.
229 * Used after recomposition.
230 * Must be called at the end when writing to a non-StringBuilder Appendable.
231 */
232 public void flush() {
233 if(appIsStringBuilder) {
234 reorderStart=str.length();
235 } else {
236 try {
237 app.append(str);
238 str.setLength(0);
239 reorderStart=0;
240 } catch(IOException e) {
241 throw new InternalError(e); // Avoid declaring "throws IOException".
242 }
243 }
244 lastCC=0;
245 }
246
247 /**
248 * Flushes from the intermediate StringBuilder to the Appendable,
249 * if they are different objects.
250 * Then appends the new text to the Appendable or StringBuilder.
251 * Normally used after quick check loops find a non-empty sequence.
252 */
253 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
254 if(appIsStringBuilder) {
255 str.append(s, start, limit);
256 reorderStart=str.length();
257 } else {
258 try {
259 app.append(str).append(s, start, limit);
260 str.setLength(0);
261 reorderStart=0;
262 } catch(IOException e) {
263 throw new InternalError(e); // Avoid declaring "throws IOException".
264 }
265 }
266 lastCC=0;
267 return this;
268 }
269
270 public void remove() {
271 str.setLength(0);
272 lastCC=0;
273 reorderStart=0;
274 }
275
276 public void removeSuffix(int suffixLength) {
277 int oldLength=str.length();
278 str.delete(oldLength-suffixLength, oldLength);
279 lastCC=0;
280 reorderStart=str.length();
281 }
282
283 // Inserts c somewhere before the last character.
284 // Requires 0<cc<lastCC which implies reorderStart<limit.
285 private void insert(int c, int cc) {
286 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
287 // insert c at codePointLimit, after the character with prevCC<=cc
288 if(c<=0xffff) {
289 str.insert(codePointLimit, (char)c);
290 if(cc<=1) {
291 reorderStart=codePointLimit+1;
292 }
293 } else {
294 str.insert(codePointLimit, Character.toChars(c));
295 if(cc<=1) {
301 private final NormalizerImpl impl;
302 private final Appendable app;
303 private final StringBuilder str;
304 private final boolean appIsStringBuilder;
305 private int reorderStart;
306 private int lastCC;
307
308 // private backward iterator
309 private void setIterator() { codePointStart=str.length(); }
310 private void skipPrevious() { // Requires 0<codePointStart.
311 codePointLimit=codePointStart;
312 codePointStart=str.offsetByCodePoints(codePointStart, -1);
313 }
314 private int previousCC() { // Returns 0 if there is no previous character.
315 codePointLimit=codePointStart;
316 if(reorderStart>=codePointStart) {
317 return 0;
318 }
319 int c=str.codePointBefore(codePointStart);
320 codePointStart-=Character.charCount(c);
321 if(c<MIN_CCC_LCCC_CP) {
322 return 0;
323 }
324 return getCCFromYesOrMaybe(impl.getNorm16(c));
325 }
326
327 private int codePointStart, codePointLimit;
328 }
329
330 // TODO: Propose as public API on the UTF16 class.
331 // TODO: Propose widening UTF16 methods that take char to take int.
332 // TODO: Propose widening UTF16 methods that take String to take CharSequence.
333 public static final class UTF16Plus {
334 /**
335 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
336 * is it a lead surrogate?
337 * @param c code unit or code point
338 * @return true or false
339 */
340 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
341
342 /**
343 * Compares two CharSequence subsequences for binary equality.
344 * @param s1 first sequence
345 * @param start1 start offset in first sequence
346 * @param limit1 limit offset in first sequence
353 public static boolean equal(CharSequence s1, int start1, int limit1,
354 CharSequence s2, int start2, int limit2) {
355 if((limit1-start1)!=(limit2-start2)) {
356 return false;
357 }
358 if(s1==s2 && start1==start2) {
359 return true;
360 }
361 while(start1<limit1) {
362 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
363 return false;
364 }
365 }
366 return true;
367 }
368 }
369
370 public NormalizerImpl() {}
371
372 private static final class IsAcceptable implements ICUBinary.Authenticate {
373 // @Override when we switch to Java 6
374 public boolean isDataVersionAcceptable(byte version[]) {
375 return version[0]==2;
376 }
377 }
378
379 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
380 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
381
382 public NormalizerImpl load(ByteBuffer bytes) {
383 try {
384 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
385 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
386 if(indexesLength<=IX_MIN_MAYBE_YES) {
387 throw new IOException("Normalizer2 data: not enough indexes");
388 }
389 int[] inIndexes=new int[indexesLength];
390 inIndexes[0]=indexesLength*4;
391 for(int i=1; i<indexesLength; ++i) {
392 inIndexes[i]=bytes.getInt();
393 }
394
395 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
396 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
397
398 minYesNo=inIndexes[IX_MIN_YES_NO];
399 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
400 minNoNo=inIndexes[IX_MIN_NO_NO];
401 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
402 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
403
404 // Read the normTrie.
405 int offset=inIndexes[IX_NORM_TRIE_OFFSET];
406 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
407 normTrie=Trie2_16.createFromSerialized(bytes);
408 int trieLength=normTrie.getSerializedLength();
409 if(trieLength>(nextOffset-offset)) {
410 throw new IOException("Normalizer2 data: not enough bytes for normTrie");
411 }
412 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
413
414 // Read the composition and mapping data.
415 offset=nextOffset;
416 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
417 int numChars=(nextOffset-offset)/2;
418 char[] chars;
419 if(numChars!=0) {
420 chars=new char[numChars];
421 for(int i=0; i<numChars; ++i) {
422 chars[i]=bytes.getChar();
423 }
424 maybeYesCompositions=new String(chars);
425 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
426 }
427
428 // smallFCD: new in formatVersion 2
429 offset=nextOffset;
430 smallFCD=new byte[0x100];
431 for(int i=0; i<0x100; ++i) {
432 smallFCD[i]=bytes.get();
433 }
434
435 // Build tccc180[].
436 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
437 tccc180=new int[0x180];
438 int bits=0;
439 for(int c=0; c<0x180; bits>>=1) {
440 if((c&0xff)==0) {
441 bits=smallFCD[c>>8]; // one byte per 0x100 code points
442 }
443 if((bits&1)!=0) {
444 for(int i=0; i<0x20; ++i, ++c) {
445 tccc180[c]=getFCD16FromNormData(c)&0xff;
446 }
447 } else {
448 c+=0x20;
449 }
450 }
451
452 return this;
453 } catch(IOException e) {
454 throw new InternalError(e);
455 }
456 }
457
458 public NormalizerImpl load(String name) {
459 return load(ICUBinary.getRequiredData(name));
460 }
461
462 public int getNorm16(int c) {
463 return normTrie.get(c);
464 }
465
466 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
467
468 public int getCC(int norm16) {
469 if(norm16>=MIN_NORMAL_MAYBE_YES) {
470 return norm16&0xff;
471 }
472 if(norm16<minNoNo || limitNoNo<=norm16) {
473 return 0;
474 }
475 return getCCFromNoNo(norm16);
476 }
477
478 public static int getCCFromYesOrMaybe(int norm16) {
479 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
480 }
481
482 /**
483 * Returns the FCD data for code point c.
484 * @param c A Unicode code point.
485 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
486 */
487 public int getFCD16(int c) {
488 if(c<0) {
489 return 0;
490 } else if(c<0x180) {
491 return tccc180[c];
492 } else if(c<=0xffff) {
493 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
494 }
495 return getFCD16FromNormData(c);
496 }
497
498 /** Returns the FCD data for U+0000<=c<U+0180. */
499 public int getFCD16FromBelow180(int c) { return tccc180[c]; }
500 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
501 public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
502 // 0<=lead<=0xffff
503 byte bits=smallFCD[lead>>8];
504 if(bits==0) { return false; }
505 return ((bits>>((lead>>5)&7))&1)!=0;
506 }
507
508 /** Gets the FCD value from the regular normalization data. */
509 public int getFCD16FromNormData(int c) {
510 // Only loops for 1:1 algorithmic mappings.
511 for(;;) {
512 int norm16=getNorm16(c);
513 if(norm16<=minYesNo) {
514 // no decomposition or Hangul syllable, all zeros
515 return 0;
516 } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
517 // combining mark
518 norm16&=0xff;
519 return norm16|(norm16<<8);
520 } else if(norm16>=minMaybeYes) {
521 return 0;
522 } else if(isDecompNoAlgorithmic(norm16)) {
523 c=mapAlgorithmic(c, norm16);
524 } else {
525 // c decomposes, get everything from the variable-length extra data
526 int firstUnit=extraData.charAt(norm16);
527 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
528 // A character that is deleted (maps to an empty string) must
529 // get the worst-case lccc and tccc values because arbitrary
530 // characters on both sides will become adjacent.
531 return 0x1ff;
532 } else {
533 int fcd16=firstUnit>>8; // tccc
534 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
535 fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc
536 }
537 return fcd16;
538 }
539 }
540 }
541 }
542
543 /**
544 * Gets the decomposition for one code point.
545 * @param c code point
546 * @return c's decomposition, if it has one; returns null if it does not have a decomposition
547 */
548 public String getDecomposition(int c) {
549 int decomp=-1;
550 int norm16;
551 for(;;) {
552 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
553 // c does not decompose
554 } else if(isHangul(norm16)) {
555 // Hangul syllable: decompose algorithmically
556 StringBuilder buffer=new StringBuilder();
557 Hangul.decompose(c, buffer);
558 return buffer.toString();
559 } else if(isDecompNoAlgorithmic(norm16)) {
560 decomp=c=mapAlgorithmic(c, norm16);
561 continue;
562 } else {
563 // c decomposes, get everything from the variable-length extra data
564 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
565 return extraData.substring(norm16, norm16+length);
566 }
567 if(decomp<0) {
568 return null;
569 } else {
570 return UTF16.valueOf(decomp);
571 }
572 }
573 }
574
575 public static final int MIN_CCC_LCCC_CP=0x300;
576
577 public static final int MIN_YES_YES_WITH_CC=0xff01;
578 public static final int JAMO_VT=0xff00;
579 public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
580 public static final int MAX_DELTA=0x40;
581
582 // Byte offsets from the start of the data, after the generic header.
583 public static final int IX_NORM_TRIE_OFFSET=0;
584 public static final int IX_EXTRA_DATA_OFFSET=1;
585 public static final int IX_SMALL_FCD_OFFSET=2;
586
587 // Code point thresholds for quick check codes.
588 public static final int IX_MIN_DECOMP_NO_CP=8;
589 public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
590
591 // Norm16 value thresholds for quick check combinations and types of extra data.
592 // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
593 public static final int IX_MIN_YES_NO=10;
594 public static final int IX_MIN_NO_NO=11;
595 public static final int IX_LIMIT_NO_NO=12;
596 public static final int IX_MIN_MAYBE_YES=13;
597
598 // Mappings only in [minYesNoMappingsOnly..minNoNo[.
599 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
600
601 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
602 public static final int MAPPING_LENGTH_MASK=0x1f;
603
604 public static final int COMP_1_LAST_TUPLE=0x8000;
605 public static final int COMP_1_TRIPLE=1;
606 public static final int COMP_1_TRAIL_LIMIT=0x3400;
607 public static final int COMP_1_TRAIL_MASK=0x7ffe;
608 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
609 public static final int COMP_2_TRAIL_SHIFT=6;
610 public static final int COMP_2_TRAIL_MASK=0xffc0;
611
612 // higher-level functionality ------------------------------------------ ***
613
614 /**
615 * Decomposes s[src, limit[ and writes the result to dest.
616 * limit can be NULL if src is NUL-terminated.
617 * destLengthEstimate is the initial dest buffer capacity and can be -1.
618 */
619 public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
620 int destLengthEstimate) {
621 if(destLengthEstimate<0) {
685 // Check one above-minimum, relevant code point.
686 src+=Character.charCount(c);
687 if(buffer!=null) {
688 decompose(c, norm16, buffer);
689 } else {
690 if(isDecompYes(norm16)) {
691 int cc=getCCFromYesOrMaybe(norm16);
692 if(prevCC<=cc || cc==0) {
693 prevCC=cc;
694 if(cc<=1) {
695 prevBoundary=src;
696 }
697 continue;
698 }
699 }
700 return prevBoundary; // "no" or cc out of order
701 }
702 }
703 return src;
704 }
705
706 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
707 int limit=s.length();
708 if(limit==0) {
709 return;
710 }
711 if(doDecompose) {
712 decompose(s, 0, limit, buffer);
713 return;
714 }
715 // Just merge the strings at the boundary.
716 int c=Character.codePointAt(s, 0);
717 int src=0;
718 int firstCC, prevCC, cc;
719 firstCC=prevCC=cc=getCC(getNorm16(c));
720 while(cc!=0) {
721 prevCC=cc;
722 src+=Character.charCount(c);
723 if(src>=limit) {
724 break;
725 }
726 c=Character.codePointAt(s, src);
727 cc=getCC(getNorm16(c));
728 };
729 buffer.append(s, 0, src, firstCC, prevCC);
730 buffer.append(s, src, limit);
731 }
732
733 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
734 // doCompose: normalize
735 // !doCompose: isNormalized (buffer must be empty and initialized)
736 public boolean compose(CharSequence s, int src, int limit,
737 boolean onlyContiguous,
738 boolean doCompose,
739 ReorderingBuffer buffer) {
740 int minNoMaybeCP=minCompNoMaybeCP;
741
742 /*
743 * prevBoundary points to the last character before the current one
744 * that has a composition boundary before it with ccc==0 and quick check "yes".
745 * Keeping track of prevBoundary saves us looking for a composition boundary
746 * when we find a "no" or "maybe".
747 *
748 * When we back out from prevSrc back to prevBoundary,
749 * then we also remove those same characters (which had been simply copied
750 * or canonically-order-inserted) from the ReorderingBuffer.
751 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
752 * must correspond 1:1 to destination units at the end of the destination buffer.
753 */
754 int prevBoundary=src;
755 int prevSrc;
756 int c=0;
757 int norm16=0;
758
759 // only for isNormalized
760 int prevCC=0;
761
762 for(;;) {
763 // count code units below the minimum or with irrelevant data for the quick check
764 for(prevSrc=src; src!=limit;) {
765 if( (c=s.charAt(src))<minNoMaybeCP ||
766 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
767 ) {
768 ++src;
769 } else if(!UTF16.isSurrogate((char)c)) {
770 break;
771 } else {
772 char c2;
773 if(UTF16Plus.isSurrogateLead(c)) {
774 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
775 c=Character.toCodePoint((char)c, c2);
776 }
777 } else /* trail surrogate */ {
778 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
779 --src;
780 c=Character.toCodePoint(c2, (char)c);
781 }
782 }
783 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
784 src+=Character.charCount(c);
785 } else {
786 break;
787 }
788 }
789 }
790 // copy these code units all at once
791 if(src!=prevSrc) {
792 if(src==limit) {
793 if(doCompose) {
794 buffer.flushAndAppendZeroCC(s, prevSrc, src);
795 }
796 break;
797 }
798 // Set prevBoundary to the last character in the quick check loop.
799 prevBoundary=src-1;
800 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
801 Character.isHighSurrogate(s.charAt(prevBoundary-1))
802 ) {
803 --prevBoundary;
804 }
805 if(doCompose) {
806 // The last "quick check yes" character is excluded from the
807 // flush-and-append call in case it needs to be modified.
808 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
809 buffer.append(s, prevBoundary, src);
810 } else {
811 prevCC=0;
812 }
813 // The start of the current character (c).
814 prevSrc=src;
815 } else if(src==limit) {
816 break;
817 }
818
819 src+=Character.charCount(c);
820 /*
821 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
822 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
823 * or has ccc!=0.
824 * Check for Jamo V/T, then for regular characters.
825 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
826 */
827 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
828 char prev=s.charAt(prevSrc-1);
829 boolean needToDecompose=false;
830 if(c<Hangul.JAMO_T_BASE) {
831 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
832 prev-=Hangul.JAMO_L_BASE;
833 if(prev<Hangul.JAMO_L_COUNT) {
834 if(!doCompose) {
835 return false;
836 }
837 char syllable=(char)
838 (Hangul.HANGUL_BASE+
839 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
840 Hangul.JAMO_T_COUNT);
841 char t;
842 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
843 ++src;
844 syllable+=t; // The next character was a Jamo T.
845 prevBoundary=src;
846 buffer.setLastChar(syllable);
847 continue;
848 }
849 // If we see L+V+x where x!=T then we drop to the slow path,
850 // decompose and recompose.
851 // This is to deal with NFKC finding normal L and V but a
852 // compatibility variant of a T. We need to either fully compose that
853 // combination here (which would complicate the code and may not work
854 // with strange custom data) or use the slow path -- or else our replacing
855 // two input characters (L+V) with one output character (LV syllable)
856 // would violate the invariant that [prevBoundary..prevSrc[ has the same
857 // length as what we appended to the buffer since prevBoundary.
858 needToDecompose=true;
859 }
860 } else if(Hangul.isHangulWithoutJamoT(prev)) {
861 // c is a Jamo Trailing consonant,
862 // compose with previous Hangul LV that does not contain a Jamo T.
863 if(!doCompose) {
864 return false;
865 }
866 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
867 prevBoundary=src;
868 continue;
869 }
870 if(!needToDecompose) {
871 // The Jamo V/T did not compose into a Hangul syllable.
872 if(doCompose) {
873 buffer.append((char)c);
874 } else {
875 prevCC=0;
876 }
877 continue;
878 }
879 }
880 /*
881 * Source buffer pointers:
882 *
883 * all done quick check current char not yet
884 * "yes" but (c) processed
885 * may combine
886 * forward
887 * [-------------[-------------[-------------[-------------[
888 * | | | | |
889 * orig. src prevBoundary prevSrc src limit
890 *
891 *
892 * Destination buffer pointers inside the ReorderingBuffer:
893 *
894 * all done might take not filled yet
895 * characters for
896 * reordering
897 * [-------------[-------------[-------------[
898 * | | | |
899 * start reorderStart limit |
900 * +remainingCap.+
901 */
902 if(norm16>=MIN_YES_YES_WITH_CC) {
903 int cc=norm16&0xff; // cc!=0
904 if( onlyContiguous && // FCC
905 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
906 prevBoundary<prevSrc &&
907 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
908 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
909 // passed the quick check "yes && ccc==0" test.
910 // Check whether the last character was a "yesYes" or a "yesNo".
911 // If a "yesNo", then we get its trailing ccc from its
912 // mapping and check for canonical order.
913 // All other cases are ok.
914 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
915 ) {
916 // Fails FCD test, need to decompose and contiguously recompose.
917 if(!doCompose) {
918 return false;
919 }
920 } else if(doCompose) {
921 buffer.append(c, cc);
922 continue;
923 } else if(prevCC<=cc) {
924 prevCC=cc;
925 continue;
926 } else {
927 return false;
928 }
929 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
930 return false;
931 }
932
933 /*
934 * Find appropriate boundaries around this character,
935 * decompose the source text from between the boundaries,
936 * and recompose it.
937 *
938 * We may need to remove the last few characters from the ReorderingBuffer
939 * to account for source text that was copied or appended
940 * but needs to take part in the recomposition.
941 */
942
943 /*
944 * Find the last composition boundary in [prevBoundary..src[.
945 * It is either the decomposition of the current character (at prevSrc),
946 * or prevBoundary.
947 */
948 if(hasCompBoundaryBefore(c, norm16)) {
949 prevBoundary=prevSrc;
950 } else if(doCompose) {
951 buffer.removeSuffix(prevSrc-prevBoundary);
952 }
953
954 // Find the next composition boundary in [src..limit[ -
955 // modifies src to point to the next starter.
956 src=findNextCompBoundary(s, src, limit);
957
958 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
959 int recomposeStartIndex=buffer.length();
960 decomposeShort(s, prevBoundary, src, buffer);
961 recompose(buffer, recomposeStartIndex, onlyContiguous);
962 if(!doCompose) {
963 if(!buffer.equals(s, prevBoundary, src)) {
964 return false;
965 }
966 buffer.remove();
967 prevCC=0;
968 }
969
970 // Move to the next starter. We never need to look back before this point again.
971 prevBoundary=src;
972 }
973 return true;
974 }
975
976 /**
977 * Very similar to compose(): Make the same changes in both places if relevant.
978 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
979 * !doSpan: quickCheck
980 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
981 * bit 0: set if "maybe"; otherwise, if the span length<s.length()
982 * then the quick check result is "no"
983 */
984 public int composeQuickCheck(CharSequence s, int src, int limit,
985 boolean onlyContiguous, boolean doSpan) {
986 int qcResult=0;
987 int minNoMaybeCP=minCompNoMaybeCP;
988
989 /*
990 * prevBoundary points to the last character before the current one
991 * that has a composition boundary before it with ccc==0 and quick check "yes".
992 */
993 int prevBoundary=src;
994 int prevSrc;
995 int c=0;
996 int norm16=0;
997 int prevCC=0;
998
999 for(;;) {
1000 // count code units below the minimum or with irrelevant data for the quick check
1001 for(prevSrc=src;;) {
1002 if(src==limit) {
1003 return (src<<1)|qcResult; // "yes" or "maybe"
1004 }
1005 if( (c=s.charAt(src))<minNoMaybeCP ||
1006 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1007 ) {
1008 ++src;
1009 } else if(!UTF16.isSurrogate((char)c)) {
1010 break;
1011 } else {
1012 char c2;
1013 if(UTF16Plus.isSurrogateLead(c)) {
1014 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1015 c=Character.toCodePoint((char)c, c2);
1016 }
1017 } else /* trail surrogate */ {
1018 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1019 --src;
1020 c=Character.toCodePoint(c2, (char)c);
1021 }
1022 }
1023 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1024 src+=Character.charCount(c);
1025 } else {
1026 break;
1027 }
1028 }
1029 }
1030 if(src!=prevSrc) {
1031 // Set prevBoundary to the last character in the quick check loop.
1032 prevBoundary=src-1;
1033 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1034 Character.isHighSurrogate(s.charAt(prevBoundary-1))
1035 ) {
1036 --prevBoundary;
1037 }
1038 prevCC=0;
1039 // The start of the current character (c).
1040 prevSrc=src;
1041 }
1042
1043 src+=Character.charCount(c);
1044 /*
1045 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1046 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1047 * or has ccc!=0.
1048 */
1049 if(isMaybeOrNonZeroCC(norm16)) {
1050 int cc=getCCFromYesOrMaybe(norm16);
1051 if( onlyContiguous && // FCC
1052 cc!=0 &&
1053 prevCC==0 &&
1054 prevBoundary<prevSrc &&
1055 // prevCC==0 && prevBoundary<prevSrc tell us that
1056 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1057 // passed the quick check "yes && ccc==0" test.
1058 // Check whether the last character was a "yesYes" or a "yesNo".
1059 // If a "yesNo", then we get its trailing ccc from its
1060 // mapping and check for canonical order.
1061 // All other cases are ok.
1062 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1063 ) {
1064 // Fails FCD test.
1065 } else if(prevCC<=cc || cc==0) {
1066 prevCC=cc;
1067 if(norm16<MIN_YES_YES_WITH_CC) {
1068 if(!doSpan) {
1069 qcResult=1;
1070 } else {
1071 return prevBoundary<<1; // spanYes does not care to know it's "maybe"
1072 }
1073 }
1074 continue;
1075 }
1076 }
1077 return prevBoundary<<1; // "no"
1078 }
1079 }
1080
1081 public void composeAndAppend(CharSequence s,
1082 boolean doCompose,
1083 boolean onlyContiguous,
1084 ReorderingBuffer buffer) {
1085 int src=0, limit=s.length();
1086 if(!buffer.isEmpty()) {
1087 int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
1088 if(0!=firstStarterInSrc) {
1089 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1090 buffer.length());
1091 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1092 firstStarterInSrc+16);
1093 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1094 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1095 middle.append(s, 0, firstStarterInSrc);
1096 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1097 src=firstStarterInSrc;
1098 }
1099 }
1100 if(doCompose) {
1101 compose(s, src, limit, onlyContiguous, true, buffer);
1102 } else {
1103 buffer.append(s, src, limit);
1104 }
1105 }
1106
1107 // Dual functionality:
1108 // buffer!=NULL: normalize
1109 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1110 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1111 // Note: In this function we use buffer->appendZeroCC() because we track
1112 // the lead and trail combining classes here, rather than leaving it to
1113 // the ReorderingBuffer.
1114 // The exception is the call to decomposeShort() which uses the buffer
1115 // in the normal way.
1116
1117 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1118 // Similar to the prevBoundary in the compose() implementation.
1119 int prevBoundary=src;
1120 int prevSrc;
1121 int c=0;
1122 int prevFCD16=0;
1123 int fcd16=0;
1124
1125 for(;;) {
1126 // count code units with lccc==0
1127 for(prevSrc=src; src!=limit;) {
1128 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
1129 prevFCD16=~c;
1130 ++src;
1131 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1132 prevFCD16=0;
1133 ++src;
1134 } else {
1135 if(UTF16.isSurrogate((char)c)) {
1136 char c2;
1137 if(UTF16Plus.isSurrogateLead(c)) {
1138 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1139 c=Character.toCodePoint((char)c, c2);
1140 }
1141 } else /* trail surrogate */ {
1142 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1143 --src;
1144 c=Character.toCodePoint(c2, (char)c);
1145 }
1146 }
1147 }
1148 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1149 prevFCD16=fcd16;
1150 src+=Character.charCount(c);
1151 } else {
1152 break;
1153 }
1154 }
1155 }
1156 // copy these code units all at once
1157 if(src!=prevSrc) {
1158 if(src==limit) {
1159 if(buffer!=null) {
1160 buffer.flushAndAppendZeroCC(s, prevSrc, src);
1161 }
1162 break;
1163 }
1164 prevBoundary=src;
1165 // We know that the previous character's lccc==0.
1166 if(prevFCD16<0) {
1167 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1168 int prev=~prevFCD16;
1169 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1170 if(prevFCD16>1) {
1171 --prevBoundary;
1172 }
1173 } else {
1174 int p=src-1;
1175 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1176 Character.isHighSurrogate(s.charAt(p-1))
1177 ) {
1178 --p;
1179 // Need to fetch the previous character's FCD value because
1180 // prevFCD16 was just for the trail surrogate code point.
1181 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1182 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1183 }
1184 if(prevFCD16>1) {
1185 prevBoundary=p;
1186 }
1187 }
1188 if(buffer!=null) {
1189 // The last lccc==0 character is excluded from the
1190 // flush-and-append call in case it needs to be modified.
1191 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1192 buffer.append(s, prevBoundary, src);
1211 prevFCD16=fcd16;
1212 continue;
1213 } else if(buffer==null) {
1214 return prevBoundary; // quick check "no"
1215 } else {
1216 /*
1217 * Back out the part of the source that we copied or appended
1218 * already but is now going to be decomposed.
1219 * prevSrc is set to after what was copied/appended.
1220 */
1221 buffer.removeSuffix(prevSrc-prevBoundary);
1222 /*
1223 * Find the part of the source that needs to be decomposed,
1224 * up to the next safe boundary.
1225 */
1226 src=findNextFCDBoundary(s, src, limit);
1227 /*
1228 * The source text does not fulfill the conditions for FCD.
1229 * Decompose and reorder a limited piece of the text.
1230 */
1231 decomposeShort(s, prevBoundary, src, buffer);
1232 prevBoundary=src;
1233 prevFCD16=0;
1234 }
1235 }
1236 return src;
1237 }
1238
1239 // Note: hasDecompBoundary() could be implemented as aliases to
1240 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
1241 // at the cost of building the FCD trie for a decomposition normalizer.
1242 public boolean hasDecompBoundary(int c, boolean before) {
1243 for(;;) {
1244 if(c<minDecompNoCP) {
1245 return true;
1246 }
1247 int norm16=getNorm16(c);
1248 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
1249 return true;
1250 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
1251 return false; // ccc!=0
1252 } else if(isDecompNoAlgorithmic(norm16)) {
1253 c=mapAlgorithmic(c, norm16);
1254 } else {
1255 // c decomposes, get everything from the variable-length extra data
1256 int firstUnit=extraData.charAt(norm16);
1257 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1258 return false;
1259 }
1260 if(!before) {
1261 // decomp after-boundary: same as hasFCDBoundaryAfter(),
1262 // fcd16<=1 || trailCC==0
1263 if(firstUnit>0x1ff) {
1264 return false; // trailCC>1
1265 }
1266 if(firstUnit<=0xff) {
1267 return true; // trailCC==0
1268 }
1269 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1270 }
1271 // true if leadCC==0 (hasFCDBoundaryBefore())
1272 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
1273 }
1274 }
1275 }
1276
1277 public boolean hasCompBoundaryBefore(int c) {
1278 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
1279 }
1280
1281 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1282 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1283 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1284 private boolean isHangul(int norm16) { return norm16==minYesNo; }
1285 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1286
1287 // UBool isCompYes(uint16_t norm16) const {
1288 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1289 // }
1290 // UBool isCompYesOrMaybe(uint16_t norm16) const {
1291 // return norm16<minNoNo || minMaybeYes<=norm16;
1292 // }
1293 // private boolean hasZeroCCFromDecompYes(int norm16) {
1294 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1295 // }
1296 private boolean isDecompYesAndZeroCC(int norm16) {
1297 return norm16<minYesNo ||
1298 norm16==JAMO_VT ||
1299 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1300 }
1301
1302 /**
1303 * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1304 * the MaybeYes which combine-forward and have ccc=0.
1305 * (Standard Unicode 5.2 normalization does not have such characters.)
1306 */
1307 private boolean isMostDecompYesAndZeroCC(int norm16) {
1308 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1309 }
1310
1311 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1312
1313 // For use with isCompYes().
1314 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1315 // static uint8_t getCCFromYes(uint16_t norm16) {
1316 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
1317 // }
1318 private int getCCFromNoNo(int norm16) {
1319 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1320 return extraData.charAt(norm16-1)&0xff;
1321 } else {
1322 return 0;
1323 }
1324 }
1325
1326 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
1327 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
1328 int c;
1329 if(cpStart==(cpLimit-1)) {
1330 c=s.charAt(cpStart);
1331 } else {
1332 c=Character.codePointAt(s, cpStart);
1333 }
1334 int prevNorm16=getNorm16(c);
1335 if(prevNorm16<=minYesNo) {
1336 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
1337 } else {
1338 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo
1339 }
1340 }
1341
1342 // Requires algorithmic-NoNo.
1343 private int mapAlgorithmic(int c, int norm16) {
1344 return c+norm16-(minMaybeYes-MAX_DELTA-1);
1345 }
1346
1347 // Requires minYesNo<norm16<limitNoNo.
1348 // private int getMapping(int norm16) { return /*extraData+*/norm16; }
1349
1350 /**
1351 * @return index into maybeYesCompositions, or -1
1352 */
1353 private int getCompositionsListForDecompYes(int norm16) {
1354 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
1355 return -1;
1356 } else {
1357 if((norm16-=minMaybeYes)<0) {
1358 // norm16<minMaybeYes: index into extraData which is a substring at
1359 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1360 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1361 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list
1362 }
1363 return norm16;
1364 }
1365 }
1366
1367 /**
1368 * @return index into maybeYesCompositions
1369 */
1370 private int getCompositionsListForComposite(int norm16) {
1371 // composite has both mapping & compositions list
1372 int firstUnit=extraData.charAt(norm16);
1373 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions
1374 1+ // +1 to skip the first unit with the mapping lenth
1375 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length
1376 }
1377
1378 // Decompose a short piece of text which is likely to contain characters that
1379 // fail the quick check loop and/or where the quick check loop's overhead
1380 // is unlikely to be amortized.
1381 // Called by the compose() and makeFCD() implementations.
1382 // Public in Java for collation implementation code.
1383 public void decomposeShort(CharSequence s, int src, int limit,
1384 ReorderingBuffer buffer) {
1385 while(src<limit) {
1386 int c=Character.codePointAt(s, src);
1387 src+=Character.charCount(c);
1388 decompose(c, getNorm16(c), buffer);
1389 }
1390 }
1391
1392 private void decompose(int c, int norm16,
1393 ReorderingBuffer buffer) {
1394 // Only loops for 1:1 algorithmic mappings.
1395 for(;;) {
1396 // get the decomposition and the lead and trail cc's
1397 if(isDecompYes(norm16)) {
1398 // c does not decompose
1399 buffer.append(c, getCCFromYesOrMaybe(norm16));
1400 } else if(isHangul(norm16)) {
1401 // Hangul syllable: decompose algorithmically
1402 Hangul.decompose(c, buffer);
1403 } else if(isDecompNoAlgorithmic(norm16)) {
1404 c=mapAlgorithmic(c, norm16);
1405 norm16=getNorm16(c);
1406 continue;
1407 } else {
1408 // c decomposes, get everything from the variable-length extra data
1409 int firstUnit=extraData.charAt(norm16);
1410 int length=firstUnit&MAPPING_LENGTH_MASK;
1411 int leadCC, trailCC;
1412 trailCC=firstUnit>>8;
1413 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1414 leadCC=extraData.charAt(norm16-1)>>8;
1415 } else {
1416 leadCC=0;
1417 }
1418 ++norm16; // skip over the firstUnit
1419 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
1420 }
1421 return;
1422 }
1423 }
1424
1425 /**
1426 * Finds the recomposition result for
1427 * a forward-combining "lead" character,
1428 * specified with a pointer to its compositions list,
1429 * and a backward-combining "trail" character.
1430 *
1431 * <p>If the lead and trail characters combine, then this function returns
1432 * the following "compositeAndFwd" value:
1433 * <pre>
1434 * Bits 21..1 composite character
1435 * Bit 0 set if the composite is a forward-combining starter
1436 * </pre>
1437 * otherwise it returns -1.
1438 *
1439 * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1440 * encoded as either pairs or triples of 16-bit units.
1441 * The last entry has the high bit of its first unit set.
1442 *
1443 * <p>The list is sorted by ascending trail characters (there are no duplicates).
1444 * A linear search is used.
1445 *
1446 * <p>See normalizer2impl.h for a more detailed description
1447 * of the compositions list format.
1448 */
1449 private static int combine(String compositions, int list, int trail) {
1450 int key1, firstUnit;
1451 if(trail<COMP_1_TRAIL_LIMIT) {
1452 // trail character is 0..33FF
1453 // result entry may have 2 or 3 units
1454 key1=(trail<<1);
1455 while(key1>(firstUnit=compositions.charAt(list))) {
1456 list+=2+(firstUnit&COMP_1_TRIPLE);
1457 }
1458 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1459 if((firstUnit&COMP_1_TRIPLE)!=0) {
1460 return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1461 } else {
1462 return compositions.charAt(list+1);
1463 }
1464 }
1465 } else {
1466 // trail character is 3400..10FFFF
1467 // result entry has 3 units
1468 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1469 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1470 int secondUnit;
1471 for(;;) {
1472 if(key1>(firstUnit=compositions.charAt(list))) {
1473 list+=2+(firstUnit&COMP_1_TRIPLE);
1474 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1475 if(key2>(secondUnit=compositions.charAt(list+1))) {
1476 if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1477 break;
1478 } else {
1479 list+=3;
1480 }
1516 int cc, prevCC;
1517 boolean starterIsSupplementary;
1518
1519 // Some of the following variables are not used until we have a forward-combining starter
1520 // and are only initialized now to avoid compiler warnings.
1521 compositionsList=-1; // used as indicator for whether we have a forward-combining starter
1522 starter=-1;
1523 starterIsSupplementary=false;
1524 prevCC=0;
1525
1526 for(;;) {
1527 c=sb.codePointAt(p);
1528 p+=Character.charCount(c);
1529 norm16=getNorm16(c);
1530 cc=getCCFromYesOrMaybe(norm16);
1531 if( // this character combines backward and
1532 isMaybe(norm16) &&
1533 // we have seen a starter that combines forward and
1534 compositionsList>=0 &&
1535 // the backward-combining character is not blocked
1536 (prevCC<cc || prevCC==0)) {
1537 if(isJamoVT(norm16)) {
1538 // c is a Jamo V/T, see if we can compose it with the previous character.
1539 if(c<Hangul.JAMO_T_BASE) {
1540 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1541 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1542 if(prev<Hangul.JAMO_L_COUNT) {
1543 pRemove=p-1;
1544 char syllable=(char)
1545 (Hangul.HANGUL_BASE+
1546 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1547 Hangul.JAMO_T_COUNT);
1548 char t;
1549 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1550 ++p;
1551 syllable+=t; // The next character was a Jamo T.
1552 }
1553 sb.setCharAt(starter, syllable);
1554 // remove the Jamo V/T
1555 sb.delete(pRemove, p);
1556 p=pRemove;
1637 starterIsSupplementary=true;
1638 starter=p-2;
1639 }
1640 }
1641 } else if(onlyContiguous) {
1642 // FCC: no discontiguous compositions; any intervening character blocks.
1643 compositionsList=-1;
1644 }
1645 }
1646 buffer.flush();
1647 }
1648
1649 /**
1650 * Does c have a composition boundary before it?
1651 * True if its decomposition begins with a character that has
1652 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1653 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1654 * (isCompYesAndZeroCC()) so we need not decompose.
1655 */
1656 private boolean hasCompBoundaryBefore(int c, int norm16) {
1657 for(;;) {
1658 if(isCompYesAndZeroCC(norm16)) {
1659 return true;
1660 } else if(isMaybeOrNonZeroCC(norm16)) {
1661 return false;
1662 } else if(isDecompNoAlgorithmic(norm16)) {
1663 c=mapAlgorithmic(c, norm16);
1664 norm16=getNorm16(c);
1665 } else {
1666 // c decomposes, get everything from the variable-length extra data
1667 int firstUnit=extraData.charAt(norm16);
1668 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1669 return false;
1670 }
1671 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
1672 return false; // non-zero leadCC
1673 }
1674 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
1675 }
1676 }
1677 }
1678
1679 private int findPreviousCompBoundary(CharSequence s, int p) {
1680 while(p>0) {
1681 int c=Character.codePointBefore(s, p);
1682 p-=Character.charCount(c);
1683 if(hasCompBoundaryBefore(c)) {
1684 break;
1685 }
1686 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1687 // but that's probably not worth the extra cost.
1688 }
1689 return p;
1690 }
1691
1692 private int findNextCompBoundary(CharSequence s, int p, int limit) {
1693 while(p<limit) {
1694 int c=Character.codePointAt(s, p);
1695 int norm16=normTrie.get(c);
1696 if(hasCompBoundaryBefore(c, norm16)) {
1697 break;
1698 }
1699 p+=Character.charCount(c);
1700 }
1701 return p;
1702 }
1703
1704 private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1705 while(p<limit) {
1706 int c=Character.codePointAt(s, p);
1707 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
1708 break;
1709 }
1710 p+=Character.charCount(c);
1711 }
1712 return p;
1713 }
1714
1715 /**
1716 * Get the canonical decomposition
1717 * sherman for ComposedCharIter
1718 */
1719 public static int getDecompose(int chars[], String decomps[]) {
1720 Normalizer2 impl = Normalizer2.getNFDInstance();
1721
1722 int length=0;
1723 int norm16 = 0;
1724 int ch = -1;
1725 int i = 0;
1726
1727 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
1728 //TBD !!!! the hack code heres save us about 50ms for startup
1729 //need a better solution/lookup
1730 if (ch == 0x30ff)
1973 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
1974 // [current]..[p] 1 code point (c, c2) with cc
1975
1976 // move the code units in between up
1977 r=p;
1978 do {
1979 source[--r]=source[--current];
1980 } while (back!=current);
1981 }
1982 }
1983
1984 // insert (c1, c2)
1985 source[current] = c1;
1986 if (c2!=0) {
1987 source[(current+1)] = c2;
1988 }
1989
1990 // we know the cc of the last code point
1991 return trailCC;
1992 }
1993
1994 /**
1995 * merge two UTF-16 string parts together
1996 * to canonically order (order by combining classes) their concatenation
1997 *
1998 * the two strings may already be adjacent, so that the merging is done
1999 * in-place if the two strings are not adjacent, then the buffer holding the
2000 * first one must be large enough
2001 * the second string may or may not be ordered in itself
2002 *
2003 * before: [start]..[current] is already ordered, and
2004 * [next]..[limit] may be ordered in itself, but
2005 * is not in relation to [start..current[
2006 * after: [start..current+(limit-next)[ is ordered
2007 *
2008 * the algorithm is a simple bubble-sort that takes the characters from
2009 * src[next++] and inserts them in correct combining class order into the
2010 * preceding part of the string
2011 *
2012 * since this function is called much less often than the single-code point
2013 * insertOrdered(), it just uses that for easier maintenance
2057
2058 if(ncArgs.next==ncArgs.limit) {
2059 // we know the cc of the last code point
2060 return trailCC;
2061 } else {
2062 if(!adjacent) {
2063 // copy the second string part
2064 do {
2065 source[current++]=data[ncArgs.next++];
2066 } while(ncArgs.next!=ncArgs.limit);
2067 ncArgs.limit=current;
2068 }
2069 PrevArgs prevArgs = new PrevArgs();
2070 prevArgs.src = data;
2071 prevArgs.start = start;
2072 prevArgs.current = ncArgs.limit;
2073 return getPrevCC(prevArgs);
2074 }
2075
2076 }
2077
2078 private static final class PrevArgs{
2079 char[] src;
2080 int start;
2081 int current;
2082 char c1;
2083 char c2;
2084 }
2085
2086 private static final class NextCCArgs{
2087 char[] source;
2088 int next;
2089 int limit;
2090 char c1;
2091 char c2;
2092 }
2093
2094 private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2095 args.c1=args.src[--args.current];
2096 args.c2=0;
2097
2098 if (args.c1 < MIN_CCC_LCCC_CP) {
2099 return 0;
2100 } else if (UTF16.isLeadSurrogate(args.c1)) {
2101 /* unpaired first surrogate */
2102 return 0;
2103 } else if (!UTF16.isTrailSurrogate(args.c1)) {
2104 return UCharacter.getCombiningClass(args.c1);
2105 } else if (args.current!=args.start &&
2106 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2107 --args.current;
2108 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2109 } else {
2110 /* unpaired second surrogate */
2111 args.c2=0;
2112 return 0;
2113 }
2114 }
2115
2116 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2117 args.c1=args.source[args.next++];
2118 args.c2=0;
2119
2120 if (UTF16.isTrailSurrogate(args.c1)) {
2121 /* unpaired second surrogate */
2122 return 0;
2123 } else if (!UTF16.isLeadSurrogate(args.c1)) {
2124 return UCharacter.getCombiningClass(args.c1);
2125 } else if (args.next!=args.limit &&
2126 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2127 ++args.next;
2128 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2129 } else {
2130 /* unpaired first surrogate */
2131 args.c2=0;
2132 return 0;
2133 }
2134 }
2135
2136 private VersionInfo dataVersion;
2137
2138 // Code point thresholds for quick check codes.
2139 private int minDecompNoCP;
2140 private int minCompNoMaybeCP;
2141
2142 // Norm16 value thresholds for quick check combinations and types of extra data.
2143 private int minYesNo;
2144 private int minYesNoMappingsOnly;
2145 private int minNoNo;
2146 private int limitNoNo;
2147 private int minMaybeYes;
2148
2149 private Trie2_16 normTrie;
2150 private String maybeYesCompositions;
2151 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
2152 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2153 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F
2154
2155 }
|
1 /*
2 * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 *******************************************************************************
28 * Copyright (C) 2009-2014, International Business Machines
29 * Corporation and others. All Rights Reserved.
30 *******************************************************************************
31 */
32 package sun.text.normalizer;
33
34 import java.io.IOException;
35 import java.nio.ByteBuffer;
36 import java.text.Normalizer;
37
38 // Original filename in ICU4J: Normalizer2Impl.java
39 public final class NormalizerImpl {
40 public static final class Hangul {
41 /* Korean Hangul and Jamo constants */
42 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
43 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
44 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
45
46 public static final int HANGUL_BASE=0xac00;
47 public static final int HANGUL_END=0xd7a3;
48
49 public static final int JAMO_L_COUNT=19;
50 public static final int JAMO_V_COUNT=21;
51 public static final int JAMO_T_COUNT=28;
52
53 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
54 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
55
56 public static boolean isHangul(int c) {
57 return HANGUL_BASE<=c && c<HANGUL_LIMIT;
58 }
59 public static boolean isHangulLV(int c) {
60 c-=HANGUL_BASE;
61 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
62 }
63
64 /**
65 * Decomposes c, which must be a Hangul syllable, into buffer
66 * and returns the length of the decomposition (2 or 3).
67 */
68 public static int decompose(int c, Appendable buffer) {
69 try {
70 c-=HANGUL_BASE;
71 int c2=c%JAMO_T_COUNT;
72 c/=JAMO_T_COUNT;
73 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
74 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
75 if(c2==0) {
76 return 2;
77 } else {
78 buffer.append((char)(JAMO_T_BASE+c2));
79 return 3;
80 }
81 } catch(IOException e) {
82 throw new InternalError(e);
83 }
84 }
85 }
86
87 /**
88 * Writable buffer that takes care of canonical ordering.
89 * Its Appendable methods behave like the C++ implementation's
90 * appendZeroCC() methods.
91 * <p>
92 * If dest is a StringBuilder, then the buffer writes directly to it.
93 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
94 * until no further changes are necessary and whole segments are appended.
95 * append() methods that take combining-class values always write to the StringBuilder.
96 * Other append() methods flush and append to the Appendable.
97 */
98 public static final class ReorderingBuffer implements Appendable {
99 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
100 impl=ni;
101 app=dest;
102 if(app instanceof StringBuilder) {
103 appIsStringBuilder=true;
104 str=(StringBuilder)dest;
105 // In Java, the constructor subsumes public void init(int destCapacity) {
106 str.ensureCapacity(destCapacity);
107 reorderStart=0;
108 if(str.length()==0) {
109 lastCC=0;
110 } else {
111 setIterator();
112 lastCC=previousCC();
113 // Set reorderStart after the last code point with cc<=1 if there is one.
114 if(lastCC>1) {
115 while(previousCC()>1) {}
116 }
117 reorderStart=codePointLimit;
118 }
119 } else {
120 appIsStringBuilder=false;
121 str=new StringBuilder();
122 reorderStart=0;
123 lastCC=0;
124 }
125 }
126
127 public boolean isEmpty() { return str.length()==0; }
128 public int length() { return str.length(); }
129 public int getLastCC() { return lastCC; }
130
131 public StringBuilder getStringBuilder() { return str; }
132
133 public boolean equals(CharSequence s, int start, int limit) {
134 return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
135 }
136
137 public void append(int c, int cc) {
138 if(lastCC<=cc || cc==0) {
139 str.appendCodePoint(c);
140 lastCC=cc;
141 if(cc<=1) {
142 reorderStart=str.length();
143 }
144 } else {
145 insert(c, cc);
146 }
147 }
148 // s must be in NFD, otherwise change the implementation.
149 public void append(CharSequence s, int start, int limit,
150 int leadCC, int trailCC) {
151 if(start==limit) {
152 return;
153 }
154 if(lastCC<=leadCC || leadCC==0) {
155 if(trailCC<=1) {
156 reorderStart=str.length()+(limit-start);
157 } else if(leadCC<=1) {
158 reorderStart=str.length()+1; // Ok if not a code point boundary.
159 }
160 str.append(s, start, limit);
161 lastCC=trailCC;
162 } else {
163 int c=Character.codePointAt(s, start);
164 start+=Character.charCount(c);
165 insert(c, leadCC); // insert first code point
166 while(start<limit) {
167 c=Character.codePointAt(s, start);
168 start+=Character.charCount(c);
169 if(start<limit) {
170 // s must be in NFD, otherwise we need to use getCC().
171 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
172 } else {
173 leadCC=trailCC;
174 }
175 append(c, leadCC);
176 }
177 }
178 }
179 // The following append() methods work like C++ appendZeroCC().
180 // They assume that the cc or trailCC of their input is 0.
181 // Most of them implement Appendable interface methods.
182 @Override
183 public ReorderingBuffer append(char c) {
184 str.append(c);
185 lastCC=0;
186 reorderStart=str.length();
187 return this;
188 }
189 public void appendZeroCC(int c) {
190 str.appendCodePoint(c);
191 lastCC=0;
192 reorderStart=str.length();
193 }
194 @Override
195 public ReorderingBuffer append(CharSequence s) {
196 if(s.length()!=0) {
197 str.append(s);
198 lastCC=0;
199 reorderStart=str.length();
200 }
201 return this;
202 }
203 @Override
204 public ReorderingBuffer append(CharSequence s, int start, int limit) {
205 if(start!=limit) {
206 str.append(s, start, limit);
207 lastCC=0;
208 reorderStart=str.length();
209 }
210 return this;
211 }
212 /**
213 * Flushes from the intermediate StringBuilder to the Appendable,
214 * if they are different objects.
215 * Used after recomposition.
216 * Must be called at the end when writing to a non-StringBuilder Appendable.
217 */
218 public void flush() {
219 if(appIsStringBuilder) {
220 reorderStart=str.length();
221 } else {
222 try {
223 app.append(str);
224 str.setLength(0);
225 reorderStart=0;
226 } catch(IOException e) {
227 throw new InternalError(e); // Avoid declaring "throws IOException".
228 }
229 }
230 lastCC=0;
231 }
232 /**
233 * Flushes from the intermediate StringBuilder to the Appendable,
234 * if they are different objects.
235 * Then appends the new text to the Appendable or StringBuilder.
236 * Normally used after quick check loops find a non-empty sequence.
237 */
238 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
239 if(appIsStringBuilder) {
240 str.append(s, start, limit);
241 reorderStart=str.length();
242 } else {
243 try {
244 app.append(str).append(s, start, limit);
245 str.setLength(0);
246 reorderStart=0;
247 } catch(IOException e) {
248 throw new InternalError(e); // Avoid declaring "throws IOException".
249 }
250 }
251 lastCC=0;
252 return this;
253 }
254 public void remove() {
255 str.setLength(0);
256 lastCC=0;
257 reorderStart=0;
258 }
259 public void removeSuffix(int suffixLength) {
260 int oldLength=str.length();
261 str.delete(oldLength-suffixLength, oldLength);
262 lastCC=0;
263 reorderStart=str.length();
264 }
265
266 // Inserts c somewhere before the last character.
267 // Requires 0<cc<lastCC which implies reorderStart<limit.
268 private void insert(int c, int cc) {
269 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
270 // insert c at codePointLimit, after the character with prevCC<=cc
271 if(c<=0xffff) {
272 str.insert(codePointLimit, (char)c);
273 if(cc<=1) {
274 reorderStart=codePointLimit+1;
275 }
276 } else {
277 str.insert(codePointLimit, Character.toChars(c));
278 if(cc<=1) {
284 private final NormalizerImpl impl;
285 private final Appendable app;
286 private final StringBuilder str;
287 private final boolean appIsStringBuilder;
288 private int reorderStart;
289 private int lastCC;
290
291 // private backward iterator
292 private void setIterator() { codePointStart=str.length(); }
293 private void skipPrevious() { // Requires 0<codePointStart.
294 codePointLimit=codePointStart;
295 codePointStart=str.offsetByCodePoints(codePointStart, -1);
296 }
297 private int previousCC() { // Returns 0 if there is no previous character.
298 codePointLimit=codePointStart;
299 if(reorderStart>=codePointStart) {
300 return 0;
301 }
302 int c=str.codePointBefore(codePointStart);
303 codePointStart-=Character.charCount(c);
304 return impl.getCCFromYesOrMaybeCP(c);
305 }
306 private int codePointStart, codePointLimit;
307 }
308
309 // TODO: Propose as public API on the UTF16 class.
310 // TODO: Propose widening UTF16 methods that take char to take int.
311 // TODO: Propose widening UTF16 methods that take String to take CharSequence.
312 public static final class UTF16Plus {
313 /**
314 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
315 * is it a lead surrogate?
316 * @param c code unit or code point
317 * @return true or false
318 */
319 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
320
321 /**
322 * Compares two CharSequence subsequences for binary equality.
323 * @param s1 first sequence
324 * @param start1 start offset in first sequence
325 * @param limit1 limit offset in first sequence
332 public static boolean equal(CharSequence s1, int start1, int limit1,
333 CharSequence s2, int start2, int limit2) {
334 if((limit1-start1)!=(limit2-start2)) {
335 return false;
336 }
337 if(s1==s2 && start1==start2) {
338 return true;
339 }
340 while(start1<limit1) {
341 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
342 return false;
343 }
344 }
345 return true;
346 }
347 }
348
349 public NormalizerImpl() {}
350
351 private static final class IsAcceptable implements ICUBinary.Authenticate {
352 public boolean isDataVersionAcceptable(byte version[]) {
353 return version[0]==3;
354 }
355 }
356 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
357 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
358
359 public NormalizerImpl load(ByteBuffer bytes) {
360 try {
361 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
362 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
363 if(indexesLength<=IX_MIN_LCCC_CP) {
364 throw new InternalError("Normalizer2 data: not enough indexes");
365 }
366 int[] inIndexes=new int[indexesLength];
367 inIndexes[0]=indexesLength*4;
368 for(int i=1; i<indexesLength; ++i) {
369 inIndexes[i]=bytes.getInt();
370 }
371
372 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
373 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
374 minLcccCP=inIndexes[IX_MIN_LCCC_CP];
375
376 minYesNo=inIndexes[IX_MIN_YES_NO];
377 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
378 minNoNo=inIndexes[IX_MIN_NO_NO];
379 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
380 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
381 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
382 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
383 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
384 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields
385 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
386
387 // Read the normTrie.
388 int offset=inIndexes[IX_NORM_TRIE_OFFSET];
389 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
390 normTrie=Trie2_16.createFromSerialized(bytes);
391 int trieLength=normTrie.getSerializedLength();
392 if(trieLength>(nextOffset-offset)) {
393 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
394 }
395 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
396
397 // Read the composition and mapping data.
398 offset=nextOffset;
399 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
400 int numChars=(nextOffset-offset)/2;
401 char[] chars;
402 if(numChars!=0) {
403 chars=new char[numChars];
404 for(int i=0; i<numChars; ++i) {
405 chars[i]=bytes.getChar();
406 }
407 maybeYesCompositions=new String(chars);
408 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
409 }
410
411 // smallFCD: new in formatVersion 2
412 offset=nextOffset;
413 smallFCD=new byte[0x100];
414 bytes.get(smallFCD);
415
416 return this;
417 } catch(IOException e) {
418 throw new InternalError(e);
419 }
420 }
421 public NormalizerImpl load(String name) {
422 return load(ICUBinary.getRequiredData(name));
423 }
424
425
426 public int getNorm16(int c) { return normTrie.get(c); }
427 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
428 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
429 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
430
431 public int getCC(int norm16) {
432 if(norm16>=MIN_NORMAL_MAYBE_YES) {
433 return getCCFromNormalYesOrMaybe(norm16);
434 }
435 if(norm16<minNoNo || limitNoNo<=norm16) {
436 return 0;
437 }
438 return getCCFromNoNo(norm16);
439 }
440 public static int getCCFromNormalYesOrMaybe(int norm16) {
441 return (norm16 >> OFFSET_SHIFT) & 0xff;
442 }
443 public static int getCCFromYesOrMaybe(int norm16) {
444 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
445 }
446 public int getCCFromYesOrMaybeCP(int c) {
447 if (c < minCompNoMaybeCP) { return 0; }
448 return getCCFromYesOrMaybe(getNorm16(c));
449 }
450
451 /**
452 * Returns the FCD data for code point c.
453 * @param c A Unicode code point.
454 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
455 */
456 public int getFCD16(int c) {
457 if(c<minDecompNoCP) {
458 return 0;
459 } else if(c<=0xffff) {
460 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
461 }
462 return getFCD16FromNormData(c);
463 }
464 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
465 public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
466 // 0<=lead<=0xffff
467 byte bits=smallFCD[lead>>8];
468 if(bits==0) { return false; }
469 return ((bits>>((lead>>5)&7))&1)!=0;
470 }
471
472 /** Gets the FCD value from the regular normalization data. */
473 public int getFCD16FromNormData(int c) {
474 int norm16=getNorm16(c);
475 if (norm16 >= limitNoNo) {
476 if(norm16>=MIN_NORMAL_MAYBE_YES) {
477 // combining mark
478 norm16=getCCFromNormalYesOrMaybe(norm16);
479 return norm16|(norm16<<8);
480 } else if(norm16>=minMaybeYes) {
481 return 0;
482 } else { // isDecompNoAlgorithmic(norm16)
483 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
484 if (deltaTrailCC <= DELTA_TCCC_1) {
485 return deltaTrailCC >> OFFSET_SHIFT;
486 }
487 // Maps to an isCompYesAndZeroCC.
488 c=mapAlgorithmic(c, norm16);
489 norm16=getNorm16(c);
490 }
491 }
492 if(norm16<=minYesNo || isHangulLVT(norm16)) {
493 // no decomposition or Hangul syllable, all zeros
494 return 0;
495 }
496 // c decomposes, get everything from the variable-length extra data
497 int mapping=norm16>>OFFSET_SHIFT;
498 int firstUnit=extraData.charAt(mapping);
499 int fcd16=firstUnit>>8; // tccc
500 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
501 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc
502 }
503 return fcd16;
504 }
505
506 /**
507 * Gets the decomposition for one code point.
508 * @param c code point
509 * @return c's decomposition, if it has one; returns null if it does not have a decomposition
510 */
511 public String getDecomposition(int c) {
512 int norm16;
513 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
514 // c does not decompose
515 return null;
516 }
517 int decomp = -1;
518 if(isDecompNoAlgorithmic(norm16)) {
519 // Maps to an isCompYesAndZeroCC.
520 decomp=c=mapAlgorithmic(c, norm16);
521 // The mapping might decompose further.
522 norm16 = getNorm16(c);
523 }
524 if (norm16 < minYesNo) {
525 if(decomp<0) {
526 return null;
527 } else {
528 return UTF16.valueOf(decomp);
529 }
530 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
531 // Hangul syllable: decompose algorithmically
532 StringBuilder buffer=new StringBuilder();
533 Hangul.decompose(c, buffer);
534 return buffer.toString();
535 }
536 // c decomposes, get everything from the variable-length extra data
537 int mapping=norm16>>OFFSET_SHIFT;
538 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
539 return extraData.substring(mapping, mapping+length);
540 }
541
542 // Fixed norm16 values.
543 public static final int MIN_YES_YES_WITH_CC=0xfe02;
544 public static final int JAMO_VT=0xfe00;
545 public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
546 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE
547 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE
548
549 // norm16 bit 0 is comp-boundary-after.
550 public static final int HAS_COMP_BOUNDARY_AFTER=1;
551 public static final int OFFSET_SHIFT=1;
552
553 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
554 // tccc (0, 1, >1) for quick FCC boundary-after tests.
555 public static final int DELTA_TCCC_0=0;
556 public static final int DELTA_TCCC_1=2;
557 public static final int DELTA_TCCC_GT_1=4;
558 public static final int DELTA_TCCC_MASK=6;
559 public static final int DELTA_SHIFT=3;
560
561 public static final int MAX_DELTA=0x40;
562
563 // Byte offsets from the start of the data, after the generic header.
564 public static final int IX_NORM_TRIE_OFFSET=0;
565 public static final int IX_EXTRA_DATA_OFFSET=1;
566 public static final int IX_SMALL_FCD_OFFSET=2;
567 public static final int IX_RESERVED3_OFFSET=3;
568 public static final int IX_TOTAL_SIZE=7;
569 public static final int MIN_CCC_LCCC_CP=0x300;
570 // Code point thresholds for quick check codes.
571 public static final int IX_MIN_DECOMP_NO_CP=8;
572 public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
573
574 // Norm16 value thresholds for quick check combinations and types of extra data.
575
576 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
577 public static final int IX_MIN_YES_NO=10;
578 /** Mappings are comp-normalized. */
579 public static final int IX_MIN_NO_NO=11;
580 public static final int IX_LIMIT_NO_NO=12;
581 public static final int IX_MIN_MAYBE_YES=13;
582
583 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
584 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
585 /** Mappings are not comp-normalized but have a comp boundary before. */
586 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
587 /** Mappings do not have a comp boundary before. */
588 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
589 /** Mappings to the empty string. */
590 public static final int IX_MIN_NO_NO_EMPTY=17;
591
592 public static final int IX_MIN_LCCC_CP=18;
593 public static final int IX_COUNT=20;
594
595 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
596 public static final int MAPPING_HAS_RAW_MAPPING=0x40;
597 // unused bit 0x20;
598 public static final int MAPPING_LENGTH_MASK=0x1f;
599
600 public static final int COMP_1_LAST_TUPLE=0x8000;
601 public static final int COMP_1_TRIPLE=1;
602 public static final int COMP_1_TRAIL_LIMIT=0x3400;
603 public static final int COMP_1_TRAIL_MASK=0x7ffe;
604 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
605 public static final int COMP_2_TRAIL_SHIFT=6;
606 public static final int COMP_2_TRAIL_MASK=0xffc0;
607
608 // higher-level functionality ------------------------------------------ ***
609
610 /**
611 * Decomposes s[src, limit[ and writes the result to dest.
612 * limit can be NULL if src is NUL-terminated.
613 * destLengthEstimate is the initial dest buffer capacity and can be -1.
614 */
615 public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
616 int destLengthEstimate) {
617 if(destLengthEstimate<0) {
681 // Check one above-minimum, relevant code point.
682 src+=Character.charCount(c);
683 if(buffer!=null) {
684 decompose(c, norm16, buffer);
685 } else {
686 if(isDecompYes(norm16)) {
687 int cc=getCCFromYesOrMaybe(norm16);
688 if(prevCC<=cc || cc==0) {
689 prevCC=cc;
690 if(cc<=1) {
691 prevBoundary=src;
692 }
693 continue;
694 }
695 }
696 return prevBoundary; // "no" or cc out of order
697 }
698 }
699 return src;
700 }
701 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
702 int limit=s.length();
703 if(limit==0) {
704 return;
705 }
706 if(doDecompose) {
707 decompose(s, 0, limit, buffer);
708 return;
709 }
710 // Just merge the strings at the boundary.
711 int c=Character.codePointAt(s, 0);
712 int src=0;
713 int firstCC, prevCC, cc;
714 firstCC=prevCC=cc=getCC(getNorm16(c));
715 while(cc!=0) {
716 prevCC=cc;
717 src+=Character.charCount(c);
718 if(src>=limit) {
719 break;
720 }
721 c=Character.codePointAt(s, src);
722 cc=getCC(getNorm16(c));
723 };
724 buffer.append(s, 0, src, firstCC, prevCC);
725 buffer.append(s, src, limit);
726 }
727
728 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
729 // doCompose: normalize
730 // !doCompose: isNormalized (buffer must be empty and initialized)
731 public boolean compose(CharSequence s, int src, int limit,
732 boolean onlyContiguous,
733 boolean doCompose,
734 ReorderingBuffer buffer) {
735 int prevBoundary=src;
736 int minNoMaybeCP=minCompNoMaybeCP;
737
738 for (;;) {
739 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
740 // or with (compYes && ccc==0) properties.
741 int prevSrc;
742 int c = 0;
743 int norm16 = 0;
744 for (;;) {
745 if (src == limit) {
746 if (prevBoundary != limit && doCompose) {
747 buffer.append(s, prevBoundary, limit);
748 }
749 return true;
750 }
751 if( (c=s.charAt(src))<minNoMaybeCP ||
752 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
753 ) {
754 ++src;
755 } else {
756 prevSrc = src++;
757 if(!UTF16.isSurrogate((char)c)) {
758 break;
759 } else {
760 char c2;
761 if(UTF16Plus.isSurrogateLead(c)) {
762 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
763 ++src;
764 c=Character.toCodePoint((char)c, c2);
765 }
766 } else /* trail surrogate */ {
767 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
768 --prevSrc;
769 c=Character.toCodePoint(c2, (char)c);
770 }
771 }
772 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
773 break;
774 }
775 }
776 }
777 }
778 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
779 // The current character is either a "noNo" (has a mapping)
780 // or a "maybeYes" (combines backward)
781 // or a "yesYes" with ccc!=0.
782 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
783
784 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
785 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
786 if (!doCompose) {
787 return false;
788 }
789 // Fast path for mapping a character that is immediately surrounded by boundaries.
790 // In this case, we need not decompose around the current character.
791 if (isDecompNoAlgorithmic(norm16)) {
792 // Maps to a single isCompYesAndZeroCC character
793 // which also implies hasCompBoundaryBefore.
794 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
795 hasCompBoundaryBefore(s, src, limit)) {
796 if (prevBoundary != prevSrc) {
797 buffer.append(s, prevBoundary, prevSrc);
798 }
799 buffer.append(mapAlgorithmic(c, norm16), 0);
800 prevBoundary = src;
801 continue;
802 }
803 } else if (norm16 < minNoNoCompBoundaryBefore) {
804 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
805 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
806 hasCompBoundaryBefore(s, src, limit)) {
807 if (prevBoundary != prevSrc) {
808 buffer.append(s, prevBoundary, prevSrc);
809 }
810 int mapping = norm16 >> OFFSET_SHIFT;
811 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
812 buffer.append(extraData, mapping, mapping + length);
813 prevBoundary = src;
814 continue;
815 }
816 } else if (norm16 >= minNoNoEmpty) {
817 // The current character maps to nothing.
818 // Simply omit it from the output if there is a boundary before _or_ after it.
819 // The character itself implies no boundaries.
820 if (hasCompBoundaryBefore(s, src, limit) ||
821 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
822 if (prevBoundary != prevSrc) {
823 buffer.append(s, prevBoundary, prevSrc);
824 }
825 prevBoundary = src;
826 continue;
827 }
828 }
829 // Other "noNo" type, or need to examine more text around this character:
830 // Fall through to the slow path.
831 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
832 char prev=s.charAt(prevSrc-1);
833 if(c<Hangul.JAMO_T_BASE) {
834 // The current character is a Jamo Vowel,
835 // compose with previous Jamo L and following Jamo T.
836 char l = (char)(prev-Hangul.JAMO_L_BASE);
837 if(l<Hangul.JAMO_L_COUNT) {
838 if (!doCompose) {
839 return false;
840 }
841 int t;
842 if (src != limit &&
843 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
844 t < Hangul.JAMO_T_COUNT) {
845 // The next character is a Jamo T.
846 ++src;
847 } else if (hasCompBoundaryBefore(s, src, limit)) {
848 // No Jamo T follows, not even via decomposition.
849 t = 0;
850 } else {
851 t = -1;
852 }
853 if (t >= 0) {
854 int syllable = Hangul.HANGUL_BASE +
855 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
856 Hangul.JAMO_T_COUNT + t;
857 --prevSrc; // Replace the Jamo L as well.
858 if (prevBoundary != prevSrc) {
859 buffer.append(s, prevBoundary, prevSrc);
860 }
861 buffer.append((char)syllable);
862 prevBoundary = src;
863 continue;
864 }
865 // If we see L+V+x where x!=T then we drop to the slow path,
866 // decompose and recompose.
867 // This is to deal with NFKC finding normal L and V but a
868 // compatibility variant of a T.
869 // We need to either fully compose that combination here
870 // (which would complicate the code and may not work with strange custom data)
871 // or use the slow path.
872 }
873 } else if (Hangul.isHangulLV(prev)) {
874 // The current character is a Jamo Trailing consonant,
875 // compose with previous Hangul LV that does not contain a Jamo T.
876 if (!doCompose) {
877 return false;
878 }
879 int syllable = prev + c - Hangul.JAMO_T_BASE;
880 --prevSrc; // Replace the Hangul LV as well.
881 if (prevBoundary != prevSrc) {
882 buffer.append(s, prevBoundary, prevSrc);
883 }
884 buffer.append((char)syllable);
885 prevBoundary = src;
886 continue;
887 }
888 // No matching context, or may need to decompose surrounding text first:
889 // Fall through to the slow path.
890 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
891 // One or more combining marks that do not combine-back:
892 // Check for canonical order, copy unchanged if ok and
893 // if followed by a character with a boundary-before.
894 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
895 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
896 // Fails FCD test, need to decompose and contiguously recompose.
897 if (!doCompose) {
898 return false;
899 }
900 } else {
901 // If !onlyContiguous (not FCC), then we ignore the tccc of
902 // the previous character which passed the quick check "yes && ccc==0" test.
903 int n16;
904 for (;;) {
905 if (src == limit) {
906 if (doCompose) {
907 buffer.append(s, prevBoundary, limit);
908 }
909 return true;
910 }
911 int prevCC = cc;
912 c = Character.codePointAt(s, src);
913 n16 = normTrie.get(c);
914 if (n16 >= MIN_YES_YES_WITH_CC) {
915 cc = getCCFromNormalYesOrMaybe(n16);
916 if (prevCC > cc) {
917 if (!doCompose) {
918 return false;
919 }
920 break;
921 }
922 } else {
923 break;
924 }
925 src += Character.charCount(c);
926 }
927 // p is after the last in-order combining mark.
928 // If there is a boundary here, then we continue with no change.
929 if (norm16HasCompBoundaryBefore(n16)) {
930 if (isCompYesAndZeroCC(n16)) {
931 src += Character.charCount(c);
932 }
933 continue;
934 }
935 // Use the slow path. There is no boundary in [prevSrc, src[.
936 }
937 }
938
939 // Slow path: Find the nearest boundaries around the current character,
940 // decompose and recompose.
941 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
942 c = Character.codePointBefore(s, prevSrc);
943 norm16 = normTrie.get(c);
944 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
945 prevSrc -= Character.charCount(c);
946 }
947 }
948 if (doCompose && prevBoundary != prevSrc) {
949 buffer.append(s, prevBoundary, prevSrc);
950 }
951 int recomposeStartIndex=buffer.length();
952 // We know there is not a boundary here.
953 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
954 buffer);
955 // Decompose until the next boundary.
956 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
957 buffer);
958 recompose(buffer, recomposeStartIndex, onlyContiguous);
959 if(!doCompose) {
960 if(!buffer.equals(s, prevSrc, src)) {
961 return false;
962 }
963 buffer.remove();
964 }
965 prevBoundary=src;
966 }
967 }
968
969 /**
970 * Very similar to compose(): Make the same changes in both places if relevant.
971 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
972 * !doSpan: quickCheck
973 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
974 * bit 0: set if "maybe"; otherwise, if the span length<s.length()
975 * then the quick check result is "no"
976 */
977 public int composeQuickCheck(CharSequence s, int src, int limit,
978 boolean onlyContiguous, boolean doSpan) {
979 int qcResult=0;
980 int prevBoundary=src;
981 int minNoMaybeCP=minCompNoMaybeCP;
982
983 for(;;) {
984 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
985 // or with (compYes && ccc==0) properties.
986 int prevSrc;
987 int c = 0;
988 int norm16 = 0;
989 for (;;) {
990 if(src==limit) {
991 return (src<<1)|qcResult; // "yes" or "maybe"
992 }
993 if( (c=s.charAt(src))<minNoMaybeCP ||
994 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
995 ) {
996 ++src;
997 } else {
998 prevSrc = src++;
999 if(!UTF16.isSurrogate((char)c)) {
1000 break;
1001 } else {
1002 char c2;
1003 if(UTF16Plus.isSurrogateLead(c)) {
1004 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1005 ++src;
1006 c=Character.toCodePoint((char)c, c2);
1007 }
1008 } else /* trail surrogate */ {
1009 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1010 --prevSrc;
1011 c=Character.toCodePoint(c2, (char)c);
1012 }
1013 }
1014 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1015 break;
1016 }
1017 }
1018 }
1019 }
1020 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1021 // The current character is either a "noNo" (has a mapping)
1022 // or a "maybeYes" (combines backward)
1023 // or a "yesYes" with ccc!=0.
1024 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1025
1026 int prevNorm16 = INERT;
1027 if (prevBoundary != prevSrc) {
1028 prevBoundary = prevSrc;
1029 if (!norm16HasCompBoundaryBefore(norm16)) {
1030 c = Character.codePointBefore(s, prevSrc);
1031 int n16 = getNorm16(c);
1032 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1033 prevBoundary -= Character.charCount(c);
1034 prevNorm16 = n16;
1035 }
1036 }
1037 }
1038
1039 if(isMaybeOrNonZeroCC(norm16)) {
1040 int cc=getCCFromYesOrMaybe(norm16);
1041 if (onlyContiguous /* FCC */ && cc != 0 &&
1042 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1043 // The [prevBoundary..prevSrc[ character
1044 // passed the quick check "yes && ccc==0" test
1045 // but is out of canonical order with the current combining mark.
1046 } else {
1047 // If !onlyContiguous (not FCC), then we ignore the tccc of
1048 // the previous character which passed the quick check "yes && ccc==0" test.
1049 for (;;) {
1050 if (norm16 < MIN_YES_YES_WITH_CC) {
1051 if (!doSpan) {
1052 qcResult = 1;
1053 } else {
1054 return prevBoundary << 1; // spanYes does not care to know it's "maybe"
1055 }
1056 }
1057 if (src == limit) {
1058 return (src<<1) | qcResult; // "yes" or "maybe"
1059 }
1060 int prevCC = cc;
1061 c = Character.codePointAt(s, src);
1062 norm16 = getNorm16(c);
1063 if (isMaybeOrNonZeroCC(norm16)) {
1064 cc = getCCFromYesOrMaybe(norm16);
1065 if (!(prevCC <= cc || cc == 0)) {
1066 break;
1067 }
1068 } else {
1069 break;
1070 }
1071 src += Character.charCount(c);
1072 }
1073 // src is after the last in-order combining mark.
1074 if (isCompYesAndZeroCC(norm16)) {
1075 prevBoundary = src;
1076 src += Character.charCount(c);
1077 continue;
1078 }
1079 }
1080 }
1081 return prevBoundary<<1; // "no"
1082 }
1083 }
1084 public void composeAndAppend(CharSequence s,
1085 boolean doCompose,
1086 boolean onlyContiguous,
1087 ReorderingBuffer buffer) {
1088 int src=0, limit=s.length();
1089 if(!buffer.isEmpty()) {
1090 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1091 if(0!=firstStarterInSrc) {
1092 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1093 buffer.length(), onlyContiguous);
1094 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1095 firstStarterInSrc+16);
1096 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1097 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1098 middle.append(s, 0, firstStarterInSrc);
1099 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1100 src=firstStarterInSrc;
1101 }
1102 }
1103 if(doCompose) {
1104 compose(s, src, limit, onlyContiguous, true, buffer);
1105 } else {
1106 buffer.append(s, src, limit);
1107 }
1108 }
1109 // Dual functionality:
1110 // buffer!=NULL: normalize
1111 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1112 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1113 // Note: In this function we use buffer->appendZeroCC() because we track
1114 // the lead and trail combining classes here, rather than leaving it to
1115 // the ReorderingBuffer.
1116 // The exception is the call to decomposeShort() which uses the buffer
1117 // in the normal way.
1118
1119 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1120 // Similar to the prevBoundary in the compose() implementation.
1121 int prevBoundary=src;
1122 int prevSrc;
1123 int c=0;
1124 int prevFCD16=0;
1125 int fcd16=0;
1126
1127 for(;;) {
1128 // count code units with lccc==0
1129 for(prevSrc=src; src!=limit;) {
1130 if((c=s.charAt(src))<minLcccCP) {
1131 prevFCD16=~c;
1132 ++src;
1133 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1134 prevFCD16=0;
1135 ++src;
1136 } else {
1137 if(UTF16.isSurrogate((char)c)) {
1138 char c2;
1139 if(UTF16Plus.isSurrogateLead(c)) {
1140 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1141 c=Character.toCodePoint((char)c, c2);
1142 }
1143 } else /* trail surrogate */ {
1144 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1145 --src;
1146 c=Character.toCodePoint(c2, (char)c);
1147 }
1148 }
1149 }
1150 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1151 prevFCD16=fcd16;
1152 src+=Character.charCount(c);
1153 } else {
1154 break;
1155 }
1156 }
1157 }
1158 // copy these code units all at once
1159 if(src!=prevSrc) {
1160 if(src==limit) {
1161 if(buffer!=null) {
1162 buffer.flushAndAppendZeroCC(s, prevSrc, src);
1163 }
1164 break;
1165 }
1166 prevBoundary=src;
1167 // We know that the previous character's lccc==0.
1168 if(prevFCD16<0) {
1169 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1170 int prev=~prevFCD16;
1171 if(prev<minDecompNoCP) {
1172 prevFCD16=0;
1173 } else {
1174 prevFCD16=getFCD16FromNormData(prev);
1175 if(prevFCD16>1) {
1176 --prevBoundary;
1177 }
1178 }
1179 } else {
1180 int p=src-1;
1181 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1182 Character.isHighSurrogate(s.charAt(p-1))
1183 ) {
1184 --p;
1185 // Need to fetch the previous character's FCD value because
1186 // prevFCD16 was just for the trail surrogate code point.
1187 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1188 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1189 }
1190 if(prevFCD16>1) {
1191 prevBoundary=p;
1192 }
1193 }
1194 if(buffer!=null) {
1195 // The last lccc==0 character is excluded from the
1196 // flush-and-append call in case it needs to be modified.
1197 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1198 buffer.append(s, prevBoundary, src);
1217 prevFCD16=fcd16;
1218 continue;
1219 } else if(buffer==null) {
1220 return prevBoundary; // quick check "no"
1221 } else {
1222 /*
1223 * Back out the part of the source that we copied or appended
1224 * already but is now going to be decomposed.
1225 * prevSrc is set to after what was copied/appended.
1226 */
1227 buffer.removeSuffix(prevSrc-prevBoundary);
1228 /*
1229 * Find the part of the source that needs to be decomposed,
1230 * up to the next safe boundary.
1231 */
1232 src=findNextFCDBoundary(s, src, limit);
1233 /*
1234 * The source text does not fulfill the conditions for FCD.
1235 * Decompose and reorder a limited piece of the text.
1236 */
1237 decomposeShort(s, prevBoundary, src, false, false, buffer);
1238 prevBoundary=src;
1239 prevFCD16=0;
1240 }
1241 }
1242 return src;
1243 }
1244
1245 public boolean hasDecompBoundaryBefore(int c) {
1246 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1247 norm16HasDecompBoundaryBefore(getNorm16(c));
1248 }
1249 public boolean norm16HasDecompBoundaryBefore(int norm16) {
1250 if (norm16 < minNoNoCompNoMaybeCC) {
1251 return true;
1252 }
1253 if (norm16 >= limitNoNo) {
1254 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1255 }
1256 // c decomposes, get everything from the variable-length extra data
1257 int mapping=norm16>>OFFSET_SHIFT;
1258 int firstUnit=extraData.charAt(mapping);
1259 // true if leadCC==0 (hasFCDBoundaryBefore())
1260 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1261 }
1262 public boolean hasDecompBoundaryAfter(int c) {
1263 if (c < minDecompNoCP) {
1264 return true;
1265 }
1266 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1267 return true;
1268 }
1269 return norm16HasDecompBoundaryAfter(getNorm16(c));
1270 }
1271 public boolean norm16HasDecompBoundaryAfter(int norm16) {
1272 if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1273 return true;
1274 }
1275 if (norm16 >= limitNoNo) {
1276 if (isMaybeOrNonZeroCC(norm16)) {
1277 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1278 }
1279 // Maps to an isCompYesAndZeroCC.
1280 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1281 }
1282 // c decomposes, get everything from the variable-length extra data
1283 int mapping=norm16>>OFFSET_SHIFT;
1284 int firstUnit=extraData.charAt(mapping);
1285 // decomp after-boundary: same as hasFCDBoundaryAfter(),
1286 // fcd16<=1 || trailCC==0
1287 if(firstUnit>0x1ff) {
1288 return false; // trailCC>1
1289 }
1290 if(firstUnit<=0xff) {
1291 return true; // trailCC==0
1292 }
1293 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1294 // true if leadCC==0 (hasFCDBoundaryBefore())
1295 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1296 }
1297 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1298
1299 public boolean hasCompBoundaryBefore(int c) {
1300 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1301 }
1302 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1303 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1304 }
1305
1306 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
1307 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
1308 private static boolean isInert(int norm16) { return norm16==INERT; }
1309 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
1310 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
1311 private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
1312 private boolean isHangulLVT(int norm16) {
1313 return norm16==hangulLVT();
1314 }
1315 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1316 // UBool isCompYes(uint16_t norm16) const {
1317 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1318 // }
1319 // UBool isCompYesOrMaybe(uint16_t norm16) const {
1320 // return norm16<minNoNo || minMaybeYes<=norm16;
1321 // }
1322 // private boolean hasZeroCCFromDecompYes(int norm16) {
1323 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1324 // }
1325 private boolean isDecompYesAndZeroCC(int norm16) {
1326 return norm16<minYesNo ||
1327 norm16==JAMO_VT ||
1328 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1329 }
1330 /**
1331 * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1332 * the MaybeYes which combine-forward and have ccc=0.
1333 * (Standard Unicode 10 normalization does not have such characters.)
1334 */
1335 private boolean isMostDecompYesAndZeroCC(int norm16) {
1336 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1337 }
1338 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1339
1340 // For use with isCompYes().
1341 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1342 // static uint8_t getCCFromYes(uint16_t norm16) {
1343 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1344 // }
1345 private int getCCFromNoNo(int norm16) {
1346 int mapping=norm16>>OFFSET_SHIFT;
1347 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1348 return extraData.charAt(mapping-1)&0xff;
1349 } else {
1350 return 0;
1351 }
1352 }
1353 int getTrailCCFromCompYesAndZeroCC(int norm16) {
1354 if(norm16<=minYesNo) {
1355 return 0; // yesYes and Hangul LV have ccc=tccc=0
1356 } else {
1357 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1358 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo
1359 }
1360 }
1361
1362 // Requires algorithmic-NoNo.
1363 private int mapAlgorithmic(int c, int norm16) {
1364 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1365 }
1366
1367 // Requires minYesNo<norm16<limitNoNo.
1368 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1369
1370 /**
1371 * @return index into maybeYesCompositions, or -1
1372 */
1373 private int getCompositionsListForDecompYes(int norm16) {
1374 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1375 return -1;
1376 } else {
1377 if((norm16-=minMaybeYes)<0) {
1378 // norm16<minMaybeYes: index into extraData which is a substring at
1379 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1380 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1381 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list
1382 }
1383 return norm16>>OFFSET_SHIFT;
1384 }
1385 }
1386 /**
1387 * @return index into maybeYesCompositions
1388 */
1389 private int getCompositionsListForComposite(int norm16) {
1390 // A composite has both mapping & compositions list.
1391 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1392 int firstUnit=maybeYesCompositions.charAt(list);
1393 return list+ // mapping in maybeYesCompositions
1394 1+ // +1 to skip the first unit with the mapping length
1395 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length
1396 }
1397
1398 // Decompose a short piece of text which is likely to contain characters that
1399 // fail the quick check loop and/or where the quick check loop's overhead
1400 // is unlikely to be amortized.
1401 // Called by the compose() and makeFCD() implementations.
1402 // Public in Java for collation implementation code.
1403 private int decomposeShort(
1404 CharSequence s, int src, int limit,
1405 boolean stopAtCompBoundary, boolean onlyContiguous,
1406 ReorderingBuffer buffer) {
1407 while(src<limit) {
1408 int c=Character.codePointAt(s, src);
1409 if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1410 return src;
1411 }
1412 int norm16 = getNorm16(c);
1413 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1414 return src;
1415 }
1416 src+=Character.charCount(c);
1417 decompose(c, norm16, buffer);
1418 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1419 return src;
1420 }
1421 }
1422 return src;
1423 }
1424 private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1425 // get the decomposition and the lead and trail cc's
1426 if (norm16 >= limitNoNo) {
1427 if (isMaybeOrNonZeroCC(norm16)) {
1428 buffer.append(c, getCCFromYesOrMaybe(norm16));
1429 return;
1430 }
1431 // Maps to an isCompYesAndZeroCC.
1432 c=mapAlgorithmic(c, norm16);
1433 norm16=getNorm16(c);
1434 }
1435 if (norm16 < minYesNo) {
1436 // c does not decompose
1437 buffer.append(c, 0);
1438 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1439 // Hangul syllable: decompose algorithmically
1440 Hangul.decompose(c, buffer);
1441 } else {
1442 // c decomposes, get everything from the variable-length extra data
1443 int mapping=norm16>>OFFSET_SHIFT;
1444 int firstUnit=extraData.charAt(mapping);
1445 int length=firstUnit&MAPPING_LENGTH_MASK;
1446 int leadCC, trailCC;
1447 trailCC=firstUnit>>8;
1448 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1449 leadCC=extraData.charAt(mapping-1)>>8;
1450 } else {
1451 leadCC=0;
1452 }
1453 ++mapping; // skip over the firstUnit
1454 buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
1455 }
1456 }
1457
1458 /**
1459 * Finds the recomposition result for
1460 * a forward-combining "lead" character,
1461 * specified with a pointer to its compositions list,
1462 * and a backward-combining "trail" character.
1463 *
1464 * <p>If the lead and trail characters combine, then this function returns
1465 * the following "compositeAndFwd" value:
1466 * <pre>
1467 * Bits 21..1 composite character
1468 * Bit 0 set if the composite is a forward-combining starter
1469 * </pre>
1470 * otherwise it returns -1.
1471 *
1472 * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1473 * encoded as either pairs or triples of 16-bit units.
1474 * The last entry has the high bit of its first unit set.
1475 *
1476 * <p>The list is sorted by ascending trail characters (there are no duplicates).
1477 * A linear search is used.
1478 *
1479 * <p>See normalizer2impl.h for a more detailed description
1480 * of the compositions list format.
1481 */
1482 private static int combine(String compositions, int list, int trail) {
1483 int key1, firstUnit;
1484 if(trail<COMP_1_TRAIL_LIMIT) {
1485 // trail character is 0..33FF
1486 // result entry may have 2 or 3 units
1487 key1=(trail<<1);
1488 while(key1>(firstUnit=compositions.charAt(list))) {
1489 list+=2+(firstUnit&COMP_1_TRIPLE);
1490 }
1491 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1492 if((firstUnit&COMP_1_TRIPLE)!=0) {
1493 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1494 } else {
1495 return compositions.charAt(list+1);
1496 }
1497 }
1498 } else {
1499 // trail character is 3400..10FFFF
1500 // result entry has 3 units
1501 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1502 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1503 int secondUnit;
1504 for(;;) {
1505 if(key1>(firstUnit=compositions.charAt(list))) {
1506 list+=2+(firstUnit&COMP_1_TRIPLE);
1507 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1508 if(key2>(secondUnit=compositions.charAt(list+1))) {
1509 if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1510 break;
1511 } else {
1512 list+=3;
1513 }
1549 int cc, prevCC;
1550 boolean starterIsSupplementary;
1551
1552 // Some of the following variables are not used until we have a forward-combining starter
1553 // and are only initialized now to avoid compiler warnings.
1554 compositionsList=-1; // used as indicator for whether we have a forward-combining starter
1555 starter=-1;
1556 starterIsSupplementary=false;
1557 prevCC=0;
1558
1559 for(;;) {
1560 c=sb.codePointAt(p);
1561 p+=Character.charCount(c);
1562 norm16=getNorm16(c);
1563 cc=getCCFromYesOrMaybe(norm16);
1564 if( // this character combines backward and
1565 isMaybe(norm16) &&
1566 // we have seen a starter that combines forward and
1567 compositionsList>=0 &&
1568 // the backward-combining character is not blocked
1569 (prevCC<cc || prevCC==0)
1570 ) {
1571 if(isJamoVT(norm16)) {
1572 // c is a Jamo V/T, see if we can compose it with the previous character.
1573 if(c<Hangul.JAMO_T_BASE) {
1574 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1575 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1576 if(prev<Hangul.JAMO_L_COUNT) {
1577 pRemove=p-1;
1578 char syllable=(char)
1579 (Hangul.HANGUL_BASE+
1580 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1581 Hangul.JAMO_T_COUNT);
1582 char t;
1583 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1584 ++p;
1585 syllable+=t; // The next character was a Jamo T.
1586 }
1587 sb.setCharAt(starter, syllable);
1588 // remove the Jamo V/T
1589 sb.delete(pRemove, p);
1590 p=pRemove;
1671 starterIsSupplementary=true;
1672 starter=p-2;
1673 }
1674 }
1675 } else if(onlyContiguous) {
1676 // FCC: no discontiguous compositions; any intervening character blocks.
1677 compositionsList=-1;
1678 }
1679 }
1680 buffer.flush();
1681 }
1682
1683 /**
1684 * Does c have a composition boundary before it?
1685 * True if its decomposition begins with a character that has
1686 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1687 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1688 * (isCompYesAndZeroCC()) so we need not decompose.
1689 */
1690 private boolean hasCompBoundaryBefore(int c, int norm16) {
1691 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
1692 }
1693 private boolean norm16HasCompBoundaryBefore(int norm16) {
1694 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
1695 }
1696 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
1697 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
1698 }
1699 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
1700 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1701 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
1702 }
1703 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
1704 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
1705 }
1706 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
1707 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
1708 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
1709 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
1710 }
1711
1712 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
1713 while(p>0) {
1714 int c=Character.codePointBefore(s, p);
1715 int norm16 = getNorm16(c);
1716 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1717 break;
1718 }
1719 p-=Character.charCount(c);
1720 if(hasCompBoundaryBefore(c, norm16)) {
1721 break;
1722 }
1723 }
1724 return p;
1725 }
1726 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
1727 while(p<limit) {
1728 int c=Character.codePointAt(s, p);
1729 int norm16=normTrie.get(c);
1730 if(hasCompBoundaryBefore(c, norm16)) {
1731 break;
1732 }
1733 p+=Character.charCount(c);
1734 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1735 break;
1736 }
1737 }
1738 return p;
1739 }
1740
1741
1742 private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1743 while(p<limit) {
1744 int c=Character.codePointAt(s, p);
1745 int norm16;
1746 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
1747 break;
1748 }
1749 p+=Character.charCount(c);
1750 if (norm16HasDecompBoundaryAfter(norm16)) {
1751 break;
1752 }
1753 }
1754 return p;
1755 }
1756
1757 /**
1758 * Get the canonical decomposition
1759 * sherman for ComposedCharIter
1760 */
1761 public static int getDecompose(int chars[], String decomps[]) {
1762 Normalizer2 impl = Normalizer2.getNFDInstance();
1763
1764 int length=0;
1765 int norm16 = 0;
1766 int ch = -1;
1767 int i = 0;
1768
1769 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
1770 //TBD !!!! the hack code heres save us about 50ms for startup
1771 //need a better solution/lookup
1772 if (ch == 0x30ff)
2015 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
2016 // [current]..[p] 1 code point (c, c2) with cc
2017
2018 // move the code units in between up
2019 r=p;
2020 do {
2021 source[--r]=source[--current];
2022 } while (back!=current);
2023 }
2024 }
2025
2026 // insert (c1, c2)
2027 source[current] = c1;
2028 if (c2!=0) {
2029 source[(current+1)] = c2;
2030 }
2031
2032 // we know the cc of the last code point
2033 return trailCC;
2034 }
2035 /**
2036 * merge two UTF-16 string parts together
2037 * to canonically order (order by combining classes) their concatenation
2038 *
2039 * the two strings may already be adjacent, so that the merging is done
2040 * in-place if the two strings are not adjacent, then the buffer holding the
2041 * first one must be large enough
2042 * the second string may or may not be ordered in itself
2043 *
2044 * before: [start]..[current] is already ordered, and
2045 * [next]..[limit] may be ordered in itself, but
2046 * is not in relation to [start..current[
2047 * after: [start..current+(limit-next)[ is ordered
2048 *
2049 * the algorithm is a simple bubble-sort that takes the characters from
2050 * src[next++] and inserts them in correct combining class order into the
2051 * preceding part of the string
2052 *
2053 * since this function is called much less often than the single-code point
2054 * insertOrdered(), it just uses that for easier maintenance
2098
2099 if(ncArgs.next==ncArgs.limit) {
2100 // we know the cc of the last code point
2101 return trailCC;
2102 } else {
2103 if(!adjacent) {
2104 // copy the second string part
2105 do {
2106 source[current++]=data[ncArgs.next++];
2107 } while(ncArgs.next!=ncArgs.limit);
2108 ncArgs.limit=current;
2109 }
2110 PrevArgs prevArgs = new PrevArgs();
2111 prevArgs.src = data;
2112 prevArgs.start = start;
2113 prevArgs.current = ncArgs.limit;
2114 return getPrevCC(prevArgs);
2115 }
2116
2117 }
2118 private static final class PrevArgs{
2119 char[] src;
2120 int start;
2121 int current;
2122 char c1;
2123 char c2;
2124 }
2125
2126 private static final class NextCCArgs{
2127 char[] source;
2128 int next;
2129 int limit;
2130 char c1;
2131 char c2;
2132 }
2133 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2134 args.c1=args.source[args.next++];
2135 args.c2=0;
2136
2137 if (UTF16.isTrailSurrogate(args.c1)) {
2138 /* unpaired second surrogate */
2139 return 0;
2140 } else if (!UTF16.isLeadSurrogate(args.c1)) {
2141 return UCharacter.getCombiningClass(args.c1);
2142 } else if (args.next!=args.limit &&
2143 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2144 ++args.next;
2145 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2146 } else {
2147 /* unpaired first surrogate */
2148 args.c2=0;
2149 return 0;
2150 }
2151 }
2152 private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2153 args.c1=args.src[--args.current];
2154 args.c2=0;
2155
2156 if (args.c1 < MIN_CCC_LCCC_CP) {
2157 return 0;
2158 } else if (UTF16.isLeadSurrogate(args.c1)) {
2159 /* unpaired first surrogate */
2160 return 0;
2161 } else if (!UTF16.isTrailSurrogate(args.c1)) {
2162 return UCharacter.getCombiningClass(args.c1);
2163 } else if (args.current!=args.start &&
2164 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2165 --args.current;
2166 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2167 } else {
2168 /* unpaired second surrogate */
2169 args.c2=0;
2170 return 0;
2171 }
2172 }
2173
2174 private int getPreviousTrailCC(CharSequence s, int start, int p) {
2175 if (start == p) {
2176 return 0;
2177 }
2178 return getFCD16(Character.codePointBefore(s, p));
2179 }
2180
2181 private VersionInfo dataVersion;
2182
2183 // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2184 private int minDecompNoCP;
2185 private int minCompNoMaybeCP;
2186 private int minLcccCP;
2187
2188 // Norm16 value thresholds for quick check combinations and types of extra data.
2189 private int minYesNo;
2190 private int minYesNoMappingsOnly;
2191 private int minNoNo;
2192 private int minNoNoCompBoundaryBefore;
2193 private int minNoNoCompNoMaybeCC;
2194 private int minNoNoEmpty;
2195 private int limitNoNo;
2196 private int centerNoNoDelta;
2197 private int minMaybeYes;
2198
2199 private Trie2_16 normTrie;
2200 private String maybeYesCompositions;
2201 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
2202 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2203
2204 }
|