< prev index next >

src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

Print this page
rev 54996 : 8221431: Support for Unicode 12.1
Reviewed-by:

*** 1,7 **** /* ! * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 143,154 **** } } else { insert(c, cc); } } ! // s must be in NFD, otherwise change the implementation. ! public void append(CharSequence s, int start, int limit, int leadCC, int trailCC) { if(start==limit) { return; } if(lastCC<=leadCC || leadCC==0) { --- 143,153 ---- } } else { insert(c, cc); } } ! public void append(CharSequence s, int start, int limit, boolean isNFD, int leadCC, int trailCC) { if(start==limit) { return; } if(lastCC<=leadCC || leadCC==0) {
*** 165,176 **** insert(c, leadCC); // insert first code point while(start<limit) { c=Character.codePointAt(s, start); start+=Character.charCount(c); if(start<limit) { ! // s must be in NFD, otherwise we need to use getCC(). ! leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); } else { leadCC=trailCC; } append(c, leadCC); } --- 164,178 ---- insert(c, leadCC); // insert first code point while(start<limit) { c=Character.codePointAt(s, start); start+=Character.charCount(c); if(start<limit) { ! if (isNFD) { ! leadCC = getCCFromYesOrMaybe(impl.getNorm16(c)); ! } else { ! leadCC = impl.getCC(impl.getNorm16(c)); ! } } else { leadCC=trailCC; } append(c, leadCC); }
*** 309,318 **** --- 311,326 ---- // TODO: Propose as public API on the UTF16 class. // TODO: Propose widening UTF16 methods that take char to take int. // TODO: Propose widening UTF16 methods that take String to take CharSequence. public static final class UTF16Plus { /** + * Is this code point a lead surrogate (U+d800..U+dbff)? + * @param c code unit or code point + * @return true or false + */ + public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } + /** * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), * is it a lead surrogate? * @param c code unit or code point * @return true or false */
*** 348,358 **** public NormalizerImpl() {} private static final class IsAcceptable implements ICUBinary.Authenticate { public boolean isDataVersionAcceptable(byte version[]) { ! return version[0]==3; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" --- 356,366 ---- public NormalizerImpl() {} private static final class IsAcceptable implements ICUBinary.Authenticate { public boolean isDataVersionAcceptable(byte version[]) { ! return version[0]==4; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
*** 385,412 **** centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; ! normTrie=Trie2_16.createFromSerialized(bytes); ! int trieLength=normTrie.getSerializedLength(); if(trieLength>(nextOffset-offset)) { throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; int numChars=(nextOffset-offset)/2; - char[] chars; if(numChars!=0) { ! chars=new char[numChars]; ! for(int i=0; i<numChars; ++i) { ! chars[i]=bytes.getChar(); ! } ! maybeYesCompositions=new String(chars); extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); } // smallFCD: new in formatVersion 2 offset=nextOffset; --- 393,416 ---- centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; ! int triePosition = bytes.position(); ! normTrie = CodePointTrie.Fast16.fromBinary(bytes); ! int trieLength = bytes.position() - triePosition; if(trieLength>(nextOffset-offset)) { throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; int numChars=(nextOffset-offset)/2; if(numChars!=0) { ! maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); } // smallFCD: new in formatVersion 2 offset=nextOffset;
*** 420,431 **** } public NormalizerImpl load(String name) { return load(ICUBinary.getRequiredData(name)); } ! ! public int getNorm16(int c) { return normTrie.get(c); } public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } public int getCC(int norm16) { --- 424,439 ---- } public NormalizerImpl load(String name) { return load(ICUBinary.getRequiredData(name)); } ! // The trie stores values for lead surrogate code *units*. ! // Surrogate code *points* are inert. ! public int getNorm16(int c) { ! return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); ! } ! public int getRawNorm16(int c) { return normTrie.get(c); } public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } public int getCC(int norm16) {
*** 484,494 **** if (deltaTrailCC <= DELTA_TCCC_1) { return deltaTrailCC >> OFFSET_SHIFT; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); ! norm16=getNorm16(c); } } if(norm16<=minYesNo || isHangulLVT(norm16)) { // no decomposition or Hangul syllable, all zeros return 0; --- 492,502 ---- if (deltaTrailCC <= DELTA_TCCC_1) { return deltaTrailCC >> OFFSET_SHIFT; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); ! norm16=getRawNorm16(c); } } if(norm16<=minYesNo || isHangulLVT(norm16)) { // no decomposition or Hangul syllable, all zeros return 0;
*** 517,527 **** int decomp = -1; if(isDecompNoAlgorithmic(norm16)) { // Maps to an isCompYesAndZeroCC. decomp=c=mapAlgorithmic(c, norm16); // The mapping might decompose further. ! norm16 = getNorm16(c); } if (norm16 < minYesNo) { if(decomp<0) { return null; } else { --- 525,535 ---- int decomp = -1; if(isDecompNoAlgorithmic(norm16)) { // Maps to an isCompYesAndZeroCC. decomp=c=mapAlgorithmic(c, norm16); // The mapping might decompose further. ! norm16 = getRawNorm16(c); } if (norm16 < minYesNo) { if(decomp<0) { return null; } else {
*** 639,670 **** for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))<minNoCP || ! isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; ! } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { ! c=Character.toCodePoint((char)c, c2); ! } ! } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; ! c=Character.toCodePoint(c2, (char)c); ! } ! } ! if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { ! src+=Character.charCount(c); } else { break; } } } // copy these code units all at once if(src!=prevSrc) { if(buffer!=null) { --- 647,674 ---- for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))<minNoCP || ! isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c)) ) { ++src; ! } else if(!UTF16Plus.isLeadSurrogate(c)) { break; } else { char c2; ! if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { ! c = Character.toCodePoint((char)c, c2); ! norm16 = normTrie.suppGet(c); ! if (isMostDecompYesAndZeroCC(norm16)) { ! src += 2; } else { break; } + } else { + ++src; // unpaired lead surrogate: inert + } } } // copy these code units all at once if(src!=prevSrc) { if(buffer!=null) {
*** 719,729 **** break; } c=Character.codePointAt(s, src); cc=getCC(getNorm16(c)); }; ! buffer.append(s, 0, src, firstCC, prevCC); buffer.append(s, src, limit); } // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize --- 723,733 ---- break; } c=Character.codePointAt(s, src); cc=getCC(getNorm16(c)); }; ! buffer.append(s, 0, src, false, firstCC, prevCC); buffer.append(s, src, limit); } // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize
*** 747,782 **** buffer.append(s, prevBoundary, limit); } return true; } if( (c=s.charAt(src))<minNoMaybeCP || ! isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else { prevSrc = src++; ! if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { ++src; ! c=Character.toCodePoint((char)c, c2); ! } ! } else /* trail surrogate */ { ! if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { ! --prevSrc; ! c=Character.toCodePoint(c2, (char)c); ! } ! } ! if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { break; } } } } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. --- 751,780 ---- buffer.append(s, prevBoundary, limit); } return true; } if( (c=s.charAt(src))<minNoMaybeCP || ! isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) ) { ++src; } else { prevSrc = src++; ! if (!UTF16Plus.isLeadSurrogate(c)) { break; } else { char c2; ! if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { ++src; ! c = Character.toCodePoint((char)c, c2); ! norm16 = normTrie.suppGet(c); ! if (!isCompYesAndZeroCC(norm16)) { break; } } } } + } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties.
*** 989,1024 **** for (;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))<minNoMaybeCP || ! isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else { prevSrc = src++; ! if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { ++src; ! c=Character.toCodePoint((char)c, c2); ! } ! } else /* trail surrogate */ { ! if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { ! --prevSrc; ! c=Character.toCodePoint(c2, (char)c); ! } ! } ! if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { break; } } } } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. --- 987,1016 ---- for (;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))<minNoMaybeCP || ! isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) ) { ++src; } else { prevSrc = src++; ! if (!UTF16Plus.isLeadSurrogate(c)) { break; } else { char c2; ! if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { ++src; ! c = Character.toCodePoint((char)c, c2); ! norm16 = normTrie.suppGet(c); ! if (!isCompYesAndZeroCC(norm16)) { break; } } } } + } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties.
*** 1132,1152 **** ++src; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { prevFCD16=0; ++src; } else { ! if(UTF16.isSurrogate((char)c)) { char c2; ! if(UTF16Plus.isSurrogateLead(c)) { ! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { ! c=Character.toCodePoint((char)c, c2); ! } ! } else /* trail surrogate */ { ! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { ! --src; ! c=Character.toCodePoint(c2, (char)c); ! } } } if((fcd16=getFCD16FromNormData(c))<=0xff) { prevFCD16=fcd16; src+=Character.charCount(c); --- 1124,1137 ---- ++src; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { prevFCD16=0; ++src; } else { ! if (UTF16Plus.isLeadSurrogate(c)) { char c2; ! if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { ! c = Character.toCodePoint((char)c, c2); } } if((fcd16=getFCD16FromNormData(c))<=0xff) { prevFCD16=fcd16; src+=Character.charCount(c);
*** 1428,1438 **** buffer.append(c, getCCFromYesOrMaybe(norm16)); return; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); ! norm16=getNorm16(c); } if (norm16 < minYesNo) { // c does not decompose buffer.append(c, 0); } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { --- 1413,1423 ---- buffer.append(c, getCCFromYesOrMaybe(norm16)); return; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); ! norm16=getRawNorm16(c); } if (norm16 < minYesNo) { // c does not decompose buffer.append(c, 0); } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
*** 1449,1459 **** leadCC=extraData.charAt(mapping-1)>>8; } else { leadCC=0; } ++mapping; // skip over the firstUnit ! buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); } } /** * Finds the recomposition result for --- 1434,1444 ---- leadCC=extraData.charAt(mapping-1)>>8; } else { leadCC=0; } ++mapping; // skip over the firstUnit ! buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); } } /** * Finds the recomposition result for
*** 1641,1651 **** break; } // Is the composite a starter that combines forward? if((compositeAndFwd&1)!=0) { compositionsList= ! getCompositionsListForComposite(getNorm16(composite)); } else { compositionsList=-1; } // We combined; continue with looking for compositions. --- 1626,1636 ---- break; } // Is the composite a starter that combines forward? if((compositeAndFwd&1)!=0) { compositionsList= ! getCompositionsListForComposite(getRawNorm16(composite)); } else { compositionsList=-1; } // We combined; continue with looking for compositions.
*** 2194,2204 **** private int minNoNoEmpty; private int limitNoNo; private int centerNoNoDelta; private int minMaybeYes; ! private Trie2_16 normTrie; private String maybeYesCompositions; private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 ! ! } --- 2179,2188 ---- private int minNoNoEmpty; private int limitNoNo; private int centerNoNoDelta; private int minMaybeYes; ! private CodePointTrie.Fast16 normTrie; private String maybeYesCompositions; private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 ! }
< prev index next >