< prev index next >
src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java
Print this page
rev 54996 : 8221431: Support for Unicode 12.1
Reviewed-by:
*** 1,7 ****
/*
! * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
--- 1,7 ----
/*
! * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
*** 143,154 ****
}
} else {
insert(c, cc);
}
}
! // s must be in NFD, otherwise change the implementation.
! public void append(CharSequence s, int start, int limit,
int leadCC, int trailCC) {
if(start==limit) {
return;
}
if(lastCC<=leadCC || leadCC==0) {
--- 143,153 ----
}
} else {
insert(c, cc);
}
}
! public void append(CharSequence s, int start, int limit, boolean isNFD,
int leadCC, int trailCC) {
if(start==limit) {
return;
}
if(lastCC<=leadCC || leadCC==0) {
*** 165,176 ****
insert(c, leadCC); // insert first code point
while(start<limit) {
c=Character.codePointAt(s, start);
start+=Character.charCount(c);
if(start<limit) {
! // s must be in NFD, otherwise we need to use getCC().
! leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
} else {
leadCC=trailCC;
}
append(c, leadCC);
}
--- 164,178 ----
insert(c, leadCC); // insert first code point
while(start<limit) {
c=Character.codePointAt(s, start);
start+=Character.charCount(c);
if(start<limit) {
! if (isNFD) {
! leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
! } else {
! leadCC = impl.getCC(impl.getNorm16(c));
! }
} else {
leadCC=trailCC;
}
append(c, leadCC);
}
*** 309,318 ****
--- 311,326 ----
// TODO: Propose as public API on the UTF16 class.
// TODO: Propose widening UTF16 methods that take char to take int.
// TODO: Propose widening UTF16 methods that take String to take CharSequence.
public static final class UTF16Plus {
/**
+ * Is this code point a lead surrogate (U+d800..U+dbff)?
+ * @param c code unit or code point
+ * @return true or false
+ */
+ public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
+ /**
* Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
* is it a lead surrogate?
* @param c code unit or code point
* @return true or false
*/
*** 348,358 ****
public NormalizerImpl() {}
private static final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
! return version[0]==3;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
--- 356,366 ----
public NormalizerImpl() {}
private static final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
! return version[0]==4;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
*** 385,412 ****
centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
// Read the normTrie.
int offset=inIndexes[IX_NORM_TRIE_OFFSET];
int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
! normTrie=Trie2_16.createFromSerialized(bytes);
! int trieLength=normTrie.getSerializedLength();
if(trieLength>(nextOffset-offset)) {
throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
}
ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
// Read the composition and mapping data.
offset=nextOffset;
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
int numChars=(nextOffset-offset)/2;
- char[] chars;
if(numChars!=0) {
! chars=new char[numChars];
! for(int i=0; i<numChars; ++i) {
! chars[i]=bytes.getChar();
! }
! maybeYesCompositions=new String(chars);
extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
}
// smallFCD: new in formatVersion 2
offset=nextOffset;
--- 393,416 ----
centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
// Read the normTrie.
int offset=inIndexes[IX_NORM_TRIE_OFFSET];
int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
! int triePosition = bytes.position();
! normTrie = CodePointTrie.Fast16.fromBinary(bytes);
! int trieLength = bytes.position() - triePosition;
if(trieLength>(nextOffset-offset)) {
throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
}
ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
// Read the composition and mapping data.
offset=nextOffset;
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
int numChars=(nextOffset-offset)/2;
if(numChars!=0) {
! maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
}
// smallFCD: new in formatVersion 2
offset=nextOffset;
*** 420,431 ****
}
public NormalizerImpl load(String name) {
return load(ICUBinary.getRequiredData(name));
}
!
! public int getNorm16(int c) { return normTrie.get(c); }
public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
public int getCC(int norm16) {
--- 424,439 ----
}
public NormalizerImpl load(String name) {
return load(ICUBinary.getRequiredData(name));
}
! // The trie stores values for lead surrogate code *units*.
! // Surrogate code *points* are inert.
! public int getNorm16(int c) {
! return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
! }
! public int getRawNorm16(int c) { return normTrie.get(c); }
public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
public int getCC(int norm16) {
*** 484,494 ****
if (deltaTrailCC <= DELTA_TCCC_1) {
return deltaTrailCC >> OFFSET_SHIFT;
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
! norm16=getNorm16(c);
}
}
if(norm16<=minYesNo || isHangulLVT(norm16)) {
// no decomposition or Hangul syllable, all zeros
return 0;
--- 492,502 ----
if (deltaTrailCC <= DELTA_TCCC_1) {
return deltaTrailCC >> OFFSET_SHIFT;
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
! norm16=getRawNorm16(c);
}
}
if(norm16<=minYesNo || isHangulLVT(norm16)) {
// no decomposition or Hangul syllable, all zeros
return 0;
*** 517,527 ****
int decomp = -1;
if(isDecompNoAlgorithmic(norm16)) {
// Maps to an isCompYesAndZeroCC.
decomp=c=mapAlgorithmic(c, norm16);
// The mapping might decompose further.
! norm16 = getNorm16(c);
}
if (norm16 < minYesNo) {
if(decomp<0) {
return null;
} else {
--- 525,535 ----
int decomp = -1;
if(isDecompNoAlgorithmic(norm16)) {
// Maps to an isCompYesAndZeroCC.
decomp=c=mapAlgorithmic(c, norm16);
// The mapping might decompose further.
! norm16 = getRawNorm16(c);
}
if (norm16 < minYesNo) {
if(decomp<0) {
return null;
} else {
*** 639,670 ****
for(;;) {
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src; src!=limit;) {
if( (c=s.charAt(src))<minNoCP ||
! isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
) {
++src;
! } else if(!UTF16.isSurrogate((char)c)) {
break;
} else {
char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
! c=Character.toCodePoint((char)c, c2);
! }
! } else /* trail surrogate */ {
! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
! --src;
! c=Character.toCodePoint(c2, (char)c);
! }
! }
! if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
! src+=Character.charCount(c);
} else {
break;
}
}
}
// copy these code units all at once
if(src!=prevSrc) {
if(buffer!=null) {
--- 647,674 ----
for(;;) {
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src; src!=limit;) {
if( (c=s.charAt(src))<minNoCP ||
! isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
) {
++src;
! } else if(!UTF16Plus.isLeadSurrogate(c)) {
break;
} else {
char c2;
! if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
! c = Character.toCodePoint((char)c, c2);
! norm16 = normTrie.suppGet(c);
! if (isMostDecompYesAndZeroCC(norm16)) {
! src += 2;
} else {
break;
}
+ } else {
+ ++src; // unpaired lead surrogate: inert
+ }
}
}
// copy these code units all at once
if(src!=prevSrc) {
if(buffer!=null) {
*** 719,729 ****
break;
}
c=Character.codePointAt(s, src);
cc=getCC(getNorm16(c));
};
! buffer.append(s, 0, src, firstCC, prevCC);
buffer.append(s, src, limit);
}
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
// doCompose: normalize
--- 723,733 ----
break;
}
c=Character.codePointAt(s, src);
cc=getCC(getNorm16(c));
};
! buffer.append(s, 0, src, false, firstCC, prevCC);
buffer.append(s, src, limit);
}
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
// doCompose: normalize
*** 747,782 ****
buffer.append(s, prevBoundary, limit);
}
return true;
}
if( (c=s.charAt(src))<minNoMaybeCP ||
! isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
) {
++src;
} else {
prevSrc = src++;
! if(!UTF16.isSurrogate((char)c)) {
break;
} else {
char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
++src;
! c=Character.toCodePoint((char)c, c2);
! }
! } else /* trail surrogate */ {
! if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
! --prevSrc;
! c=Character.toCodePoint(c2, (char)c);
! }
! }
! if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
break;
}
}
}
}
// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
// The current character is either a "noNo" (has a mapping)
// or a "maybeYes" (combines backward)
// or a "yesYes" with ccc!=0.
// It is not a Hangul syllable or Jamo L because those have "yes" properties.
--- 751,780 ----
buffer.append(s, prevBoundary, limit);
}
return true;
}
if( (c=s.charAt(src))<minNoMaybeCP ||
! isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
) {
++src;
} else {
prevSrc = src++;
! if (!UTF16Plus.isLeadSurrogate(c)) {
break;
} else {
char c2;
! if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
++src;
! c = Character.toCodePoint((char)c, c2);
! norm16 = normTrie.suppGet(c);
! if (!isCompYesAndZeroCC(norm16)) {
break;
}
}
}
}
+ }
// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
// The current character is either a "noNo" (has a mapping)
// or a "maybeYes" (combines backward)
// or a "yesYes" with ccc!=0.
// It is not a Hangul syllable or Jamo L because those have "yes" properties.
*** 989,1024 ****
for (;;) {
if(src==limit) {
return (src<<1)|qcResult; // "yes" or "maybe"
}
if( (c=s.charAt(src))<minNoMaybeCP ||
! isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
) {
++src;
} else {
prevSrc = src++;
! if(!UTF16.isSurrogate((char)c)) {
break;
} else {
char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
++src;
! c=Character.toCodePoint((char)c, c2);
! }
! } else /* trail surrogate */ {
! if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
! --prevSrc;
! c=Character.toCodePoint(c2, (char)c);
! }
! }
! if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
break;
}
}
}
}
// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
// The current character is either a "noNo" (has a mapping)
// or a "maybeYes" (combines backward)
// or a "yesYes" with ccc!=0.
// It is not a Hangul syllable or Jamo L because those have "yes" properties.
--- 987,1016 ----
for (;;) {
if(src==limit) {
return (src<<1)|qcResult; // "yes" or "maybe"
}
if( (c=s.charAt(src))<minNoMaybeCP ||
! isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
) {
++src;
} else {
prevSrc = src++;
! if (!UTF16Plus.isLeadSurrogate(c)) {
break;
} else {
char c2;
! if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
++src;
! c = Character.toCodePoint((char)c, c2);
! norm16 = normTrie.suppGet(c);
! if (!isCompYesAndZeroCC(norm16)) {
break;
}
}
}
}
+ }
// isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
// The current character is either a "noNo" (has a mapping)
// or a "maybeYes" (combines backward)
// or a "yesYes" with ccc!=0.
// It is not a Hangul syllable or Jamo L because those have "yes" properties.
*** 1132,1152 ****
++src;
} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
prevFCD16=0;
++src;
} else {
! if(UTF16.isSurrogate((char)c)) {
char c2;
! if(UTF16Plus.isSurrogateLead(c)) {
! if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
! c=Character.toCodePoint((char)c, c2);
! }
! } else /* trail surrogate */ {
! if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
! --src;
! c=Character.toCodePoint(c2, (char)c);
! }
}
}
if((fcd16=getFCD16FromNormData(c))<=0xff) {
prevFCD16=fcd16;
src+=Character.charCount(c);
--- 1124,1137 ----
++src;
} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
prevFCD16=0;
++src;
} else {
! if (UTF16Plus.isLeadSurrogate(c)) {
char c2;
! if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
! c = Character.toCodePoint((char)c, c2);
}
}
if((fcd16=getFCD16FromNormData(c))<=0xff) {
prevFCD16=fcd16;
src+=Character.charCount(c);
*** 1428,1438 ****
buffer.append(c, getCCFromYesOrMaybe(norm16));
return;
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
! norm16=getNorm16(c);
}
if (norm16 < minYesNo) {
// c does not decompose
buffer.append(c, 0);
} else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
--- 1413,1423 ----
buffer.append(c, getCCFromYesOrMaybe(norm16));
return;
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
! norm16=getRawNorm16(c);
}
if (norm16 < minYesNo) {
// c does not decompose
buffer.append(c, 0);
} else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
*** 1449,1459 ****
leadCC=extraData.charAt(mapping-1)>>8;
} else {
leadCC=0;
}
++mapping; // skip over the firstUnit
! buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
}
}
/**
* Finds the recomposition result for
--- 1434,1444 ----
leadCC=extraData.charAt(mapping-1)>>8;
} else {
leadCC=0;
}
++mapping; // skip over the firstUnit
! buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
}
}
/**
* Finds the recomposition result for
*** 1641,1651 ****
break;
}
// Is the composite a starter that combines forward?
if((compositeAndFwd&1)!=0) {
compositionsList=
! getCompositionsListForComposite(getNorm16(composite));
} else {
compositionsList=-1;
}
// We combined; continue with looking for compositions.
--- 1626,1636 ----
break;
}
// Is the composite a starter that combines forward?
if((compositeAndFwd&1)!=0) {
compositionsList=
! getCompositionsListForComposite(getRawNorm16(composite));
} else {
compositionsList=-1;
}
// We combined; continue with looking for compositions.
*** 2194,2204 ****
private int minNoNoEmpty;
private int limitNoNo;
private int centerNoNoDelta;
private int minMaybeYes;
! private Trie2_16 normTrie;
private String maybeYesCompositions;
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
!
! }
--- 2179,2188 ----
private int minNoNoEmpty;
private int limitNoNo;
private int centerNoNoDelta;
private int minMaybeYes;
! private CodePointTrie.Fast16 normTrie;
private String maybeYesCompositions;
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
! }
< prev index next >