open Cdiff src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java

rev 54996 : 8221431: Support for Unicode 12.1
Reviewed-by:


*** 1,7 ****
  /*
!  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.  Oracle designates this
--- 1,7 ----
  /*
!  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.  Oracle designates this
*** 143,154 ****
                  }
              } else {
                  insert(c, cc);
              }
          }
!         // s must be in NFD, otherwise change the implementation.
!         public void append(CharSequence s, int start, int limit,
                             int leadCC, int trailCC) {
              if(start==limit) {
                  return;
              }
              if(lastCC<=leadCC || leadCC==0) {
--- 143,153 ----
                  }
              } else {
                  insert(c, cc);
              }
          }
!         public void append(CharSequence s, int start, int limit, boolean isNFD,
                             int leadCC, int trailCC) {
              if(start==limit) {
                  return;
              }
              if(lastCC<=leadCC || leadCC==0) {
*** 165,176 ****
                  insert(c, leadCC);  // insert first code point
                  while(start<limit) {
                      c=Character.codePointAt(s, start);
                      start+=Character.charCount(c);
                      if(start<limit) {
!                         // s must be in NFD, otherwise we need to use getCC().
!                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
                      } else {
                          leadCC=trailCC;
                      }
                      append(c, leadCC);
                  }
--- 164,178 ----
                  insert(c, leadCC);  // insert first code point
                  while(start<limit) {
                      c=Character.codePointAt(s, start);
                      start+=Character.charCount(c);
                      if(start<limit) {
!                         if (isNFD) {
!                             leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
!                         } else {
!                             leadCC = impl.getCC(impl.getNorm16(c));
!                         }
                      } else {
                          leadCC=trailCC;
                      }
                      append(c, leadCC);
                  }
*** 309,318 ****
--- 311,326 ----
      // TODO: Propose as public API on the UTF16 class.
      // TODO: Propose widening UTF16 methods that take char to take int.
      // TODO: Propose widening UTF16 methods that take String to take CharSequence.
      public static final class UTF16Plus {
          /**
+          * Is this code point a lead surrogate (U+d800..U+dbff)?
+          * @param c code unit or code point
+          * @return true or false
+          */
+         public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
+         /**
           * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
           * is it a lead surrogate?
           * @param c code unit or code point
           * @return true or false
           */
*** 348,358 ****
  
      public NormalizerImpl() {}
  
      private static final class IsAcceptable implements ICUBinary.Authenticate {
          public boolean isDataVersionAcceptable(byte version[]) {
!             return version[0]==3;
          }
      }
      private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
      private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
  
--- 356,366 ----
  
      public NormalizerImpl() {}
  
      private static final class IsAcceptable implements ICUBinary.Authenticate {
          public boolean isDataVersionAcceptable(byte version[]) {
!             return version[0]==4;
          }
      }
      private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
      private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
  
*** 385,412 ****
              centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
  
              // Read the normTrie.
              int offset=inIndexes[IX_NORM_TRIE_OFFSET];
              int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
!             normTrie=Trie2_16.createFromSerialized(bytes);
!             int trieLength=normTrie.getSerializedLength();
              if(trieLength>(nextOffset-offset)) {
                  throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
              }
              ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
  
              // Read the composition and mapping data.
              offset=nextOffset;
              nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
              int numChars=(nextOffset-offset)/2;
-             char[] chars;
              if(numChars!=0) {
!                 chars=new char[numChars];
!                 for(int i=0; i<numChars; ++i) {
!                     chars[i]=bytes.getChar();
!                 }
!                 maybeYesCompositions=new String(chars);
                  extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
              }
  
              // smallFCD: new in formatVersion 2
              offset=nextOffset;
--- 393,416 ----
              centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
  
              // Read the normTrie.
              int offset=inIndexes[IX_NORM_TRIE_OFFSET];
              int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
!             int triePosition = bytes.position();
!             normTrie = CodePointTrie.Fast16.fromBinary(bytes);
!             int trieLength = bytes.position() - triePosition;
              if(trieLength>(nextOffset-offset)) {
                  throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
              }
              ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
  
              // Read the composition and mapping data.
              offset=nextOffset;
              nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
              int numChars=(nextOffset-offset)/2;
              if(numChars!=0) {
!                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
                  extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
              }
  
              // smallFCD: new in formatVersion 2
              offset=nextOffset;
*** 420,431 ****
      }
      public NormalizerImpl load(String name) {
          return load(ICUBinary.getRequiredData(name));
      }
  
! 
!     public int getNorm16(int c) { return normTrie.get(c); }
      public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
      public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
      public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
  
      public int getCC(int norm16) {
--- 424,439 ----
      }
      public NormalizerImpl load(String name) {
          return load(ICUBinary.getRequiredData(name));
      }
  
!     // The trie stores values for lead surrogate code *units*.
!     // Surrogate code *points* are inert.
!     public int getNorm16(int c) {
!         return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
!     }
!     public int getRawNorm16(int c) { return normTrie.get(c); }
      public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
      public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
      public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
  
      public int getCC(int norm16) {
*** 484,494 ****
                  if (deltaTrailCC <= DELTA_TCCC_1) {
                      return deltaTrailCC >> OFFSET_SHIFT;
                  }
                  // Maps to an isCompYesAndZeroCC.
                  c=mapAlgorithmic(c, norm16);
!                 norm16=getNorm16(c);
              }
          }
          if(norm16<=minYesNo || isHangulLVT(norm16)) {
              // no decomposition or Hangul syllable, all zeros
              return 0;
--- 492,502 ----
                  if (deltaTrailCC <= DELTA_TCCC_1) {
                      return deltaTrailCC >> OFFSET_SHIFT;
                  }
                  // Maps to an isCompYesAndZeroCC.
                  c=mapAlgorithmic(c, norm16);
!                 norm16=getRawNorm16(c);
              }
          }
          if(norm16<=minYesNo || isHangulLVT(norm16)) {
              // no decomposition or Hangul syllable, all zeros
              return 0;
*** 517,527 ****
          int decomp = -1;
          if(isDecompNoAlgorithmic(norm16)) {
              // Maps to an isCompYesAndZeroCC.
              decomp=c=mapAlgorithmic(c, norm16);
              // The mapping might decompose further.
!             norm16 = getNorm16(c);
          }
          if (norm16 < minYesNo) {
              if(decomp<0) {
                  return null;
              } else {
--- 525,535 ----
          int decomp = -1;
          if(isDecompNoAlgorithmic(norm16)) {
              // Maps to an isCompYesAndZeroCC.
              decomp=c=mapAlgorithmic(c, norm16);
              // The mapping might decompose further.
!             norm16 = getRawNorm16(c);
          }
          if (norm16 < minYesNo) {
              if(decomp<0) {
                  return null;
              } else {
*** 639,670 ****
  
          for(;;) {
              // count code units below the minimum or with irrelevant data for the quick check
              for(prevSrc=src; src!=limit;) {
                  if( (c=s.charAt(src))<minNoCP ||
!                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
                  ) {
                      ++src;
!                 } else if(!UTF16.isSurrogate((char)c)) {
                      break;
                  } else {
                      char c2;
!                     if(UTF16Plus.isSurrogateLead(c)) {
!                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
!                             c=Character.toCodePoint((char)c, c2);
!                         }
!                     } else /* trail surrogate */ {
!                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
!                             --src;
!                             c=Character.toCodePoint(c2, (char)c);
!                         }
!                     }
!                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
!                         src+=Character.charCount(c);
                      } else {
                          break;
                      }
                  }
              }
              // copy these code units all at once
              if(src!=prevSrc) {
                  if(buffer!=null) {
--- 647,674 ----
  
          for(;;) {
              // count code units below the minimum or with irrelevant data for the quick check
              for(prevSrc=src; src!=limit;) {
                  if( (c=s.charAt(src))<minNoCP ||
!                     isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
                  ) {
                      ++src;
!                 } else if(!UTF16Plus.isLeadSurrogate(c)) {
                      break;
                  } else {
                      char c2;
!                     if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
!                         c = Character.toCodePoint((char)c, c2);
!                         norm16 = normTrie.suppGet(c);
!                         if (isMostDecompYesAndZeroCC(norm16)) {
!                             src += 2;
                          } else {
                              break;
                          }
+                     } else {
+                         ++src;  // unpaired lead surrogate: inert
+                     }
                  }
              }
              // copy these code units all at once
              if(src!=prevSrc) {
                  if(buffer!=null) {
*** 719,729 ****
                  break;
              }
              c=Character.codePointAt(s, src);
              cc=getCC(getNorm16(c));
          };
!         buffer.append(s, 0, src, firstCC, prevCC);
          buffer.append(s, src, limit);
      }
  
      // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
      // doCompose: normalize
--- 723,733 ----
                  break;
              }
              c=Character.codePointAt(s, src);
              cc=getCC(getNorm16(c));
          };
!         buffer.append(s, 0, src, false, firstCC, prevCC);
          buffer.append(s, src, limit);
      }
  
      // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
      // doCompose: normalize
*** 747,782 ****
                          buffer.append(s, prevBoundary, limit);
                      }
                      return true;
                  }
                  if( (c=s.charAt(src))<minNoMaybeCP ||
!                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
                  ) {
                      ++src;
                  } else {
                      prevSrc = src++;
!                     if(!UTF16.isSurrogate((char)c)) {
                          break;
                      } else {
                          char c2;
!                         if(UTF16Plus.isSurrogateLead(c)) {
!                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
                                  ++src;
!                                 c=Character.toCodePoint((char)c, c2);
!                             }
!                         } else /* trail surrogate */ {
!                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
!                                 --prevSrc;
!                                 c=Character.toCodePoint(c2, (char)c);
!                             }
!                         }
!                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
                              break;
                          }
                      }
                  }
              }
              // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
              // The current character is either a "noNo" (has a mapping)
              // or a "maybeYes" (combines backward)
              // or a "yesYes" with ccc!=0.
              // It is not a Hangul syllable or Jamo L because those have "yes" properties.
--- 751,780 ----
                          buffer.append(s, prevBoundary, limit);
                      }
                      return true;
                  }
                  if( (c=s.charAt(src))<minNoMaybeCP ||
!                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
                  ) {
                      ++src;
                  } else {
                      prevSrc = src++;
!                     if (!UTF16Plus.isLeadSurrogate(c)) {
                          break;
                      } else {
                          char c2;
!                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
                              ++src;
!                             c = Character.toCodePoint((char)c, c2);
!                             norm16 = normTrie.suppGet(c);
!                             if (!isCompYesAndZeroCC(norm16)) {
                                  break;
                              }
                          }
                      }
                  }
+             }
              // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
              // The current character is either a "noNo" (has a mapping)
              // or a "maybeYes" (combines backward)
              // or a "yesYes" with ccc!=0.
              // It is not a Hangul syllable or Jamo L because those have "yes" properties.
*** 989,1024 ****
              for (;;) {
                  if(src==limit) {
                      return (src<<1)|qcResult;  // "yes" or "maybe"
                  }
                  if( (c=s.charAt(src))<minNoMaybeCP ||
!                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
                  ) {
                      ++src;
                  } else {
                      prevSrc = src++;
!                     if(!UTF16.isSurrogate((char)c)) {
                          break;
                      } else {
                          char c2;
!                         if(UTF16Plus.isSurrogateLead(c)) {
!                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
                                  ++src;
!                                 c=Character.toCodePoint((char)c, c2);
!                             }
!                         } else /* trail surrogate */ {
!                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
!                                 --prevSrc;
!                                 c=Character.toCodePoint(c2, (char)c);
!                             }
!                         }
!                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
                              break;
                          }
                      }
                  }
              }
              // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
              // The current character is either a "noNo" (has a mapping)
              // or a "maybeYes" (combines backward)
              // or a "yesYes" with ccc!=0.
              // It is not a Hangul syllable or Jamo L because those have "yes" properties.
--- 987,1016 ----
              for (;;) {
                  if(src==limit) {
                      return (src<<1)|qcResult;  // "yes" or "maybe"
                  }
                  if( (c=s.charAt(src))<minNoMaybeCP ||
!                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
                  ) {
                      ++src;
                  } else {
                      prevSrc = src++;
!                     if (!UTF16Plus.isLeadSurrogate(c)) {
                          break;
                      } else {
                          char c2;
!                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
                              ++src;
!                             c = Character.toCodePoint((char)c, c2);
!                             norm16 = normTrie.suppGet(c);
!                             if (!isCompYesAndZeroCC(norm16)) {
                                  break;
                              }
                          }
                      }
                  }
+             }
              // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
              // The current character is either a "noNo" (has a mapping)
              // or a "maybeYes" (combines backward)
              // or a "yesYes" with ccc!=0.
              // It is not a Hangul syllable or Jamo L because those have "yes" properties.
*** 1132,1152 ****
                      ++src;
                  } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
                      prevFCD16=0;
                      ++src;
                  } else {
!                     if(UTF16.isSurrogate((char)c)) {
                          char c2;
!                         if(UTF16Plus.isSurrogateLead(c)) {
!                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
!                                 c=Character.toCodePoint((char)c, c2);
!                             }
!                         } else /* trail surrogate */ {
!                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
!                                 --src;
!                                 c=Character.toCodePoint(c2, (char)c);
!                             }
                          }
                      }
                      if((fcd16=getFCD16FromNormData(c))<=0xff) {
                          prevFCD16=fcd16;
                          src+=Character.charCount(c);
--- 1124,1137 ----
                      ++src;
                  } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
                      prevFCD16=0;
                      ++src;
                  } else {
!                     if (UTF16Plus.isLeadSurrogate(c)) {
                          char c2;
!                         if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
!                             c = Character.toCodePoint((char)c, c2);
                          }
                      }
                      if((fcd16=getFCD16FromNormData(c))<=0xff) {
                          prevFCD16=fcd16;
                          src+=Character.charCount(c);
*** 1428,1438 ****
                  buffer.append(c, getCCFromYesOrMaybe(norm16));
                  return;
              }
              // Maps to an isCompYesAndZeroCC.
              c=mapAlgorithmic(c, norm16);
!             norm16=getNorm16(c);
          }
          if (norm16 < minYesNo) {
              // c does not decompose
              buffer.append(c, 0);
          } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
--- 1413,1423 ----
                  buffer.append(c, getCCFromYesOrMaybe(norm16));
                  return;
              }
              // Maps to an isCompYesAndZeroCC.
              c=mapAlgorithmic(c, norm16);
!             norm16=getRawNorm16(c);
          }
          if (norm16 < minYesNo) {
              // c does not decompose
              buffer.append(c, 0);
          } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
*** 1449,1459 ****
                  leadCC=extraData.charAt(mapping-1)>>8;
              } else {
                  leadCC=0;
              }
              ++mapping;  // skip over the firstUnit
!             buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
          }
      }
  
      /**
       * Finds the recomposition result for
--- 1434,1444 ----
                  leadCC=extraData.charAt(mapping-1)>>8;
              } else {
                  leadCC=0;
              }
              ++mapping;  // skip over the firstUnit
!             buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
          }
      }
  
      /**
       * Finds the recomposition result for
*** 1641,1651 ****
                          break;
                      }
                      // Is the composite a starter that combines forward?
                      if((compositeAndFwd&1)!=0) {
                          compositionsList=
!                             getCompositionsListForComposite(getNorm16(composite));
                      } else {
                          compositionsList=-1;
                      }
  
                      // We combined; continue with looking for compositions.
--- 1626,1636 ----
                          break;
                      }
                      // Is the composite a starter that combines forward?
                      if((compositeAndFwd&1)!=0) {
                          compositionsList=
!                             getCompositionsListForComposite(getRawNorm16(composite));
                      } else {
                          compositionsList=-1;
                      }
  
                      // We combined; continue with looking for compositions.
*** 2194,2204 ****
      private int minNoNoEmpty;
      private int limitNoNo;
      private int centerNoNoDelta;
      private int minMaybeYes;
  
!     private Trie2_16 normTrie;
      private String maybeYesCompositions;
      private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
      private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
! 
!    }
--- 2179,2188 ----
      private int minNoNoEmpty;
      private int limitNoNo;
      private int centerNoNoDelta;
      private int minMaybeYes;
  
!     private CodePointTrie.Fast16 normTrie;
      private String maybeYesCompositions;
      private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
      private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
! }

< prev index next >