< prev index next >

src/java.base/share/classes/java/util/regex/Pattern.java

Print this page
rev 54996 : 8221431: Support for Unicode 12.1
Reviewed-by:

@@ -538,11 +538,11 @@
  * <h2> Unicode support </h2>
  *
  * <p> This class is in conformance with Level 1 of <a
  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
- * Canonical Equivalents.
+ * Canonical Equivalents and RL2.2 Extended Grapheme Clusters.
  * <p>
  * <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code
  * are processed as described in section 3.3 of
  * <cite>The Java&trade; Language Specification</cite>.
  * Such escape sequences are also implemented directly by the regular-expression

@@ -1499,19 +1499,12 @@
             if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
                 dst.append((char)ch0);
                 off++;
                 continue;
             }
-            int j = off + Character.charCount(ch0);
+            int j = Grapheme.nextBoundary(src, off, limit);
             int ch1;
-            while (j < limit) {
-                ch1 = src.codePointAt(j);
-                if (Grapheme.isBoundary(ch0, ch1))
-                    break;
-                ch0 = ch1;
-                j += Character.charCount(ch1);
-            }
             String seq = src.substring(off, j);
             String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
             off = j;
             if (nfd.length() > 1) {
                 ch0 = nfd.codePointAt(0);

@@ -3973,18 +3966,11 @@
 
         boolean match(Matcher matcher, int i, CharSequence seq) {
             if (i < matcher.to) {
                 int ch0 = Character.codePointAt(seq, i);
                 int n = Character.charCount(ch0);
-                int j = i + n;
-                while (j < matcher.to) {
-                    int ch1 = Character.codePointAt(seq, j);
-                    if (Grapheme.isBoundary(ch0, ch1))
-                        break;
-                    ch0 = ch1;
-                    j += Character.charCount(ch1);
-                }
+                int j = Grapheme.nextBoundary(seq, i, matcher.to);
                 if (i + n == j) {    // single, assume nfc cp
                     if (predicate.is(ch0))
                         return next.match(matcher, j, seq);
                 } else {
                     while (i + n < j) {

@@ -4019,19 +4005,11 @@
      * Node class that matches an unicode extended grapheme cluster
      */
     static class XGrapheme extends Node {
         boolean match(Matcher matcher, int i, CharSequence seq) {
             if (i < matcher.to) {
-                int ch0 = Character.codePointAt(seq, i);
-                    i += Character.charCount(ch0);
-                while (i < matcher.to) {
-                    int ch1 = Character.codePointAt(seq, i);
-                    if (Grapheme.isBoundary(ch0, ch1))
-                        break;
-                    ch0 = ch1;
-                    i += Character.charCount(ch1);
-                }
+                i = Grapheme.nextBoundary(seq, i, matcher.to);
                 return next.match(matcher, i, seq);
             }
             matcher.hitEnd = true;
             return false;
         }

@@ -4057,12 +4035,13 @@
             if (i == startIndex) {
                 return next.match(matcher, i, seq);
             }
             if (i < endIndex) {
                 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
-                    !Grapheme.isBoundary(Character.codePointBefore(seq, i),
-                                         Character.codePointAt(seq, i))) {
+                    Grapheme.nextBoundary(seq,
+                        i - Character.charCount(Character.codePointBefore(seq, i)),
+                        i + Character.charCount(Character.codePointAt(seq, i))) > i) {
                     return false;
                 }
             } else {
                 matcher.hitEnd = true;
                 matcher.requireEnd = true;
< prev index next >