< prev index next >

src/java.base/share/classes/java/util/regex/Grapheme.java

Print this page
rev 55125 : 8225061: Performance regression in Regex
Reviewed-by: TBD

@@ -28,10 +28,23 @@
 import java.util.Objects;
 
 final class Grapheme {
 
     /**
+     * Determines if there is an extended  grapheme cluster boundary between two
+     * continuing characters {@code cp1} and {@code cp2}.
+     * <p>
+     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
+     * for the extended grapheme cluster boundary rules
+     * <p>
+     * Note: this method does not take care of stateful breaking.
+     */
+    static boolean isBoundary(int cp1, int cp2) {
+        return rules[getType(cp1)][getType(cp2)];
+    }
+
+    /**
      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
      * the start of the char sequence is a boundary.
      * <p>
      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
      * for the extended grapheme cluster boundary rules. The following implementation

@@ -48,32 +61,33 @@
 
         int ch0 = Character.codePointAt(src, 0);
         int ret = Character.charCount(ch0);
         int ch1;
         // indicates whether gb11 or gb12 is underway
-        boolean gb11 = EmojiData.isExtendedPictographic(ch0);
-        int riCount = getType(ch0) == RI ? 1 : 0;
+        int t0 = getGraphemeType(ch0);
+        int riCount = t0 == RI ? 1 : 0;
+        boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
         while (ret < limit) {
             ch1 = Character.codePointAt(src, ret);
-            int t0 = getType(ch0);
-            int t1 = getType(ch1);
+            int t1 = getGraphemeType(ch1);
 
             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
                 gb11 = false;
             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
                 // continue for gb12
             } else if (rules[t0][t1]) {
                 if (ret > off) {
                     break;
                 } else {
-                    gb11 = EmojiData.isExtendedPictographic(ch1);
+                    gb11 = t1 == EXTENDED_PICTOGRAPHIC;
                     riCount = 0;
                 }
             }
 
-            riCount += getType(ch1) == RI ? 1 : 0;
-            ch0 = ch1;
+            riCount += (t1 == RI) ? 1 : 0;
+            t0 = t1;
+
             ret += Character.charCount(ch1);
         }
         return ret;
     }
 

@@ -161,31 +175,40 @@
                cp >= 0x109A && cp <= 0x109C ||
                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
                cp == 0xAA7B || cp == 0xAA7D;
     }
 
+    private static int getGraphemeType(int cp) {
+        if (cp < 0x007F) { // ASCII
+            if (cp < 32) { // Control characters
+                if (cp == 0x000D)
+                    return CR;
+                if (cp == 0x000A)
+                    return LF;
+                return CONTROL;
+            }
+            return OTHER;
+        }
+        return getType(cp);
+    }
+
     @SuppressWarnings("fallthrough")
     private static int getType(int cp) {
         if (EmojiData.isExtendedPictographic(cp)) {
             return EXTENDED_PICTOGRAPHIC;
         }
 
         int type = Character.getType(cp);
         switch(type) {
-        case Character.CONTROL:
-            if (cp == 0x000D)
-                return CR;
-            if (cp == 0x000A)
-                return LF;
-            return CONTROL;
         case Character.UNASSIGNED:
             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
             // so type it as "Other" to make the test happy
             if (cp == 0x0378)
                 return OTHER;
 
+        case Character.CONTROL:
         case Character.LINE_SEPARATOR:
         case Character.PARAGRAPH_SEPARATOR:
         case Character.SURROGATE:
             return CONTROL;
         case Character.FORMAT:
< prev index next >