< prev index next >

src/java.base/share/classes/java/util/regex/Grapheme.java

Print this page
rev 55125 : 8225061: Performance regression in Regex
Reviewed-by: TBD

*** 28,37 **** --- 28,50 ---- import java.util.Objects; final class Grapheme { /** + * Determines if there is an extended grapheme cluster boundary between two + * continuing characters {@code cp1} and {@code cp2}. + * <p> + * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification + * for the extended grapheme cluster boundary rules + * <p> + * Note: this method does not take care of stateful breaking. + */ + static boolean isBoundary(int cp1, int cp2) { + return rules[getType(cp1)][getType(cp2)]; + } + + /** * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes * the start of the char sequence is a boundary. * <p> * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification * for the extended grapheme cluster boundary rules. The following implementation
*** 48,79 **** int ch0 = Character.codePointAt(src, 0); int ret = Character.charCount(ch0); int ch1; // indicates whether gb11 or gb12 is underway ! boolean gb11 = EmojiData.isExtendedPictographic(ch0); ! int riCount = getType(ch0) == RI ? 1 : 0; while (ret < limit) { ch1 = Character.codePointAt(src, ret); ! int t0 = getType(ch0); ! int t1 = getType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { gb11 = false; } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { // continue for gb12 } else if (rules[t0][t1]) { if (ret > off) { break; } else { ! gb11 = EmojiData.isExtendedPictographic(ch1); riCount = 0; } } ! riCount += getType(ch1) == RI ? 1 : 0; ! ch0 = ch1; ret += Character.charCount(ch1); } return ret; } --- 61,93 ---- int ch0 = Character.codePointAt(src, 0); int ret = Character.charCount(ch0); int ch1; // indicates whether gb11 or gb12 is underway ! int t0 = getGraphemeType(ch0); ! int riCount = t0 == RI ? 1 : 0; ! boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; while (ret < limit) { ch1 = Character.codePointAt(src, ret); ! int t1 = getGraphemeType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { gb11 = false; } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { // continue for gb12 } else if (rules[t0][t1]) { if (ret > off) { break; } else { ! gb11 = t1 == EXTENDED_PICTOGRAPHIC; riCount = 0; } } ! riCount += (t1 == RI) ? 1 : 0; ! t0 = t1; ! ret += Character.charCount(ch1); } return ret; }
*** 161,191 **** cp >= 0x109A && cp <= 0x109C || cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || cp == 0xAA7B || cp == 0xAA7D; } @SuppressWarnings("fallthrough") private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { return EXTENDED_PICTOGRAPHIC; } int type = Character.getType(cp); switch(type) { - case Character.CONTROL: - if (cp == 0x000D) - return CR; - if (cp == 0x000A) - return LF; - return CONTROL; case Character.UNASSIGNED: // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" // so type it as "Other" to make the test happy if (cp == 0x0378) return OTHER; case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.SURROGATE: return CONTROL; case Character.FORMAT: --- 175,214 ---- cp >= 0x109A && cp <= 0x109C || cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || cp == 0xAA7B || cp == 0xAA7D; } + private static int getGraphemeType(int cp) { + if (cp < 0x007F) { // ASCII + if (cp < 32) { // Control characters + if (cp == 0x000D) + return CR; + if (cp == 0x000A) + return LF; + return CONTROL; + } + return OTHER; + } + return getType(cp); + } + @SuppressWarnings("fallthrough") private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { return EXTENDED_PICTOGRAPHIC; } int type = Character.getType(cp); switch(type) { case Character.UNASSIGNED: // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" // so type it as "Other" to make the test happy if (cp == 0x0378) return OTHER; + case Character.CONTROL: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.SURROGATE: return CONTROL; case Character.FORMAT:
< prev index next >