# HG changeset patch # User redestad # Date 1559326067 -7200 # Fri May 31 20:07:47 2019 +0200 # Node ID b85260846fd026008649ee2d5588ab2e2ead0f6d # Parent f91999057a5a28dcd2be51b832c20a9dabfd5445 8225061: Performance regression in Regex Reviewed-by: TBD diff --git a/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java b/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java --- a/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java +++ b/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java @@ -67,27 +67,28 @@ }, ArrayList::addAll); + // make the code point conditions - String extPictCodePoints = extPictRanges.stream() - .map(r -> { - if (r.start == r.last) { - return (" ".repeat(12) + "cp == 0x" + toHexString(r.start)); - } else if (r.start == r.last - 1) { - return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" + - " ".repeat(12) + "cp == 0x" + toHexString(r.last); - } else { - return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) + - " && cp <= 0x" + toHexString(r.last) + ")"; - } - }) - .collect(Collectors.joining(" ||\n")) + ";\n"; + // only very few codepoints below 0x2000 are "emojis", so separate them + // out to generate a fast-path check that can be efficiently inlined + String lowExtPictCodePoints = extPictRanges.stream() + .takeWhile(r -> r.last < 0x2000) + .map(r -> rangeToString(r)) + .collect(Collectors.joining(" ||\n", "", ";\n")); + + String highExtPictCodePoints = extPictRanges.stream() + .dropWhile(r -> r.last < 0x2000) + .map(r -> rangeToString(r)) + .collect(Collectors.joining(" ||\n", "", ";\n")); // Generate EmojiData.java file Files.write(Paths.get(args[2]), Files.lines(Paths.get(args[0])) .flatMap(l -> { - if (l.equals("%%%EXTPICT%%%")) { - return Stream.of(extPictCodePoints); + if (l.equals("%%%EXTPICT_LOW%%%")) { + return Stream.of(lowExtPictCodePoints); + } else if (l.equals("%%%EXTPICT_HIGH%%%")) { + return Stream.of(highExtPictCodePoints); } else { return Stream.of(l); } @@ -99,6 +100,18 @@ } } + static String rangeToString(Range r) { + if (r.start == r.last) { + return (" ".repeat(16) + "cp == 0x" + toHexString(r.start)); + } else if (r.start == r.last - 1) { + return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" + + " ".repeat(16) + "cp == 0x" + toHexString(r.last); + } else { + return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) + + " && cp <= 0x" + toHexString(r.last) + ")"; + } + } + static int toInt(String hexStr) { return Integer.parseUnsignedInt(hexStr, 16); } diff --git a/src/java.base/share/classes/java/util/regex/EmojiData.java.template b/src/java.base/share/classes/java/util/regex/EmojiData.java.template --- a/src/java.base/share/classes/java/util/regex/EmojiData.java.template +++ b/src/java.base/share/classes/java/util/regex/EmojiData.java.template @@ -40,7 +40,16 @@ * @return true if {@code cp} is an extended pictographic */ static boolean isExtendedPictographic(int cp) { + if (cp < 0x2000) { + return +%%%EXTPICT_LOW%%% + } else { + return isHigh(cp); + } + } + + private static boolean isHigh(int cp) { return -%%%EXTPICT%%% +%%%EXTPICT_HIGH%%% } } diff --git a/src/java.base/share/classes/java/util/regex/Grapheme.java b/src/java.base/share/classes/java/util/regex/Grapheme.java --- a/src/java.base/share/classes/java/util/regex/Grapheme.java +++ b/src/java.base/share/classes/java/util/regex/Grapheme.java @@ -30,6 +30,19 @@ final class Grapheme { /** + * Determines if there is an extended grapheme cluster boundary between two + * continuing characters {@code cp1} and {@code cp2}. + *

+ * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification + * for the extended grapheme cluster boundary rules + *

+ * Note: this method does not take care of stateful breaking. + */ + static boolean isBoundary(int cp1, int cp2) { + return rules[getType(cp1)][getType(cp2)]; + } + + /** * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes * the start of the char sequence is a boundary. *

@@ -50,12 +63,12 @@ int ret = Character.charCount(ch0); int ch1; // indicates whether gb11 or gb12 is underway - boolean gb11 = EmojiData.isExtendedPictographic(ch0); - int riCount = getType(ch0) == RI ? 1 : 0; + int t0 = getGraphemeType(ch0); + int riCount = t0 == RI ? 1 : 0; + boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; while (ret < limit) { ch1 = Character.codePointAt(src, ret); - int t0 = getType(ch0); - int t1 = getType(ch1); + int t1 = getGraphemeType(ch1); if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { gb11 = false; @@ -65,13 +78,14 @@ if (ret > off) { break; } else { - gb11 = EmojiData.isExtendedPictographic(ch1); + gb11 = t1 == EXTENDED_PICTOGRAPHIC; riCount = 0; } } - riCount += getType(ch1) == RI ? 1 : 0; - ch0 = ch1; + riCount += (t1 == RI) ? 1 : 0; + t0 = t1; + ret += Character.charCount(ch1); } return ret; @@ -163,6 +177,20 @@ cp == 0xAA7B || cp == 0xAA7D; } + private static int getGraphemeType(int cp) { + if (cp < 0x007F) { // ASCII + if (cp < 32) { // Control characters + if (cp == 0x000D) + return CR; + if (cp == 0x000A) + return LF; + return CONTROL; + } + return OTHER; + } + return getType(cp); + } + @SuppressWarnings("fallthrough") private static int getType(int cp) { if (EmojiData.isExtendedPictographic(cp)) { @@ -171,12 +199,6 @@ int type = Character.getType(cp); switch(type) { - case Character.CONTROL: - if (cp == 0x000D) - return CR; - if (cp == 0x000A) - return LF; - return CONTROL; case Character.UNASSIGNED: // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" @@ -184,6 +206,7 @@ if (cp == 0x0378) return OTHER; + case Character.CONTROL: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.SURROGATE: diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -3973,7 +3973,16 @@ if (i < matcher.to) { int ch0 = Character.codePointAt(seq, i); int n = Character.charCount(ch0); - int j = Grapheme.nextBoundary(seq, i, matcher.to); + int j = i + n; + // Fast check if it's necessary to call Normalizer; + // testing Grapheme.isBoundary is enough for this case + while (j < matcher.to) { + int ch1 = Character.codePointAt(seq, j); + if (Grapheme.isBoundary(ch0, ch1)) + break; + ch0 = ch1; + j += Character.charCount(ch1); + } if (i + n == j) { // single, assume nfc cp if (predicate.is(ch0)) return next.match(matcher, j, seq); diff --git a/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java new file mode 100644 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.util.regex; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +public class PatternBench { + + public String fileTestString; + public String flagsString; + + + public Pattern graphemePattern; + public Pattern jmodPattern; + public Pattern jmodCanonicalPattern; + + public Pattern pattern; + + @Setup + public void setup() { + flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7"; + fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode"; + graphemePattern = Pattern.compile("\\b{g}"); + jmodCanonicalPattern = Pattern.compile("^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$", Pattern.CANON_EQ); + jmodPattern = Pattern.compile("^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$"); + } + + @Benchmark + @Warmup(iterations = 3) + @Measurement(iterations = 3) + public int splitFlags() { + return graphemePattern.split(flagsString).length; + } + + @Benchmark + @Warmup(iterations = 3) + @Measurement(iterations = 3) + public boolean canonicalJmodMatch() { + return jmodCanonicalPattern.matcher(fileTestString).matches(); + } + + @Benchmark + @Warmup(iterations = 3) + @Measurement(iterations = 3) + public boolean normalJmodMatch() { + return jmodPattern.matcher(fileTestString).matches(); + } +}