# HG changeset patch # User igerasim # Date 1584773576 25200 # Fri Mar 20 23:52:56 2020 -0700 # Node ID 316d0555b15050331d7f2c3c4fcdd743c911b0ef # Parent 84215fa115fc4e156cd372174bd29e4015196d81 [mq]: 8237599-Greedy-matching-against-supplementary-chars-does-not-respect-the-region diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -4340,14 +4340,22 @@ this.cmin = cmin; } boolean match(Matcher matcher, int i, CharSequence seq) { + int starti = i; int n = 0; int to = matcher.to; // greedy, all the way down while (i < to) { int ch = Character.codePointAt(seq, i); + int len = Character.charCount(ch); + if (i + len > to) { + // the region cut off the high half of a surrogate pair + matcher.hitEnd = true; + ch = seq.charAt(i); + len = 1; + } if (!predicate.is(ch)) - break; - i += Character.charCount(ch); + break; + i += len; n++; } if (i >= to) { @@ -4358,9 +4366,10 @@ return true; if (n == cmin) return false; - // backing off if match fails + // backing off if match fails int ch = Character.codePointBefore(seq, i); - i -= Character.charCount(ch); + // check if the region cut off the low half of a surrogate pair + i = Math.max(starti, i - Character.charCount(ch)); n--; } return false; diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -36,7 +36,7 @@ * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 - * 8216332 8214245 + * 8216332 8214245 8237599 * * @library /test/lib * @library /lib/testlibrary/java/lang @@ -195,6 +195,7 @@ surrogatePairWithCanonEq(); lineBreakWithQuantifier(); caseInsensitivePMatch(); + surrogatePairOverlapRegion(); if (failure) { throw new @@ -5155,4 +5156,45 @@ } report("caseInsensitivePMatch"); } + + // This test is for 8237599 + private static void surrogatePairOverlapRegion() { + String input = "\ud801\udc37"; + + Pattern p = Pattern.compile(".+"); + Matcher m = p.matcher(input); + m.region(0, 1); + + boolean ok = m.find(); + if (!ok || !m.group(0).equals(input.substring(0, 1))) + { + failCount++; + System.out.println("Input \"" + input + "\".substr(0, 1)" + + " expected to match pattern \"" + p + "\""); + if (ok) { + System.out.println("group(0): \"" + m.group(0) + "\""); + } + } else if (!m.hitEnd()) { + failCount++; + System.out.println("Expected m.hitEnd() == true"); + } + + p = Pattern.compile(".*(.)"); + m = p.matcher(input); + m.region(1, 2); + + ok = m.find(); + if (!ok || !m.group(0).equals(input.substring(1, 2)) + || !m.group(1).equals(input.substring(1, 2))) + { + failCount++; + System.out.println("Input \"" + input + "\".substr(1, 2)" + + " expected to match pattern \"" + p + "\""); + if (ok) { + System.out.println("group(0): \"" + m.group(0) + "\""); + System.out.println("group(1): \"" + m.group(1) + "\""); + } + } + report("surrogatePairOverlapRegion"); + } }