# HG changeset patch # User igerasim # Date 1580951601 28800 # Wed Feb 05 17:13:21 2020 -0800 # Node ID db7acc29e004fb4bbbf876b7d80b27f4fd5083a1 # Parent 62b5bfef8d618e08e6f3a56cf1fb0e67e89e9cc2 [mq]: 8235812-Unicode-linebreak-with-quantifier-does-not-match-valid-input diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -2064,7 +2064,7 @@ Node prev = null; Node firstTail = null; Branch branch = null; - Node branchConn = null; + BranchConn branchConn = null; for (;;) { Node node = sequence(end); @@ -2212,7 +2212,24 @@ break; } - node = closure(node); + if (node instanceof LineEnding) { + LineEnding le = (LineEnding)node; + node = closureOfLineEnding(le); + + if (node != le) { + // LineEnding was replaced with an anonymous group + if (head == null) + head = node; + else + tail.next = node; + // Double return: Tail was returned in root + tail = root; + continue; + } + } else { + node = closure(node); + } + /* save the top dot-greedy nodes (.*, .+) as well if (node instanceof GreedyCharProperty && ((GreedyCharProperty)node).cp instanceof Dot) { @@ -3079,18 +3096,31 @@ if (saveTCNCount < topClosureNodes.size()) topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); + return groupWithClosure(node, head, tail, capturingGroup); + } + + /** + * Transforms a Group with quantifiers into some special constructs + * (such as Branch or Loop/GroupCurly), if necessary. + * + * This method is applied either to actual groups or to the Unicode + * linebreak (aka \\R) represented as an anonymous group. + */ + private Node groupWithClosure(Node node, Node head, Node tail, + boolean capturingGroup) + { if (node instanceof Ques) { Ques ques = (Ques) node; if (ques.type == Qtype.POSSESSIVE) { root = node; return node; } - tail.next = new BranchConn(); - tail = tail.next; + BranchConn branchConn = new BranchConn(); + tail = tail.next = branchConn; if (ques.type == Qtype.GREEDY) { - head = new Branch(head, null, tail); + head = new Branch(head, null, branchConn); } else { // Reluctant quantifier - head = new Branch(null, head, tail); + head = new Branch(null, head, branchConn); } root = tail; return head; @@ -3268,6 +3298,31 @@ } /** + * Processing repetition of a Unicode linebreak \\R. + */ + private Node closureOfLineEnding(LineEnding le) { + int ch = peek(); + if (ch != '?' && ch != '*' && ch != '+' && ch != '{') { + return le; + } + + // Replace the LineEnding with an anonymous group + // (?:\\u000D\\u000A|[\\u000A\\u000B\\u000C\\u000D\\u0085\\u2028\\u2029]) + Node grHead = createGroup(true); + Node grTail = root; + BranchConn branchConn = new BranchConn(); + branchConn.next = grTail; + Node slice = new Slice(new int[] {0x0D, 0x0A}); + slice.next = branchConn; + Node chClass = newCharProperty(x -> x == 0x0A || x == 0x0B || + x == 0x0C || x == 0x0D || x == 0x85 || x == 0x2028 || + x == 0x2029); + chClass.next = branchConn; + grHead.next = new Branch(slice, chClass, branchConn); + return groupWithClosure(closure(grHead), grHead, grTail, false); + } + + /** * Processes repetition. If the next character peeked is a quantifier * then new nodes must be appended to handle the repetition. * Prev could be a single or a group, so it could be a chain of nodes. @@ -4723,8 +4778,8 @@ static final class Branch extends Node { Node[] atoms = new Node[2]; int size = 2; - Node conn; - Branch(Node first, Node second, Node branchConn) { + BranchConn conn; + Branch(Node first, Node second, BranchConn branchConn) { conn = branchConn; atoms[0] = first; atoms[1] = second; @@ -4732,9 +4787,10 @@ void add(Node node) { if (size >= atoms.length) { - Node[] tmp = new Node[atoms.length*2]; - System.arraycopy(atoms, 0, tmp, 0, atoms.length); - atoms = tmp; + int len = ArraysSupport.newLength(size, + 1, /* minimum growth */ + size /* preferred growth */); + atoms = Arrays.copyOf(atoms, len); } atoms[size++] = node; } diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -35,7 +35,7 @@ * 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819 * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 - * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 + * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 * * @library /test/lib * @library /lib/testlibrary/java/lang @@ -57,7 +57,9 @@ import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Random; import java.util.Scanner; import java.util.function.Function; @@ -186,6 +188,7 @@ invalidGroupName(); illegalRepetitionRange(); surrogatePairWithCanonEq(); + lineBreakWithQuantifier(); if (failure) { throw new @@ -5000,4 +5003,81 @@ } report("surrogatePairWithCanonEq"); } + + // This test is for 8235812 + private static void lineBreakWithQuantifier() { + // key: pattern + // value: lengths of input that must match the pattern + Map> cases = Map.ofEntries( + Map.entry("\\R?", List.of(0, 1)), + Map.entry("\\R*", List.of(0, 1, 2, 3)), + Map.entry("\\R+", List.of(1, 2, 3)), + Map.entry("\\R{0}", List.of(0)), + Map.entry("\\R{1}", List.of(1)), + Map.entry("\\R{2}", List.of(2)), + Map.entry("\\R{3}", List.of(3)), + Map.entry("\\R{0,}", List.of(0, 1, 2, 3)), + Map.entry("\\R{1,}", List.of(1, 2, 3)), + Map.entry("\\R{2,}", List.of(2, 3)), + Map.entry("\\R{3,}", List.of(3)), + Map.entry("\\R{0,0}", List.of(0)), + Map.entry("\\R{0,1}", List.of(0, 1)), + Map.entry("\\R{0,2}", List.of(0, 1, 2)), + Map.entry("\\R{0,3}", List.of(0, 1, 2, 3)), + Map.entry("\\R{1,1}", List.of(1)), + Map.entry("\\R{1,2}", List.of(1, 2)), + Map.entry("\\R{1,3}", List.of(1, 2, 3)), + Map.entry("\\R{2,2}", List.of(2)), + Map.entry("\\R{2,3}", List.of(2, 3)), + Map.entry("\\R{3,3}", List.of(3)), + Map.entry("\\R", List.of(1)), + Map.entry("\\R\\R", List.of(2)), + Map.entry("\\R\\R\\R", List.of(3)) + ); + + // key: length of input + // value: all possible inputs of given length + Map> inputs = new HashMap<>(); + String[] Rs = { "\r\n", "\r", "\n", + "\u000B", "\u000C", "\u0085", "\u2028", "\u2029" }; + StringBuilder sb = new StringBuilder(); + for (int len = 0; len <= 3; ++len) { + int[] idx = new int[len + 1]; + do { + sb.setLength(0); + for (int j = 0; j < len; ++j) + sb.append(Rs[idx[j]]); + inputs.computeIfAbsent(len, ArrayList::new).add(sb.toString()); + idx[0]++; + for (int j = 0; j < len; ++j) { + if (idx[j] < Rs.length) + break; + idx[j] = 0; + idx[j+1]++; + } + } while (idx[len] == 0); + } + + // exhaustive testing + for (String patStr : cases.keySet()) { + Pattern[] pats = patStr.endsWith("R") + ? new Pattern[] { Pattern.compile(patStr) } // no quantifiers + : new Pattern[] { Pattern.compile(patStr), // greedy + Pattern.compile(patStr + "?") }; // reluctant + Matcher m = pats[0].matcher(""); + for (Pattern p : pats) { + m.usePattern(p); + for (int len : cases.get(patStr)) { + for (String in : inputs.get(len)) { + if (!m.reset(in).matches()) { + failCount++; + System.out.println("Expected to match '" + + in + "' =~ /" + p + "/"); + } + } + } + } + } + report("lineBreakWithQuantifier"); + } }