< prev index next >

src/java.base/share/classes/java/util/regex/Pattern.java

Print this page
rev 57941 : [mq]: 8235812-Unicode-linebreak-with-quantifier-does-not-match-valid-input

@@ -2062,11 +2062,11 @@
      */
     private Node expr(Node end) {
         Node prev = null;
         Node firstTail = null;
         Branch branch = null;
-        Node branchConn = null;
+        BranchConn branchConn = null;
 
         for (;;) {
             Node node = sequence(end);
             Node nodeTail = root;      //double return
             if (prev == null) {

@@ -2210,11 +2210,28 @@
             default:
                 node = atom();
                 break;
             }
 
+            if (node instanceof LineEnding) {
+                LineEnding le = (LineEnding)node;
+                node = closureOfLineEnding(le);
+
+                if (node != le) {
+                    // LineEnding was replaced with an anonymous group
+                    if (head == null)
+                        head = node;
+                    else
+                        tail.next = node;
+                    // Double return: Tail was returned in root
+                    tail = root;
+                    continue;
+                }
+            } else {
             node = closure(node);
+            }
+
             /* save the top dot-greedy nodes (.*, .+) as well
             if (node instanceof GreedyCharProperty &&
                 ((GreedyCharProperty)node).cp instanceof Dot) {
                 topClosureNodes.add(node);
             }

@@ -3077,22 +3094,35 @@
         // have group closure, clear all inner closure nodes from the
         // top list (no backtracking stopper optimization for inner
         if (saveTCNCount < topClosureNodes.size())
             topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
 
+        return groupWithClosure(node, head, tail, capturingGroup);
+    }
+
+    /**
+     * Transforms a Group with quantifiers into some special constructs
+     * (such as Branch or Loop/GroupCurly), if necessary.
+     *
+     * This method is applied either to actual groups or to the Unicode
+     * linebreak (aka \\R) represented as an anonymous group.
+     */
+    private Node groupWithClosure(Node node, Node head, Node tail,
+                                  boolean capturingGroup)
+    {
         if (node instanceof Ques) {
             Ques ques = (Ques) node;
             if (ques.type == Qtype.POSSESSIVE) {
                 root = node;
                 return node;
             }
-            tail.next = new BranchConn();
-            tail = tail.next;
+            BranchConn branchConn = new BranchConn();
+            tail = tail.next = branchConn;
             if (ques.type == Qtype.GREEDY) {
-                head = new Branch(head, null, tail);
+                head = new Branch(head, null, branchConn);
             } else { // Reluctant quantifier
-                head = new Branch(null, head, tail);
+                head = new Branch(null, head, branchConn);
             }
             root = tail;
             return head;
         } else if (node instanceof Curly) {
             Curly curly = (Curly) node;

@@ -3266,10 +3296,35 @@
         }
         return new Curly(prev, cmin, MAX_REPS, qtype);
     }
 
     /**
+     * Processing repetition of a Unicode linebreak \\R.
+     */
+    private Node closureOfLineEnding(LineEnding le) {
+        int ch = peek();
+        if (ch != '?' && ch != '*' && ch != '+' && ch != '{') {
+            return le;
+        }
+
+        // Replace the LineEnding with an anonymous group
+        // (?:\\u000D\\u000A|[\\u000A\\u000B\\u000C\\u000D\\u0085\\u2028\\u2029])
+        Node grHead = createGroup(true);
+        Node grTail = root;
+        BranchConn branchConn = new BranchConn();
+        branchConn.next = grTail;
+        Node slice = new Slice(new int[] {0x0D, 0x0A});
+        slice.next = branchConn;
+        Node chClass = newCharProperty(x -> x == 0x0A || x == 0x0B ||
+                x == 0x0C || x == 0x0D || x == 0x85 || x == 0x2028 ||
+                x == 0x2029);
+        chClass.next = branchConn;
+        grHead.next = new Branch(slice, chClass, branchConn);
+        return groupWithClosure(closure(grHead), grHead, grTail, false);
+    }
+
+    /**
      * Processes repetition. If the next character peeked is a quantifier
      * then new nodes must be appended to handle the repetition.
      * Prev could be a single or a group, so it could be a chain of nodes.
      */
     private Node closure(Node prev) {

@@ -4721,22 +4776,23 @@
      * and where it does not occur.
      */
     static final class Branch extends Node {
         Node[] atoms = new Node[2];
         int size = 2;
-        Node conn;
-        Branch(Node first, Node second, Node branchConn) {
+        BranchConn conn;
+        Branch(Node first, Node second, BranchConn branchConn) {
             conn = branchConn;
             atoms[0] = first;
             atoms[1] = second;
         }
 
         void add(Node node) {
             if (size >= atoms.length) {
-                Node[] tmp = new Node[atoms.length*2];
-                System.arraycopy(atoms, 0, tmp, 0, atoms.length);
-                atoms = tmp;
+                int len = ArraysSupport.newLength(size,
+                        1,    /* minimum growth */
+                        size  /* preferred growth */);
+                atoms = Arrays.copyOf(atoms, len);
             }
             atoms[size++] = node;
         }
 
         boolean match(Matcher matcher, int i, CharSequence seq) {
< prev index next >