< prev index next >

src/java.base/share/classes/java/util/regex/Pattern.java

Print this page
rev 57941 : [mq]: 8235812-Unicode-linebreak-with-quantifier-does-not-match-valid-input


2047      */
2048     private static final boolean isSupplementary(int ch) {
2049         return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
2050                Character.isSurrogate((char)ch);
2051     }
2052 
2053     /**
2054      *  The following methods handle the main parsing. They are sorted
2055      *  according to their precedence order, the lowest one first.
2056      */
2057 
2058     /**
2059      * The expression is parsed with branch nodes added for alternations.
2060      * This may be called recursively to parse sub expressions that may
2061      * contain alternations.
2062      */
2063     private Node expr(Node end) {
2064         Node prev = null;
2065         Node firstTail = null;
2066         Branch branch = null;
2067         Node branchConn = null;
2068 
2069         for (;;) {
2070             Node node = sequence(end);
2071             Node nodeTail = root;      //double return
2072             if (prev == null) {
2073                 prev = node;
2074                 firstTail = nodeTail;
2075             } else {
2076                 // Branch
2077                 if (branchConn == null) {
2078                     branchConn = new BranchConn();
2079                     branchConn.next = end;
2080                 }
2081                 if (node == end) {
2082                     // if the node returned from sequence() is "end"
2083                     // we have an empty expr, set a null atom into
2084                     // the branch to indicate to go "next" directly.
2085                     node = null;
2086                 } else {
2087                     // the "tail.next" of each atom goes to branchConn


2195                 break LOOP;
2196             case ']': // Now interpreting dangling ] and } as literals
2197             case '}':
2198                 node = atom();
2199                 break;
2200             case '?':
2201             case '*':
2202             case '+':
2203                 next();
2204                 throw error("Dangling meta character '" + ((char)ch) + "'");
2205             case 0:
2206                 if (cursor >= patternLength) {
2207                     break LOOP;
2208                 }
2209                 // Fall through
2210             default:
2211                 node = atom();
2212                 break;
2213             }
2214 















2215             node = closure(node);


2216             /* save the top dot-greedy nodes (.*, .+) as well
2217             if (node instanceof GreedyCharProperty &&
2218                 ((GreedyCharProperty)node).cp instanceof Dot) {
2219                 topClosureNodes.add(node);
2220             }
2221             */
2222             if (head == null) {
2223                 head = tail = node;
2224             } else {
2225                 tail.next = node;
2226                 tail = node;
2227             }
2228         }
2229         if (head == null) {
2230             return end;
2231         }
2232         tail.next = end;
2233         root = tail;      //double return
2234         return head;
2235     }


3062 
3063         accept(')', "Unclosed group");
3064         flags0 = save;
3065 
3066         // Check for quantifiers
3067         Node node = closure(head);
3068         if (node == head) { // No closure
3069             root = tail;
3070             return node;    // Dual return
3071         }
3072         if (head == tail) { // Zero length assertion
3073             root = node;
3074             return node;    // Dual return
3075         }
3076 
3077         // have group closure, clear all inner closure nodes from the
3078         // top list (no backtracking stopper optimization for inner
3079         if (saveTCNCount < topClosureNodes.size())
3080             topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
3081 













3082         if (node instanceof Ques) {
3083             Ques ques = (Ques) node;
3084             if (ques.type == Qtype.POSSESSIVE) {
3085                 root = node;
3086                 return node;
3087             }
3088             tail.next = new BranchConn();
3089             tail = tail.next;
3090             if (ques.type == Qtype.GREEDY) {
3091                 head = new Branch(head, null, tail);
3092             } else { // Reluctant quantifier
3093                 head = new Branch(null, head, tail);
3094             }
3095             root = tail;
3096             return head;
3097         } else if (node instanceof Curly) {
3098             Curly curly = (Curly) node;
3099             if (curly.type == Qtype.POSSESSIVE) {
3100                 root = node;
3101                 return node;
3102             }
3103             // Discover if the group is deterministic
3104             TreeInfo info = new TreeInfo();
3105             if (head.study(info)) { // Deterministic
3106                 GroupTail temp = (GroupTail) tail;
3107                 head = root = new GroupCurly(head.next, curly.cmin,
3108                                    curly.cmax, curly.type,
3109                                    ((GroupTail)tail).localIndex,
3110                                    ((GroupTail)tail).groupIndex,
3111                                              capturingGroup);
3112                 return head;
3113             } else { // Non-deterministic


3251         } else if (ch == '+') {
3252             next();
3253             return Qtype.POSSESSIVE;
3254         }
3255         return Qtype.GREEDY;
3256     }
3257 
3258     private Node curly(Node prev, int cmin) {
3259         Qtype qtype = qtype();
3260         if (qtype == Qtype.GREEDY) {
3261             if (prev instanceof BmpCharProperty) {
3262                 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3263             } else if (prev instanceof CharProperty) {
3264                 return new CharPropertyGreedy((CharProperty)prev, cmin);
3265             }
3266         }
3267         return new Curly(prev, cmin, MAX_REPS, qtype);
3268     }
3269 
3270     /**

























3271      * Processes repetition. If the next character peeked is a quantifier
3272      * then new nodes must be appended to handle the repetition.
3273      * Prev could be a single or a group, so it could be a chain of nodes.
3274      */
3275     private Node closure(Node prev) {
3276         int ch = peek();
3277         switch (ch) {
3278         case '?':
3279             return new Ques(prev, qtype());
3280         case '*':
3281             return curly(prev, 0);
3282         case '+':
3283             return curly(prev, 1);
3284         case '{':
3285             ch = skip();
3286             if (ASCII.isDigit(ch)) {
3287                 int cmin = 0, cmax;
3288                 try {
3289                     do {
3290                         cmin = Math.addExact(Math.multiplyExact(cmin, 10),


4706      * "next".
4707      */
4708     static final class BranchConn extends Node {
4709         BranchConn() {}
4710         boolean match(Matcher matcher, int i, CharSequence seq) {
4711             return next.match(matcher, i, seq);
4712         }
4713         boolean study(TreeInfo info) {
4714             return info.deterministic;
4715         }
4716     }
4717 
4718     /**
4719      * Handles the branching of alternations. Note this is also used for
4720      * the ? quantifier to branch between the case where it matches once
4721      * and where it does not occur.
4722      */
4723     static final class Branch extends Node {
4724         Node[] atoms = new Node[2];
4725         int size = 2;
4726         Node conn;
4727         Branch(Node first, Node second, Node branchConn) {
4728             conn = branchConn;
4729             atoms[0] = first;
4730             atoms[1] = second;
4731         }
4732 
4733         void add(Node node) {
4734             if (size >= atoms.length) {
4735                 Node[] tmp = new Node[atoms.length*2];
4736                 System.arraycopy(atoms, 0, tmp, 0, atoms.length);
4737                 atoms = tmp;

4738             }
4739             atoms[size++] = node;
4740         }
4741 
4742         boolean match(Matcher matcher, int i, CharSequence seq) {
4743             for (int n = 0; n < size; n++) {
4744                 if (atoms[n] == null) {
4745                     if (conn.next.match(matcher, i, seq))
4746                         return true;
4747                 } else if (atoms[n].match(matcher, i, seq)) {
4748                     return true;
4749                 }
4750             }
4751             return false;
4752         }
4753 
4754         boolean study(TreeInfo info) {
4755             int minL = info.minLength;
4756             int maxL = info.maxLength;
4757             boolean maxV = info.maxValid;




2047      */
2048     private static final boolean isSupplementary(int ch) {
2049         return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
2050                Character.isSurrogate((char)ch);
2051     }
2052 
2053     /**
2054      *  The following methods handle the main parsing. They are sorted
2055      *  according to their precedence order, the lowest one first.
2056      */
2057 
2058     /**
2059      * The expression is parsed with branch nodes added for alternations.
2060      * This may be called recursively to parse sub expressions that may
2061      * contain alternations.
2062      */
2063     private Node expr(Node end) {
2064         Node prev = null;
2065         Node firstTail = null;
2066         Branch branch = null;
2067         BranchConn branchConn = null;
2068 
2069         for (;;) {
2070             Node node = sequence(end);
2071             Node nodeTail = root;      //double return
2072             if (prev == null) {
2073                 prev = node;
2074                 firstTail = nodeTail;
2075             } else {
2076                 // Branch
2077                 if (branchConn == null) {
2078                     branchConn = new BranchConn();
2079                     branchConn.next = end;
2080                 }
2081                 if (node == end) {
2082                     // if the node returned from sequence() is "end"
2083                     // we have an empty expr, set a null atom into
2084                     // the branch to indicate to go "next" directly.
2085                     node = null;
2086                 } else {
2087                     // the "tail.next" of each atom goes to branchConn


2195                 break LOOP;
2196             case ']': // Now interpreting dangling ] and } as literals
2197             case '}':
2198                 node = atom();
2199                 break;
2200             case '?':
2201             case '*':
2202             case '+':
2203                 next();
2204                 throw error("Dangling meta character '" + ((char)ch) + "'");
2205             case 0:
2206                 if (cursor >= patternLength) {
2207                     break LOOP;
2208                 }
2209                 // Fall through
2210             default:
2211                 node = atom();
2212                 break;
2213             }
2214 
2215             if (node instanceof LineEnding) {
2216                 LineEnding le = (LineEnding)node;
2217                 node = closureOfLineEnding(le);
2218 
2219                 if (node != le) {
2220                     // LineEnding was replaced with an anonymous group
2221                     if (head == null)
2222                         head = node;
2223                     else
2224                         tail.next = node;
2225                     // Double return: Tail was returned in root
2226                     tail = root;
2227                     continue;
2228                 }
2229             } else {
2230                 node = closure(node);
2231             }
2232 
2233             /* save the top dot-greedy nodes (.*, .+) as well
2234             if (node instanceof GreedyCharProperty &&
2235                 ((GreedyCharProperty)node).cp instanceof Dot) {
2236                 topClosureNodes.add(node);
2237             }
2238             */
2239             if (head == null) {
2240                 head = tail = node;
2241             } else {
2242                 tail.next = node;
2243                 tail = node;
2244             }
2245         }
2246         if (head == null) {
2247             return end;
2248         }
2249         tail.next = end;
2250         root = tail;      //double return
2251         return head;
2252     }


3079 
3080         accept(')', "Unclosed group");
3081         flags0 = save;
3082 
3083         // Check for quantifiers
3084         Node node = closure(head);
3085         if (node == head) { // No closure
3086             root = tail;
3087             return node;    // Dual return
3088         }
3089         if (head == tail) { // Zero length assertion
3090             root = node;
3091             return node;    // Dual return
3092         }
3093 
3094         // have group closure, clear all inner closure nodes from the
3095         // top list (no backtracking stopper optimization for inner
3096         if (saveTCNCount < topClosureNodes.size())
3097             topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
3098 
3099         return groupWithClosure(node, head, tail, capturingGroup);
3100     }
3101 
3102     /**
3103      * Transforms a Group with quantifiers into some special constructs
3104      * (such as Branch or Loop/GroupCurly), if necessary.
3105      *
3106      * This method is applied either to actual groups or to the Unicode
3107      * linebreak (aka \\R) represented as an anonymous group.
3108      */
3109     private Node groupWithClosure(Node node, Node head, Node tail,
3110                                   boolean capturingGroup)
3111     {
3112         if (node instanceof Ques) {
3113             Ques ques = (Ques) node;
3114             if (ques.type == Qtype.POSSESSIVE) {
3115                 root = node;
3116                 return node;
3117             }
3118             BranchConn branchConn = new BranchConn();
3119             tail = tail.next = branchConn;
3120             if (ques.type == Qtype.GREEDY) {
3121                 head = new Branch(head, null, branchConn);
3122             } else { // Reluctant quantifier
3123                 head = new Branch(null, head, branchConn);
3124             }
3125             root = tail;
3126             return head;
3127         } else if (node instanceof Curly) {
3128             Curly curly = (Curly) node;
3129             if (curly.type == Qtype.POSSESSIVE) {
3130                 root = node;
3131                 return node;
3132             }
3133             // Discover if the group is deterministic
3134             TreeInfo info = new TreeInfo();
3135             if (head.study(info)) { // Deterministic
3136                 GroupTail temp = (GroupTail) tail;
3137                 head = root = new GroupCurly(head.next, curly.cmin,
3138                                    curly.cmax, curly.type,
3139                                    ((GroupTail)tail).localIndex,
3140                                    ((GroupTail)tail).groupIndex,
3141                                              capturingGroup);
3142                 return head;
3143             } else { // Non-deterministic


3281         } else if (ch == '+') {
3282             next();
3283             return Qtype.POSSESSIVE;
3284         }
3285         return Qtype.GREEDY;
3286     }
3287 
3288     private Node curly(Node prev, int cmin) {
3289         Qtype qtype = qtype();
3290         if (qtype == Qtype.GREEDY) {
3291             if (prev instanceof BmpCharProperty) {
3292                 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3293             } else if (prev instanceof CharProperty) {
3294                 return new CharPropertyGreedy((CharProperty)prev, cmin);
3295             }
3296         }
3297         return new Curly(prev, cmin, MAX_REPS, qtype);
3298     }
3299 
3300     /**
3301      * Processing repetition of a Unicode linebreak \\R.
3302      */
3303     private Node closureOfLineEnding(LineEnding le) {
3304         int ch = peek();
3305         if (ch != '?' && ch != '*' && ch != '+' && ch != '{') {
3306             return le;
3307         }
3308 
3309         // Replace the LineEnding with an anonymous group
3310         // (?:\\u000D\\u000A|[\\u000A\\u000B\\u000C\\u000D\\u0085\\u2028\\u2029])
3311         Node grHead = createGroup(true);
3312         Node grTail = root;
3313         BranchConn branchConn = new BranchConn();
3314         branchConn.next = grTail;
3315         Node slice = new Slice(new int[] {0x0D, 0x0A});
3316         slice.next = branchConn;
3317         Node chClass = newCharProperty(x -> x == 0x0A || x == 0x0B ||
3318                 x == 0x0C || x == 0x0D || x == 0x85 || x == 0x2028 ||
3319                 x == 0x2029);
3320         chClass.next = branchConn;
3321         grHead.next = new Branch(slice, chClass, branchConn);
3322         return groupWithClosure(closure(grHead), grHead, grTail, false);
3323     }
3324 
3325     /**
3326      * Processes repetition. If the next character peeked is a quantifier
3327      * then new nodes must be appended to handle the repetition.
3328      * Prev could be a single or a group, so it could be a chain of nodes.
3329      */
3330     private Node closure(Node prev) {
3331         int ch = peek();
3332         switch (ch) {
3333         case '?':
3334             return new Ques(prev, qtype());
3335         case '*':
3336             return curly(prev, 0);
3337         case '+':
3338             return curly(prev, 1);
3339         case '{':
3340             ch = skip();
3341             if (ASCII.isDigit(ch)) {
3342                 int cmin = 0, cmax;
3343                 try {
3344                     do {
3345                         cmin = Math.addExact(Math.multiplyExact(cmin, 10),


4761      * "next".
4762      */
4763     static final class BranchConn extends Node {
4764         BranchConn() {}
4765         boolean match(Matcher matcher, int i, CharSequence seq) {
4766             return next.match(matcher, i, seq);
4767         }
4768         boolean study(TreeInfo info) {
4769             return info.deterministic;
4770         }
4771     }
4772 
4773     /**
4774      * Handles the branching of alternations. Note this is also used for
4775      * the ? quantifier to branch between the case where it matches once
4776      * and where it does not occur.
4777      */
4778     static final class Branch extends Node {
4779         Node[] atoms = new Node[2];
4780         int size = 2;
4781         BranchConn conn;
4782         Branch(Node first, Node second, BranchConn branchConn) {
4783             conn = branchConn;
4784             atoms[0] = first;
4785             atoms[1] = second;
4786         }
4787 
4788         void add(Node node) {
4789             if (size >= atoms.length) {
4790                 int len = ArraysSupport.newLength(size,
4791                         1,    /* minimum growth */
4792                         size  /* preferred growth */);
4793                 atoms = Arrays.copyOf(atoms, len);
4794             }
4795             atoms[size++] = node;
4796         }
4797 
4798         boolean match(Matcher matcher, int i, CharSequence seq) {
4799             for (int n = 0; n < size; n++) {
4800                 if (atoms[n] == null) {
4801                     if (conn.next.match(matcher, i, seq))
4802                         return true;
4803                 } else if (atoms[n].match(matcher, i, seq)) {
4804                     return true;
4805                 }
4806             }
4807             return false;
4808         }
4809 
4810         boolean study(TreeInfo info) {
4811             int minL = info.minLength;
4812             int maxL = info.maxLength;
4813             boolean maxV = info.maxValid;


< prev index next >