2047 */
2048 private static final boolean isSupplementary(int ch) {
2049 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
2050 Character.isSurrogate((char)ch);
2051 }
2052
2053 /**
2054 * The following methods handle the main parsing. They are sorted
2055 * according to their precedence order, the lowest one first.
2056 */
2057
2058 /**
2059 * The expression is parsed with branch nodes added for alternations.
2060 * This may be called recursively to parse sub expressions that may
2061 * contain alternations.
2062 */
2063 private Node expr(Node end) {
2064 Node prev = null;
2065 Node firstTail = null;
2066 Branch branch = null;
2067 Node branchConn = null;
2068
2069 for (;;) {
2070 Node node = sequence(end);
2071 Node nodeTail = root; //double return
2072 if (prev == null) {
2073 prev = node;
2074 firstTail = nodeTail;
2075 } else {
2076 // Branch
2077 if (branchConn == null) {
2078 branchConn = new BranchConn();
2079 branchConn.next = end;
2080 }
2081 if (node == end) {
2082 // if the node returned from sequence() is "end"
2083 // we have an empty expr, set a null atom into
2084 // the branch to indicate to go "next" directly.
2085 node = null;
2086 } else {
2087 // the "tail.next" of each atom goes to branchConn
2195 break LOOP;
2196 case ']': // Now interpreting dangling ] and } as literals
2197 case '}':
2198 node = atom();
2199 break;
2200 case '?':
2201 case '*':
2202 case '+':
2203 next();
2204 throw error("Dangling meta character '" + ((char)ch) + "'");
2205 case 0:
2206 if (cursor >= patternLength) {
2207 break LOOP;
2208 }
2209 // Fall through
2210 default:
2211 node = atom();
2212 break;
2213 }
2214
2215 node = closure(node);
2216 /* save the top dot-greedy nodes (.*, .+) as well
2217 if (node instanceof GreedyCharProperty &&
2218 ((GreedyCharProperty)node).cp instanceof Dot) {
2219 topClosureNodes.add(node);
2220 }
2221 */
2222 if (head == null) {
2223 head = tail = node;
2224 } else {
2225 tail.next = node;
2226 tail = node;
2227 }
2228 }
2229 if (head == null) {
2230 return end;
2231 }
2232 tail.next = end;
2233 root = tail; //double return
2234 return head;
2235 }
3062
3063 accept(')', "Unclosed group");
3064 flags0 = save;
3065
3066 // Check for quantifiers
3067 Node node = closure(head);
3068 if (node == head) { // No closure
3069 root = tail;
3070 return node; // Dual return
3071 }
3072 if (head == tail) { // Zero length assertion
3073 root = node;
3074 return node; // Dual return
3075 }
3076
3077 // have group closure, clear all inner closure nodes from the
3078 // top list (no backtracking stopper optimization for inner
3079 if (saveTCNCount < topClosureNodes.size())
3080 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
3081
3082 if (node instanceof Ques) {
3083 Ques ques = (Ques) node;
3084 if (ques.type == Qtype.POSSESSIVE) {
3085 root = node;
3086 return node;
3087 }
3088 tail.next = new BranchConn();
3089 tail = tail.next;
3090 if (ques.type == Qtype.GREEDY) {
3091 head = new Branch(head, null, tail);
3092 } else { // Reluctant quantifier
3093 head = new Branch(null, head, tail);
3094 }
3095 root = tail;
3096 return head;
3097 } else if (node instanceof Curly) {
3098 Curly curly = (Curly) node;
3099 if (curly.type == Qtype.POSSESSIVE) {
3100 root = node;
3101 return node;
3102 }
3103 // Discover if the group is deterministic
3104 TreeInfo info = new TreeInfo();
3105 if (head.study(info)) { // Deterministic
3106 GroupTail temp = (GroupTail) tail;
3107 head = root = new GroupCurly(head.next, curly.cmin,
3108 curly.cmax, curly.type,
3109 ((GroupTail)tail).localIndex,
3110 ((GroupTail)tail).groupIndex,
3111 capturingGroup);
3112 return head;
3113 } else { // Non-deterministic
3251 } else if (ch == '+') {
3252 next();
3253 return Qtype.POSSESSIVE;
3254 }
3255 return Qtype.GREEDY;
3256 }
3257
3258 private Node curly(Node prev, int cmin) {
3259 Qtype qtype = qtype();
3260 if (qtype == Qtype.GREEDY) {
3261 if (prev instanceof BmpCharProperty) {
3262 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3263 } else if (prev instanceof CharProperty) {
3264 return new CharPropertyGreedy((CharProperty)prev, cmin);
3265 }
3266 }
3267 return new Curly(prev, cmin, MAX_REPS, qtype);
3268 }
3269
3270 /**
3271 * Processes repetition. If the next character peeked is a quantifier
3272 * then new nodes must be appended to handle the repetition.
3273 * Prev could be a single or a group, so it could be a chain of nodes.
3274 */
3275 private Node closure(Node prev) {
3276 int ch = peek();
3277 switch (ch) {
3278 case '?':
3279 return new Ques(prev, qtype());
3280 case '*':
3281 return curly(prev, 0);
3282 case '+':
3283 return curly(prev, 1);
3284 case '{':
3285 ch = skip();
3286 if (ASCII.isDigit(ch)) {
3287 int cmin = 0, cmax;
3288 try {
3289 do {
3290 cmin = Math.addExact(Math.multiplyExact(cmin, 10),
4706 * "next".
4707 */
4708 static final class BranchConn extends Node {
4709 BranchConn() {}
4710 boolean match(Matcher matcher, int i, CharSequence seq) {
4711 return next.match(matcher, i, seq);
4712 }
4713 boolean study(TreeInfo info) {
4714 return info.deterministic;
4715 }
4716 }
4717
4718 /**
4719 * Handles the branching of alternations. Note this is also used for
4720 * the ? quantifier to branch between the case where it matches once
4721 * and where it does not occur.
4722 */
4723 static final class Branch extends Node {
4724 Node[] atoms = new Node[2];
4725 int size = 2;
4726 Node conn;
4727 Branch(Node first, Node second, Node branchConn) {
4728 conn = branchConn;
4729 atoms[0] = first;
4730 atoms[1] = second;
4731 }
4732
4733 void add(Node node) {
4734 if (size >= atoms.length) {
4735 Node[] tmp = new Node[atoms.length*2];
4736 System.arraycopy(atoms, 0, tmp, 0, atoms.length);
4737 atoms = tmp;
4738 }
4739 atoms[size++] = node;
4740 }
4741
4742 boolean match(Matcher matcher, int i, CharSequence seq) {
4743 for (int n = 0; n < size; n++) {
4744 if (atoms[n] == null) {
4745 if (conn.next.match(matcher, i, seq))
4746 return true;
4747 } else if (atoms[n].match(matcher, i, seq)) {
4748 return true;
4749 }
4750 }
4751 return false;
4752 }
4753
4754 boolean study(TreeInfo info) {
4755 int minL = info.minLength;
4756 int maxL = info.maxLength;
4757 boolean maxV = info.maxValid;
|
2047 */
2048 private static final boolean isSupplementary(int ch) {
2049 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
2050 Character.isSurrogate((char)ch);
2051 }
2052
2053 /**
2054 * The following methods handle the main parsing. They are sorted
2055 * according to their precedence order, the lowest one first.
2056 */
2057
2058 /**
2059 * The expression is parsed with branch nodes added for alternations.
2060 * This may be called recursively to parse sub expressions that may
2061 * contain alternations.
2062 */
2063 private Node expr(Node end) {
2064 Node prev = null;
2065 Node firstTail = null;
2066 Branch branch = null;
2067 BranchConn branchConn = null;
2068
2069 for (;;) {
2070 Node node = sequence(end);
2071 Node nodeTail = root; //double return
2072 if (prev == null) {
2073 prev = node;
2074 firstTail = nodeTail;
2075 } else {
2076 // Branch
2077 if (branchConn == null) {
2078 branchConn = new BranchConn();
2079 branchConn.next = end;
2080 }
2081 if (node == end) {
2082 // if the node returned from sequence() is "end"
2083 // we have an empty expr, set a null atom into
2084 // the branch to indicate to go "next" directly.
2085 node = null;
2086 } else {
2087 // the "tail.next" of each atom goes to branchConn
2195 break LOOP;
2196 case ']': // Now interpreting dangling ] and } as literals
2197 case '}':
2198 node = atom();
2199 break;
2200 case '?':
2201 case '*':
2202 case '+':
2203 next();
2204 throw error("Dangling meta character '" + ((char)ch) + "'");
2205 case 0:
2206 if (cursor >= patternLength) {
2207 break LOOP;
2208 }
2209 // Fall through
2210 default:
2211 node = atom();
2212 break;
2213 }
2214
2215 if (node instanceof LineEnding) {
2216 LineEnding le = (LineEnding)node;
2217 node = closureOfLineEnding(le);
2218
2219 if (node != le) {
2220 // LineEnding was replaced with an anonymous group
2221 if (head == null)
2222 head = node;
2223 else
2224 tail.next = node;
2225 // Double return: Tail was returned in root
2226 tail = root;
2227 continue;
2228 }
2229 } else {
2230 node = closure(node);
2231 }
2232
2233 /* save the top dot-greedy nodes (.*, .+) as well
2234 if (node instanceof GreedyCharProperty &&
2235 ((GreedyCharProperty)node).cp instanceof Dot) {
2236 topClosureNodes.add(node);
2237 }
2238 */
2239 if (head == null) {
2240 head = tail = node;
2241 } else {
2242 tail.next = node;
2243 tail = node;
2244 }
2245 }
2246 if (head == null) {
2247 return end;
2248 }
2249 tail.next = end;
2250 root = tail; //double return
2251 return head;
2252 }
3079
3080 accept(')', "Unclosed group");
3081 flags0 = save;
3082
3083 // Check for quantifiers
3084 Node node = closure(head);
3085 if (node == head) { // No closure
3086 root = tail;
3087 return node; // Dual return
3088 }
3089 if (head == tail) { // Zero length assertion
3090 root = node;
3091 return node; // Dual return
3092 }
3093
3094 // have group closure, clear all inner closure nodes from the
3095 // top list (no backtracking stopper optimization for inner
3096 if (saveTCNCount < topClosureNodes.size())
3097 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
3098
3099 return groupWithClosure(node, head, tail, capturingGroup);
3100 }
3101
3102 /**
3103 * Transforms a Group with quantifiers into some special constructs
3104 * (such as Branch or Loop/GroupCurly), if necessary.
3105 *
3106 * This method is applied either to actual groups or to the Unicode
3107 * linebreak (aka \\R) represented as an anonymous group.
3108 */
3109 private Node groupWithClosure(Node node, Node head, Node tail,
3110 boolean capturingGroup)
3111 {
3112 if (node instanceof Ques) {
3113 Ques ques = (Ques) node;
3114 if (ques.type == Qtype.POSSESSIVE) {
3115 root = node;
3116 return node;
3117 }
3118 BranchConn branchConn = new BranchConn();
3119 tail = tail.next = branchConn;
3120 if (ques.type == Qtype.GREEDY) {
3121 head = new Branch(head, null, branchConn);
3122 } else { // Reluctant quantifier
3123 head = new Branch(null, head, branchConn);
3124 }
3125 root = tail;
3126 return head;
3127 } else if (node instanceof Curly) {
3128 Curly curly = (Curly) node;
3129 if (curly.type == Qtype.POSSESSIVE) {
3130 root = node;
3131 return node;
3132 }
3133 // Discover if the group is deterministic
3134 TreeInfo info = new TreeInfo();
3135 if (head.study(info)) { // Deterministic
3136 GroupTail temp = (GroupTail) tail;
3137 head = root = new GroupCurly(head.next, curly.cmin,
3138 curly.cmax, curly.type,
3139 ((GroupTail)tail).localIndex,
3140 ((GroupTail)tail).groupIndex,
3141 capturingGroup);
3142 return head;
3143 } else { // Non-deterministic
3281 } else if (ch == '+') {
3282 next();
3283 return Qtype.POSSESSIVE;
3284 }
3285 return Qtype.GREEDY;
3286 }
3287
3288 private Node curly(Node prev, int cmin) {
3289 Qtype qtype = qtype();
3290 if (qtype == Qtype.GREEDY) {
3291 if (prev instanceof BmpCharProperty) {
3292 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3293 } else if (prev instanceof CharProperty) {
3294 return new CharPropertyGreedy((CharProperty)prev, cmin);
3295 }
3296 }
3297 return new Curly(prev, cmin, MAX_REPS, qtype);
3298 }
3299
3300 /**
3301 * Processing repetition of a Unicode linebreak \\R.
3302 */
3303 private Node closureOfLineEnding(LineEnding le) {
3304 int ch = peek();
3305 if (ch != '?' && ch != '*' && ch != '+' && ch != '{') {
3306 return le;
3307 }
3308
3309 // Replace the LineEnding with an anonymous group
3310 // (?:\\u000D\\u000A|[\\u000A\\u000B\\u000C\\u000D\\u0085\\u2028\\u2029])
3311 Node grHead = createGroup(true);
3312 Node grTail = root;
3313 BranchConn branchConn = new BranchConn();
3314 branchConn.next = grTail;
3315 Node slice = new Slice(new int[] {0x0D, 0x0A});
3316 slice.next = branchConn;
3317 Node chClass = newCharProperty(x -> x == 0x0A || x == 0x0B ||
3318 x == 0x0C || x == 0x0D || x == 0x85 || x == 0x2028 ||
3319 x == 0x2029);
3320 chClass.next = branchConn;
3321 grHead.next = new Branch(slice, chClass, branchConn);
3322 return groupWithClosure(closure(grHead), grHead, grTail, false);
3323 }
3324
3325 /**
3326 * Processes repetition. If the next character peeked is a quantifier
3327 * then new nodes must be appended to handle the repetition.
3328 * Prev could be a single or a group, so it could be a chain of nodes.
3329 */
3330 private Node closure(Node prev) {
3331 int ch = peek();
3332 switch (ch) {
3333 case '?':
3334 return new Ques(prev, qtype());
3335 case '*':
3336 return curly(prev, 0);
3337 case '+':
3338 return curly(prev, 1);
3339 case '{':
3340 ch = skip();
3341 if (ASCII.isDigit(ch)) {
3342 int cmin = 0, cmax;
3343 try {
3344 do {
3345 cmin = Math.addExact(Math.multiplyExact(cmin, 10),
4761 * "next".
4762 */
4763 static final class BranchConn extends Node {
4764 BranchConn() {}
4765 boolean match(Matcher matcher, int i, CharSequence seq) {
4766 return next.match(matcher, i, seq);
4767 }
4768 boolean study(TreeInfo info) {
4769 return info.deterministic;
4770 }
4771 }
4772
4773 /**
4774 * Handles the branching of alternations. Note this is also used for
4775 * the ? quantifier to branch between the case where it matches once
4776 * and where it does not occur.
4777 */
4778 static final class Branch extends Node {
4779 Node[] atoms = new Node[2];
4780 int size = 2;
4781 BranchConn conn;
4782 Branch(Node first, Node second, BranchConn branchConn) {
4783 conn = branchConn;
4784 atoms[0] = first;
4785 atoms[1] = second;
4786 }
4787
4788 void add(Node node) {
4789 if (size >= atoms.length) {
4790 int len = ArraysSupport.newLength(size,
4791 1, /* minimum growth */
4792 size /* preferred growth */);
4793 atoms = Arrays.copyOf(atoms, len);
4794 }
4795 atoms[size++] = node;
4796 }
4797
4798 boolean match(Matcher matcher, int i, CharSequence seq) {
4799 for (int n = 0; n < size; n++) {
4800 if (atoms[n] == null) {
4801 if (conn.next.match(matcher, i, seq))
4802 return true;
4803 } else if (atoms[n].match(matcher, i, seq)) {
4804 return true;
4805 }
4806 }
4807 return false;
4808 }
4809
4810 boolean study(TreeInfo info) {
4811 int minL = info.minLength;
4812 int maxL = info.maxLength;
4813 boolean maxV = info.maxValid;
|