967
968 /**
969 * The starting point of state machine for the find operation. This allows
970 * a match to start anywhere in the input.
971 */
972 transient Node root;
973
974 /**
975 * The root of object tree for a match operation. The pattern is matched
976 * at the beginning. This may include a find that uses BnM or a First
977 * node.
978 */
979 transient Node matchRoot;
980
981 /**
982 * Temporary storage used by parsing pattern slice.
983 */
984 transient int[] buffer;
985
986 /**
987 * Map the "name" of the "named capturing group" to its group id
988 * node.
989 */
990 transient volatile Map<String, Integer> namedGroups;
991
992 /**
993 * Temporary storage used while parsing group references.
994 */
995 transient GroupHead[] groupNodes;
996
997 /**
998 * Temporary null terminated code point array used by pattern compiling.
999 */
1000 private transient int[] temp;
1001
1002 /**
1003 * The number of capturing groups in this Pattern. Used by matchers to
1004 * allocate storage needed to perform a match.
1005 */
1006 transient int capturingGroupCount;
1009 * The local variable count used by parsing tree. Used by matchers to
1010 * allocate storage needed to perform a match.
1011 */
1012 transient int localCount;
1013
1014 /**
1015 * Index into the pattern string that keeps track of how much has been
1016 * parsed.
1017 */
1018 private transient int cursor;
1019
1020 /**
1021 * Holds the length of the pattern string.
1022 */
1023 private transient int patternLength;
1024
1025 /**
1026 * If the Start node might possibly match supplementary characters.
1027 * It is set to true during compiling if
1028 * (1) There is supplementary char in pattern, or
1029 * (2) There is complement node of Category or Block
1030 */
1031 private transient boolean hasSupplementary;
1032
1033 /**
1034 * Compiles the given regular expression into a pattern.
1035 *
1036 * @param regex
1037 * The expression to be compiled
1038 * @return the given regular expression compiled into a pattern
1039 * @throws PatternSyntaxException
1040 * If the expression's syntax is invalid
1041 */
1042 public static Pattern compile(String regex) {
1043 return new Pattern(regex, 0);
1044 }
1045
1046 /**
1047 * Compiles the given regular expression into a pattern with the given
1048 * flags.
1049 *
1737 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1738 }
1739
1740 // Release temporary storage
1741 temp = null;
1742 buffer = null;
1743 groupNodes = null;
1744 patternLength = 0;
1745 compiled = true;
1746 }
1747
1748 Map<String, Integer> namedGroups() {
1749 Map<String, Integer> groups = namedGroups;
1750 if (groups == null) {
1751 namedGroups = groups = new HashMap<>(2);
1752 }
1753 return groups;
1754 }
1755
1756 /**
1757 * Used to print out a subtree of the Pattern to help with debugging.
1758 */
1759 private static void printObjectTree(Node node) {
1760 while(node != null) {
1761 if (node instanceof Prolog) {
1762 System.out.println(node);
1763 printObjectTree(((Prolog)node).loop);
1764 System.out.println("**** end contents prolog loop");
1765 } else if (node instanceof Loop) {
1766 System.out.println(node);
1767 printObjectTree(((Loop)node).body);
1768 System.out.println("**** end contents Loop body");
1769 } else if (node instanceof Curly) {
1770 System.out.println(node);
1771 printObjectTree(((Curly)node).atom);
1772 System.out.println("**** end contents Curly body");
1773 } else if (node instanceof GroupCurly) {
1774 System.out.println(node);
1775 printObjectTree(((GroupCurly)node).atom);
1776 System.out.println("**** end contents GroupCurly body");
1777 } else if (node instanceof GroupTail) {
1778 System.out.println(node);
1779 System.out.println("Tail next is "+node.next);
1780 return;
1781 } else {
1782 System.out.println(node);
1783 }
1784 node = node.next;
1785 if (node != null)
1786 System.out.println("->next:");
1787 if (node == Pattern.accept) {
1788 System.out.println("Accept Node");
1789 node = null;
1790 }
1791 }
1792 }
1793
1794 /**
1795 * Used to accumulate information about a subtree of the object graph
1796 * so that optimizations can be applied to the subtree.
1797 */
1798 static final class TreeInfo {
1799 int minLength;
1800 int maxLength;
1801 boolean maxValid;
1802 boolean deterministic;
1803
1804 TreeInfo() {
1805 reset();
1806 }
1807 void reset() {
1808 minLength = 0;
1809 maxLength = 0;
1810 maxValid = true;
1811 deterministic = true;
1812 }
1813 }
1814
2066 Node node = null;
2067 LOOP:
2068 for (;;) {
2069 int ch = peek();
2070 switch (ch) {
2071 case '(':
2072 // Because group handles its own closure,
2073 // we need to treat it differently
2074 node = group0();
2075 // Check for comment or flag group
2076 if (node == null)
2077 continue;
2078 if (head == null)
2079 head = node;
2080 else
2081 tail.next = node;
2082 // Double return: Tail was returned in root
2083 tail = root;
2084 continue;
2085 case '[':
2086 node = clazz(true);
2087 break;
2088 case '\\':
2089 ch = nextEscaped();
2090 if (ch == 'p' || ch == 'P') {
2091 boolean oneLetter = true;
2092 boolean comp = (ch == 'P');
2093 ch = next(); // Consume { if present
2094 if (ch != '{') {
2095 unread();
2096 } else {
2097 oneLetter = false;
2098 }
2099 node = family(oneLetter, comp);
2100 } else {
2101 unread();
2102 node = atom();
2103 }
2104 break;
2105 case '^':
2106 next();
2107 if (has(MULTILINE)) {
2108 if (has(UNIX_LINES))
2109 node = new UnixCaret();
2110 else
2111 node = new Caret();
2112 } else {
2113 node = new Begin();
2114 }
2115 break;
2116 case '$':
2117 next();
2118 if (has(UNIX_LINES))
2119 node = new UnixDollar(has(MULTILINE));
2120 else
2121 node = new Dollar(has(MULTILINE));
2122 break;
2123 case '.':
2124 next();
2125 if (has(DOTALL)) {
2126 node = new All();
2127 } else {
2128 if (has(UNIX_LINES))
2129 node = new UnixDot();
2130 else {
2131 node = new Dot();
2132 }
2133 }
2134 break;
2135 case '|':
2136 case ')':
2137 break LOOP;
2138 case ']': // Now interpreting dangling ] and } as literals
2139 case '}':
2140 node = atom();
2141 break;
2142 case '?':
2143 case '*':
2144 case '+':
2145 next();
2146 throw error("Dangling meta character '" + ((char)ch) + "'");
2147 case 0:
2148 if (cursor >= patternLength) {
2149 break LOOP;
2150 }
2151 // Fall through
2152 default:
2153 node = atom();
2154 break;
2155 }
2156
2157 node = closure(node);
2158
2159 if (head == null) {
2160 head = tail = node;
2161 } else {
2162 tail.next = node;
2163 tail = node;
2164 }
2165 }
2166 if (head == null) {
2167 return end;
2168 }
2169 tail.next = end;
2170 root = tail; //double return
2171 return head;
2172 }
2173
2174 @SuppressWarnings("fallthrough")
2175 /**
2176 * Parse and add a new Single or Slice.
2177 */
2178 private Node atom() {
2196 case '^':
2197 case '(':
2198 case '[':
2199 case '|':
2200 case ')':
2201 break;
2202 case '\\':
2203 ch = nextEscaped();
2204 if (ch == 'p' || ch == 'P') { // Property
2205 if (first > 0) { // Slice is waiting; handle it first
2206 unread();
2207 break;
2208 } else { // No slice; just return the family node
2209 boolean comp = (ch == 'P');
2210 boolean oneLetter = true;
2211 ch = next(); // Consume { if present
2212 if (ch != '{')
2213 unread();
2214 else
2215 oneLetter = false;
2216 return family(oneLetter, comp);
2217 }
2218 }
2219 unread();
2220 prev = cursor;
2221 ch = escape(false, first == 0, false);
2222 if (ch >= 0) {
2223 append(ch, first);
2224 first++;
2225 if (isSupplementary(ch)) {
2226 hasSupplementary = true;
2227 }
2228 ch = peek();
2229 continue;
2230 } else if (first == 0) {
2231 return root;
2232 }
2233 // Unwind meta escape sequence
2234 cursor = prev;
2235 break;
2236 case 0:
2237 if (cursor >= patternLength) {
2238 break;
2239 }
2240 // Fall through
2241 default:
2242 prev = cursor;
2243 append(ch, first);
2244 first++;
2245 if (isSupplementary(ch)) {
2246 hasSupplementary = true;
2247 }
2248 ch = next();
2249 continue;
2250 }
2251 break;
2252 }
2253 if (first == 1) {
2254 return newSingle(buffer[0]);
2255 } else {
2256 return newSlice(buffer, first, hasSupplementary);
2257 }
2258 }
2259
2260 private void append(int ch, int len) {
2261 if (len >= buffer.length) {
2262 int[] tmp = new int[len+len];
2263 System.arraycopy(buffer, 0, tmp, 0, len);
2264 buffer = tmp;
2265 }
2266 buffer[len] = ch;
2267 }
2268
2269 /**
2270 * Parses a backref greedily, taking as many numbers as it
2271 * can. The first digit is always treated as a backref, but
2272 * multi digit numbers are only treated as a backref if at
2273 * least that many backrefs exist at this point in the regex.
2274 */
2329 case '6':
2330 case '7':
2331 case '8':
2332 case '9':
2333 if (inclass) break;
2334 if (create) {
2335 root = ref((ch - '0'));
2336 }
2337 return -1;
2338 case 'A':
2339 if (inclass) break;
2340 if (create) root = new Begin();
2341 return -1;
2342 case 'B':
2343 if (inclass) break;
2344 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2345 return -1;
2346 case 'C':
2347 break;
2348 case 'D':
2349 if (create) root = has(UNICODE_CHARACTER_CLASS)
2350 ? new Utype(UnicodeProp.DIGIT).complement()
2351 : new Ctype(ASCII.DIGIT).complement();
2352 return -1;
2353 case 'E':
2354 case 'F':
2355 break;
2356 case 'G':
2357 if (inclass) break;
2358 if (create) root = new LastMatch();
2359 return -1;
2360 case 'H':
2361 if (create) root = new HorizWS().complement();
2362 return -1;
2363 case 'I':
2364 case 'J':
2365 case 'K':
2366 case 'L':
2367 case 'M':
2368 break;
2369 case 'N':
2370 return N();
2371 case 'O':
2372 case 'P':
2373 case 'Q':
2374 break;
2375 case 'R':
2376 if (inclass) break;
2377 if (create) root = new LineEnding();
2378 return -1;
2379 case 'S':
2380 if (create) root = has(UNICODE_CHARACTER_CLASS)
2381 ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2382 : new Ctype(ASCII.SPACE).complement();
2383 return -1;
2384 case 'T':
2385 case 'U':
2386 break;
2387 case 'V':
2388 if (create) root = new VertWS().complement();
2389 return -1;
2390 case 'W':
2391 if (create) root = has(UNICODE_CHARACTER_CLASS)
2392 ? new Utype(UnicodeProp.WORD).complement()
2393 : new Ctype(ASCII.WORD).complement();
2394 return -1;
2395 case 'X':
2396 if (inclass) break;
2397 if (create) {
2398 root = new XGrapheme();
2399 }
2400 return -1;
2401 case 'Y':
2402 break;
2403 case 'Z':
2404 if (inclass) break;
2405 if (create) {
2406 if (has(UNIX_LINES))
2407 root = new UnixDollar(false);
2408 else
2409 root = new Dollar(false);
2410 }
2411 return -1;
2412 case 'a':
2413 return '\007';
2414 case 'b':
2415 if (inclass) break;
2416 if (create) {
2417 if (peek() == '{') {
2418 if (skip() == 'g') {
2419 if (read() == '}') {
2420 root = new GraphemeBound();
2421 return -1;
2422 }
2423 break; // error missing trailing }
2424 }
2425 unread(); unread();
2426 }
2427 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2428 }
2429 return -1;
2430 case 'c':
2431 return c();
2432 case 'd':
2433 if (create) root = has(UNICODE_CHARACTER_CLASS)
2434 ? new Utype(UnicodeProp.DIGIT)
2435 : new Ctype(ASCII.DIGIT);
2436 return -1;
2437 case 'e':
2438 return '\033';
2439 case 'f':
2440 return '\f';
2441 case 'g':
2442 break;
2443 case 'h':
2444 if (create) root = new HorizWS();
2445 return -1;
2446 case 'i':
2447 case 'j':
2448 break;
2449 case 'k':
2450 if (inclass)
2451 break;
2452 if (read() != '<')
2453 throw error("\\k is not followed by '<' for named capturing group");
2454 String name = groupname(read());
2455 if (!namedGroups().containsKey(name))
2456 throw error("(named capturing group <"+ name+"> does not exit");
2457 if (create) {
2458 if (has(CASE_INSENSITIVE))
2459 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2460 else
2461 root = new BackRef(namedGroups().get(name));
2462 }
2463 return -1;
2464 case 'l':
2465 case 'm':
2466 break;
2467 case 'n':
2468 return '\n';
2469 case 'o':
2470 case 'p':
2471 case 'q':
2472 break;
2473 case 'r':
2474 return '\r';
2475 case 's':
2476 if (create) root = has(UNICODE_CHARACTER_CLASS)
2477 ? new Utype(UnicodeProp.WHITE_SPACE)
2478 : new Ctype(ASCII.SPACE);
2479 return -1;
2480 case 't':
2481 return '\t';
2482 case 'u':
2483 return u();
2484 case 'v':
2485 // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2486 // undocumented). In JDK8 '\v' is specified as a predefined
2487 // character class for all vertical whitespace characters.
2488 // So [-1, root=VertWS node] pair is returned (instead of a
2489 // single 0x0B). This breaks the range if '\v' is used as
2490 // the start or end value, such as [\v-...] or [...-\v], in
2491 // which a single definite value (0x0B) is expected. For
2492 // compatibility concern '\013'/0x0B is returned if isrange.
2493 if (isrange)
2494 return '\013';
2495 if (create) root = new VertWS();
2496 return -1;
2497 case 'w':
2498 if (create) root = has(UNICODE_CHARACTER_CLASS)
2499 ? new Utype(UnicodeProp.WORD)
2500 : new Ctype(ASCII.WORD);
2501 return -1;
2502 case 'x':
2503 return x();
2504 case 'y':
2505 break;
2506 case 'z':
2507 if (inclass) break;
2508 if (create) root = new End();
2509 return -1;
2510 default:
2511 return ch;
2512 }
2513 throw error("Illegal/unsupported escape sequence");
2514 }
2515
2516 /**
2517 * Parse a character class, and return the node that matches it.
2518 *
2519 * Consumes a ] on the way out if consume is true. Usually consume
2520 * is true except for the case of [abc&&def] where def is a separate
2521 * right hand node with "understood" brackets.
2522 */
2523 private CharProperty clazz(boolean consume) {
2524 CharProperty prev = null;
2525 CharProperty node = null;
2526 BitClass bits = new BitClass();
2527 boolean include = true;
2528 boolean firstInClass = true;
2529 int ch = next();
2530 for (;;) {
2531 switch (ch) {
2532 case '^':
2533 // Negates if first char in a class, otherwise literal
2534 if (firstInClass) {
2535 if (temp[cursor-1] != '[')
2536 break;
2537 ch = next();
2538 include = !include;
2539 continue;
2540 } else {
2541 // ^ not first in class, treat as literal
2542 break;
2543 }
2544 case '[':
2545 firstInClass = false;
2546 node = clazz(true);
2547 if (prev == null)
2548 prev = node;
2549 else
2550 prev = union(prev, node);
2551 ch = peek();
2552 continue;
2553 case '&':
2554 firstInClass = false;
2555 ch = next();
2556 if (ch == '&') {
2557 ch = next();
2558 CharProperty rightNode = null;
2559 while (ch != ']' && ch != '&') {
2560 if (ch == '[') {
2561 if (rightNode == null)
2562 rightNode = clazz(true);
2563 else
2564 rightNode = union(rightNode, clazz(true));
2565 } else { // abc&&def
2566 unread();
2567 rightNode = clazz(false);
2568 }
2569 ch = peek();
2570 }
2571 if (rightNode != null)
2572 node = rightNode;
2573 if (prev == null) {
2574 if (rightNode == null)
2575 throw error("Bad class syntax");
2576 else
2577 prev = rightNode;
2578 } else {
2579 prev = intersection(prev, node);
2580 }
2581 } else {
2582 // treat as a literal &
2583 unread();
2584 break;
2585 }
2586 continue;
2587 case 0:
2588 firstInClass = false;
2589 if (cursor >= patternLength)
2590 throw error("Unclosed character class");
2591 break;
2592 case ']':
2593 firstInClass = false;
2594 if (prev != null) {
2595 if (consume)
2596 next();
2597 return prev;
2598 }
2599 break;
2600 default:
2601 firstInClass = false;
2602 break;
2603 }
2604 node = range(bits);
2605 if (include) {
2606 if (prev == null) {
2607 prev = node;
2608 } else {
2609 if (prev != node)
2610 prev = union(prev, node);
2611 }
2612 } else {
2613 if (prev == null) {
2614 prev = node.complement();
2615 } else {
2616 if (prev != node)
2617 prev = setDifference(prev, node);
2618 }
2619 }
2620 ch = peek();
2621 }
2622 }
2623
2624 private CharProperty bitsOrSingle(BitClass bits, int ch) {
2625 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2626 Use "single" node instead of bits when dealing with unicode
2627 case folding for codepoints listed below.
2628 (1)Uppercase out of range: u+00ff, u+00b5
2629 toUpperCase(u+00ff) -> u+0178
2630 toUpperCase(u+00b5) -> u+039c
2631 (2)LatinSmallLetterLongS u+17f
2632 toUpperCase(u+017f) -> u+0053
2633 (3)LatinSmallLetterDotlessI u+131
2634 toUpperCase(u+0131) -> u+0049
2635 (4)LatinCapitalLetterIWithDotAbove u+0130
2636 toLowerCase(u+0130) -> u+0069
2637 (5)KelvinSign u+212a
2638 toLowerCase(u+212a) ==> u+006B
2639 (6)AngstromSign u+212b
2640 toLowerCase(u+212b) ==> u+00e5
2641 */
2642 int d;
2643 if (ch < 256 &&
2644 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2645 (ch == 0xff || ch == 0xb5 ||
2646 ch == 0x49 || ch == 0x69 || //I and i
2647 ch == 0x53 || ch == 0x73 || //S and s
2648 ch == 0x4b || ch == 0x6b || //K and k
2649 ch == 0xc5 || ch == 0xe5))) //A+ring
2650 return bits.add(ch, flags());
2651 return newSingle(ch);
2652 }
2653
2654 /**
2655 * Parse a single character or a character range in a character class
2656 * and return its representative node.
2657 */
2658 private CharProperty range(BitClass bits) {
2659 int ch = peek();
2660 if (ch == '\\') {
2661 ch = nextEscaped();
2662 if (ch == 'p' || ch == 'P') { // A property
2663 boolean comp = (ch == 'P');
2664 boolean oneLetter = true;
2665 // Consume { if present
2666 ch = next();
2667 if (ch != '{')
2668 unread();
2669 else
2670 oneLetter = false;
2671 return family(oneLetter, comp);
2672 } else { // ordinary escape
2673 boolean isrange = temp[cursor+1] == '-';
2674 unread();
2675 ch = escape(true, true, isrange);
2676 if (ch == -1)
2677 return (CharProperty) root;
2678 }
2679 } else {
2680 next();
2681 }
2682 if (ch >= 0) {
2683 if (peek() == '-') {
2684 int endRange = temp[cursor+1];
2685 if (endRange == '[') {
2686 return bitsOrSingle(bits, ch);
2687 }
2688 if (endRange != ']') {
2689 next();
2690 int m = peek();
2691 if (m == '\\') {
2692 m = escape(true, false, true);
2693 } else {
2694 next();
2695 }
2696 if (m < ch) {
2697 throw error("Illegal character range");
2698 }
2699 if (has(CASE_INSENSITIVE))
2700 return caseInsensitiveRangeFor(ch, m);
2701 else
2702 return rangeFor(ch, m);
2703 }
2704 }
2705 return bitsOrSingle(bits, ch);
2706 }
2707 throw error("Unexpected character '"+((char)ch)+"'");
2708 }
2709
2710 /**
2711 * Parses a Unicode character family and returns its representative node.
2712 */
2713 private CharProperty family(boolean singleLetter,
2714 boolean maybeComplement)
2715 {
2716 next();
2717 String name;
2718 CharProperty node = null;
2719
2720 if (singleLetter) {
2721 int c = temp[cursor];
2722 if (!Character.isSupplementaryCodePoint(c)) {
2723 name = String.valueOf((char)c);
2724 } else {
2725 name = new String(temp, cursor, 1);
2726 }
2727 read();
2728 } else {
2729 int i = cursor;
2730 mark('}');
2731 while(read() != '}') {
2732 }
2733 mark('\000');
2734 int j = cursor;
2735 if (j > patternLength)
2736 throw error("Unclosed character family");
2737 if (i + 1 >= j)
2738 throw error("Empty character family");
2739 name = new String(temp, i, j-i-1);
2740 }
2741
2742 int i = name.indexOf('=');
2743 if (i != -1) {
2744 // property construct \p{name=value}
2745 String value = name.substring(i + 1);
2746 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2747 switch (name) {
2748 case "sc":
2749 case "script":
2750 node = unicodeScriptPropertyFor(value);
2751 break;
2752 case "blk":
2753 case "block":
2754 node = unicodeBlockPropertyFor(value);
2755 break;
2756 case "gc":
2757 case "general_category":
2758 node = charPropertyNodeFor(value);
2759 break;
2760 default:
2761 throw error("Unknown Unicode property {name=<" + name + ">, "
2762 + "value=<" + value + ">}");
2763 }
2764 } else {
2765 if (name.startsWith("In")) {
2766 // \p{inBlockName}
2767 node = unicodeBlockPropertyFor(name.substring(2));
2768 } else if (name.startsWith("Is")) {
2769 // \p{isGeneralCategory} and \p{isScriptName}
2770 name = name.substring(2);
2771 UnicodeProp uprop = UnicodeProp.forName(name);
2772 if (uprop != null)
2773 node = new Utype(uprop);
2774 if (node == null)
2775 node = CharPropertyNames.charPropertyFor(name);
2776 if (node == null)
2777 node = unicodeScriptPropertyFor(name);
2778 } else {
2779 if (has(UNICODE_CHARACTER_CLASS)) {
2780 UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
2781 if (uprop != null)
2782 node = new Utype(uprop);
2783 }
2784 if (node == null)
2785 node = charPropertyNodeFor(name);
2786 }
2787 }
2788 if (maybeComplement) {
2789 if (node instanceof Category || node instanceof Block)
2790 hasSupplementary = true;
2791 node = node.complement();
2792 }
2793 return node;
2794 }
2795
2796
2797 /**
2798 * Returns a CharProperty matching all characters belong to
2799 * a UnicodeScript.
2800 */
2801 private CharProperty unicodeScriptPropertyFor(String name) {
2802 final Character.UnicodeScript script;
2803 try {
2804 script = Character.UnicodeScript.forName(name);
2805 } catch (IllegalArgumentException iae) {
2806 throw error("Unknown character script name {" + name + "}");
2807 }
2808 return new Script(script);
2809 }
2810
2811 /**
2812 * Returns a CharProperty matching all characters in a UnicodeBlock.
2813 */
2814 private CharProperty unicodeBlockPropertyFor(String name) {
2815 final Character.UnicodeBlock block;
2816 try {
2817 block = Character.UnicodeBlock.forName(name);
2818 } catch (IllegalArgumentException iae) {
2819 throw error("Unknown character block name {" + name + "}");
2820 }
2821 return new Block(block);
2822 }
2823
2824 /**
2825 * Returns a CharProperty matching all characters in a named property.
2826 */
2827 private CharProperty charPropertyNodeFor(String name) {
2828 CharProperty p = CharPropertyNames.charPropertyFor(name);
2829 if (p == null)
2830 throw error("Unknown character property name {" + name + "}");
2831 return p;
2832 }
2833
2834 /**
2835 * Parses and returns the name of a "named capturing group", the trailing
2836 * ">" is consumed after parsing.
2837 */
2838 private String groupname(int ch) {
2839 StringBuilder sb = new StringBuilder();
2840 sb.append(Character.toChars(ch));
2841 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2842 ASCII.isDigit(ch)) {
2843 sb.append(Character.toChars(ch));
2844 }
2845 if (sb.length() == 0)
2846 throw error("named capturing group has 0 length name");
2847 if (ch != '>')
2848 throw error("named capturing group is missing trailing '>'");
2849 return sb.toString();
2850 }
2851
2867 case ':': // (?:xxx) pure group
2868 head = createGroup(true);
2869 tail = root;
2870 head.next = expr(tail);
2871 break;
2872 case '=': // (?=xxx) and (?!xxx) lookahead
2873 case '!':
2874 head = createGroup(true);
2875 tail = root;
2876 head.next = expr(tail);
2877 if (ch == '=') {
2878 head = tail = new Pos(head);
2879 } else {
2880 head = tail = new Neg(head);
2881 }
2882 break;
2883 case '>': // (?>xxx) independent group
2884 head = createGroup(true);
2885 tail = root;
2886 head.next = expr(tail);
2887 head = tail = new Ques(head, INDEPENDENT);
2888 break;
2889 case '<': // (?<xxx) look behind
2890 ch = read();
2891 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2892 // named captured group
2893 String name = groupname(ch);
2894 if (namedGroups().containsKey(name))
2895 throw error("Named capturing group <" + name
2896 + "> is already defined");
2897 capturingGroup = true;
2898 head = createGroup(false);
2899 tail = root;
2900 namedGroups().put(name, capturingGroupCount-1);
2901 head.next = expr(tail);
2902 break;
2903 }
2904 int start = cursor;
2905 head = createGroup(true);
2906 tail = root;
2907 head.next = expr(tail);
2953 tail = root;
2954 head.next = expr(tail);
2955 }
2956
2957 accept(')', "Unclosed group");
2958 flags = save;
2959
2960 // Check for quantifiers
2961 Node node = closure(head);
2962 if (node == head) { // No closure
2963 root = tail;
2964 return node; // Dual return
2965 }
2966 if (head == tail) { // Zero length assertion
2967 root = node;
2968 return node; // Dual return
2969 }
2970
2971 if (node instanceof Ques) {
2972 Ques ques = (Ques) node;
2973 if (ques.type == POSSESSIVE) {
2974 root = node;
2975 return node;
2976 }
2977 tail.next = new BranchConn();
2978 tail = tail.next;
2979 if (ques.type == GREEDY) {
2980 head = new Branch(head, null, tail);
2981 } else { // Reluctant quantifier
2982 head = new Branch(null, head, tail);
2983 }
2984 root = tail;
2985 return head;
2986 } else if (node instanceof Curly) {
2987 Curly curly = (Curly) node;
2988 if (curly.type == POSSESSIVE) {
2989 root = node;
2990 return node;
2991 }
2992 // Discover if the group is deterministic
2993 TreeInfo info = new TreeInfo();
2994 if (head.study(info)) { // Deterministic
2995 GroupTail temp = (GroupTail) tail;
2996 head = root = new GroupCurly(head.next, curly.cmin,
2997 curly.cmax, curly.type,
2998 ((GroupTail)tail).localIndex,
2999 ((GroupTail)tail).groupIndex,
3000 capturingGroup);
3001 return head;
3002 } else { // Non-deterministic
3003 int temp = ((GroupHead) head).localIndex;
3004 Loop loop;
3005 if (curly.type == GREEDY)
3006 loop = new Loop(this.localCount, temp);
3007 else // Reluctant Curly
3008 loop = new LazyLoop(this.localCount, temp);
3009 Prolog prolog = new Prolog(loop);
3010 this.localCount += 1;
3011 loop.cmin = curly.cmin;
3012 loop.cmax = curly.cmax;
3013 loop.body = head;
3014 tail.next = loop;
3015 root = loop;
3016 return prolog; // Dual return
3017 }
3018 }
3019 throw error("Internal logic error");
3020 }
3021
3022 /**
3023 * Create group head and tail nodes using double return. If the group is
3024 * created with anonymous true then it is a pure group and should not
3025 * affect group counting.
3026 */
3027 private Node createGroup(boolean anonymous) {
3028 int localIndex = localCount++;
3029 int groupIndex = 0;
3030 if (!anonymous)
3031 groupIndex = capturingGroupCount++;
3032 GroupHead head = new GroupHead(localIndex);
3033 root = new GroupTail(localIndex, groupIndex);
3034 if (!anonymous && groupIndex < 10)
3035 groupNodes[groupIndex] = head;
3036 return head;
3037 }
3038
3039 @SuppressWarnings("fallthrough")
3040 /**
3041 * Parses inlined match flags and set them appropriately.
3042 */
3043 private void addFlag() {
3044 int ch = peek();
3045 for (;;) {
3046 switch (ch) {
3047 case 'i':
3048 flags |= CASE_INSENSITIVE;
3049 break;
3050 case 'm':
3051 flags |= MULTILINE;
3052 break;
3053 case 's':
3102 case 'u':
3103 flags &= ~UNICODE_CASE;
3104 break;
3105 case 'c':
3106 flags &= ~CANON_EQ;
3107 break;
3108 case 'x':
3109 flags &= ~COMMENTS;
3110 break;
3111 case 'U':
3112 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3113 default:
3114 return;
3115 }
3116 ch = next();
3117 }
3118 }
3119
3120 static final int MAX_REPS = 0x7FFFFFFF;
3121
3122 static final int GREEDY = 0;
3123
3124 static final int LAZY = 1;
3125
3126 static final int POSSESSIVE = 2;
3127
3128 static final int INDEPENDENT = 3;
3129
3130 /**
3131 * Processes repetition. If the next character peeked is a quantifier
3132 * then new nodes must be appended to handle the repetition.
3133 * Prev could be a single or a group, so it could be a chain of nodes.
3134 */
3135 private Node closure(Node prev) {
3136 Node atom;
3137 int ch = peek();
3138 switch (ch) {
3139 case '?':
3140 ch = next();
3141 if (ch == '?') {
3142 next();
3143 return new Ques(prev, LAZY);
3144 } else if (ch == '+') {
3145 next();
3146 return new Ques(prev, POSSESSIVE);
3147 }
3148 return new Ques(prev, GREEDY);
3149 case '*':
3150 ch = next();
3151 if (ch == '?') {
3152 next();
3153 return new Curly(prev, 0, MAX_REPS, LAZY);
3154 } else if (ch == '+') {
3155 next();
3156 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
3157 }
3158 return new Curly(prev, 0, MAX_REPS, GREEDY);
3159 case '+':
3160 ch = next();
3161 if (ch == '?') {
3162 next();
3163 return new Curly(prev, 1, MAX_REPS, LAZY);
3164 } else if (ch == '+') {
3165 next();
3166 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
3167 }
3168 return new Curly(prev, 1, MAX_REPS, GREEDY);
3169 case '{':
3170 ch = temp[cursor+1];
3171 if (ASCII.isDigit(ch)) {
3172 skip();
3173 int cmin = 0;
3174 do {
3175 cmin = cmin * 10 + (ch - '0');
3176 } while (ASCII.isDigit(ch = read()));
3177 int cmax = cmin;
3178 if (ch == ',') {
3179 ch = read();
3180 cmax = MAX_REPS;
3181 if (ch != '}') {
3182 cmax = 0;
3183 while (ASCII.isDigit(ch)) {
3184 cmax = cmax * 10 + (ch - '0');
3185 ch = read();
3186 }
3187 }
3188 }
3189 if (ch != '}')
3190 throw error("Unclosed counted closure");
3191 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3192 throw error("Illegal repetition range");
3193 Curly curly;
3194 ch = peek();
3195 if (ch == '?') {
3196 next();
3197 curly = new Curly(prev, cmin, cmax, LAZY);
3198 } else if (ch == '+') {
3199 next();
3200 curly = new Curly(prev, cmin, cmax, POSSESSIVE);
3201 } else {
3202 curly = new Curly(prev, cmin, cmax, GREEDY);
3203 }
3204 return curly;
3205 } else {
3206 throw error("Illegal repetition");
3207 }
3208 default:
3209 return prev;
3210 }
3211 }
3212
3213 /**
3214 * Utility method for parsing control escape sequences.
3215 */
3216 private int c() {
3217 if (cursor < patternLength) {
3218 return read() ^ 64;
3219 }
3220 throw error("Illegal control escape sequence");
3221 }
3222
3359
3360 private static final int countCodePoints(CharSequence seq) {
3361 int length = seq.length();
3362 int n = 0;
3363 for (int i = 0; i < length; ) {
3364 n++;
3365 if (Character.isHighSurrogate(seq.charAt(i++))) {
3366 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3367 i++;
3368 }
3369 }
3370 }
3371 return n;
3372 }
3373
3374 /**
3375 * Creates a bit vector for matching Latin-1 values. A normal BitClass
3376 * never matches values above Latin-1, and a complemented BitClass always
3377 * matches values above Latin-1.
3378 */
3379 private static final class BitClass extends BmpCharProperty {
3380 final boolean[] bits;
3381 BitClass() { bits = new boolean[256]; }
3382 private BitClass(boolean[] bits) { this.bits = bits; }
3383 BitClass add(int c, int flags) {
3384 assert c >= 0 && c <= 255;
3385 if ((flags & CASE_INSENSITIVE) != 0) {
3386 if (ASCII.isAscii(c)) {
3387 bits[ASCII.toUpper(c)] = true;
3388 bits[ASCII.toLower(c)] = true;
3389 } else if ((flags & UNICODE_CASE) != 0) {
3390 bits[Character.toLowerCase(c)] = true;
3391 bits[Character.toUpperCase(c)] = true;
3392 }
3393 }
3394 bits[c] = true;
3395 return this;
3396 }
3397 boolean isSatisfiedBy(int ch) {
3398 return ch < 256 && bits[ch];
3399 }
3400 }
3401
3402 /**
3403 * Returns a suitably optimized, single character matcher.
3404 */
3405 private CharProperty newSingle(final int ch) {
3406 if (has(CASE_INSENSITIVE)) {
3407 int lower, upper;
3408 if (has(UNICODE_CASE)) {
3409 upper = Character.toUpperCase(ch);
3410 lower = Character.toLowerCase(upper);
3411 if (upper != lower)
3412 return new SingleU(lower);
3413 } else if (ASCII.isAscii(ch)) {
3414 lower = ASCII.toLower(ch);
3415 upper = ASCII.toUpper(ch);
3416 if (lower != upper)
3417 return new SingleI(lower, upper);
3418 }
3419 }
3420 if (isSupplementary(ch))
3421 return new SingleS(ch); // Match a given Unicode character
3422 return new Single(ch); // Match a given BMP character
3423 }
3424
3425 /**
3426 * Utility method for creating a string slice matcher.
3427 */
3428 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3429 int[] tmp = new int[count];
3430 if (has(CASE_INSENSITIVE)) {
3431 if (has(UNICODE_CASE)) {
3432 for (int i = 0; i < count; i++) {
3433 tmp[i] = Character.toLowerCase(
3434 Character.toUpperCase(buf[i]));
3435 }
3436 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3437 }
3438 for (int i = 0; i < count; i++) {
3439 tmp[i] = ASCII.toLower(buf[i]);
3440 }
3441 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3442 }
3810 if (i < matcher.to && seq.charAt(i) == 0x0A)
3811 i++;
3812 return next.match(matcher, i, seq);
3813 }
3814 } else {
3815 matcher.hitEnd = true;
3816 }
3817 return false;
3818 }
3819 boolean study(TreeInfo info) {
3820 info.minLength++;
3821 info.maxLength += 2;
3822 return next.study(info);
3823 }
3824 }
3825
3826 /**
3827 * Abstract node class to match one character satisfying some
3828 * boolean property.
3829 */
3830 private abstract static class CharProperty extends Node {
3831 abstract boolean isSatisfiedBy(int ch);
3832 CharProperty complement() {
3833 return new CharProperty() {
3834 boolean isSatisfiedBy(int ch) {
3835 return ! CharProperty.this.isSatisfiedBy(ch);}};
3836 }
3837 boolean match(Matcher matcher, int i, CharSequence seq) {
3838 if (i < matcher.to) {
3839 int ch = Character.codePointAt(seq, i);
3840 return isSatisfiedBy(ch)
3841 && next.match(matcher, i+Character.charCount(ch), seq);
3842 } else {
3843 matcher.hitEnd = true;
3844 return false;
3845 }
3846 }
3847 boolean study(TreeInfo info) {
3848 info.minLength++;
3849 info.maxLength++;
3850 return next.study(info);
3851 }
3852 }
3853
3854 /**
3855 * Optimized version of CharProperty that works only for
3856 * properties never satisfied by Supplementary characters.
3857 */
3858 private abstract static class BmpCharProperty extends CharProperty {
3859 boolean match(Matcher matcher, int i, CharSequence seq) {
3860 if (i < matcher.to) {
3861 return isSatisfiedBy(seq.charAt(i))
3862 && next.match(matcher, i+1, seq);
3863 } else {
3864 matcher.hitEnd = true;
3865 return false;
3866 }
3867 }
3868 }
3869
3870 /**
3871 * Node class that matches a Supplementary Unicode character
3872 */
3873 static final class SingleS extends CharProperty {
3874 final int c;
3875 SingleS(int c) { this.c = c; }
3876 boolean isSatisfiedBy(int ch) {
3877 return ch == c;
3878 }
3879 }
3880
3881 /**
3882 * Optimization -- matches a given BMP character
3883 */
3884 static final class Single extends BmpCharProperty {
3885 final int c;
3886 Single(int c) { this.c = c; }
3887 boolean isSatisfiedBy(int ch) {
3888 return ch == c;
3889 }
3890 }
3891
3892 /**
3893 * Case insensitive matches a given BMP character
3894 */
3895 static final class SingleI extends BmpCharProperty {
3896 final int lower;
3897 final int upper;
3898 SingleI(int lower, int upper) {
3899 this.lower = lower;
3900 this.upper = upper;
3901 }
3902 boolean isSatisfiedBy(int ch) {
3903 return ch == lower || ch == upper;
3904 }
3905 }
3906
3907 /**
3908 * Unicode case insensitive matches a given Unicode character
3909 */
3910 static final class SingleU extends CharProperty {
3911 final int lower;
3912 SingleU(int lower) {
3913 this.lower = lower;
3914 }
3915 boolean isSatisfiedBy(int ch) {
3916 return lower == ch ||
3917 lower == Character.toLowerCase(Character.toUpperCase(ch));
3918 }
3919 }
3920
3921 /**
3922 * Node class that matches a Unicode block.
3923 */
3924 static final class Block extends CharProperty {
3925 final Character.UnicodeBlock block;
3926 Block(Character.UnicodeBlock block) {
3927 this.block = block;
3928 }
3929 boolean isSatisfiedBy(int ch) {
3930 return block == Character.UnicodeBlock.of(ch);
3931 }
3932 }
3933
3934 /**
3935 * Node class that matches a Unicode script
3936 */
3937 static final class Script extends CharProperty {
3938 final Character.UnicodeScript script;
3939 Script(Character.UnicodeScript script) {
3940 this.script = script;
3941 }
3942 boolean isSatisfiedBy(int ch) {
3943 return script == Character.UnicodeScript.of(ch);
3944 }
3945 }
3946
3947 /**
3948 * Node class that matches a Unicode category.
3949 */
3950 static final class Category extends CharProperty {
3951 final int typeMask;
3952 Category(int typeMask) { this.typeMask = typeMask; }
3953 boolean isSatisfiedBy(int ch) {
3954 return (typeMask & (1 << Character.getType(ch))) != 0;
3955 }
3956 }
3957
3958 /**
3959 * Node class that matches a Unicode "type"
3960 */
3961 static final class Utype extends CharProperty {
3962 final UnicodeProp uprop;
3963 Utype(UnicodeProp uprop) { this.uprop = uprop; }
3964 boolean isSatisfiedBy(int ch) {
3965 return uprop.is(ch);
3966 }
3967 }
3968
3969 /**
3970 * Node class that matches a POSIX type.
3971 */
3972 static final class Ctype extends BmpCharProperty {
3973 final int ctype;
3974 Ctype(int ctype) { this.ctype = ctype; }
3975 boolean isSatisfiedBy(int ch) {
3976 return ch < 128 && ASCII.isType(ch, ctype);
3977 }
3978 }
3979
3980 /**
3981 * Node class that matches a Perl vertical whitespace
3982 */
3983 static final class VertWS extends BmpCharProperty {
3984 boolean isSatisfiedBy(int cp) {
3985 return (cp >= 0x0A && cp <= 0x0D) ||
3986 cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3987 }
3988 }
3989
3990 /**
3991 * Node class that matches a Perl horizontal whitespace
3992 */
3993 static final class HorizWS extends BmpCharProperty {
3994 boolean isSatisfiedBy(int cp) {
3995 return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3996 cp == 0x1680 || cp == 0x180e ||
3997 cp >= 0x2000 && cp <= 0x200a ||
3998 cp == 0x202f || cp == 0x205f || cp == 0x3000;
3999 }
4000 }
4001
4002 /**
4003 * Node class that matches an unicode extended grapheme cluster
4004 */
4005 static class XGrapheme extends Node {
4006 boolean match(Matcher matcher, int i, CharSequence seq) {
4007 if (i < matcher.to) {
4008 int ch0 = Character.codePointAt(seq, i);
4009 i += Character.charCount(ch0);
4010 while (i < matcher.to) {
4011 int ch1 = Character.codePointAt(seq, i);
4012 if (Grapheme.isBoundary(ch0, ch1))
4013 break;
4014 ch0 = ch1;
4015 i += Character.charCount(ch1);
4016 }
4017 return next.match(matcher, i, seq);
4018 }
4019 matcher.hitEnd = true;
4020 return false;
4021 }
4022
4200 return false;
4201 }
4202 }
4203 return next.match(matcher, x, seq);
4204 }
4205 }
4206
4207 /**
4208 * Node class for a case insensitive sequence of literal characters.
4209 * Uses unicode case folding.
4210 */
4211 static final class SliceUS extends SliceIS {
4212 SliceUS(int[] buf) {
4213 super(buf);
4214 }
4215 int toLower(int c) {
4216 return Character.toLowerCase(Character.toUpperCase(c));
4217 }
4218 }
4219
4220 private static boolean inRange(int lower, int ch, int upper) {
4221 return lower <= ch && ch <= upper;
4222 }
4223
4224 /**
4225 * Returns node for matching characters within an explicit value range.
4226 */
4227 private static CharProperty rangeFor(final int lower,
4228 final int upper) {
4229 return new CharProperty() {
4230 boolean isSatisfiedBy(int ch) {
4231 return inRange(lower, ch, upper);}};
4232 }
4233
4234 /**
4235 * Returns node for matching characters within an explicit value
4236 * range in a case insensitive manner.
4237 */
4238 private CharProperty caseInsensitiveRangeFor(final int lower,
4239 final int upper) {
4240 if (has(UNICODE_CASE))
4241 return new CharProperty() {
4242 boolean isSatisfiedBy(int ch) {
4243 if (inRange(lower, ch, upper))
4244 return true;
4245 int up = Character.toUpperCase(ch);
4246 return inRange(lower, up, upper) ||
4247 inRange(lower, Character.toLowerCase(up), upper);}};
4248 return new CharProperty() {
4249 boolean isSatisfiedBy(int ch) {
4250 return inRange(lower, ch, upper) ||
4251 ASCII.isAscii(ch) &&
4252 (inRange(lower, ASCII.toUpper(ch), upper) ||
4253 inRange(lower, ASCII.toLower(ch), upper));
4254 }};
4255 }
4256
4257 /**
4258 * Implements the Unicode category ALL and the dot metacharacter when
4259 * in dotall mode.
4260 */
4261 static final class All extends CharProperty {
4262 boolean isSatisfiedBy(int ch) {
4263 return true;
4264 }
4265 }
4266
4267 /**
4268 * Node class for the dot metacharacter when dotall is not enabled.
4269 */
4270 static final class Dot extends CharProperty {
4271 boolean isSatisfiedBy(int ch) {
4272 return (ch != '\n' && ch != '\r'
4273 && (ch|1) != '\u2029'
4274 && ch != '\u0085');
4275 }
4276 }
4277
4278 /**
4279 * Node class for the dot metacharacter when dotall is not enabled
4280 * but UNIX_LINES is enabled.
4281 */
4282 static final class UnixDot extends CharProperty {
4283 boolean isSatisfiedBy(int ch) {
4284 return ch != '\n';
4285 }
4286 }
4287
4288 /**
4289 * The 0 or 1 quantifier. This one class implements all three types.
4290 */
4291 static final class Ques extends Node {
4292 Node atom;
4293 int type;
4294 Ques(Node node, int type) {
4295 this.atom = node;
4296 this.type = type;
4297 }
4298 boolean match(Matcher matcher, int i, CharSequence seq) {
4299 switch (type) {
4300 case GREEDY:
4301 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4302 || next.match(matcher, i, seq);
4303 case LAZY:
4304 return next.match(matcher, i, seq)
4305 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4306 case POSSESSIVE:
4307 if (atom.match(matcher, i, seq)) i = matcher.last;
4308 return next.match(matcher, i, seq);
4309 default:
4310 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4311 }
4312 }
4313 boolean study(TreeInfo info) {
4314 if (type != INDEPENDENT) {
4315 int minL = info.minLength;
4316 atom.study(info);
4317 info.minLength = minL;
4318 info.deterministic = false;
4319 return next.study(info);
4320 } else {
4321 atom.study(info);
4322 return next.study(info);
4323 }
4324 }
4325 }
4326
4327 /**
4328 * Handles the curly-brace style repetition with a specified minimum and
4329 * maximum occurrences. The * quantifier is handled as a special case.
4330 * This class handles the three types.
4331 */
4332 static final class Curly extends Node {
4333 Node atom;
4334 int type;
4335 int cmin;
4336 int cmax;
4337
4338 Curly(Node node, int cmin, int cmax, int type) {
4339 this.atom = node;
4340 this.type = type;
4341 this.cmin = cmin;
4342 this.cmax = cmax;
4343 }
4344 boolean match(Matcher matcher, int i, CharSequence seq) {
4345 int j;
4346 for (j = 0; j < cmin; j++) {
4347 if (atom.match(matcher, i, seq)) {
4348 i = matcher.last;
4349 continue;
4350 }
4351 return false;
4352 }
4353 if (type == GREEDY)
4354 return match0(matcher, i, j, seq);
4355 else if (type == LAZY)
4356 return match1(matcher, i, j, seq);
4357 else
4358 return match2(matcher, i, j, seq);
4359 }
4360 // Greedy match.
4361 // i is the index to start matching at
4362 // j is the number of atoms that have matched
4363 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4364 if (j >= cmax) {
4365 // We have matched the maximum... continue with the rest of
4366 // the regular expression
4367 return next.match(matcher, i, seq);
4368 }
4369 int backLimit = j;
4370 while (atom.match(matcher, i, seq)) {
4371 // k is the length of this match
4372 int k = matcher.last - i;
4373 if (k == 0) // Zero length match
4374 break;
4375 // Move up index and number matched
4457 }
4458
4459 if (info.deterministic && cmin == cmax)
4460 info.deterministic = detm;
4461 else
4462 info.deterministic = false;
4463 return next.study(info);
4464 }
4465 }
4466
4467 /**
4468 * Handles the curly-brace style repetition with a specified minimum and
4469 * maximum occurrences in deterministic cases. This is an iterative
4470 * optimization over the Prolog and Loop system which would handle this
4471 * in a recursive way. The * quantifier is handled as a special case.
4472 * If capture is true then this class saves group settings and ensures
4473 * that groups are unset when backing off of a group match.
4474 */
4475 static final class GroupCurly extends Node {
4476 Node atom;
4477 int type;
4478 int cmin;
4479 int cmax;
4480 int localIndex;
4481 int groupIndex;
4482 boolean capture;
4483
4484 GroupCurly(Node node, int cmin, int cmax, int type, int local,
4485 int group, boolean capture) {
4486 this.atom = node;
4487 this.type = type;
4488 this.cmin = cmin;
4489 this.cmax = cmax;
4490 this.localIndex = local;
4491 this.groupIndex = group;
4492 this.capture = capture;
4493 }
4494 boolean match(Matcher matcher, int i, CharSequence seq) {
4495 int[] groups = matcher.groups;
4496 int[] locals = matcher.locals;
4497 int save0 = locals[localIndex];
4498 int save1 = 0;
4499 int save2 = 0;
4500
4501 if (capture) {
4502 save1 = groups[groupIndex];
4503 save2 = groups[groupIndex+1];
4504 }
4505
4506 // Notify GroupTail there is no need to setup group info
4507 // because it will be set here
4508 locals[localIndex] = -1;
4509
4510 boolean ret = true;
4511 for (int j = 0; j < cmin; j++) {
4512 if (atom.match(matcher, i, seq)) {
4513 if (capture) {
4514 groups[groupIndex] = i;
4515 groups[groupIndex+1] = matcher.last;
4516 }
4517 i = matcher.last;
4518 } else {
4519 ret = false;
4520 break;
4521 }
4522 }
4523 if (ret) {
4524 if (type == GREEDY) {
4525 ret = match0(matcher, i, cmin, seq);
4526 } else if (type == LAZY) {
4527 ret = match1(matcher, i, cmin, seq);
4528 } else {
4529 ret = match2(matcher, i, cmin, seq);
4530 }
4531 }
4532 if (!ret) {
4533 locals[localIndex] = save0;
4534 if (capture) {
4535 groups[groupIndex] = save1;
4536 groups[groupIndex+1] = save2;
4537 }
4538 }
4539 return ret;
4540 }
4541 // Aggressive group match
4542 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4543 // don't back off passing the starting "j"
4544 int min = j;
4545 int[] groups = matcher.groups;
4546 int save0 = 0;
4752
4753 info.minLength += minL;
4754 info.maxLength += maxL;
4755 info.maxValid &= maxV;
4756 info.deterministic = false;
4757 return false;
4758 }
4759 }
4760
4761 /**
4762 * The GroupHead saves the location where the group begins in the locals
4763 * and restores them when the match is done.
4764 *
4765 * The matchRef is used when a reference to this group is accessed later
4766 * in the expression. The locals will have a negative value in them to
4767 * indicate that we do not want to unset the group if the reference
4768 * doesn't match.
4769 */
4770 static final class GroupHead extends Node {
4771 int localIndex;
4772 GroupHead(int localCount) {
4773 localIndex = localCount;
4774 }
4775 boolean match(Matcher matcher, int i, CharSequence seq) {
4776 int save = matcher.locals[localIndex];
4777 matcher.locals[localIndex] = i;
4778 boolean ret = next.match(matcher, i, seq);
4779 matcher.locals[localIndex] = save;
4780 return ret;
4781 }
4782 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4783 int save = matcher.locals[localIndex];
4784 matcher.locals[localIndex] = ~i; // HACK
4785 boolean ret = next.match(matcher, i, seq);
4786 matcher.locals[localIndex] = save;
4787 return ret;
4788 }
4789 }
4790
4791 /**
5345 int startIndex = (!matcher.transparentBounds) ?
5346 matcher.from : 0;
5347 int from = Math.max(i - rmaxChars, startIndex);
5348 matcher.lookbehindTo = i;
5349 // Relax transparent region boundaries for lookbehind
5350 if (matcher.transparentBounds)
5351 matcher.from = 0;
5352 for (int j = i - rminChars;
5353 !conditionMatched && j >= from;
5354 j -= j>from ? countChars(seq, j, -1) : 1) {
5355 conditionMatched = cond.match(matcher, j, seq);
5356 }
5357 //Reinstate region boundaries
5358 matcher.from = savedFrom;
5359 matcher.lookbehindTo = savedLBT;
5360 return !conditionMatched && next.match(matcher, i, seq);
5361 }
5362 }
5363
5364 /**
5365 * Returns the set union of two CharProperty nodes.
5366 */
5367 private static CharProperty union(final CharProperty lhs,
5368 final CharProperty rhs) {
5369 return new CharProperty() {
5370 boolean isSatisfiedBy(int ch) {
5371 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
5372 }
5373
5374 /**
5375 * Returns the set intersection of two CharProperty nodes.
5376 */
5377 private static CharProperty intersection(final CharProperty lhs,
5378 final CharProperty rhs) {
5379 return new CharProperty() {
5380 boolean isSatisfiedBy(int ch) {
5381 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
5382 }
5383
5384 /**
5385 * Returns the set difference of two CharProperty nodes.
5386 */
5387 private static CharProperty setDifference(final CharProperty lhs,
5388 final CharProperty rhs) {
5389 return new CharProperty() {
5390 boolean isSatisfiedBy(int ch) {
5391 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
5392 }
5393
5394 /**
5395 * Handles word boundaries. Includes a field to allow this one class to
5396 * deal with the different types of word boundaries we can match. The word
5397 * characters include underscores, letters, and digits. Non spacing marks
5398 * can are also part of a word if they have a base character, otherwise
5399 * they are ignored for purposes of finding word boundaries.
5400 */
5401 static final class Bound extends Node {
5402 static int LEFT = 0x1;
5403 static int RIGHT= 0x2;
5404 static int BOTH = 0x3;
5405 static int NONE = 0x4;
5406 int type;
5407 boolean useUWORD;
5408 Bound(int n, boolean useUWORD) {
5409 type = n;
5410 this.useUWORD = useUWORD;
5411 }
5412
5413 boolean isWord(int ch) {
5414 return useUWORD ? UnicodeProp.WORD.is(ch)
5415 : (ch == '_' || Character.isLetterOrDigit(ch));
5416 }
5417
5418 int check(Matcher matcher, int i, CharSequence seq) {
5419 int ch;
5420 boolean left = false;
5421 int startIndex = matcher.from;
5422 int endIndex = matcher.to;
5423 if (matcher.transparentBounds) {
5424 startIndex = 0;
5425 endIndex = matcher.getTextLength();
5426 }
5427 if (i > startIndex) {
5428 ch = Character.codePointBefore(seq, i);
5429 left = (isWord(ch) ||
5430 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5431 && hasBaseCharacter(matcher, i-1, seq)));
5432 }
5433 boolean right = false;
5434 if (i < endIndex) {
5640 i += countChars(seq, i, n);
5641 continue NEXT;
5642 }
5643 }
5644 // Entire pattern matched starting at i
5645 matcher.first = i;
5646 boolean ret = next.match(matcher, i + lengthInChars, seq);
5647 if (ret) {
5648 matcher.first = i;
5649 matcher.groups[0] = matcher.first;
5650 matcher.groups[1] = matcher.last;
5651 return true;
5652 }
5653 i += countChars(seq, i, 1);
5654 }
5655 matcher.hitEnd = true;
5656 return false;
5657 }
5658 }
5659
5660 ///////////////////////////////////////////////////////////////////////////////
5661 ///////////////////////////////////////////////////////////////////////////////
5662
5663 /**
5664 * This must be the very first initializer.
5665 */
5666 static Node accept = new Node();
5667
5668 static Node lastAccept = new LastNode();
5669
5670 private static class CharPropertyNames {
5671
5672 static CharProperty charPropertyFor(String name) {
5673 CharPropertyFactory m = map.get(name);
5674 return m == null ? null : m.make();
5675 }
5676
5677 private abstract static class CharPropertyFactory {
5678 abstract CharProperty make();
5679 }
5680
5681 private static void defCategory(String name,
5682 final int typeMask) {
5683 map.put(name, new CharPropertyFactory() {
5684 CharProperty make() { return new Category(typeMask);}});
5685 }
5686
5687 private static void defRange(String name,
5688 final int lower, final int upper) {
5689 map.put(name, new CharPropertyFactory() {
5690 CharProperty make() { return rangeFor(lower, upper);}});
5691 }
5692
5693 private static void defCtype(String name,
5694 final int ctype) {
5695 map.put(name, new CharPropertyFactory() {
5696 CharProperty make() { return new Ctype(ctype);}});
5697 }
5698
5699 private abstract static class CloneableProperty
5700 extends CharProperty implements Cloneable
5701 {
5702 public CloneableProperty clone() {
5703 try {
5704 return (CloneableProperty) super.clone();
5705 } catch (CloneNotSupportedException e) {
5706 throw new AssertionError(e);
5707 }
5708 }
5709 }
5710
5711 private static void defClone(String name,
5712 final CloneableProperty p) {
5713 map.put(name, new CharPropertyFactory() {
5714 CharProperty make() { return p.clone();}});
5715 }
5716
5717 private static final HashMap<String, CharPropertyFactory> map
5718 = new HashMap<>();
5719
5720 static {
5721 // Unicode character property aliases, defined in
5722 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
5723 defCategory("Cn", 1<<Character.UNASSIGNED);
5724 defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
5725 defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
5726 defCategory("Lt", 1<<Character.TITLECASE_LETTER);
5727 defCategory("Lm", 1<<Character.MODIFIER_LETTER);
5728 defCategory("Lo", 1<<Character.OTHER_LETTER);
5729 defCategory("Mn", 1<<Character.NON_SPACING_MARK);
5730 defCategory("Me", 1<<Character.ENCLOSING_MARK);
5731 defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
5732 defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
5733 defCategory("Nl", 1<<Character.LETTER_NUMBER);
5734 defCategory("No", 1<<Character.OTHER_NUMBER);
5735 defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
5736 defCategory("Zl", 1<<Character.LINE_SEPARATOR);
5737 defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
5738 defCategory("Cc", 1<<Character.CONTROL);
5739 defCategory("Cf", 1<<Character.FORMAT);
5740 defCategory("Co", 1<<Character.PRIVATE_USE);
5741 defCategory("Cs", 1<<Character.SURROGATE);
5742 defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
5743 defCategory("Ps", 1<<Character.START_PUNCTUATION);
5744 defCategory("Pe", 1<<Character.END_PUNCTUATION);
5745 defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
5746 defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
5747 defCategory("Sm", 1<<Character.MATH_SYMBOL);
5748 defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
5749 defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
5750 defCategory("So", 1<<Character.OTHER_SYMBOL);
5751 defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
5752 defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
5753 defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
5754 (1<<Character.LOWERCASE_LETTER) |
5755 (1<<Character.TITLECASE_LETTER) |
5756 (1<<Character.MODIFIER_LETTER) |
5757 (1<<Character.OTHER_LETTER)));
5758 defCategory("M", ((1<<Character.NON_SPACING_MARK) |
5759 (1<<Character.ENCLOSING_MARK) |
5760 (1<<Character.COMBINING_SPACING_MARK)));
5761 defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
5762 (1<<Character.LETTER_NUMBER) |
5763 (1<<Character.OTHER_NUMBER)));
5764 defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
5765 (1<<Character.LINE_SEPARATOR) |
5766 (1<<Character.PARAGRAPH_SEPARATOR)));
5767 defCategory("C", ((1<<Character.CONTROL) |
5768 (1<<Character.FORMAT) |
5769 (1<<Character.PRIVATE_USE) |
5770 (1<<Character.SURROGATE))); // Other
5771 defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
5772 (1<<Character.START_PUNCTUATION) |
5773 (1<<Character.END_PUNCTUATION) |
5774 (1<<Character.CONNECTOR_PUNCTUATION) |
5775 (1<<Character.OTHER_PUNCTUATION) |
5776 (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
5777 (1<<Character.FINAL_QUOTE_PUNCTUATION)));
5778 defCategory("S", ((1<<Character.MATH_SYMBOL) |
5779 (1<<Character.CURRENCY_SYMBOL) |
5780 (1<<Character.MODIFIER_SYMBOL) |
5781 (1<<Character.OTHER_SYMBOL)));
5782 defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
5783 (1<<Character.LOWERCASE_LETTER) |
5784 (1<<Character.TITLECASE_LETTER)));
5785 defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
5786 (1<<Character.LOWERCASE_LETTER) |
5787 (1<<Character.TITLECASE_LETTER) |
5788 (1<<Character.MODIFIER_LETTER) |
5789 (1<<Character.OTHER_LETTER) |
5790 (1<<Character.DECIMAL_DIGIT_NUMBER)));
5791 defRange("L1", 0x00, 0xFF); // Latin-1
5792 map.put("all", new CharPropertyFactory() {
5793 CharProperty make() { return new All(); }});
5794
5795 // Posix regular expression character classes, defined in
5796 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
5797 defRange("ASCII", 0x00, 0x7F); // ASCII
5798 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
5799 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
5800 defCtype("Blank", ASCII.BLANK); // Space and tab characters
5801 defCtype("Cntrl", ASCII.CNTRL); // Control characters
5802 defRange("Digit", '0', '9'); // Numeric characters
5803 defCtype("Graph", ASCII.GRAPH); // printable and visible
5804 defRange("Lower", 'a', 'z'); // Lower-case alphabetic
5805 defRange("Print", 0x20, 0x7E); // Printable characters
5806 defCtype("Punct", ASCII.PUNCT); // Punctuation characters
5807 defCtype("Space", ASCII.SPACE); // Space characters
5808 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
5809 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5810
5811 // Java character properties, defined by methods in Character.java
5812 defClone("javaLowerCase", new CloneableProperty() {
5813 boolean isSatisfiedBy(int ch) {
5814 return Character.isLowerCase(ch);}});
5815 defClone("javaUpperCase", new CloneableProperty() {
5816 boolean isSatisfiedBy(int ch) {
5817 return Character.isUpperCase(ch);}});
5818 defClone("javaAlphabetic", new CloneableProperty() {
5819 boolean isSatisfiedBy(int ch) {
5820 return Character.isAlphabetic(ch);}});
5821 defClone("javaIdeographic", new CloneableProperty() {
5822 boolean isSatisfiedBy(int ch) {
5823 return Character.isIdeographic(ch);}});
5824 defClone("javaTitleCase", new CloneableProperty() {
5825 boolean isSatisfiedBy(int ch) {
5826 return Character.isTitleCase(ch);}});
5827 defClone("javaDigit", new CloneableProperty() {
5828 boolean isSatisfiedBy(int ch) {
5829 return Character.isDigit(ch);}});
5830 defClone("javaDefined", new CloneableProperty() {
5831 boolean isSatisfiedBy(int ch) {
5832 return Character.isDefined(ch);}});
5833 defClone("javaLetter", new CloneableProperty() {
5834 boolean isSatisfiedBy(int ch) {
5835 return Character.isLetter(ch);}});
5836 defClone("javaLetterOrDigit", new CloneableProperty() {
5837 boolean isSatisfiedBy(int ch) {
5838 return Character.isLetterOrDigit(ch);}});
5839 defClone("javaJavaIdentifierStart", new CloneableProperty() {
5840 boolean isSatisfiedBy(int ch) {
5841 return Character.isJavaIdentifierStart(ch);}});
5842 defClone("javaJavaIdentifierPart", new CloneableProperty() {
5843 boolean isSatisfiedBy(int ch) {
5844 return Character.isJavaIdentifierPart(ch);}});
5845 defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
5846 boolean isSatisfiedBy(int ch) {
5847 return Character.isUnicodeIdentifierStart(ch);}});
5848 defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
5849 boolean isSatisfiedBy(int ch) {
5850 return Character.isUnicodeIdentifierPart(ch);}});
5851 defClone("javaIdentifierIgnorable", new CloneableProperty() {
5852 boolean isSatisfiedBy(int ch) {
5853 return Character.isIdentifierIgnorable(ch);}});
5854 defClone("javaSpaceChar", new CloneableProperty() {
5855 boolean isSatisfiedBy(int ch) {
5856 return Character.isSpaceChar(ch);}});
5857 defClone("javaWhitespace", new CloneableProperty() {
5858 boolean isSatisfiedBy(int ch) {
5859 return Character.isWhitespace(ch);}});
5860 defClone("javaISOControl", new CloneableProperty() {
5861 boolean isSatisfiedBy(int ch) {
5862 return Character.isISOControl(ch);}});
5863 defClone("javaMirrored", new CloneableProperty() {
5864 boolean isSatisfiedBy(int ch) {
5865 return Character.isMirrored(ch);}});
5866 }
5867 }
5868
5869 /**
5870 * Creates a predicate which can be used to match a string.
5871 *
5872 * @return The predicate which can be used for matching on a string
5873 * @since 1.8
5874 */
5875 public Predicate<String> asPredicate() {
5876 return s -> matcher(s).find();
5877 }
5878
5879 /**
5880 * Creates a stream from the given input sequence around matches of this
5881 * pattern.
5882 *
5883 * <p> The stream returned by this method contains each substring of the
5884 * input sequence that is terminated by another subsequence that matches
5885 * this pattern or is terminated by the end of the input sequence. The
5886 * substrings in the stream are in the order in which they occur in the
5887 * input. Trailing empty strings will be discarded and not encountered in
5888 * the stream.
5889 *
|
967
968 /**
969 * The starting point of state machine for the find operation. This allows
970 * a match to start anywhere in the input.
971 */
972 transient Node root;
973
974 /**
975 * The root of object tree for a match operation. The pattern is matched
976 * at the beginning. This may include a find that uses BnM or a First
977 * node.
978 */
979 transient Node matchRoot;
980
981 /**
982 * Temporary storage used by parsing pattern slice.
983 */
984 transient int[] buffer;
985
986 /**
987 * A temporary storage used for predicate for double return.
988 */
989 transient CharPredicate predicate;
990
991 /**
992 * Map the "name" of the "named capturing group" to its group id
993 * node.
994 */
995 transient volatile Map<String, Integer> namedGroups;
996
997 /**
998 * Temporary storage used while parsing group references.
999 */
1000 transient GroupHead[] groupNodes;
1001
1002 /**
1003 * Temporary null terminated code point array used by pattern compiling.
1004 */
1005 private transient int[] temp;
1006
1007 /**
1008 * The number of capturing groups in this Pattern. Used by matchers to
1009 * allocate storage needed to perform a match.
1010 */
1011 transient int capturingGroupCount;
1014 * The local variable count used by parsing tree. Used by matchers to
1015 * allocate storage needed to perform a match.
1016 */
1017 transient int localCount;
1018
1019 /**
1020 * Index into the pattern string that keeps track of how much has been
1021 * parsed.
1022 */
1023 private transient int cursor;
1024
1025 /**
1026 * Holds the length of the pattern string.
1027 */
1028 private transient int patternLength;
1029
1030 /**
1031 * If the Start node might possibly match supplementary characters.
1032 * It is set to true during compiling if
1033 * (1) There is supplementary char in pattern, or
1034 * (2) There is complement node of a "family" CharProperty
1035 */
1036 private transient boolean hasSupplementary;
1037
1038 /**
1039 * Compiles the given regular expression into a pattern.
1040 *
1041 * @param regex
1042 * The expression to be compiled
1043 * @return the given regular expression compiled into a pattern
1044 * @throws PatternSyntaxException
1045 * If the expression's syntax is invalid
1046 */
1047 public static Pattern compile(String regex) {
1048 return new Pattern(regex, 0);
1049 }
1050
1051 /**
1052 * Compiles the given regular expression into a pattern with the given
1053 * flags.
1054 *
1742 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1743 }
1744
1745 // Release temporary storage
1746 temp = null;
1747 buffer = null;
1748 groupNodes = null;
1749 patternLength = 0;
1750 compiled = true;
1751 }
1752
1753 Map<String, Integer> namedGroups() {
1754 Map<String, Integer> groups = namedGroups;
1755 if (groups == null) {
1756 namedGroups = groups = new HashMap<>(2);
1757 }
1758 return groups;
1759 }
1760
1761 /**
1762 * Used to accumulate information about a subtree of the object graph
1763 * so that optimizations can be applied to the subtree.
1764 */
1765 static final class TreeInfo {
1766 int minLength;
1767 int maxLength;
1768 boolean maxValid;
1769 boolean deterministic;
1770
1771 TreeInfo() {
1772 reset();
1773 }
1774 void reset() {
1775 minLength = 0;
1776 maxLength = 0;
1777 maxValid = true;
1778 deterministic = true;
1779 }
1780 }
1781
2033 Node node = null;
2034 LOOP:
2035 for (;;) {
2036 int ch = peek();
2037 switch (ch) {
2038 case '(':
2039 // Because group handles its own closure,
2040 // we need to treat it differently
2041 node = group0();
2042 // Check for comment or flag group
2043 if (node == null)
2044 continue;
2045 if (head == null)
2046 head = node;
2047 else
2048 tail.next = node;
2049 // Double return: Tail was returned in root
2050 tail = root;
2051 continue;
2052 case '[':
2053 node = newCharProperty(clazz(true));
2054 break;
2055 case '\\':
2056 ch = nextEscaped();
2057 if (ch == 'p' || ch == 'P') {
2058 boolean oneLetter = true;
2059 boolean comp = (ch == 'P');
2060 ch = next(); // Consume { if present
2061 if (ch != '{') {
2062 unread();
2063 } else {
2064 oneLetter = false;
2065 }
2066 node = newCharProperty(family(oneLetter, comp));
2067 } else {
2068 unread();
2069 node = atom();
2070 }
2071 break;
2072 case '^':
2073 next();
2074 if (has(MULTILINE)) {
2075 if (has(UNIX_LINES))
2076 node = new UnixCaret();
2077 else
2078 node = new Caret();
2079 } else {
2080 node = new Begin();
2081 }
2082 break;
2083 case '$':
2084 next();
2085 if (has(UNIX_LINES))
2086 node = new UnixDollar(has(MULTILINE));
2087 else
2088 node = new Dollar(has(MULTILINE));
2089 break;
2090 case '.':
2091 next();
2092 if (has(DOTALL)) {
2093 node = new CharProperty(ALL);
2094 } else {
2095 if (has(UNIX_LINES)) {
2096 node = new CharProperty(UNIXDOT);
2097 } else {
2098 node = new CharProperty(DOT);
2099 }
2100 }
2101 break;
2102 case '|':
2103 case ')':
2104 break LOOP;
2105 case ']': // Now interpreting dangling ] and } as literals
2106 case '}':
2107 node = atom();
2108 break;
2109 case '?':
2110 case '*':
2111 case '+':
2112 next();
2113 throw error("Dangling meta character '" + ((char)ch) + "'");
2114 case 0:
2115 if (cursor >= patternLength) {
2116 break LOOP;
2117 }
2118 // Fall through
2119 default:
2120 node = atom();
2121 break;
2122 }
2123
2124 node = closure(node);
2125 if (head == null) {
2126 head = tail = node;
2127 } else {
2128 tail.next = node;
2129 tail = node;
2130 }
2131 }
2132 if (head == null) {
2133 return end;
2134 }
2135 tail.next = end;
2136 root = tail; //double return
2137 return head;
2138 }
2139
2140 @SuppressWarnings("fallthrough")
2141 /**
2142 * Parse and add a new Single or Slice.
2143 */
2144 private Node atom() {
2162 case '^':
2163 case '(':
2164 case '[':
2165 case '|':
2166 case ')':
2167 break;
2168 case '\\':
2169 ch = nextEscaped();
2170 if (ch == 'p' || ch == 'P') { // Property
2171 if (first > 0) { // Slice is waiting; handle it first
2172 unread();
2173 break;
2174 } else { // No slice; just return the family node
2175 boolean comp = (ch == 'P');
2176 boolean oneLetter = true;
2177 ch = next(); // Consume { if present
2178 if (ch != '{')
2179 unread();
2180 else
2181 oneLetter = false;
2182 return newCharProperty(family(oneLetter, comp));
2183 }
2184 }
2185 unread();
2186 prev = cursor;
2187 ch = escape(false, first == 0, false);
2188 if (ch >= 0) {
2189 append(ch, first);
2190 first++;
2191 if (isSupplementary(ch)) {
2192 hasSupplementary = true;
2193 }
2194 ch = peek();
2195 continue;
2196 } else if (first == 0) {
2197 return root;
2198 }
2199 // Unwind meta escape sequence
2200 cursor = prev;
2201 break;
2202 case 0:
2203 if (cursor >= patternLength) {
2204 break;
2205 }
2206 // Fall through
2207 default:
2208 prev = cursor;
2209 append(ch, first);
2210 first++;
2211 if (isSupplementary(ch)) {
2212 hasSupplementary = true;
2213 }
2214 ch = next();
2215 continue;
2216 }
2217 break;
2218 }
2219 if (first == 1) {
2220 return newCharProperty(single(buffer[0]));
2221 } else {
2222 return newSlice(buffer, first, hasSupplementary);
2223 }
2224 }
2225
2226 private void append(int ch, int len) {
2227 if (len >= buffer.length) {
2228 int[] tmp = new int[len+len];
2229 System.arraycopy(buffer, 0, tmp, 0, len);
2230 buffer = tmp;
2231 }
2232 buffer[len] = ch;
2233 }
2234
2235 /**
2236 * Parses a backref greedily, taking as many numbers as it
2237 * can. The first digit is always treated as a backref, but
2238 * multi digit numbers are only treated as a backref if at
2239 * least that many backrefs exist at this point in the regex.
2240 */
2295 case '6':
2296 case '7':
2297 case '8':
2298 case '9':
2299 if (inclass) break;
2300 if (create) {
2301 root = ref((ch - '0'));
2302 }
2303 return -1;
2304 case 'A':
2305 if (inclass) break;
2306 if (create) root = new Begin();
2307 return -1;
2308 case 'B':
2309 if (inclass) break;
2310 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2311 return -1;
2312 case 'C':
2313 break;
2314 case 'D':
2315 if (create) {
2316 predicate = has(UNICODE_CHARACTER_CLASS) ?
2317 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
2318 predicate = predicate.negate();
2319 if (!inclass)
2320 root = newCharProperty(predicate);
2321 }
2322 return -1;
2323 case 'E':
2324 case 'F':
2325 break;
2326 case 'G':
2327 if (inclass) break;
2328 if (create) root = new LastMatch();
2329 return -1;
2330 case 'H':
2331 if (create) {
2332 predicate = HorizWS.negate();
2333 if (!inclass)
2334 root = newCharProperty(predicate);
2335 }
2336 return -1;
2337 case 'I':
2338 case 'J':
2339 case 'K':
2340 case 'L':
2341 case 'M':
2342 break;
2343 case 'N':
2344 return N();
2345 case 'O':
2346 case 'P':
2347 case 'Q':
2348 break;
2349 case 'R':
2350 if (inclass) break;
2351 if (create) root = new LineEnding();
2352 return -1;
2353 case 'S':
2354 if (create) {
2355 predicate = has(UNICODE_CHARACTER_CLASS) ?
2356 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
2357 predicate = predicate.negate();
2358 if (!inclass)
2359 root = newCharProperty(predicate);
2360 }
2361 return -1;
2362 case 'T':
2363 case 'U':
2364 break;
2365 case 'V':
2366 if (create) {
2367 predicate = VertWS.negate();
2368 if (!inclass)
2369 root = newCharProperty(predicate);
2370 }
2371 return -1;
2372 case 'W':
2373 if (create) {
2374 predicate = has(UNICODE_CHARACTER_CLASS) ?
2375 CharPredicates.WORD : CharPredicates.ASCII_WORD;
2376 predicate = predicate.negate();
2377 if (!inclass)
2378 root = newCharProperty(predicate);
2379 }
2380 return -1;
2381 case 'X':
2382 if (inclass) break;
2383 if (create) {
2384 root = new XGrapheme();
2385 }
2386 return -1;
2387 case 'Y':
2388 break;
2389 case 'Z':
2390 if (inclass) break;
2391 if (create) {
2392 if (has(UNIX_LINES))
2393 root = new UnixDollar(false);
2394 else
2395 root = new Dollar(false);
2396 }
2397 return -1;
2398 case 'a':
2399 return '\007';
2400 case 'b':
2401 if (inclass) break;
2402 if (create) {
2403 if (peek() == '{') {
2404 if (skip() == 'g') {
2405 if (read() == '}') {
2406 root = new GraphemeBound();
2407 return -1;
2408 }
2409 break; // error missing trailing }
2410 }
2411 unread(); unread();
2412 }
2413 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2414 }
2415 return -1;
2416 case 'c':
2417 return c();
2418 case 'd':
2419 if (create) {
2420 predicate = has(UNICODE_CHARACTER_CLASS) ?
2421 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
2422 if (!inclass)
2423 root = newCharProperty(predicate);
2424 }
2425 return -1;
2426 case 'e':
2427 return '\033';
2428 case 'f':
2429 return '\f';
2430 case 'g':
2431 break;
2432 case 'h':
2433 if (create) {
2434 predicate = HorizWS;
2435 if (!inclass)
2436 root = newCharProperty(predicate);
2437 }
2438 return -1;
2439 case 'i':
2440 case 'j':
2441 break;
2442 case 'k':
2443 if (inclass)
2444 break;
2445 if (read() != '<')
2446 throw error("\\k is not followed by '<' for named capturing group");
2447 String name = groupname(read());
2448 if (!namedGroups().containsKey(name))
2449 throw error("(named capturing group <"+ name+"> does not exit");
2450 if (create) {
2451 if (has(CASE_INSENSITIVE))
2452 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2453 else
2454 root = new BackRef(namedGroups().get(name));
2455 }
2456 return -1;
2457 case 'l':
2458 case 'm':
2459 break;
2460 case 'n':
2461 return '\n';
2462 case 'o':
2463 case 'p':
2464 case 'q':
2465 break;
2466 case 'r':
2467 return '\r';
2468 case 's':
2469 if (create) {
2470 predicate = has(UNICODE_CHARACTER_CLASS) ?
2471 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
2472 if (!inclass)
2473 root = newCharProperty(predicate);
2474 }
2475 return -1;
2476 case 't':
2477 return '\t';
2478 case 'u':
2479 return u();
2480 case 'v':
2481 // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2482 // undocumented). In JDK8 '\v' is specified as a predefined
2483 // character class for all vertical whitespace characters.
2484 // So [-1, root=VertWS node] pair is returned (instead of a
2485 // single 0x0B). This breaks the range if '\v' is used as
2486 // the start or end value, such as [\v-...] or [...-\v], in
2487 // which a single definite value (0x0B) is expected. For
2488 // compatibility concern '\013'/0x0B is returned if isrange.
2489 if (isrange)
2490 return '\013';
2491 if (create) {
2492 predicate = VertWS;
2493 if (!inclass)
2494 root = newCharProperty(predicate);
2495 }
2496 return -1;
2497 case 'w':
2498 if (create) {
2499 predicate = has(UNICODE_CHARACTER_CLASS) ?
2500 CharPredicates.WORD : CharPredicates.ASCII_WORD;
2501 if (!inclass)
2502 root = newCharProperty(predicate);
2503 }
2504 return -1;
2505 case 'x':
2506 return x();
2507 case 'y':
2508 break;
2509 case 'z':
2510 if (inclass) break;
2511 if (create) root = new End();
2512 return -1;
2513 default:
2514 return ch;
2515 }
2516 throw error("Illegal/unsupported escape sequence");
2517 }
2518
2519 /**
2520 * Parse a character class, and return the node that matches it.
2521 *
2522 * Consumes a ] on the way out if consume is true. Usually consume
2523 * is true except for the case of [abc&&def] where def is a separate
2524 * right hand node with "understood" brackets.
2525 */
2526 private CharPredicate clazz(boolean consume) {
2527 CharPredicate prev = null;
2528 CharPredicate curr = null;
2529 BitClass bits = new BitClass();
2530 BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
2531
2532 boolean isNeg = false;
2533 boolean hasBits = false;
2534 int ch = next();
2535
2536 // Negates if first char in a class, otherwise literal
2537 if (ch == '^' && temp[cursor-1] == '[') {
2538 ch = next();
2539 isNeg = true;
2540 }
2541 for (;;) {
2542 switch (ch) {
2543 case '[':
2544 curr = clazz(true);
2545 if (prev == null)
2546 prev = curr;
2547 else
2548 prev = prev.union(curr);
2549 ch = peek();
2550 continue;
2551 case '&':
2552 ch = next();
2553 if (ch == '&') {
2554 ch = next();
2555 CharPredicate right = null;
2556 while (ch != ']' && ch != '&') {
2557 if (ch == '[') {
2558 if (right == null)
2559 right = clazz(true);
2560 else
2561 right = right.union(clazz(true));
2562 } else { // abc&&def
2563 unread();
2564 right = clazz(false);
2565 }
2566 ch = peek();
2567 }
2568 if (hasBits) {
2569 // bits used, union has high precedence
2570 if (prev == null) {
2571 prev = curr = bitsP;
2572 } else {
2573 prev = prev.union(bitsP);
2574 }
2575 hasBits = false;
2576 }
2577 if (right != null)
2578 curr = right;
2579 if (prev == null) {
2580 if (right == null)
2581 throw error("Bad class syntax");
2582 else
2583 prev = right;
2584 } else {
2585 prev = prev.and(curr);
2586 }
2587 } else {
2588 // treat as a literal &
2589 unread();
2590 break;
2591 }
2592 continue;
2593 case 0:
2594 if (cursor >= patternLength)
2595 throw error("Unclosed character class");
2596 break;
2597 case ']':
2598 if (prev != null || hasBits) {
2599 if (consume)
2600 next();
2601 if (prev == null)
2602 prev = bitsP;
2603 else if (hasBits)
2604 prev = prev.union(bitsP);
2605 if (isNeg)
2606 return prev.negate();
2607 return prev;
2608 }
2609 break;
2610 default:
2611 break;
2612 }
2613 curr = range(bits);
2614 if (curr == null) { // the bits used
2615 hasBits = true;
2616 } else {
2617 if (prev == null)
2618 prev = curr;
2619 else if (prev != curr)
2620 prev = prev.union(curr);
2621 }
2622 ch = peek();
2623 }
2624 }
2625
2626 private CharPredicate bitsOrSingle(BitClass bits, int ch) {
2627 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2628 Use "single" node instead of bits when dealing with unicode
2629 case folding for codepoints listed below.
2630 (1)Uppercase out of range: u+00ff, u+00b5
2631 toUpperCase(u+00ff) -> u+0178
2632 toUpperCase(u+00b5) -> u+039c
2633 (2)LatinSmallLetterLongS u+17f
2634 toUpperCase(u+017f) -> u+0053
2635 (3)LatinSmallLetterDotlessI u+131
2636 toUpperCase(u+0131) -> u+0049
2637 (4)LatinCapitalLetterIWithDotAbove u+0130
2638 toLowerCase(u+0130) -> u+0069
2639 (5)KelvinSign u+212a
2640 toLowerCase(u+212a) ==> u+006B
2641 (6)AngstromSign u+212b
2642 toLowerCase(u+212b) ==> u+00e5
2643 */
2644 int d;
2645 if (ch < 256 &&
2646 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2647 (ch == 0xff || ch == 0xb5 ||
2648 ch == 0x49 || ch == 0x69 || //I and i
2649 ch == 0x53 || ch == 0x73 || //S and s
2650 ch == 0x4b || ch == 0x6b || //K and k
2651 ch == 0xc5 || ch == 0xe5))) { //A+ring {
2652 bits.add(ch, flags());
2653 return null;
2654 }
2655 return single(ch);
2656 }
2657
2658 /**
2659 * Returns a suitably optimized, single character predicate
2660 */
2661 private CharPredicate single(final int ch) {
2662 if (has(CASE_INSENSITIVE)) {
2663 int lower, upper;
2664 if (has(UNICODE_CASE)) {
2665 upper = Character.toUpperCase(ch);
2666 lower = Character.toLowerCase(upper);
2667 // Unicode case insensitive matches
2668 if (upper != lower)
2669 return SingleU(lower);
2670 } else if (ASCII.isAscii(ch)) {
2671 lower = ASCII.toLower(ch);
2672 upper = ASCII.toUpper(ch);
2673 // Case insensitive matches a given BMP character
2674 if (lower != upper)
2675 return SingleI(lower, upper);
2676 }
2677 }
2678 if (isSupplementary(ch))
2679 return SingleS(ch);
2680 return Single(ch); // Match a given BMP character
2681 }
2682
2683 /**
2684 * Parse a single character or a character range in a character class
2685 * and return its representative node.
2686 */
2687 private CharPredicate range(BitClass bits) {
2688 int ch = peek();
2689 if (ch == '\\') {
2690 ch = nextEscaped();
2691 if (ch == 'p' || ch == 'P') { // A property
2692 boolean comp = (ch == 'P');
2693 boolean oneLetter = true;
2694 // Consume { if present
2695 ch = next();
2696 if (ch != '{')
2697 unread();
2698 else
2699 oneLetter = false;
2700 return family(oneLetter, comp);
2701 } else { // ordinary escape
2702 boolean isrange = temp[cursor+1] == '-';
2703 unread();
2704 ch = escape(true, true, isrange);
2705 if (ch == -1)
2706 return predicate;
2707 }
2708 } else {
2709 next();
2710 }
2711 if (ch >= 0) {
2712 if (peek() == '-') {
2713 int endRange = temp[cursor+1];
2714 if (endRange == '[') {
2715 return bitsOrSingle(bits, ch);
2716 }
2717 if (endRange != ']') {
2718 next();
2719 int m = peek();
2720 if (m == '\\') {
2721 m = escape(true, false, true);
2722 } else {
2723 next();
2724 }
2725 if (m < ch) {
2726 throw error("Illegal character range");
2727 }
2728 if (has(CASE_INSENSITIVE)) {
2729 if (has(UNICODE_CASE))
2730 return CIRangeU(ch, m);
2731 return CIRange(ch, m);
2732 } else {
2733 return Range(ch, m);
2734 }
2735 }
2736 }
2737 return bitsOrSingle(bits, ch);
2738 }
2739 throw error("Unexpected character '"+((char)ch)+"'");
2740 }
2741
2742 /**
2743 * Parses a Unicode character family and returns its representative node.
2744 */
2745 private CharPredicate family(boolean singleLetter,
2746 boolean isComplement)
2747 {
2748 next();
2749 String name;
2750 CharPredicate p = null;
2751
2752 if (singleLetter) {
2753 int c = temp[cursor];
2754 if (!Character.isSupplementaryCodePoint(c)) {
2755 name = String.valueOf((char)c);
2756 } else {
2757 name = new String(temp, cursor, 1);
2758 }
2759 read();
2760 } else {
2761 int i = cursor;
2762 mark('}');
2763 while(read() != '}') {
2764 }
2765 mark('\000');
2766 int j = cursor;
2767 if (j > patternLength)
2768 throw error("Unclosed character family");
2769 if (i + 1 >= j)
2770 throw error("Empty character family");
2771 name = new String(temp, i, j-i-1);
2772 }
2773
2774 int i = name.indexOf('=');
2775 if (i != -1) {
2776 // property construct \p{name=value}
2777 String value = name.substring(i + 1);
2778 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2779 switch (name) {
2780 case "sc":
2781 case "script":
2782 p = CharPredicates.forUnicodeScript(value);
2783 break;
2784 case "blk":
2785 case "block":
2786 p = CharPredicates.forUnicodeBlock(value);
2787 break;
2788 case "gc":
2789 case "general_category":
2790 p = CharPredicates.forProperty(value);
2791 break;
2792 default:
2793 break;
2794 }
2795 if (p == null)
2796 throw error("Unknown Unicode property {name=<" + name + ">, "
2797 + "value=<" + value + ">}");
2798
2799 } else {
2800 if (name.startsWith("In")) {
2801 // \p{InBlockName}
2802 p = CharPredicates.forUnicodeBlock(name.substring(2));
2803 } else if (name.startsWith("Is")) {
2804 // \p{IsGeneralCategory} and \p{IsScriptName}
2805 name = name.substring(2);
2806 p = CharPredicates.forUnicodeProperty(name);
2807 if (p == null)
2808 p = CharPredicates.forProperty(name);
2809 if (p == null)
2810 p = CharPredicates.forUnicodeScript(name);
2811 } else {
2812 if (has(UNICODE_CHARACTER_CLASS)) {
2813 p = CharPredicates.forPOSIXName(name);
2814 }
2815 if (p == null)
2816 p = CharPredicates.forProperty(name);
2817 }
2818 if (p == null)
2819 throw error("Unknown character property name {In/Is" + name + "}");
2820 }
2821 if (isComplement) {
2822 // it might be too expensive to detect if a complement of
2823 // CharProperty can match "certain" supplementary. So just
2824 // go with StartS.
2825 hasSupplementary = true;
2826 p = p.negate();
2827 }
2828 return p;
2829 }
2830
2831 private CharProperty newCharProperty(CharPredicate p) {
2832 if (p == null)
2833 return null;
2834 if (p instanceof BmpCharPredicate)
2835 return new BmpCharProperty((BmpCharPredicate)p);
2836 else
2837 return new CharProperty(p);
2838 }
2839
2840 /**
2841 * Parses and returns the name of a "named capturing group", the trailing
2842 * ">" is consumed after parsing.
2843 */
2844 private String groupname(int ch) {
2845 StringBuilder sb = new StringBuilder();
2846 sb.append(Character.toChars(ch));
2847 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2848 ASCII.isDigit(ch)) {
2849 sb.append(Character.toChars(ch));
2850 }
2851 if (sb.length() == 0)
2852 throw error("named capturing group has 0 length name");
2853 if (ch != '>')
2854 throw error("named capturing group is missing trailing '>'");
2855 return sb.toString();
2856 }
2857
2873 case ':': // (?:xxx) pure group
2874 head = createGroup(true);
2875 tail = root;
2876 head.next = expr(tail);
2877 break;
2878 case '=': // (?=xxx) and (?!xxx) lookahead
2879 case '!':
2880 head = createGroup(true);
2881 tail = root;
2882 head.next = expr(tail);
2883 if (ch == '=') {
2884 head = tail = new Pos(head);
2885 } else {
2886 head = tail = new Neg(head);
2887 }
2888 break;
2889 case '>': // (?>xxx) independent group
2890 head = createGroup(true);
2891 tail = root;
2892 head.next = expr(tail);
2893 head = tail = new Ques(head, Qtype.INDEPENDENT);
2894 break;
2895 case '<': // (?<xxx) look behind
2896 ch = read();
2897 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2898 // named captured group
2899 String name = groupname(ch);
2900 if (namedGroups().containsKey(name))
2901 throw error("Named capturing group <" + name
2902 + "> is already defined");
2903 capturingGroup = true;
2904 head = createGroup(false);
2905 tail = root;
2906 namedGroups().put(name, capturingGroupCount-1);
2907 head.next = expr(tail);
2908 break;
2909 }
2910 int start = cursor;
2911 head = createGroup(true);
2912 tail = root;
2913 head.next = expr(tail);
2959 tail = root;
2960 head.next = expr(tail);
2961 }
2962
2963 accept(')', "Unclosed group");
2964 flags = save;
2965
2966 // Check for quantifiers
2967 Node node = closure(head);
2968 if (node == head) { // No closure
2969 root = tail;
2970 return node; // Dual return
2971 }
2972 if (head == tail) { // Zero length assertion
2973 root = node;
2974 return node; // Dual return
2975 }
2976
2977 if (node instanceof Ques) {
2978 Ques ques = (Ques) node;
2979 if (ques.type == Qtype.POSSESSIVE) {
2980 root = node;
2981 return node;
2982 }
2983 tail.next = new BranchConn();
2984 tail = tail.next;
2985 if (ques.type == Qtype.GREEDY) {
2986 head = new Branch(head, null, tail);
2987 } else { // Reluctant quantifier
2988 head = new Branch(null, head, tail);
2989 }
2990 root = tail;
2991 return head;
2992 } else if (node instanceof Curly) {
2993 Curly curly = (Curly) node;
2994 if (curly.type == Qtype.POSSESSIVE) {
2995 root = node;
2996 return node;
2997 }
2998 // Discover if the group is deterministic
2999 TreeInfo info = new TreeInfo();
3000 if (head.study(info)) { // Deterministic
3001 GroupTail temp = (GroupTail) tail;
3002 head = root = new GroupCurly(head.next, curly.cmin,
3003 curly.cmax, curly.type,
3004 ((GroupTail)tail).localIndex,
3005 ((GroupTail)tail).groupIndex,
3006 capturingGroup);
3007 return head;
3008 } else { // Non-deterministic
3009 int temp = ((GroupHead) head).localIndex;
3010 Loop loop;
3011 if (curly.type == Qtype.GREEDY)
3012 loop = new Loop(this.localCount, temp);
3013 else // Reluctant Curly
3014 loop = new LazyLoop(this.localCount, temp);
3015 Prolog prolog = new Prolog(loop);
3016 this.localCount += 1;
3017 loop.cmin = curly.cmin;
3018 loop.cmax = curly.cmax;
3019 loop.body = head;
3020 tail.next = loop;
3021 root = loop;
3022 return prolog; // Dual return
3023 }
3024 }
3025 throw error("Internal logic error");
3026 }
3027
3028 /**
3029 * Create group head and tail nodes using double return. If the group is
3030 * created with anonymous true then it is a pure group and should not
3031 * affect group counting.
3032 */
3033 private Node createGroup(boolean anonymous) {
3034 int localIndex = localCount++;
3035 int groupIndex = 0;
3036 if (!anonymous)
3037 groupIndex = capturingGroupCount++;
3038 GroupHead head = new GroupHead(localIndex);
3039 root = new GroupTail(localIndex, groupIndex);
3040
3041 // for debug/print only, head.match does NOT need the "tail" info
3042 head.tail = (GroupTail)root;
3043
3044 if (!anonymous && groupIndex < 10)
3045 groupNodes[groupIndex] = head;
3046 return head;
3047 }
3048
3049 @SuppressWarnings("fallthrough")
3050 /**
3051 * Parses inlined match flags and set them appropriately.
3052 */
3053 private void addFlag() {
3054 int ch = peek();
3055 for (;;) {
3056 switch (ch) {
3057 case 'i':
3058 flags |= CASE_INSENSITIVE;
3059 break;
3060 case 'm':
3061 flags |= MULTILINE;
3062 break;
3063 case 's':
3112 case 'u':
3113 flags &= ~UNICODE_CASE;
3114 break;
3115 case 'c':
3116 flags &= ~CANON_EQ;
3117 break;
3118 case 'x':
3119 flags &= ~COMMENTS;
3120 break;
3121 case 'U':
3122 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3123 default:
3124 return;
3125 }
3126 ch = next();
3127 }
3128 }
3129
3130 static final int MAX_REPS = 0x7FFFFFFF;
3131
3132 static enum Qtype {
3133 GREEDY, LAZY, POSSESSIVE, INDEPENDENT
3134 }
3135
3136 private Node curly(Node prev, int cmin) {
3137 int ch = next();
3138 if (ch == '?') {
3139 next();
3140 return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
3141 } else if (ch == '+') {
3142 next();
3143 return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
3144 }
3145 if (prev instanceof BmpCharProperty) {
3146 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3147 } else if (prev instanceof CharProperty) {
3148 return new CharPropertyGreedy((CharProperty)prev, cmin);
3149 }
3150 return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
3151 }
3152
3153 /**
3154 * Processes repetition. If the next character peeked is a quantifier
3155 * then new nodes must be appended to handle the repetition.
3156 * Prev could be a single or a group, so it could be a chain of nodes.
3157 */
3158 private Node closure(Node prev) {
3159 Node atom;
3160 int ch = peek();
3161 switch (ch) {
3162 case '?':
3163 ch = next();
3164 if (ch == '?') {
3165 next();
3166 return new Ques(prev, Qtype.LAZY);
3167 } else if (ch == '+') {
3168 next();
3169 return new Ques(prev, Qtype.POSSESSIVE);
3170 }
3171 return new Ques(prev, Qtype.GREEDY);
3172 case '*':
3173 return curly(prev, 0);
3174 case '+':
3175 return curly(prev, 1);
3176 case '{':
3177 ch = temp[cursor+1];
3178 if (ASCII.isDigit(ch)) {
3179 skip();
3180 int cmin = 0;
3181 do {
3182 cmin = cmin * 10 + (ch - '0');
3183 } while (ASCII.isDigit(ch = read()));
3184 int cmax = cmin;
3185 if (ch == ',') {
3186 ch = read();
3187 cmax = MAX_REPS;
3188 if (ch != '}') {
3189 cmax = 0;
3190 while (ASCII.isDigit(ch)) {
3191 cmax = cmax * 10 + (ch - '0');
3192 ch = read();
3193 }
3194 }
3195 }
3196 if (ch != '}')
3197 throw error("Unclosed counted closure");
3198 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3199 throw error("Illegal repetition range");
3200 Curly curly;
3201 ch = peek();
3202 if (ch == '?') {
3203 next();
3204 curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
3205 } else if (ch == '+') {
3206 next();
3207 curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3208 } else {
3209 curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
3210 }
3211 return curly;
3212 } else {
3213 throw error("Illegal repetition");
3214 }
3215 default:
3216 return prev;
3217 }
3218 }
3219
3220 /**
3221 * Utility method for parsing control escape sequences.
3222 */
3223 private int c() {
3224 if (cursor < patternLength) {
3225 return read() ^ 64;
3226 }
3227 throw error("Illegal control escape sequence");
3228 }
3229
3366
3367 private static final int countCodePoints(CharSequence seq) {
3368 int length = seq.length();
3369 int n = 0;
3370 for (int i = 0; i < length; ) {
3371 n++;
3372 if (Character.isHighSurrogate(seq.charAt(i++))) {
3373 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3374 i++;
3375 }
3376 }
3377 }
3378 return n;
3379 }
3380
3381 /**
3382 * Creates a bit vector for matching Latin-1 values. A normal BitClass
3383 * never matches values above Latin-1, and a complemented BitClass always
3384 * matches values above Latin-1.
3385 */
3386 static final class BitClass extends BmpCharProperty {
3387 final boolean[] bits;
3388 BitClass() {
3389 this(new boolean[256]);
3390 }
3391 private BitClass(boolean[] bits) {
3392 super( ch -> ch < 256 && bits[ch]);
3393 this.bits = bits;
3394 }
3395 BitClass add(int c, int flags) {
3396 assert c >= 0 && c <= 255;
3397 if ((flags & CASE_INSENSITIVE) != 0) {
3398 if (ASCII.isAscii(c)) {
3399 bits[ASCII.toUpper(c)] = true;
3400 bits[ASCII.toLower(c)] = true;
3401 } else if ((flags & UNICODE_CASE) != 0) {
3402 bits[Character.toLowerCase(c)] = true;
3403 bits[Character.toUpperCase(c)] = true;
3404 }
3405 }
3406 bits[c] = true;
3407 return this;
3408 }
3409 }
3410
3411 /**
3412 * Utility method for creating a string slice matcher.
3413 */
3414 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3415 int[] tmp = new int[count];
3416 if (has(CASE_INSENSITIVE)) {
3417 if (has(UNICODE_CASE)) {
3418 for (int i = 0; i < count; i++) {
3419 tmp[i] = Character.toLowerCase(
3420 Character.toUpperCase(buf[i]));
3421 }
3422 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3423 }
3424 for (int i = 0; i < count; i++) {
3425 tmp[i] = ASCII.toLower(buf[i]);
3426 }
3427 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3428 }
3796 if (i < matcher.to && seq.charAt(i) == 0x0A)
3797 i++;
3798 return next.match(matcher, i, seq);
3799 }
3800 } else {
3801 matcher.hitEnd = true;
3802 }
3803 return false;
3804 }
3805 boolean study(TreeInfo info) {
3806 info.minLength++;
3807 info.maxLength += 2;
3808 return next.study(info);
3809 }
3810 }
3811
3812 /**
3813 * Abstract node class to match one character satisfying some
3814 * boolean property.
3815 */
3816 static class CharProperty extends Node {
3817 CharPredicate predicate;
3818
3819 CharProperty (CharPredicate predicate) {
3820 this.predicate = predicate;
3821 }
3822 boolean match(Matcher matcher, int i, CharSequence seq) {
3823 if (i < matcher.to) {
3824 int ch = Character.codePointAt(seq, i);
3825 return predicate.is(ch) &&
3826 next.match(matcher, i + Character.charCount(ch), seq);
3827 } else {
3828 matcher.hitEnd = true;
3829 return false;
3830 }
3831 }
3832 boolean study(TreeInfo info) {
3833 info.minLength++;
3834 info.maxLength++;
3835 return next.study(info);
3836 }
3837 }
3838
3839 /**
3840 * Optimized version of CharProperty that works only for
3841 * properties never satisfied by Supplementary characters.
3842 */
3843 private static class BmpCharProperty extends CharProperty {
3844 BmpCharProperty (BmpCharPredicate predicate) {
3845 super(predicate);
3846 }
3847 boolean match(Matcher matcher, int i, CharSequence seq) {
3848 if (i < matcher.to) {
3849 return predicate.is(seq.charAt(i)) &&
3850 next.match(matcher, i + 1, seq);
3851 } else {
3852 matcher.hitEnd = true;
3853 return false;
3854 }
3855 }
3856 }
3857
3858 /**
3859 * Node class that matches an unicode extended grapheme cluster
3860 */
3861 static class XGrapheme extends Node {
3862 boolean match(Matcher matcher, int i, CharSequence seq) {
3863 if (i < matcher.to) {
3864 int ch0 = Character.codePointAt(seq, i);
3865 i += Character.charCount(ch0);
3866 while (i < matcher.to) {
3867 int ch1 = Character.codePointAt(seq, i);
3868 if (Grapheme.isBoundary(ch0, ch1))
3869 break;
3870 ch0 = ch1;
3871 i += Character.charCount(ch1);
3872 }
3873 return next.match(matcher, i, seq);
3874 }
3875 matcher.hitEnd = true;
3876 return false;
3877 }
3878
4056 return false;
4057 }
4058 }
4059 return next.match(matcher, x, seq);
4060 }
4061 }
4062
4063 /**
4064 * Node class for a case insensitive sequence of literal characters.
4065 * Uses unicode case folding.
4066 */
4067 static final class SliceUS extends SliceIS {
4068 SliceUS(int[] buf) {
4069 super(buf);
4070 }
4071 int toLower(int c) {
4072 return Character.toLowerCase(Character.toUpperCase(c));
4073 }
4074 }
4075
4076 /**
4077 * The 0 or 1 quantifier. This one class implements all three types.
4078 */
4079 static final class Ques extends Node {
4080 Node atom;
4081 Qtype type;
4082 Ques(Node node, Qtype type) {
4083 this.atom = node;
4084 this.type = type;
4085 }
4086 boolean match(Matcher matcher, int i, CharSequence seq) {
4087 switch (type) {
4088 case GREEDY:
4089 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4090 || next.match(matcher, i, seq);
4091 case LAZY:
4092 return next.match(matcher, i, seq)
4093 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4094 case POSSESSIVE:
4095 if (atom.match(matcher, i, seq)) i = matcher.last;
4096 return next.match(matcher, i, seq);
4097 default:
4098 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4099 }
4100 }
4101 boolean study(TreeInfo info) {
4102 if (type != Qtype.INDEPENDENT) {
4103 int minL = info.minLength;
4104 atom.study(info);
4105 info.minLength = minL;
4106 info.deterministic = false;
4107 return next.study(info);
4108 } else {
4109 atom.study(info);
4110 return next.study(info);
4111 }
4112 }
4113 }
4114
4115 /**
4116 * Handles the greedy style repetition with the minimum either be
4117 * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
4118 */
4119 static class CharPropertyGreedy extends Node {
4120 final CharPredicate predicate;
4121 final int cmin;
4122
4123 CharPropertyGreedy(CharProperty cp, int cmin) {
4124 this.predicate = cp.predicate;
4125 this.cmin = cmin;
4126 }
4127 boolean match(Matcher matcher, int i, CharSequence seq) {
4128 int n = 0;
4129 int to = matcher.to;
4130 // greedy, all the way down
4131 while (i < to) {
4132 int ch = Character.codePointAt(seq, i);
4133 if (!predicate.is(ch))
4134 break;
4135 i += Character.charCount(ch);
4136 n++;
4137 }
4138 if (i >= to) {
4139 matcher.hitEnd = true;
4140 }
4141 while (n >= cmin) {
4142 if (next.match(matcher, i, seq))
4143 return true;
4144 if (n == cmin)
4145 return false;
4146 // backing off if match fails
4147 int ch = Character.codePointBefore(seq, i);
4148 i -= Character.charCount(ch);
4149 n--;
4150 }
4151 return false;
4152 }
4153
4154 boolean study(TreeInfo info) {
4155 info.minLength += cmin;
4156 if (info.maxValid) {
4157 info.maxLength += MAX_REPS;
4158 }
4159 info.deterministic = false;
4160 return next.study(info);
4161 }
4162 }
4163
4164 static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
4165
4166 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
4167 super(bcp, cmin);
4168 }
4169
4170 boolean match(Matcher matcher, int i, CharSequence seq) {
4171 int n = 0;
4172 int to = matcher.to;
4173 while (i < to && predicate.is(seq.charAt(i))) {
4174 i++; n++;
4175 }
4176 if (i >= to) {
4177 matcher.hitEnd = true;
4178 }
4179 while (n >= cmin) {
4180 if (next.match(matcher, i, seq))
4181 return true;
4182 i--; n--; // backing off if match fails
4183 }
4184 return false;
4185 }
4186 }
4187
4188 /**
4189 * Handles the curly-brace style repetition with a specified minimum and
4190 * maximum occurrences. The * quantifier is handled as a special case.
4191 * This class handles the three types.
4192 */
4193 static final class Curly extends Node {
4194 Node atom;
4195 Qtype type;
4196 int cmin;
4197 int cmax;
4198
4199 Curly(Node node, int cmin, int cmax, Qtype type) {
4200 this.atom = node;
4201 this.type = type;
4202 this.cmin = cmin;
4203 this.cmax = cmax;
4204 }
4205 boolean match(Matcher matcher, int i, CharSequence seq) {
4206 int j;
4207 for (j = 0; j < cmin; j++) {
4208 if (atom.match(matcher, i, seq)) {
4209 i = matcher.last;
4210 continue;
4211 }
4212 return false;
4213 }
4214 if (type == Qtype.GREEDY)
4215 return match0(matcher, i, j, seq);
4216 else if (type == Qtype.LAZY)
4217 return match1(matcher, i, j, seq);
4218 else
4219 return match2(matcher, i, j, seq);
4220 }
4221 // Greedy match.
4222 // i is the index to start matching at
4223 // j is the number of atoms that have matched
4224 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4225 if (j >= cmax) {
4226 // We have matched the maximum... continue with the rest of
4227 // the regular expression
4228 return next.match(matcher, i, seq);
4229 }
4230 int backLimit = j;
4231 while (atom.match(matcher, i, seq)) {
4232 // k is the length of this match
4233 int k = matcher.last - i;
4234 if (k == 0) // Zero length match
4235 break;
4236 // Move up index and number matched
4318 }
4319
4320 if (info.deterministic && cmin == cmax)
4321 info.deterministic = detm;
4322 else
4323 info.deterministic = false;
4324 return next.study(info);
4325 }
4326 }
4327
4328 /**
4329 * Handles the curly-brace style repetition with a specified minimum and
4330 * maximum occurrences in deterministic cases. This is an iterative
4331 * optimization over the Prolog and Loop system which would handle this
4332 * in a recursive way. The * quantifier is handled as a special case.
4333 * If capture is true then this class saves group settings and ensures
4334 * that groups are unset when backing off of a group match.
4335 */
4336 static final class GroupCurly extends Node {
4337 Node atom;
4338 Qtype type;
4339 int cmin;
4340 int cmax;
4341 int localIndex;
4342 int groupIndex;
4343 boolean capture;
4344
4345 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
4346 int group, boolean capture) {
4347 this.atom = node;
4348 this.type = type;
4349 this.cmin = cmin;
4350 this.cmax = cmax;
4351 this.localIndex = local;
4352 this.groupIndex = group;
4353 this.capture = capture;
4354 }
4355 boolean match(Matcher matcher, int i, CharSequence seq) {
4356 int[] groups = matcher.groups;
4357 int[] locals = matcher.locals;
4358 int save0 = locals[localIndex];
4359 int save1 = 0;
4360 int save2 = 0;
4361
4362 if (capture) {
4363 save1 = groups[groupIndex];
4364 save2 = groups[groupIndex+1];
4365 }
4366
4367 // Notify GroupTail there is no need to setup group info
4368 // because it will be set here
4369 locals[localIndex] = -1;
4370
4371 boolean ret = true;
4372 for (int j = 0; j < cmin; j++) {
4373 if (atom.match(matcher, i, seq)) {
4374 if (capture) {
4375 groups[groupIndex] = i;
4376 groups[groupIndex+1] = matcher.last;
4377 }
4378 i = matcher.last;
4379 } else {
4380 ret = false;
4381 break;
4382 }
4383 }
4384 if (ret) {
4385 if (type == Qtype.GREEDY) {
4386 ret = match0(matcher, i, cmin, seq);
4387 } else if (type == Qtype.LAZY) {
4388 ret = match1(matcher, i, cmin, seq);
4389 } else {
4390 ret = match2(matcher, i, cmin, seq);
4391 }
4392 }
4393 if (!ret) {
4394 locals[localIndex] = save0;
4395 if (capture) {
4396 groups[groupIndex] = save1;
4397 groups[groupIndex+1] = save2;
4398 }
4399 }
4400 return ret;
4401 }
4402 // Aggressive group match
4403 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4404 // don't back off passing the starting "j"
4405 int min = j;
4406 int[] groups = matcher.groups;
4407 int save0 = 0;
4613
4614 info.minLength += minL;
4615 info.maxLength += maxL;
4616 info.maxValid &= maxV;
4617 info.deterministic = false;
4618 return false;
4619 }
4620 }
4621
4622 /**
4623 * The GroupHead saves the location where the group begins in the locals
4624 * and restores them when the match is done.
4625 *
4626 * The matchRef is used when a reference to this group is accessed later
4627 * in the expression. The locals will have a negative value in them to
4628 * indicate that we do not want to unset the group if the reference
4629 * doesn't match.
4630 */
4631 static final class GroupHead extends Node {
4632 int localIndex;
4633 GroupTail tail; // for debug/print only, match does not need to know
4634 GroupHead(int localCount) {
4635 localIndex = localCount;
4636 }
4637 boolean match(Matcher matcher, int i, CharSequence seq) {
4638 int save = matcher.locals[localIndex];
4639 matcher.locals[localIndex] = i;
4640 boolean ret = next.match(matcher, i, seq);
4641 matcher.locals[localIndex] = save;
4642 return ret;
4643 }
4644 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4645 int save = matcher.locals[localIndex];
4646 matcher.locals[localIndex] = ~i; // HACK
4647 boolean ret = next.match(matcher, i, seq);
4648 matcher.locals[localIndex] = save;
4649 return ret;
4650 }
4651 }
4652
4653 /**
5207 int startIndex = (!matcher.transparentBounds) ?
5208 matcher.from : 0;
5209 int from = Math.max(i - rmaxChars, startIndex);
5210 matcher.lookbehindTo = i;
5211 // Relax transparent region boundaries for lookbehind
5212 if (matcher.transparentBounds)
5213 matcher.from = 0;
5214 for (int j = i - rminChars;
5215 !conditionMatched && j >= from;
5216 j -= j>from ? countChars(seq, j, -1) : 1) {
5217 conditionMatched = cond.match(matcher, j, seq);
5218 }
5219 //Reinstate region boundaries
5220 matcher.from = savedFrom;
5221 matcher.lookbehindTo = savedLBT;
5222 return !conditionMatched && next.match(matcher, i, seq);
5223 }
5224 }
5225
5226 /**
5227 * Handles word boundaries. Includes a field to allow this one class to
5228 * deal with the different types of word boundaries we can match. The word
5229 * characters include underscores, letters, and digits. Non spacing marks
5230 * can are also part of a word if they have a base character, otherwise
5231 * they are ignored for purposes of finding word boundaries.
5232 */
5233 static final class Bound extends Node {
5234 static int LEFT = 0x1;
5235 static int RIGHT= 0x2;
5236 static int BOTH = 0x3;
5237 static int NONE = 0x4;
5238 int type;
5239 boolean useUWORD;
5240 Bound(int n, boolean useUWORD) {
5241 type = n;
5242 this.useUWORD = useUWORD;
5243 }
5244
5245 boolean isWord(int ch) {
5246 return useUWORD ? CharPredicates.WORD.is(ch)
5247 : (ch == '_' || Character.isLetterOrDigit(ch));
5248 }
5249
5250 int check(Matcher matcher, int i, CharSequence seq) {
5251 int ch;
5252 boolean left = false;
5253 int startIndex = matcher.from;
5254 int endIndex = matcher.to;
5255 if (matcher.transparentBounds) {
5256 startIndex = 0;
5257 endIndex = matcher.getTextLength();
5258 }
5259 if (i > startIndex) {
5260 ch = Character.codePointBefore(seq, i);
5261 left = (isWord(ch) ||
5262 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5263 && hasBaseCharacter(matcher, i-1, seq)));
5264 }
5265 boolean right = false;
5266 if (i < endIndex) {
5472 i += countChars(seq, i, n);
5473 continue NEXT;
5474 }
5475 }
5476 // Entire pattern matched starting at i
5477 matcher.first = i;
5478 boolean ret = next.match(matcher, i + lengthInChars, seq);
5479 if (ret) {
5480 matcher.first = i;
5481 matcher.groups[0] = matcher.first;
5482 matcher.groups[1] = matcher.last;
5483 return true;
5484 }
5485 i += countChars(seq, i, 1);
5486 }
5487 matcher.hitEnd = true;
5488 return false;
5489 }
5490 }
5491
5492 @FunctionalInterface
5493 static interface CharPredicate {
5494 boolean is(int ch);
5495
5496 default CharPredicate and(CharPredicate p) {
5497 return ch -> is(ch) && p.is(ch);
5498 }
5499 default CharPredicate union(CharPredicate p) {
5500 return ch -> is(ch) || p.is(ch);
5501 }
5502 default CharPredicate union(CharPredicate p1,
5503 CharPredicate p2 ) {
5504 return ch -> is(ch) || p1.is(ch) || p2.is(ch);
5505 }
5506 default CharPredicate negate() {
5507 return ch -> !is(ch);
5508 }
5509 }
5510
5511 static interface BmpCharPredicate extends CharPredicate {
5512
5513 default CharPredicate and(CharPredicate p) {
5514 if(p instanceof BmpCharPredicate)
5515 return (BmpCharPredicate)((ch) -> is(ch) && p.is(ch));
5516 return ch -> is(ch) && p.is(ch);
5517 }
5518 default CharPredicate union(CharPredicate p) {
5519 if (p instanceof BmpCharPredicate)
5520 return (BmpCharPredicate)((ch) -> is(ch) || p.is(ch));
5521 return ch -> is(ch) || p.is(ch);
5522 }
5523 static CharPredicate union(CharPredicate... predicates) {
5524 CharPredicate cp = ch -> {
5525 for (CharPredicate p : predicates) {
5526 if (!p.is(ch))
5527 return false;
5528 }
5529 return true;
5530 };
5531 for (CharPredicate p : predicates) {
5532 if (! (p instanceof BmpCharPredicate))
5533 return cp;
5534 }
5535 return (BmpCharPredicate)cp;
5536 }
5537 }
5538
5539 /**
5540 * matches a Perl vertical whitespace
5541 */
5542 static BmpCharPredicate VertWS = cp ->
5543 (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
5544
5545 /**
5546 * matches a Perl horizontal whitespace
5547 */
5548 static BmpCharPredicate HorizWS = cp ->
5549 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
5550 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
5551 cp == 0x205f || cp == 0x3000;
5552
5553 /**
5554 * for the Unicode category ALL and the dot metacharacter when
5555 * in dotall mode.
5556 */
5557 static CharPredicate ALL = ch -> true;
5558
5559 /**
5560 * for the dot metacharacter when dotall is not enabled.
5561 */
5562 static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
5563 && (ch|1) != '\u2029'
5564 && ch != '\u0085');
5565 /**
5566 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
5567 */
5568 static CharPredicate UNIXDOT = ch -> ch != '\n';
5569
5570 /**
5571 * Indicate that matches a Supplementary Unicode character
5572 */
5573 static CharPredicate SingleS(int c) {
5574 return ch -> ch == c;
5575 }
5576
5577 /**
5578 * A bmp/optimized predicate of single
5579 */
5580 static BmpCharPredicate Single(int c) {
5581 return ch -> ch == c;
5582 }
5583
5584 /**
5585 * Case insensitive matches a given BMP character
5586 */
5587 static BmpCharPredicate SingleI(int lower, int upper) {
5588 return ch -> ch == lower || ch == upper;
5589 }
5590
5591 /**
5592 * Unicode case insensitive matches a given Unicode character
5593 */
5594 static CharPredicate SingleU(int lower) {
5595 return ch -> lower == ch ||
5596 lower == Character.toLowerCase(Character.toUpperCase(ch));
5597 }
5598
5599 private static boolean inRange(int lower, int ch, int upper) {
5600 return lower <= ch && ch <= upper;
5601 }
5602
5603 /**
5604 * Charactrs within a explicit value range
5605 */
5606 static CharPredicate Range(int lower, int upper) {
5607 if (upper < Character.MIN_HIGH_SURROGATE ||
5608 lower > Character.MAX_HIGH_SURROGATE &&
5609 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
5610 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
5611 return ch -> inRange(lower, ch, upper);
5612 }
5613
5614 /**
5615 * Charactrs within a explicit value range in a case insensitive manner.
5616 */
5617 static CharPredicate CIRange(int lower, int upper) {
5618 return ch -> inRange(lower, ch, upper) ||
5619 ASCII.isAscii(ch) &&
5620 (inRange(lower, ASCII.toUpper(ch), upper) ||
5621 inRange(lower, ASCII.toLower(ch), upper));
5622 }
5623
5624 static CharPredicate CIRangeU(int lower, int upper) {
5625 return ch -> {
5626 if (inRange(lower, ch, upper))
5627 return true;
5628 int up = Character.toUpperCase(ch);
5629 return inRange(lower, up, upper) ||
5630 inRange(lower, Character.toLowerCase(up), upper);
5631 };
5632 }
5633
5634 /**
5635 * This must be the very first initializer.
5636 */
5637 static Node accept = new Node();
5638
5639 static Node lastAccept = new LastNode();
5640
5641 /**
5642 * Creates a predicate which can be used to match a string.
5643 *
5644 * @return The predicate which can be used for matching on a string
5645 * @since 1.8
5646 */
5647 public Predicate<String> asPredicate() {
5648 return s -> matcher(s).find();
5649 }
5650
5651 /**
5652 * Creates a stream from the given input sequence around matches of this
5653 * pattern.
5654 *
5655 * <p> The stream returned by this method contains each substring of the
5656 * input sequence that is terminated by another subsequence that matches
5657 * this pattern or is terminated by the end of the input sequence. The
5658 * substrings in the stream are in the order in which they occur in the
5659 * input. Trailing empty strings will be discarded and not encountered in
5660 * the stream.
5661 *
|