9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.util.regex;
27
28 import java.text.Normalizer;
29 import java.util.Locale;
30 import java.util.Iterator;
31 import java.util.Map;
32 import java.util.ArrayList;
33 import java.util.HashMap;
34 import java.util.Arrays;
35 import java.util.NoSuchElementException;
36 import java.util.Spliterator;
37 import java.util.Spliterators;
38 import java.util.function.Predicate;
39 import java.util.stream.Stream;
40 import java.util.stream.StreamSupport;
41
42
43 /**
44 * A compiled representation of a regular expression.
45 *
46 * <p> A regular expression, specified as a string, must first be compiled into
47 * an instance of this class. The resulting pattern can then be used to create
48 * a {@link Matcher} object that can match arbitrary {@linkplain
49 * java.lang.CharSequence character sequences} against the regular
50 * expression. All of the state involved in performing a match resides in the
51 * matcher, so many matchers can share the same pattern.
52 *
53 * <p> A typical invocation sequence is thus
967
968 /**
969 * The starting point of state machine for the find operation. This allows
970 * a match to start anywhere in the input.
971 */
972 transient Node root;
973
974 /**
975 * The root of object tree for a match operation. The pattern is matched
976 * at the beginning. This may include a find that uses BnM or a First
977 * node.
978 */
979 transient Node matchRoot;
980
981 /**
982 * Temporary storage used by parsing pattern slice.
983 */
984 transient int[] buffer;
985
986 /**
987 * Map the "name" of the "named capturing group" to its group id
988 * node.
989 */
990 transient volatile Map<String, Integer> namedGroups;
991
992 /**
993 * Temporary storage used while parsing group references.
994 */
995 transient GroupHead[] groupNodes;
996
997 /**
998 * Temporary null terminated code point array used by pattern compiling.
999 */
1000 private transient int[] temp;
1001
1002 /**
1003 * The number of capturing groups in this Pattern. Used by matchers to
1004 * allocate storage needed to perform a match.
1005 */
1006 transient int capturingGroupCount;
1009 * The local variable count used by parsing tree. Used by matchers to
1010 * allocate storage needed to perform a match.
1011 */
1012 transient int localCount;
1013
1014 /**
1015 * Index into the pattern string that keeps track of how much has been
1016 * parsed.
1017 */
1018 private transient int cursor;
1019
1020 /**
1021 * Holds the length of the pattern string.
1022 */
1023 private transient int patternLength;
1024
1025 /**
1026 * If the Start node might possibly match supplementary characters.
1027 * It is set to true during compiling if
1028 * (1) There is supplementary char in pattern, or
1029 * (2) There is complement node of Category or Block
1030 */
1031 private transient boolean hasSupplementary;
1032
1033 /**
1034 * Compiles the given regular expression into a pattern.
1035 *
1036 * @param regex
1037 * The expression to be compiled
1038 * @return the given regular expression compiled into a pattern
1039 * @throws PatternSyntaxException
1040 * If the expression's syntax is invalid
1041 */
1042 public static Pattern compile(String regex) {
1043 return new Pattern(regex, 0);
1044 }
1045
1046 /**
1047 * Compiles the given regular expression into a pattern with the given
1048 * flags.
1049 *
1361 pattern = p;
1362 flags = f;
1363
1364 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1365 if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1366 flags |= UNICODE_CASE;
1367
1368 // Reset group index count
1369 capturingGroupCount = 1;
1370 localCount = 0;
1371
1372 if (pattern.length() > 0) {
1373 compile();
1374 } else {
1375 root = new Start(lastAccept);
1376 matchRoot = lastAccept;
1377 }
1378 }
1379
1380 /**
1381 * The pattern is converted to normalized form ({@linkplain
1382 * java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
1383 * and then a pure group is constructed to match canonical
1384 * equivalences of the characters.
1385 */
1386 private void normalize() {
1387 int lastCodePoint = -1;
1388
1389 // Convert pattern into normalized form
1390 normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
1391 patternLength = normalizedPattern.length();
1392
1393 // Modify pattern to match canonical equivalences
1394 StringBuilder newPattern = new StringBuilder(patternLength);
1395 for(int i=0; i<patternLength; ) {
1396 int c = normalizedPattern.codePointAt(i);
1397 StringBuilder sequenceBuffer;
1398 if ((Character.getType(c) == Character.NON_SPACING_MARK)
1399 && (lastCodePoint != -1)) {
1400 sequenceBuffer = new StringBuilder();
1401 sequenceBuffer.appendCodePoint(lastCodePoint);
1402 sequenceBuffer.appendCodePoint(c);
1403 while(Character.getType(c) == Character.NON_SPACING_MARK) {
1404 i += Character.charCount(c);
1405 if (i >= patternLength)
1406 break;
1407 c = normalizedPattern.codePointAt(i);
1408 sequenceBuffer.appendCodePoint(c);
1409 }
1410 String ea = produceEquivalentAlternation(
1411 sequenceBuffer.toString());
1412 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
1413 newPattern.append("(?:").append(ea).append(")");
1414 } else if (c == '[' && lastCodePoint != '\\') {
1415 i = normalizeCharClass(newPattern, i);
1416 } else {
1417 newPattern.appendCodePoint(c);
1418 }
1419 lastCodePoint = c;
1420 i += Character.charCount(c);
1421 }
1422 normalizedPattern = newPattern.toString();
1423 }
1424
1425 /**
1426 * Complete the character class being parsed and add a set
1427 * of alternations to it that will match the canonical equivalences
1428 * of the characters within the class.
1429 */
1430 private int normalizeCharClass(StringBuilder newPattern, int i) {
1431 StringBuilder charClass = new StringBuilder();
1432 StringBuilder eq = null;
1433 int lastCodePoint = -1;
1434 String result;
1435
1436 i++;
1437 charClass.append("[");
1438 while(true) {
1439 int c = normalizedPattern.codePointAt(i);
1440 StringBuilder sequenceBuffer;
1441
1442 if (c == ']' && lastCodePoint != '\\') {
1443 charClass.append((char)c);
1444 break;
1445 } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
1446 sequenceBuffer = new StringBuilder();
1447 sequenceBuffer.appendCodePoint(lastCodePoint);
1448 while(Character.getType(c) == Character.NON_SPACING_MARK) {
1449 sequenceBuffer.appendCodePoint(c);
1450 i += Character.charCount(c);
1451 if (i >= normalizedPattern.length())
1452 break;
1453 c = normalizedPattern.codePointAt(i);
1454 }
1455 String ea = produceEquivalentAlternation(
1456 sequenceBuffer.toString());
1457
1458 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
1459 if (eq == null)
1460 eq = new StringBuilder();
1461 eq.append('|');
1462 eq.append(ea);
1463 } else {
1464 charClass.appendCodePoint(c);
1465 i++;
1466 }
1467 if (i == normalizedPattern.length())
1468 throw error("Unclosed character class");
1469 lastCodePoint = c;
1470 }
1471
1472 if (eq != null) {
1473 result = "(?:"+charClass.toString()+eq.toString()+")";
1474 } else {
1475 result = charClass.toString();
1476 }
1477
1478 newPattern.append(result);
1479 return i;
1480 }
1481
1482 /**
1483 * Given a specific sequence composed of a regular character and
1484 * combining marks that follow it, produce the alternation that will
1485 * match all canonical equivalences of that sequence.
1486 */
1487 private String produceEquivalentAlternation(String source) {
1488 int len = countChars(source, 0, 1);
1489 if (source.length() == len)
1490 // source has one character.
1491 return source;
1492
1493 String base = source.substring(0,len);
1494 String combiningMarks = source.substring(len);
1495
1496 String[] perms = producePermutations(combiningMarks);
1497 StringBuilder result = new StringBuilder(source);
1498
1499 // Add combined permutations
1500 for(int x=0; x<perms.length; x++) {
1501 String next = base + perms[x];
1502 if (x>0)
1503 result.append("|"+next);
1504 next = composeOneStep(next);
1505 if (next != null)
1506 result.append("|"+produceEquivalentAlternation(next));
1507 }
1508 return result.toString();
1509 }
1510
1511 /**
1512 * Returns an array of strings that have all the possible
1513 * permutations of the characters in the input string.
1514 * This is used to get a list of all possible orderings
1515 * of a set of combining marks. Note that some of the permutations
1516 * are invalid because of combining class collisions, and these
1517 * possibilities must be removed because they are not canonically
1518 * equivalent.
1519 */
1520 private String[] producePermutations(String input) {
1521 if (input.length() == countChars(input, 0, 1))
1522 return new String[] {input};
1523
1524 if (input.length() == countChars(input, 0, 2)) {
1525 int c0 = Character.codePointAt(input, 0);
1526 int c1 = Character.codePointAt(input, Character.charCount(c0));
1527 if (getClass(c1) == getClass(c0)) {
1528 return new String[] {input};
1529 }
1530 String[] result = new String[2];
1531 result[0] = input;
1532 StringBuilder sb = new StringBuilder(2);
1533 sb.appendCodePoint(c1);
1534 sb.appendCodePoint(c0);
1535 result[1] = sb.toString();
1536 return result;
1537 }
1538
1539 int length = 1;
1540 int nCodePoints = countCodePoints(input);
1558 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
1559 len = countChars(input, offset, 1);
1560 for(int y=x-1; y>=0; y--) {
1561 if (combClass[y] == combClass[x]) {
1562 continue loop;
1563 }
1564 }
1565 StringBuilder sb = new StringBuilder(input);
1566 String otherChars = sb.delete(offset, offset+len).toString();
1567 String[] subResult = producePermutations(otherChars);
1568
1569 String prefix = input.substring(offset, offset+len);
1570 for (String sre : subResult)
1571 temp[index++] = prefix + sre;
1572 }
1573 String[] result = new String[index];
1574 System.arraycopy(temp, 0, result, 0, index);
1575 return result;
1576 }
1577
1578 private int getClass(int c) {
1579 return sun.text.Normalizer.getCombiningClass(c);
1580 }
1581
1582 /**
1583 * Attempts to compose input by combining the first character
1584 * with the first combining mark following it. Returns a String
1585 * that is the composition of the leading character with its first
1586 * combining mark followed by the remaining combining marks. Returns
1587 * null if the first two characters cannot be further composed.
1588 */
1589 private String composeOneStep(String input) {
1590 int len = countChars(input, 0, 2);
1591 String firstTwoCharacters = input.substring(0, len);
1592 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
1593
1594 if (result.equals(firstTwoCharacters))
1595 return null;
1596 else {
1597 String remainder = input.substring(len);
1598 return result + remainder;
1599 }
1600 }
1601
1602 /**
1603 * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1604 * See the description of `quotemeta' in perlfunc(1).
1605 */
1606 private void RemoveQEQuoting() {
1607 final int pLen = patternLength;
1608 int i = 0;
1609 while (i < pLen-1) {
1610 if (temp[i] != '\\')
1611 i += 1;
1612 else if (temp[i + 1] != 'Q')
1613 i += 2;
1660 newtemp[j++] = c;
1661 if (i != pLen)
1662 newtemp[j++] = temp[i++];
1663 }
1664 }
1665
1666 beginQuote = false;
1667 }
1668
1669 patternLength = j;
1670 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1671 }
1672
1673 /**
1674 * Copies regular expression to an int array and invokes the parsing
1675 * of the expression which will create the object tree.
1676 */
1677 private void compile() {
1678 // Handle canonical equivalences
1679 if (has(CANON_EQ) && !has(LITERAL)) {
1680 normalize();
1681 } else {
1682 normalizedPattern = pattern;
1683 }
1684 patternLength = normalizedPattern.length();
1685
1686 // Copy pattern to int array for convenience
1687 // Use double zero to terminate pattern
1688 temp = new int[patternLength + 2];
1689
1690 hasSupplementary = false;
1691 int c, count = 0;
1692 // Convert all chars into code points
1693 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1694 c = normalizedPattern.codePointAt(x);
1695 if (isSupplementary(c)) {
1696 hasSupplementary = true;
1697 }
1698 temp[count++] = c;
1699 }
1700
1737 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1738 }
1739
1740 // Release temporary storage
1741 temp = null;
1742 buffer = null;
1743 groupNodes = null;
1744 patternLength = 0;
1745 compiled = true;
1746 }
1747
1748 Map<String, Integer> namedGroups() {
1749 Map<String, Integer> groups = namedGroups;
1750 if (groups == null) {
1751 namedGroups = groups = new HashMap<>(2);
1752 }
1753 return groups;
1754 }
1755
1756 /**
1757 * Used to print out a subtree of the Pattern to help with debugging.
1758 */
1759 private static void printObjectTree(Node node) {
1760 while(node != null) {
1761 if (node instanceof Prolog) {
1762 System.out.println(node);
1763 printObjectTree(((Prolog)node).loop);
1764 System.out.println("**** end contents prolog loop");
1765 } else if (node instanceof Loop) {
1766 System.out.println(node);
1767 printObjectTree(((Loop)node).body);
1768 System.out.println("**** end contents Loop body");
1769 } else if (node instanceof Curly) {
1770 System.out.println(node);
1771 printObjectTree(((Curly)node).atom);
1772 System.out.println("**** end contents Curly body");
1773 } else if (node instanceof GroupCurly) {
1774 System.out.println(node);
1775 printObjectTree(((GroupCurly)node).atom);
1776 System.out.println("**** end contents GroupCurly body");
1777 } else if (node instanceof GroupTail) {
1778 System.out.println(node);
1779 System.out.println("Tail next is "+node.next);
1780 return;
1781 } else {
1782 System.out.println(node);
1783 }
1784 node = node.next;
1785 if (node != null)
1786 System.out.println("->next:");
1787 if (node == Pattern.accept) {
1788 System.out.println("Accept Node");
1789 node = null;
1790 }
1791 }
1792 }
1793
1794 /**
1795 * Used to accumulate information about a subtree of the object graph
1796 * so that optimizations can be applied to the subtree.
1797 */
1798 static final class TreeInfo {
1799 int minLength;
1800 int maxLength;
1801 boolean maxValid;
1802 boolean deterministic;
1803
1804 TreeInfo() {
1805 reset();
1806 }
1807 void reset() {
1808 minLength = 0;
1809 maxLength = 0;
1810 maxValid = true;
1811 deterministic = true;
1812 }
1813 }
1814
2066 Node node = null;
2067 LOOP:
2068 for (;;) {
2069 int ch = peek();
2070 switch (ch) {
2071 case '(':
2072 // Because group handles its own closure,
2073 // we need to treat it differently
2074 node = group0();
2075 // Check for comment or flag group
2076 if (node == null)
2077 continue;
2078 if (head == null)
2079 head = node;
2080 else
2081 tail.next = node;
2082 // Double return: Tail was returned in root
2083 tail = root;
2084 continue;
2085 case '[':
2086 node = clazz(true);
2087 break;
2088 case '\\':
2089 ch = nextEscaped();
2090 if (ch == 'p' || ch == 'P') {
2091 boolean oneLetter = true;
2092 boolean comp = (ch == 'P');
2093 ch = next(); // Consume { if present
2094 if (ch != '{') {
2095 unread();
2096 } else {
2097 oneLetter = false;
2098 }
2099 node = family(oneLetter, comp);
2100 } else {
2101 unread();
2102 node = atom();
2103 }
2104 break;
2105 case '^':
2106 next();
2107 if (has(MULTILINE)) {
2108 if (has(UNIX_LINES))
2109 node = new UnixCaret();
2110 else
2111 node = new Caret();
2112 } else {
2113 node = new Begin();
2114 }
2115 break;
2116 case '$':
2117 next();
2118 if (has(UNIX_LINES))
2119 node = new UnixDollar(has(MULTILINE));
2120 else
2121 node = new Dollar(has(MULTILINE));
2122 break;
2123 case '.':
2124 next();
2125 if (has(DOTALL)) {
2126 node = new All();
2127 } else {
2128 if (has(UNIX_LINES))
2129 node = new UnixDot();
2130 else {
2131 node = new Dot();
2132 }
2133 }
2134 break;
2135 case '|':
2136 case ')':
2137 break LOOP;
2138 case ']': // Now interpreting dangling ] and } as literals
2139 case '}':
2140 node = atom();
2141 break;
2142 case '?':
2143 case '*':
2144 case '+':
2145 next();
2146 throw error("Dangling meta character '" + ((char)ch) + "'");
2147 case 0:
2148 if (cursor >= patternLength) {
2149 break LOOP;
2150 }
2151 // Fall through
2152 default:
2153 node = atom();
2154 break;
2155 }
2156
2157 node = closure(node);
2158
2159 if (head == null) {
2160 head = tail = node;
2161 } else {
2162 tail.next = node;
2163 tail = node;
2164 }
2165 }
2166 if (head == null) {
2167 return end;
2168 }
2169 tail.next = end;
2170 root = tail; //double return
2171 return head;
2172 }
2173
2174 @SuppressWarnings("fallthrough")
2175 /**
2176 * Parse and add a new Single or Slice.
2177 */
2178 private Node atom() {
2196 case '^':
2197 case '(':
2198 case '[':
2199 case '|':
2200 case ')':
2201 break;
2202 case '\\':
2203 ch = nextEscaped();
2204 if (ch == 'p' || ch == 'P') { // Property
2205 if (first > 0) { // Slice is waiting; handle it first
2206 unread();
2207 break;
2208 } else { // No slice; just return the family node
2209 boolean comp = (ch == 'P');
2210 boolean oneLetter = true;
2211 ch = next(); // Consume { if present
2212 if (ch != '{')
2213 unread();
2214 else
2215 oneLetter = false;
2216 return family(oneLetter, comp);
2217 }
2218 }
2219 unread();
2220 prev = cursor;
2221 ch = escape(false, first == 0, false);
2222 if (ch >= 0) {
2223 append(ch, first);
2224 first++;
2225 if (isSupplementary(ch)) {
2226 hasSupplementary = true;
2227 }
2228 ch = peek();
2229 continue;
2230 } else if (first == 0) {
2231 return root;
2232 }
2233 // Unwind meta escape sequence
2234 cursor = prev;
2235 break;
2236 case 0:
2237 if (cursor >= patternLength) {
2238 break;
2239 }
2240 // Fall through
2241 default:
2242 prev = cursor;
2243 append(ch, first);
2244 first++;
2245 if (isSupplementary(ch)) {
2246 hasSupplementary = true;
2247 }
2248 ch = next();
2249 continue;
2250 }
2251 break;
2252 }
2253 if (first == 1) {
2254 return newSingle(buffer[0]);
2255 } else {
2256 return newSlice(buffer, first, hasSupplementary);
2257 }
2258 }
2259
2260 private void append(int ch, int len) {
2261 if (len >= buffer.length) {
2262 int[] tmp = new int[len+len];
2263 System.arraycopy(buffer, 0, tmp, 0, len);
2264 buffer = tmp;
2265 }
2266 buffer[len] = ch;
2267 }
2268
2269 /**
2270 * Parses a backref greedily, taking as many numbers as it
2271 * can. The first digit is always treated as a backref, but
2272 * multi digit numbers are only treated as a backref if at
2273 * least that many backrefs exist at this point in the regex.
2274 */
2329 case '6':
2330 case '7':
2331 case '8':
2332 case '9':
2333 if (inclass) break;
2334 if (create) {
2335 root = ref((ch - '0'));
2336 }
2337 return -1;
2338 case 'A':
2339 if (inclass) break;
2340 if (create) root = new Begin();
2341 return -1;
2342 case 'B':
2343 if (inclass) break;
2344 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2345 return -1;
2346 case 'C':
2347 break;
2348 case 'D':
2349 if (create) root = has(UNICODE_CHARACTER_CLASS)
2350 ? new Utype(UnicodeProp.DIGIT).complement()
2351 : new Ctype(ASCII.DIGIT).complement();
2352 return -1;
2353 case 'E':
2354 case 'F':
2355 break;
2356 case 'G':
2357 if (inclass) break;
2358 if (create) root = new LastMatch();
2359 return -1;
2360 case 'H':
2361 if (create) root = new HorizWS().complement();
2362 return -1;
2363 case 'I':
2364 case 'J':
2365 case 'K':
2366 case 'L':
2367 case 'M':
2368 break;
2369 case 'N':
2370 return N();
2371 case 'O':
2372 case 'P':
2373 case 'Q':
2374 break;
2375 case 'R':
2376 if (inclass) break;
2377 if (create) root = new LineEnding();
2378 return -1;
2379 case 'S':
2380 if (create) root = has(UNICODE_CHARACTER_CLASS)
2381 ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2382 : new Ctype(ASCII.SPACE).complement();
2383 return -1;
2384 case 'T':
2385 case 'U':
2386 break;
2387 case 'V':
2388 if (create) root = new VertWS().complement();
2389 return -1;
2390 case 'W':
2391 if (create) root = has(UNICODE_CHARACTER_CLASS)
2392 ? new Utype(UnicodeProp.WORD).complement()
2393 : new Ctype(ASCII.WORD).complement();
2394 return -1;
2395 case 'X':
2396 if (inclass) break;
2397 if (create) {
2398 root = new XGrapheme();
2399 }
2400 return -1;
2401 case 'Y':
2402 break;
2403 case 'Z':
2404 if (inclass) break;
2405 if (create) {
2406 if (has(UNIX_LINES))
2407 root = new UnixDollar(false);
2408 else
2409 root = new Dollar(false);
2410 }
2411 return -1;
2412 case 'a':
2413 return '\007';
2414 case 'b':
2415 if (inclass) break;
2416 if (create) {
2417 if (peek() == '{') {
2418 if (skip() == 'g') {
2419 if (read() == '}') {
2420 root = new GraphemeBound();
2421 return -1;
2422 }
2423 break; // error missing trailing }
2424 }
2425 unread(); unread();
2426 }
2427 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2428 }
2429 return -1;
2430 case 'c':
2431 return c();
2432 case 'd':
2433 if (create) root = has(UNICODE_CHARACTER_CLASS)
2434 ? new Utype(UnicodeProp.DIGIT)
2435 : new Ctype(ASCII.DIGIT);
2436 return -1;
2437 case 'e':
2438 return '\033';
2439 case 'f':
2440 return '\f';
2441 case 'g':
2442 break;
2443 case 'h':
2444 if (create) root = new HorizWS();
2445 return -1;
2446 case 'i':
2447 case 'j':
2448 break;
2449 case 'k':
2450 if (inclass)
2451 break;
2452 if (read() != '<')
2453 throw error("\\k is not followed by '<' for named capturing group");
2454 String name = groupname(read());
2455 if (!namedGroups().containsKey(name))
2456 throw error("(named capturing group <"+ name+"> does not exit");
2457 if (create) {
2458 if (has(CASE_INSENSITIVE))
2459 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2460 else
2461 root = new BackRef(namedGroups().get(name));
2462 }
2463 return -1;
2464 case 'l':
2465 case 'm':
2466 break;
2467 case 'n':
2468 return '\n';
2469 case 'o':
2470 case 'p':
2471 case 'q':
2472 break;
2473 case 'r':
2474 return '\r';
2475 case 's':
2476 if (create) root = has(UNICODE_CHARACTER_CLASS)
2477 ? new Utype(UnicodeProp.WHITE_SPACE)
2478 : new Ctype(ASCII.SPACE);
2479 return -1;
2480 case 't':
2481 return '\t';
2482 case 'u':
2483 return u();
2484 case 'v':
2485 // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2486 // undocumented). In JDK8 '\v' is specified as a predefined
2487 // character class for all vertical whitespace characters.
2488 // So [-1, root=VertWS node] pair is returned (instead of a
2489 // single 0x0B). This breaks the range if '\v' is used as
2490 // the start or end value, such as [\v-...] or [...-\v], in
2491 // which a single definite value (0x0B) is expected. For
2492 // compatibility concern '\013'/0x0B is returned if isrange.
2493 if (isrange)
2494 return '\013';
2495 if (create) root = new VertWS();
2496 return -1;
2497 case 'w':
2498 if (create) root = has(UNICODE_CHARACTER_CLASS)
2499 ? new Utype(UnicodeProp.WORD)
2500 : new Ctype(ASCII.WORD);
2501 return -1;
2502 case 'x':
2503 return x();
2504 case 'y':
2505 break;
2506 case 'z':
2507 if (inclass) break;
2508 if (create) root = new End();
2509 return -1;
2510 default:
2511 return ch;
2512 }
2513 throw error("Illegal/unsupported escape sequence");
2514 }
2515
2516 /**
2517 * Parse a character class, and return the node that matches it.
2518 *
2519 * Consumes a ] on the way out if consume is true. Usually consume
2520 * is true except for the case of [abc&&def] where def is a separate
2521 * right hand node with "understood" brackets.
2522 */
2523 private CharProperty clazz(boolean consume) {
2524 CharProperty prev = null;
2525 CharProperty node = null;
2526 BitClass bits = new BitClass();
2527 boolean include = true;
2528 boolean firstInClass = true;
2529 int ch = next();
2530 for (;;) {
2531 switch (ch) {
2532 case '^':
2533 // Negates if first char in a class, otherwise literal
2534 if (firstInClass) {
2535 if (temp[cursor-1] != '[')
2536 break;
2537 ch = next();
2538 include = !include;
2539 continue;
2540 } else {
2541 // ^ not first in class, treat as literal
2542 break;
2543 }
2544 case '[':
2545 firstInClass = false;
2546 node = clazz(true);
2547 if (prev == null)
2548 prev = node;
2549 else
2550 prev = union(prev, node);
2551 ch = peek();
2552 continue;
2553 case '&':
2554 firstInClass = false;
2555 ch = next();
2556 if (ch == '&') {
2557 ch = next();
2558 CharProperty rightNode = null;
2559 while (ch != ']' && ch != '&') {
2560 if (ch == '[') {
2561 if (rightNode == null)
2562 rightNode = clazz(true);
2563 else
2564 rightNode = union(rightNode, clazz(true));
2565 } else { // abc&&def
2566 unread();
2567 rightNode = clazz(false);
2568 }
2569 ch = peek();
2570 }
2571 if (rightNode != null)
2572 node = rightNode;
2573 if (prev == null) {
2574 if (rightNode == null)
2575 throw error("Bad class syntax");
2576 else
2577 prev = rightNode;
2578 } else {
2579 prev = intersection(prev, node);
2580 }
2581 } else {
2582 // treat as a literal &
2583 unread();
2584 break;
2585 }
2586 continue;
2587 case 0:
2588 firstInClass = false;
2589 if (cursor >= patternLength)
2590 throw error("Unclosed character class");
2591 break;
2592 case ']':
2593 firstInClass = false;
2594 if (prev != null) {
2595 if (consume)
2596 next();
2597 return prev;
2598 }
2599 break;
2600 default:
2601 firstInClass = false;
2602 break;
2603 }
2604 node = range(bits);
2605 if (include) {
2606 if (prev == null) {
2607 prev = node;
2608 } else {
2609 if (prev != node)
2610 prev = union(prev, node);
2611 }
2612 } else {
2613 if (prev == null) {
2614 prev = node.complement();
2615 } else {
2616 if (prev != node)
2617 prev = setDifference(prev, node);
2618 }
2619 }
2620 ch = peek();
2621 }
2622 }
2623
2624 private CharProperty bitsOrSingle(BitClass bits, int ch) {
2625 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2626 Use "single" node instead of bits when dealing with unicode
2627 case folding for codepoints listed below.
2628 (1)Uppercase out of range: u+00ff, u+00b5
2629 toUpperCase(u+00ff) -> u+0178
2630 toUpperCase(u+00b5) -> u+039c
2631 (2)LatinSmallLetterLongS u+17f
2632 toUpperCase(u+017f) -> u+0053
2633 (3)LatinSmallLetterDotlessI u+131
2634 toUpperCase(u+0131) -> u+0049
2635 (4)LatinCapitalLetterIWithDotAbove u+0130
2636 toLowerCase(u+0130) -> u+0069
2637 (5)KelvinSign u+212a
2638 toLowerCase(u+212a) ==> u+006B
2639 (6)AngstromSign u+212b
2640 toLowerCase(u+212b) ==> u+00e5
2641 */
2642 int d;
2643 if (ch < 256 &&
2644 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2645 (ch == 0xff || ch == 0xb5 ||
2646 ch == 0x49 || ch == 0x69 || //I and i
2647 ch == 0x53 || ch == 0x73 || //S and s
2648 ch == 0x4b || ch == 0x6b || //K and k
2649 ch == 0xc5 || ch == 0xe5))) //A+ring
2650 return bits.add(ch, flags());
2651 return newSingle(ch);
2652 }
2653
2654 /**
2655 * Parse a single character or a character range in a character class
2656 * and return its representative node.
2657 */
2658 private CharProperty range(BitClass bits) {
2659 int ch = peek();
2660 if (ch == '\\') {
2661 ch = nextEscaped();
2662 if (ch == 'p' || ch == 'P') { // A property
2663 boolean comp = (ch == 'P');
2664 boolean oneLetter = true;
2665 // Consume { if present
2666 ch = next();
2667 if (ch != '{')
2668 unread();
2669 else
2670 oneLetter = false;
2671 return family(oneLetter, comp);
2672 } else { // ordinary escape
2673 boolean isrange = temp[cursor+1] == '-';
2674 unread();
2675 ch = escape(true, true, isrange);
2676 if (ch == -1)
2677 return (CharProperty) root;
2678 }
2679 } else {
2680 next();
2681 }
2682 if (ch >= 0) {
2683 if (peek() == '-') {
2684 int endRange = temp[cursor+1];
2685 if (endRange == '[') {
2686 return bitsOrSingle(bits, ch);
2687 }
2688 if (endRange != ']') {
2689 next();
2690 int m = peek();
2691 if (m == '\\') {
2692 m = escape(true, false, true);
2693 } else {
2694 next();
2695 }
2696 if (m < ch) {
2697 throw error("Illegal character range");
2698 }
2699 if (has(CASE_INSENSITIVE))
2700 return caseInsensitiveRangeFor(ch, m);
2701 else
2702 return rangeFor(ch, m);
2703 }
2704 }
2705 return bitsOrSingle(bits, ch);
2706 }
2707 throw error("Unexpected character '"+((char)ch)+"'");
2708 }
2709
2710 /**
2711 * Parses a Unicode character family and returns its representative node.
2712 */
2713 private CharProperty family(boolean singleLetter,
2714 boolean maybeComplement)
2715 {
2716 next();
2717 String name;
2718 CharProperty node = null;
2719
2720 if (singleLetter) {
2721 int c = temp[cursor];
2722 if (!Character.isSupplementaryCodePoint(c)) {
2723 name = String.valueOf((char)c);
2724 } else {
2725 name = new String(temp, cursor, 1);
2726 }
2727 read();
2728 } else {
2729 int i = cursor;
2730 mark('}');
2731 while(read() != '}') {
2732 }
2733 mark('\000');
2734 int j = cursor;
2735 if (j > patternLength)
2736 throw error("Unclosed character family");
2737 if (i + 1 >= j)
2738 throw error("Empty character family");
2739 name = new String(temp, i, j-i-1);
2740 }
2741
2742 int i = name.indexOf('=');
2743 if (i != -1) {
2744 // property construct \p{name=value}
2745 String value = name.substring(i + 1);
2746 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2747 switch (name) {
2748 case "sc":
2749 case "script":
2750 node = unicodeScriptPropertyFor(value);
2751 break;
2752 case "blk":
2753 case "block":
2754 node = unicodeBlockPropertyFor(value);
2755 break;
2756 case "gc":
2757 case "general_category":
2758 node = charPropertyNodeFor(value);
2759 break;
2760 default:
2761 throw error("Unknown Unicode property {name=<" + name + ">, "
2762 + "value=<" + value + ">}");
2763 }
2764 } else {
2765 if (name.startsWith("In")) {
2766 // \p{inBlockName}
2767 node = unicodeBlockPropertyFor(name.substring(2));
2768 } else if (name.startsWith("Is")) {
2769 // \p{isGeneralCategory} and \p{isScriptName}
2770 name = name.substring(2);
2771 UnicodeProp uprop = UnicodeProp.forName(name);
2772 if (uprop != null)
2773 node = new Utype(uprop);
2774 if (node == null)
2775 node = CharPropertyNames.charPropertyFor(name);
2776 if (node == null)
2777 node = unicodeScriptPropertyFor(name);
2778 } else {
2779 if (has(UNICODE_CHARACTER_CLASS)) {
2780 UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
2781 if (uprop != null)
2782 node = new Utype(uprop);
2783 }
2784 if (node == null)
2785 node = charPropertyNodeFor(name);
2786 }
2787 }
2788 if (maybeComplement) {
2789 if (node instanceof Category || node instanceof Block)
2790 hasSupplementary = true;
2791 node = node.complement();
2792 }
2793 return node;
2794 }
2795
2796
2797 /**
2798 * Returns a CharProperty matching all characters belong to
2799 * a UnicodeScript.
2800 */
2801 private CharProperty unicodeScriptPropertyFor(String name) {
2802 final Character.UnicodeScript script;
2803 try {
2804 script = Character.UnicodeScript.forName(name);
2805 } catch (IllegalArgumentException iae) {
2806 throw error("Unknown character script name {" + name + "}");
2807 }
2808 return new Script(script);
2809 }
2810
2811 /**
2812 * Returns a CharProperty matching all characters in a UnicodeBlock.
2813 */
2814 private CharProperty unicodeBlockPropertyFor(String name) {
2815 final Character.UnicodeBlock block;
2816 try {
2817 block = Character.UnicodeBlock.forName(name);
2818 } catch (IllegalArgumentException iae) {
2819 throw error("Unknown character block name {" + name + "}");
2820 }
2821 return new Block(block);
2822 }
2823
2824 /**
2825 * Returns a CharProperty matching all characters in a named property.
2826 */
2827 private CharProperty charPropertyNodeFor(String name) {
2828 CharProperty p = CharPropertyNames.charPropertyFor(name);
2829 if (p == null)
2830 throw error("Unknown character property name {" + name + "}");
2831 return p;
2832 }
2833
2834 /**
2835 * Parses and returns the name of a "named capturing group", the trailing
2836 * ">" is consumed after parsing.
2837 */
2838 private String groupname(int ch) {
2839 StringBuilder sb = new StringBuilder();
2840 sb.append(Character.toChars(ch));
2841 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2842 ASCII.isDigit(ch)) {
2843 sb.append(Character.toChars(ch));
2844 }
2845 if (sb.length() == 0)
2846 throw error("named capturing group has 0 length name");
2847 if (ch != '>')
2848 throw error("named capturing group is missing trailing '>'");
2849 return sb.toString();
2850 }
2851
2867 case ':': // (?:xxx) pure group
2868 head = createGroup(true);
2869 tail = root;
2870 head.next = expr(tail);
2871 break;
2872 case '=': // (?=xxx) and (?!xxx) lookahead
2873 case '!':
2874 head = createGroup(true);
2875 tail = root;
2876 head.next = expr(tail);
2877 if (ch == '=') {
2878 head = tail = new Pos(head);
2879 } else {
2880 head = tail = new Neg(head);
2881 }
2882 break;
2883 case '>': // (?>xxx) independent group
2884 head = createGroup(true);
2885 tail = root;
2886 head.next = expr(tail);
2887 head = tail = new Ques(head, INDEPENDENT);
2888 break;
2889 case '<': // (?<xxx) look behind
2890 ch = read();
2891 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2892 // named captured group
2893 String name = groupname(ch);
2894 if (namedGroups().containsKey(name))
2895 throw error("Named capturing group <" + name
2896 + "> is already defined");
2897 capturingGroup = true;
2898 head = createGroup(false);
2899 tail = root;
2900 namedGroups().put(name, capturingGroupCount-1);
2901 head.next = expr(tail);
2902 break;
2903 }
2904 int start = cursor;
2905 head = createGroup(true);
2906 tail = root;
2907 head.next = expr(tail);
2953 tail = root;
2954 head.next = expr(tail);
2955 }
2956
2957 accept(')', "Unclosed group");
2958 flags = save;
2959
2960 // Check for quantifiers
2961 Node node = closure(head);
2962 if (node == head) { // No closure
2963 root = tail;
2964 return node; // Dual return
2965 }
2966 if (head == tail) { // Zero length assertion
2967 root = node;
2968 return node; // Dual return
2969 }
2970
2971 if (node instanceof Ques) {
2972 Ques ques = (Ques) node;
2973 if (ques.type == POSSESSIVE) {
2974 root = node;
2975 return node;
2976 }
2977 tail.next = new BranchConn();
2978 tail = tail.next;
2979 if (ques.type == GREEDY) {
2980 head = new Branch(head, null, tail);
2981 } else { // Reluctant quantifier
2982 head = new Branch(null, head, tail);
2983 }
2984 root = tail;
2985 return head;
2986 } else if (node instanceof Curly) {
2987 Curly curly = (Curly) node;
2988 if (curly.type == POSSESSIVE) {
2989 root = node;
2990 return node;
2991 }
2992 // Discover if the group is deterministic
2993 TreeInfo info = new TreeInfo();
2994 if (head.study(info)) { // Deterministic
2995 GroupTail temp = (GroupTail) tail;
2996 head = root = new GroupCurly(head.next, curly.cmin,
2997 curly.cmax, curly.type,
2998 ((GroupTail)tail).localIndex,
2999 ((GroupTail)tail).groupIndex,
3000 capturingGroup);
3001 return head;
3002 } else { // Non-deterministic
3003 int temp = ((GroupHead) head).localIndex;
3004 Loop loop;
3005 if (curly.type == GREEDY)
3006 loop = new Loop(this.localCount, temp);
3007 else // Reluctant Curly
3008 loop = new LazyLoop(this.localCount, temp);
3009 Prolog prolog = new Prolog(loop);
3010 this.localCount += 1;
3011 loop.cmin = curly.cmin;
3012 loop.cmax = curly.cmax;
3013 loop.body = head;
3014 tail.next = loop;
3015 root = loop;
3016 return prolog; // Dual return
3017 }
3018 }
3019 throw error("Internal logic error");
3020 }
3021
3022 /**
3023 * Create group head and tail nodes using double return. If the group is
3024 * created with anonymous true then it is a pure group and should not
3025 * affect group counting.
3026 */
3027 private Node createGroup(boolean anonymous) {
3028 int localIndex = localCount++;
3029 int groupIndex = 0;
3030 if (!anonymous)
3031 groupIndex = capturingGroupCount++;
3032 GroupHead head = new GroupHead(localIndex);
3033 root = new GroupTail(localIndex, groupIndex);
3034 if (!anonymous && groupIndex < 10)
3035 groupNodes[groupIndex] = head;
3036 return head;
3037 }
3038
3039 @SuppressWarnings("fallthrough")
3040 /**
3041 * Parses inlined match flags and set them appropriately.
3042 */
3043 private void addFlag() {
3044 int ch = peek();
3045 for (;;) {
3046 switch (ch) {
3047 case 'i':
3048 flags |= CASE_INSENSITIVE;
3049 break;
3050 case 'm':
3051 flags |= MULTILINE;
3052 break;
3053 case 's':
3102 case 'u':
3103 flags &= ~UNICODE_CASE;
3104 break;
3105 case 'c':
3106 flags &= ~CANON_EQ;
3107 break;
3108 case 'x':
3109 flags &= ~COMMENTS;
3110 break;
3111 case 'U':
3112 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3113 default:
3114 return;
3115 }
3116 ch = next();
3117 }
3118 }
3119
3120 static final int MAX_REPS = 0x7FFFFFFF;
3121
3122 static final int GREEDY = 0;
3123
3124 static final int LAZY = 1;
3125
3126 static final int POSSESSIVE = 2;
3127
3128 static final int INDEPENDENT = 3;
3129
3130 /**
3131 * Processes repetition. If the next character peeked is a quantifier
3132 * then new nodes must be appended to handle the repetition.
3133 * Prev could be a single or a group, so it could be a chain of nodes.
3134 */
3135 private Node closure(Node prev) {
3136 Node atom;
3137 int ch = peek();
3138 switch (ch) {
3139 case '?':
3140 ch = next();
3141 if (ch == '?') {
3142 next();
3143 return new Ques(prev, LAZY);
3144 } else if (ch == '+') {
3145 next();
3146 return new Ques(prev, POSSESSIVE);
3147 }
3148 return new Ques(prev, GREEDY);
3149 case '*':
3150 ch = next();
3151 if (ch == '?') {
3152 next();
3153 return new Curly(prev, 0, MAX_REPS, LAZY);
3154 } else if (ch == '+') {
3155 next();
3156 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
3157 }
3158 return new Curly(prev, 0, MAX_REPS, GREEDY);
3159 case '+':
3160 ch = next();
3161 if (ch == '?') {
3162 next();
3163 return new Curly(prev, 1, MAX_REPS, LAZY);
3164 } else if (ch == '+') {
3165 next();
3166 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
3167 }
3168 return new Curly(prev, 1, MAX_REPS, GREEDY);
3169 case '{':
3170 ch = temp[cursor+1];
3171 if (ASCII.isDigit(ch)) {
3172 skip();
3173 int cmin = 0;
3174 do {
3175 cmin = cmin * 10 + (ch - '0');
3176 } while (ASCII.isDigit(ch = read()));
3177 int cmax = cmin;
3178 if (ch == ',') {
3179 ch = read();
3180 cmax = MAX_REPS;
3181 if (ch != '}') {
3182 cmax = 0;
3183 while (ASCII.isDigit(ch)) {
3184 cmax = cmax * 10 + (ch - '0');
3185 ch = read();
3186 }
3187 }
3188 }
3189 if (ch != '}')
3190 throw error("Unclosed counted closure");
3191 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3192 throw error("Illegal repetition range");
3193 Curly curly;
3194 ch = peek();
3195 if (ch == '?') {
3196 next();
3197 curly = new Curly(prev, cmin, cmax, LAZY);
3198 } else if (ch == '+') {
3199 next();
3200 curly = new Curly(prev, cmin, cmax, POSSESSIVE);
3201 } else {
3202 curly = new Curly(prev, cmin, cmax, GREEDY);
3203 }
3204 return curly;
3205 } else {
3206 throw error("Illegal repetition");
3207 }
3208 default:
3209 return prev;
3210 }
3211 }
3212
3213 /**
3214 * Utility method for parsing control escape sequences.
3215 */
3216 private int c() {
3217 if (cursor < patternLength) {
3218 return read() ^ 64;
3219 }
3220 throw error("Illegal control escape sequence");
3221 }
3222
3359
3360 private static final int countCodePoints(CharSequence seq) {
3361 int length = seq.length();
3362 int n = 0;
3363 for (int i = 0; i < length; ) {
3364 n++;
3365 if (Character.isHighSurrogate(seq.charAt(i++))) {
3366 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3367 i++;
3368 }
3369 }
3370 }
3371 return n;
3372 }
3373
3374 /**
3375 * Creates a bit vector for matching Latin-1 values. A normal BitClass
3376 * never matches values above Latin-1, and a complemented BitClass always
3377 * matches values above Latin-1.
3378 */
3379 private static final class BitClass extends BmpCharProperty {
3380 final boolean[] bits;
3381 BitClass() { bits = new boolean[256]; }
3382 private BitClass(boolean[] bits) { this.bits = bits; }
3383 BitClass add(int c, int flags) {
3384 assert c >= 0 && c <= 255;
3385 if ((flags & CASE_INSENSITIVE) != 0) {
3386 if (ASCII.isAscii(c)) {
3387 bits[ASCII.toUpper(c)] = true;
3388 bits[ASCII.toLower(c)] = true;
3389 } else if ((flags & UNICODE_CASE) != 0) {
3390 bits[Character.toLowerCase(c)] = true;
3391 bits[Character.toUpperCase(c)] = true;
3392 }
3393 }
3394 bits[c] = true;
3395 return this;
3396 }
3397 boolean isSatisfiedBy(int ch) {
3398 return ch < 256 && bits[ch];
3399 }
3400 }
3401
3402 /**
3403 * Returns a suitably optimized, single character matcher.
3404 */
3405 private CharProperty newSingle(final int ch) {
3406 if (has(CASE_INSENSITIVE)) {
3407 int lower, upper;
3408 if (has(UNICODE_CASE)) {
3409 upper = Character.toUpperCase(ch);
3410 lower = Character.toLowerCase(upper);
3411 if (upper != lower)
3412 return new SingleU(lower);
3413 } else if (ASCII.isAscii(ch)) {
3414 lower = ASCII.toLower(ch);
3415 upper = ASCII.toUpper(ch);
3416 if (lower != upper)
3417 return new SingleI(lower, upper);
3418 }
3419 }
3420 if (isSupplementary(ch))
3421 return new SingleS(ch); // Match a given Unicode character
3422 return new Single(ch); // Match a given BMP character
3423 }
3424
3425 /**
3426 * Utility method for creating a string slice matcher.
3427 */
3428 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3429 int[] tmp = new int[count];
3430 if (has(CASE_INSENSITIVE)) {
3431 if (has(UNICODE_CASE)) {
3432 for (int i = 0; i < count; i++) {
3433 tmp[i] = Character.toLowerCase(
3434 Character.toUpperCase(buf[i]));
3435 }
3436 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3437 }
3438 for (int i = 0; i < count; i++) {
3439 tmp[i] = ASCII.toLower(buf[i]);
3440 }
3441 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3442 }
3810 if (i < matcher.to && seq.charAt(i) == 0x0A)
3811 i++;
3812 return next.match(matcher, i, seq);
3813 }
3814 } else {
3815 matcher.hitEnd = true;
3816 }
3817 return false;
3818 }
3819 boolean study(TreeInfo info) {
3820 info.minLength++;
3821 info.maxLength += 2;
3822 return next.study(info);
3823 }
3824 }
3825
3826 /**
3827 * Abstract node class to match one character satisfying some
3828 * boolean property.
3829 */
3830 private abstract static class CharProperty extends Node {
3831 abstract boolean isSatisfiedBy(int ch);
3832 CharProperty complement() {
3833 return new CharProperty() {
3834 boolean isSatisfiedBy(int ch) {
3835 return ! CharProperty.this.isSatisfiedBy(ch);}};
3836 }
3837 boolean match(Matcher matcher, int i, CharSequence seq) {
3838 if (i < matcher.to) {
3839 int ch = Character.codePointAt(seq, i);
3840 return isSatisfiedBy(ch)
3841 && next.match(matcher, i+Character.charCount(ch), seq);
3842 } else {
3843 matcher.hitEnd = true;
3844 return false;
3845 }
3846 }
3847 boolean study(TreeInfo info) {
3848 info.minLength++;
3849 info.maxLength++;
3850 return next.study(info);
3851 }
3852 }
3853
3854 /**
3855 * Optimized version of CharProperty that works only for
3856 * properties never satisfied by Supplementary characters.
3857 */
3858 private abstract static class BmpCharProperty extends CharProperty {
3859 boolean match(Matcher matcher, int i, CharSequence seq) {
3860 if (i < matcher.to) {
3861 return isSatisfiedBy(seq.charAt(i))
3862 && next.match(matcher, i+1, seq);
3863 } else {
3864 matcher.hitEnd = true;
3865 return false;
3866 }
3867 }
3868 }
3869
3870 /**
3871 * Node class that matches a Supplementary Unicode character
3872 */
3873 static final class SingleS extends CharProperty {
3874 final int c;
3875 SingleS(int c) { this.c = c; }
3876 boolean isSatisfiedBy(int ch) {
3877 return ch == c;
3878 }
3879 }
3880
3881 /**
3882 * Optimization -- matches a given BMP character
3883 */
3884 static final class Single extends BmpCharProperty {
3885 final int c;
3886 Single(int c) { this.c = c; }
3887 boolean isSatisfiedBy(int ch) {
3888 return ch == c;
3889 }
3890 }
3891
3892 /**
3893 * Case insensitive matches a given BMP character
3894 */
3895 static final class SingleI extends BmpCharProperty {
3896 final int lower;
3897 final int upper;
3898 SingleI(int lower, int upper) {
3899 this.lower = lower;
3900 this.upper = upper;
3901 }
3902 boolean isSatisfiedBy(int ch) {
3903 return ch == lower || ch == upper;
3904 }
3905 }
3906
3907 /**
3908 * Unicode case insensitive matches a given Unicode character
3909 */
3910 static final class SingleU extends CharProperty {
3911 final int lower;
3912 SingleU(int lower) {
3913 this.lower = lower;
3914 }
3915 boolean isSatisfiedBy(int ch) {
3916 return lower == ch ||
3917 lower == Character.toLowerCase(Character.toUpperCase(ch));
3918 }
3919 }
3920
3921 /**
3922 * Node class that matches a Unicode block.
3923 */
3924 static final class Block extends CharProperty {
3925 final Character.UnicodeBlock block;
3926 Block(Character.UnicodeBlock block) {
3927 this.block = block;
3928 }
3929 boolean isSatisfiedBy(int ch) {
3930 return block == Character.UnicodeBlock.of(ch);
3931 }
3932 }
3933
3934 /**
3935 * Node class that matches a Unicode script
3936 */
3937 static final class Script extends CharProperty {
3938 final Character.UnicodeScript script;
3939 Script(Character.UnicodeScript script) {
3940 this.script = script;
3941 }
3942 boolean isSatisfiedBy(int ch) {
3943 return script == Character.UnicodeScript.of(ch);
3944 }
3945 }
3946
3947 /**
3948 * Node class that matches a Unicode category.
3949 */
3950 static final class Category extends CharProperty {
3951 final int typeMask;
3952 Category(int typeMask) { this.typeMask = typeMask; }
3953 boolean isSatisfiedBy(int ch) {
3954 return (typeMask & (1 << Character.getType(ch))) != 0;
3955 }
3956 }
3957
3958 /**
3959 * Node class that matches a Unicode "type"
3960 */
3961 static final class Utype extends CharProperty {
3962 final UnicodeProp uprop;
3963 Utype(UnicodeProp uprop) { this.uprop = uprop; }
3964 boolean isSatisfiedBy(int ch) {
3965 return uprop.is(ch);
3966 }
3967 }
3968
3969 /**
3970 * Node class that matches a POSIX type.
3971 */
3972 static final class Ctype extends BmpCharProperty {
3973 final int ctype;
3974 Ctype(int ctype) { this.ctype = ctype; }
3975 boolean isSatisfiedBy(int ch) {
3976 return ch < 128 && ASCII.isType(ch, ctype);
3977 }
3978 }
3979
3980 /**
3981 * Node class that matches a Perl vertical whitespace
3982 */
3983 static final class VertWS extends BmpCharProperty {
3984 boolean isSatisfiedBy(int cp) {
3985 return (cp >= 0x0A && cp <= 0x0D) ||
3986 cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3987 }
3988 }
3989
3990 /**
3991 * Node class that matches a Perl horizontal whitespace
3992 */
3993 static final class HorizWS extends BmpCharProperty {
3994 boolean isSatisfiedBy(int cp) {
3995 return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3996 cp == 0x1680 || cp == 0x180e ||
3997 cp >= 0x2000 && cp <= 0x200a ||
3998 cp == 0x202f || cp == 0x205f || cp == 0x3000;
3999 }
4000 }
4001
4002 /**
4003 * Node class that matches an unicode extended grapheme cluster
4004 */
4005 static class XGrapheme extends Node {
4006 boolean match(Matcher matcher, int i, CharSequence seq) {
4007 if (i < matcher.to) {
4008 int ch0 = Character.codePointAt(seq, i);
4009 i += Character.charCount(ch0);
4010 while (i < matcher.to) {
4011 int ch1 = Character.codePointAt(seq, i);
4012 if (Grapheme.isBoundary(ch0, ch1))
4013 break;
4014 ch0 = ch1;
4015 i += Character.charCount(ch1);
4016 }
4017 return next.match(matcher, i, seq);
4018 }
4200 return false;
4201 }
4202 }
4203 return next.match(matcher, x, seq);
4204 }
4205 }
4206
4207 /**
4208 * Node class for a case insensitive sequence of literal characters.
4209 * Uses unicode case folding.
4210 */
4211 static final class SliceUS extends SliceIS {
4212 SliceUS(int[] buf) {
4213 super(buf);
4214 }
4215 int toLower(int c) {
4216 return Character.toLowerCase(Character.toUpperCase(c));
4217 }
4218 }
4219
4220 private static boolean inRange(int lower, int ch, int upper) {
4221 return lower <= ch && ch <= upper;
4222 }
4223
4224 /**
4225 * Returns node for matching characters within an explicit value range.
4226 */
4227 private static CharProperty rangeFor(final int lower,
4228 final int upper) {
4229 return new CharProperty() {
4230 boolean isSatisfiedBy(int ch) {
4231 return inRange(lower, ch, upper);}};
4232 }
4233
4234 /**
4235 * Returns node for matching characters within an explicit value
4236 * range in a case insensitive manner.
4237 */
4238 private CharProperty caseInsensitiveRangeFor(final int lower,
4239 final int upper) {
4240 if (has(UNICODE_CASE))
4241 return new CharProperty() {
4242 boolean isSatisfiedBy(int ch) {
4243 if (inRange(lower, ch, upper))
4244 return true;
4245 int up = Character.toUpperCase(ch);
4246 return inRange(lower, up, upper) ||
4247 inRange(lower, Character.toLowerCase(up), upper);}};
4248 return new CharProperty() {
4249 boolean isSatisfiedBy(int ch) {
4250 return inRange(lower, ch, upper) ||
4251 ASCII.isAscii(ch) &&
4252 (inRange(lower, ASCII.toUpper(ch), upper) ||
4253 inRange(lower, ASCII.toLower(ch), upper));
4254 }};
4255 }
4256
4257 /**
4258 * Implements the Unicode category ALL and the dot metacharacter when
4259 * in dotall mode.
4260 */
4261 static final class All extends CharProperty {
4262 boolean isSatisfiedBy(int ch) {
4263 return true;
4264 }
4265 }
4266
4267 /**
4268 * Node class for the dot metacharacter when dotall is not enabled.
4269 */
4270 static final class Dot extends CharProperty {
4271 boolean isSatisfiedBy(int ch) {
4272 return (ch != '\n' && ch != '\r'
4273 && (ch|1) != '\u2029'
4274 && ch != '\u0085');
4275 }
4276 }
4277
4278 /**
4279 * Node class for the dot metacharacter when dotall is not enabled
4280 * but UNIX_LINES is enabled.
4281 */
4282 static final class UnixDot extends CharProperty {
4283 boolean isSatisfiedBy(int ch) {
4284 return ch != '\n';
4285 }
4286 }
4287
4288 /**
4289 * The 0 or 1 quantifier. This one class implements all three types.
4290 */
4291 static final class Ques extends Node {
4292 Node atom;
4293 int type;
4294 Ques(Node node, int type) {
4295 this.atom = node;
4296 this.type = type;
4297 }
4298 boolean match(Matcher matcher, int i, CharSequence seq) {
4299 switch (type) {
4300 case GREEDY:
4301 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4302 || next.match(matcher, i, seq);
4303 case LAZY:
4304 return next.match(matcher, i, seq)
4305 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4306 case POSSESSIVE:
4307 if (atom.match(matcher, i, seq)) i = matcher.last;
4308 return next.match(matcher, i, seq);
4309 default:
4310 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4311 }
4312 }
4313 boolean study(TreeInfo info) {
4314 if (type != INDEPENDENT) {
4315 int minL = info.minLength;
4316 atom.study(info);
4317 info.minLength = minL;
4318 info.deterministic = false;
4319 return next.study(info);
4320 } else {
4321 atom.study(info);
4322 return next.study(info);
4323 }
4324 }
4325 }
4326
4327 /**
4328 * Handles the curly-brace style repetition with a specified minimum and
4329 * maximum occurrences. The * quantifier is handled as a special case.
4330 * This class handles the three types.
4331 */
4332 static final class Curly extends Node {
4333 Node atom;
4334 int type;
4335 int cmin;
4336 int cmax;
4337
4338 Curly(Node node, int cmin, int cmax, int type) {
4339 this.atom = node;
4340 this.type = type;
4341 this.cmin = cmin;
4342 this.cmax = cmax;
4343 }
4344 boolean match(Matcher matcher, int i, CharSequence seq) {
4345 int j;
4346 for (j = 0; j < cmin; j++) {
4347 if (atom.match(matcher, i, seq)) {
4348 i = matcher.last;
4349 continue;
4350 }
4351 return false;
4352 }
4353 if (type == GREEDY)
4354 return match0(matcher, i, j, seq);
4355 else if (type == LAZY)
4356 return match1(matcher, i, j, seq);
4357 else
4358 return match2(matcher, i, j, seq);
4359 }
4360 // Greedy match.
4361 // i is the index to start matching at
4362 // j is the number of atoms that have matched
4363 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4364 if (j >= cmax) {
4365 // We have matched the maximum... continue with the rest of
4366 // the regular expression
4367 return next.match(matcher, i, seq);
4368 }
4369 int backLimit = j;
4370 while (atom.match(matcher, i, seq)) {
4371 // k is the length of this match
4372 int k = matcher.last - i;
4373 if (k == 0) // Zero length match
4374 break;
4375 // Move up index and number matched
4457 }
4458
4459 if (info.deterministic && cmin == cmax)
4460 info.deterministic = detm;
4461 else
4462 info.deterministic = false;
4463 return next.study(info);
4464 }
4465 }
4466
4467 /**
4468 * Handles the curly-brace style repetition with a specified minimum and
4469 * maximum occurrences in deterministic cases. This is an iterative
4470 * optimization over the Prolog and Loop system which would handle this
4471 * in a recursive way. The * quantifier is handled as a special case.
4472 * If capture is true then this class saves group settings and ensures
4473 * that groups are unset when backing off of a group match.
4474 */
4475 static final class GroupCurly extends Node {
4476 Node atom;
4477 int type;
4478 int cmin;
4479 int cmax;
4480 int localIndex;
4481 int groupIndex;
4482 boolean capture;
4483
4484 GroupCurly(Node node, int cmin, int cmax, int type, int local,
4485 int group, boolean capture) {
4486 this.atom = node;
4487 this.type = type;
4488 this.cmin = cmin;
4489 this.cmax = cmax;
4490 this.localIndex = local;
4491 this.groupIndex = group;
4492 this.capture = capture;
4493 }
4494 boolean match(Matcher matcher, int i, CharSequence seq) {
4495 int[] groups = matcher.groups;
4496 int[] locals = matcher.locals;
4497 int save0 = locals[localIndex];
4498 int save1 = 0;
4499 int save2 = 0;
4500
4501 if (capture) {
4502 save1 = groups[groupIndex];
4503 save2 = groups[groupIndex+1];
4504 }
4505
4506 // Notify GroupTail there is no need to setup group info
4507 // because it will be set here
4508 locals[localIndex] = -1;
4509
4510 boolean ret = true;
4511 for (int j = 0; j < cmin; j++) {
4512 if (atom.match(matcher, i, seq)) {
4513 if (capture) {
4514 groups[groupIndex] = i;
4515 groups[groupIndex+1] = matcher.last;
4516 }
4517 i = matcher.last;
4518 } else {
4519 ret = false;
4520 break;
4521 }
4522 }
4523 if (ret) {
4524 if (type == GREEDY) {
4525 ret = match0(matcher, i, cmin, seq);
4526 } else if (type == LAZY) {
4527 ret = match1(matcher, i, cmin, seq);
4528 } else {
4529 ret = match2(matcher, i, cmin, seq);
4530 }
4531 }
4532 if (!ret) {
4533 locals[localIndex] = save0;
4534 if (capture) {
4535 groups[groupIndex] = save1;
4536 groups[groupIndex+1] = save2;
4537 }
4538 }
4539 return ret;
4540 }
4541 // Aggressive group match
4542 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4543 // don't back off passing the starting "j"
4544 int min = j;
4545 int[] groups = matcher.groups;
4546 int save0 = 0;
4752
4753 info.minLength += minL;
4754 info.maxLength += maxL;
4755 info.maxValid &= maxV;
4756 info.deterministic = false;
4757 return false;
4758 }
4759 }
4760
4761 /**
4762 * The GroupHead saves the location where the group begins in the locals
4763 * and restores them when the match is done.
4764 *
4765 * The matchRef is used when a reference to this group is accessed later
4766 * in the expression. The locals will have a negative value in them to
4767 * indicate that we do not want to unset the group if the reference
4768 * doesn't match.
4769 */
4770 static final class GroupHead extends Node {
4771 int localIndex;
4772 GroupHead(int localCount) {
4773 localIndex = localCount;
4774 }
4775 boolean match(Matcher matcher, int i, CharSequence seq) {
4776 int save = matcher.locals[localIndex];
4777 matcher.locals[localIndex] = i;
4778 boolean ret = next.match(matcher, i, seq);
4779 matcher.locals[localIndex] = save;
4780 return ret;
4781 }
4782 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4783 int save = matcher.locals[localIndex];
4784 matcher.locals[localIndex] = ~i; // HACK
4785 boolean ret = next.match(matcher, i, seq);
4786 matcher.locals[localIndex] = save;
4787 return ret;
4788 }
4789 }
4790
4791 /**
5345 int startIndex = (!matcher.transparentBounds) ?
5346 matcher.from : 0;
5347 int from = Math.max(i - rmaxChars, startIndex);
5348 matcher.lookbehindTo = i;
5349 // Relax transparent region boundaries for lookbehind
5350 if (matcher.transparentBounds)
5351 matcher.from = 0;
5352 for (int j = i - rminChars;
5353 !conditionMatched && j >= from;
5354 j -= j>from ? countChars(seq, j, -1) : 1) {
5355 conditionMatched = cond.match(matcher, j, seq);
5356 }
5357 //Reinstate region boundaries
5358 matcher.from = savedFrom;
5359 matcher.lookbehindTo = savedLBT;
5360 return !conditionMatched && next.match(matcher, i, seq);
5361 }
5362 }
5363
5364 /**
5365 * Returns the set union of two CharProperty nodes.
5366 */
5367 private static CharProperty union(final CharProperty lhs,
5368 final CharProperty rhs) {
5369 return new CharProperty() {
5370 boolean isSatisfiedBy(int ch) {
5371 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
5372 }
5373
5374 /**
5375 * Returns the set intersection of two CharProperty nodes.
5376 */
5377 private static CharProperty intersection(final CharProperty lhs,
5378 final CharProperty rhs) {
5379 return new CharProperty() {
5380 boolean isSatisfiedBy(int ch) {
5381 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
5382 }
5383
5384 /**
5385 * Returns the set difference of two CharProperty nodes.
5386 */
5387 private static CharProperty setDifference(final CharProperty lhs,
5388 final CharProperty rhs) {
5389 return new CharProperty() {
5390 boolean isSatisfiedBy(int ch) {
5391 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
5392 }
5393
5394 /**
5395 * Handles word boundaries. Includes a field to allow this one class to
5396 * deal with the different types of word boundaries we can match. The word
5397 * characters include underscores, letters, and digits. Non spacing marks
5398 * can are also part of a word if they have a base character, otherwise
5399 * they are ignored for purposes of finding word boundaries.
5400 */
5401 static final class Bound extends Node {
5402 static int LEFT = 0x1;
5403 static int RIGHT= 0x2;
5404 static int BOTH = 0x3;
5405 static int NONE = 0x4;
5406 int type;
5407 boolean useUWORD;
5408 Bound(int n, boolean useUWORD) {
5409 type = n;
5410 this.useUWORD = useUWORD;
5411 }
5412
5413 boolean isWord(int ch) {
5414 return useUWORD ? UnicodeProp.WORD.is(ch)
5415 : (ch == '_' || Character.isLetterOrDigit(ch));
5416 }
5417
5418 int check(Matcher matcher, int i, CharSequence seq) {
5419 int ch;
5420 boolean left = false;
5421 int startIndex = matcher.from;
5422 int endIndex = matcher.to;
5423 if (matcher.transparentBounds) {
5424 startIndex = 0;
5425 endIndex = matcher.getTextLength();
5426 }
5427 if (i > startIndex) {
5428 ch = Character.codePointBefore(seq, i);
5429 left = (isWord(ch) ||
5430 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5431 && hasBaseCharacter(matcher, i-1, seq)));
5432 }
5433 boolean right = false;
5434 if (i < endIndex) {
5640 i += countChars(seq, i, n);
5641 continue NEXT;
5642 }
5643 }
5644 // Entire pattern matched starting at i
5645 matcher.first = i;
5646 boolean ret = next.match(matcher, i + lengthInChars, seq);
5647 if (ret) {
5648 matcher.first = i;
5649 matcher.groups[0] = matcher.first;
5650 matcher.groups[1] = matcher.last;
5651 return true;
5652 }
5653 i += countChars(seq, i, 1);
5654 }
5655 matcher.hitEnd = true;
5656 return false;
5657 }
5658 }
5659
5660 ///////////////////////////////////////////////////////////////////////////////
5661 ///////////////////////////////////////////////////////////////////////////////
5662
5663 /**
5664 * This must be the very first initializer.
5665 */
5666 static Node accept = new Node();
5667
5668 static Node lastAccept = new LastNode();
5669
5670 private static class CharPropertyNames {
5671
5672 static CharProperty charPropertyFor(String name) {
5673 CharPropertyFactory m = map.get(name);
5674 return m == null ? null : m.make();
5675 }
5676
5677 private abstract static class CharPropertyFactory {
5678 abstract CharProperty make();
5679 }
5680
5681 private static void defCategory(String name,
5682 final int typeMask) {
5683 map.put(name, new CharPropertyFactory() {
5684 CharProperty make() { return new Category(typeMask);}});
5685 }
5686
5687 private static void defRange(String name,
5688 final int lower, final int upper) {
5689 map.put(name, new CharPropertyFactory() {
5690 CharProperty make() { return rangeFor(lower, upper);}});
5691 }
5692
5693 private static void defCtype(String name,
5694 final int ctype) {
5695 map.put(name, new CharPropertyFactory() {
5696 CharProperty make() { return new Ctype(ctype);}});
5697 }
5698
5699 private abstract static class CloneableProperty
5700 extends CharProperty implements Cloneable
5701 {
5702 public CloneableProperty clone() {
5703 try {
5704 return (CloneableProperty) super.clone();
5705 } catch (CloneNotSupportedException e) {
5706 throw new AssertionError(e);
5707 }
5708 }
5709 }
5710
5711 private static void defClone(String name,
5712 final CloneableProperty p) {
5713 map.put(name, new CharPropertyFactory() {
5714 CharProperty make() { return p.clone();}});
5715 }
5716
5717 private static final HashMap<String, CharPropertyFactory> map
5718 = new HashMap<>();
5719
5720 static {
5721 // Unicode character property aliases, defined in
5722 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
5723 defCategory("Cn", 1<<Character.UNASSIGNED);
5724 defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
5725 defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
5726 defCategory("Lt", 1<<Character.TITLECASE_LETTER);
5727 defCategory("Lm", 1<<Character.MODIFIER_LETTER);
5728 defCategory("Lo", 1<<Character.OTHER_LETTER);
5729 defCategory("Mn", 1<<Character.NON_SPACING_MARK);
5730 defCategory("Me", 1<<Character.ENCLOSING_MARK);
5731 defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
5732 defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
5733 defCategory("Nl", 1<<Character.LETTER_NUMBER);
5734 defCategory("No", 1<<Character.OTHER_NUMBER);
5735 defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
5736 defCategory("Zl", 1<<Character.LINE_SEPARATOR);
5737 defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
5738 defCategory("Cc", 1<<Character.CONTROL);
5739 defCategory("Cf", 1<<Character.FORMAT);
5740 defCategory("Co", 1<<Character.PRIVATE_USE);
5741 defCategory("Cs", 1<<Character.SURROGATE);
5742 defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
5743 defCategory("Ps", 1<<Character.START_PUNCTUATION);
5744 defCategory("Pe", 1<<Character.END_PUNCTUATION);
5745 defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
5746 defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
5747 defCategory("Sm", 1<<Character.MATH_SYMBOL);
5748 defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
5749 defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
5750 defCategory("So", 1<<Character.OTHER_SYMBOL);
5751 defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
5752 defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
5753 defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
5754 (1<<Character.LOWERCASE_LETTER) |
5755 (1<<Character.TITLECASE_LETTER) |
5756 (1<<Character.MODIFIER_LETTER) |
5757 (1<<Character.OTHER_LETTER)));
5758 defCategory("M", ((1<<Character.NON_SPACING_MARK) |
5759 (1<<Character.ENCLOSING_MARK) |
5760 (1<<Character.COMBINING_SPACING_MARK)));
5761 defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
5762 (1<<Character.LETTER_NUMBER) |
5763 (1<<Character.OTHER_NUMBER)));
5764 defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
5765 (1<<Character.LINE_SEPARATOR) |
5766 (1<<Character.PARAGRAPH_SEPARATOR)));
5767 defCategory("C", ((1<<Character.CONTROL) |
5768 (1<<Character.FORMAT) |
5769 (1<<Character.PRIVATE_USE) |
5770 (1<<Character.SURROGATE))); // Other
5771 defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
5772 (1<<Character.START_PUNCTUATION) |
5773 (1<<Character.END_PUNCTUATION) |
5774 (1<<Character.CONNECTOR_PUNCTUATION) |
5775 (1<<Character.OTHER_PUNCTUATION) |
5776 (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
5777 (1<<Character.FINAL_QUOTE_PUNCTUATION)));
5778 defCategory("S", ((1<<Character.MATH_SYMBOL) |
5779 (1<<Character.CURRENCY_SYMBOL) |
5780 (1<<Character.MODIFIER_SYMBOL) |
5781 (1<<Character.OTHER_SYMBOL)));
5782 defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
5783 (1<<Character.LOWERCASE_LETTER) |
5784 (1<<Character.TITLECASE_LETTER)));
5785 defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
5786 (1<<Character.LOWERCASE_LETTER) |
5787 (1<<Character.TITLECASE_LETTER) |
5788 (1<<Character.MODIFIER_LETTER) |
5789 (1<<Character.OTHER_LETTER) |
5790 (1<<Character.DECIMAL_DIGIT_NUMBER)));
5791 defRange("L1", 0x00, 0xFF); // Latin-1
5792 map.put("all", new CharPropertyFactory() {
5793 CharProperty make() { return new All(); }});
5794
5795 // Posix regular expression character classes, defined in
5796 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
5797 defRange("ASCII", 0x00, 0x7F); // ASCII
5798 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
5799 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
5800 defCtype("Blank", ASCII.BLANK); // Space and tab characters
5801 defCtype("Cntrl", ASCII.CNTRL); // Control characters
5802 defRange("Digit", '0', '9'); // Numeric characters
5803 defCtype("Graph", ASCII.GRAPH); // printable and visible
5804 defRange("Lower", 'a', 'z'); // Lower-case alphabetic
5805 defRange("Print", 0x20, 0x7E); // Printable characters
5806 defCtype("Punct", ASCII.PUNCT); // Punctuation characters
5807 defCtype("Space", ASCII.SPACE); // Space characters
5808 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
5809 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5810
5811 // Java character properties, defined by methods in Character.java
5812 defClone("javaLowerCase", new CloneableProperty() {
5813 boolean isSatisfiedBy(int ch) {
5814 return Character.isLowerCase(ch);}});
5815 defClone("javaUpperCase", new CloneableProperty() {
5816 boolean isSatisfiedBy(int ch) {
5817 return Character.isUpperCase(ch);}});
5818 defClone("javaAlphabetic", new CloneableProperty() {
5819 boolean isSatisfiedBy(int ch) {
5820 return Character.isAlphabetic(ch);}});
5821 defClone("javaIdeographic", new CloneableProperty() {
5822 boolean isSatisfiedBy(int ch) {
5823 return Character.isIdeographic(ch);}});
5824 defClone("javaTitleCase", new CloneableProperty() {
5825 boolean isSatisfiedBy(int ch) {
5826 return Character.isTitleCase(ch);}});
5827 defClone("javaDigit", new CloneableProperty() {
5828 boolean isSatisfiedBy(int ch) {
5829 return Character.isDigit(ch);}});
5830 defClone("javaDefined", new CloneableProperty() {
5831 boolean isSatisfiedBy(int ch) {
5832 return Character.isDefined(ch);}});
5833 defClone("javaLetter", new CloneableProperty() {
5834 boolean isSatisfiedBy(int ch) {
5835 return Character.isLetter(ch);}});
5836 defClone("javaLetterOrDigit", new CloneableProperty() {
5837 boolean isSatisfiedBy(int ch) {
5838 return Character.isLetterOrDigit(ch);}});
5839 defClone("javaJavaIdentifierStart", new CloneableProperty() {
5840 boolean isSatisfiedBy(int ch) {
5841 return Character.isJavaIdentifierStart(ch);}});
5842 defClone("javaJavaIdentifierPart", new CloneableProperty() {
5843 boolean isSatisfiedBy(int ch) {
5844 return Character.isJavaIdentifierPart(ch);}});
5845 defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
5846 boolean isSatisfiedBy(int ch) {
5847 return Character.isUnicodeIdentifierStart(ch);}});
5848 defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
5849 boolean isSatisfiedBy(int ch) {
5850 return Character.isUnicodeIdentifierPart(ch);}});
5851 defClone("javaIdentifierIgnorable", new CloneableProperty() {
5852 boolean isSatisfiedBy(int ch) {
5853 return Character.isIdentifierIgnorable(ch);}});
5854 defClone("javaSpaceChar", new CloneableProperty() {
5855 boolean isSatisfiedBy(int ch) {
5856 return Character.isSpaceChar(ch);}});
5857 defClone("javaWhitespace", new CloneableProperty() {
5858 boolean isSatisfiedBy(int ch) {
5859 return Character.isWhitespace(ch);}});
5860 defClone("javaISOControl", new CloneableProperty() {
5861 boolean isSatisfiedBy(int ch) {
5862 return Character.isISOControl(ch);}});
5863 defClone("javaMirrored", new CloneableProperty() {
5864 boolean isSatisfiedBy(int ch) {
5865 return Character.isMirrored(ch);}});
5866 }
5867 }
5868
5869 /**
5870 * Creates a predicate which can be used to match a string.
5871 *
5872 * @return The predicate which can be used for matching on a string
5873 * @since 1.8
5874 */
5875 public Predicate<String> asPredicate() {
5876 return s -> matcher(s).find();
5877 }
5878
5879 /**
5880 * Creates a stream from the given input sequence around matches of this
5881 * pattern.
5882 *
5883 * <p> The stream returned by this method contains each substring of the
5884 * input sequence that is terminated by another subsequence that matches
5885 * this pattern or is terminated by the end of the input sequence. The
5886 * substrings in the stream are in the order in which they occur in the
5887 * input. Trailing empty strings will be discarded and not encountered in
5888 * the stream.
|
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.util.regex;
27
28 import java.text.Normalizer;
29 import java.text.Normalizer.Form;
30 import java.util.Locale;
31 import java.util.Iterator;
32 import java.util.Map;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.LinkedHashSet;
36 import java.util.Set;
37 import java.util.Arrays;
38 import java.util.NoSuchElementException;
39 import java.util.Spliterator;
40 import java.util.Spliterators;
41 import java.util.function.Predicate;
42 import java.util.stream.Stream;
43 import java.util.stream.StreamSupport;
44
45
46 /**
47 * A compiled representation of a regular expression.
48 *
49 * <p> A regular expression, specified as a string, must first be compiled into
50 * an instance of this class. The resulting pattern can then be used to create
51 * a {@link Matcher} object that can match arbitrary {@linkplain
52 * java.lang.CharSequence character sequences} against the regular
53 * expression. All of the state involved in performing a match resides in the
54 * matcher, so many matchers can share the same pattern.
55 *
56 * <p> A typical invocation sequence is thus
970
971 /**
972 * The starting point of state machine for the find operation. This allows
973 * a match to start anywhere in the input.
974 */
975 transient Node root;
976
977 /**
978 * The root of object tree for a match operation. The pattern is matched
979 * at the beginning. This may include a find that uses BnM or a First
980 * node.
981 */
982 transient Node matchRoot;
983
984 /**
985 * Temporary storage used by parsing pattern slice.
986 */
987 transient int[] buffer;
988
989 /**
990 * A temporary storage used for predicate for double return.
991 */
992 transient CharPredicate predicate;
993
994 /**
995 * Map the "name" of the "named capturing group" to its group id
996 * node.
997 */
998 transient volatile Map<String, Integer> namedGroups;
999
1000 /**
1001 * Temporary storage used while parsing group references.
1002 */
1003 transient GroupHead[] groupNodes;
1004
1005 /**
1006 * Temporary null terminated code point array used by pattern compiling.
1007 */
1008 private transient int[] temp;
1009
1010 /**
1011 * The number of capturing groups in this Pattern. Used by matchers to
1012 * allocate storage needed to perform a match.
1013 */
1014 transient int capturingGroupCount;
1017 * The local variable count used by parsing tree. Used by matchers to
1018 * allocate storage needed to perform a match.
1019 */
1020 transient int localCount;
1021
1022 /**
1023 * Index into the pattern string that keeps track of how much has been
1024 * parsed.
1025 */
1026 private transient int cursor;
1027
1028 /**
1029 * Holds the length of the pattern string.
1030 */
1031 private transient int patternLength;
1032
1033 /**
1034 * If the Start node might possibly match supplementary characters.
1035 * It is set to true during compiling if
1036 * (1) There is supplementary char in pattern, or
1037 * (2) There is complement node of a "family" CharProperty
1038 */
1039 private transient boolean hasSupplementary;
1040
1041 /**
1042 * Compiles the given regular expression into a pattern.
1043 *
1044 * @param regex
1045 * The expression to be compiled
1046 * @return the given regular expression compiled into a pattern
1047 * @throws PatternSyntaxException
1048 * If the expression's syntax is invalid
1049 */
1050 public static Pattern compile(String regex) {
1051 return new Pattern(regex, 0);
1052 }
1053
1054 /**
1055 * Compiles the given regular expression into a pattern with the given
1056 * flags.
1057 *
1369 pattern = p;
1370 flags = f;
1371
1372 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1373 if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1374 flags |= UNICODE_CASE;
1375
1376 // Reset group index count
1377 capturingGroupCount = 1;
1378 localCount = 0;
1379
1380 if (pattern.length() > 0) {
1381 compile();
1382 } else {
1383 root = new Start(lastAccept);
1384 matchRoot = lastAccept;
1385 }
1386 }
1387
1388 /**
1389 * The pattern is converted to normalized form ({@link
1390 * java.text.Normalizer.Form.NFC NFC}, canonical decomposition,
1391 * followed by canonical composition for the character class
1392 * part, and {@link java.text.Normalizer.Form.NFD NFD},
1393 * canonical decomposition) for the rest), and then a pure
1394 * group is constructed to match canonical equivalences of the
1395 * characters.
1396 */
1397 private static String normalize(String pattern) {
1398 int plen = pattern.length();
1399 StringBuilder pbuf = new StringBuilder(plen);
1400 char last = 0;
1401 int lastStart = 0;
1402 char cc = 0;
1403 for (int i = 0; i < plen;) {
1404 char c = pattern.charAt(i);
1405 if (cc == 0 && // top level
1406 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {
1407 i += 2; last = 0;
1408 continue;
1409 }
1410 if (c == '[' && last != '\\') {
1411 if (cc == 0) {
1412 if (lastStart < i)
1413 normalizeSlice(pattern, lastStart, i, pbuf);
1414 lastStart = i;
1415 }
1416 cc++;
1417 } else if (c == ']' && last != '\\') {
1418 cc--;
1419 if (cc == 0) {
1420 normalizeClazz(pattern, lastStart, i + 1, pbuf);
1421 lastStart = i + 1;
1422 }
1423 }
1424 last = c;
1425 i++;
1426 }
1427 assert (cc == 0);
1428 if (lastStart < plen)
1429 normalizeSlice(pattern, lastStart, plen, pbuf);
1430 return pbuf.toString();
1431 }
1432
1433 private static void normalizeSlice(String src, int off, int limit,
1434 StringBuilder dst)
1435 {
1436 int len = src.length();
1437 int off0 = off;
1438 while (off < limit && ASCII.isAscii(src.charAt(off))) {
1439 off++;
1440 }
1441 if (off == limit) {
1442 dst.append(src, off0, limit);
1443 return;
1444 }
1445 off--;
1446 if (off < off0)
1447 off = off0;
1448 else
1449 dst.append(src, off0, off);
1450 while (off < limit) {
1451 int ch0 = src.codePointAt(off);
1452 if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
1453 dst.append((char)ch0);
1454 off++;
1455 continue;
1456 }
1457 int j = off + Character.charCount(ch0);
1458 int ch1;
1459 while (j < limit) {
1460 ch1 = src.codePointAt(j);
1461 if (Grapheme.isBoundary(ch0, ch1))
1462 break;
1463 ch0 = ch1;
1464 j += Character.charCount(ch1);
1465 }
1466 String seq = src.substring(off, j);
1467 String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
1468 off = j;
1469 if (nfd.length() > 1) {
1470 ch0 = nfd.codePointAt(0);
1471 ch1 = nfd.codePointAt(Character.charCount(ch0));
1472 if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
1473 Set<String> altns = new LinkedHashSet<>();
1474 altns.add(seq);
1475 produceEquivalentAlternation(nfd, altns);
1476 dst.append("(?:");
1477 altns.forEach( s -> dst.append(s + "|"));
1478 dst.delete(dst.length() - 1, dst.length());
1479 dst.append(")");
1480 continue;
1481 }
1482 }
1483 String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);
1484 if (!seq.equals(nfc) && !nfd.equals(nfc))
1485 dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")");
1486 else if (!seq.equals(nfd))
1487 dst.append("(?:" + seq + "|" + nfd + ")");
1488 else
1489 dst.append(seq);
1490 }
1491 }
1492
1493 private static void normalizeClazz(String src, int off, int limit,
1494 StringBuilder dst)
1495 {
1496 dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));
1497 }
1498
1499 /**
1500 * Given a specific sequence composed of a regular character and
1501 * combining marks that follow it, produce the alternation that will
1502 * match all canonical equivalences of that sequence.
1503 */
1504 private static void produceEquivalentAlternation(String src,
1505 Set<String> dst)
1506 {
1507 int len = countChars(src, 0, 1);
1508 if (src.length() == len) {
1509 dst.add(src); // source has one character.
1510 return;
1511 }
1512 String base = src.substring(0,len);
1513 String combiningMarks = src.substring(len);
1514 String[] perms = producePermutations(combiningMarks);
1515 // Add combined permutations
1516 for(int x = 0; x < perms.length; x++) {
1517 String next = base + perms[x];
1518 dst.add(next);
1519 next = composeOneStep(next);
1520 if (next != null) {
1521 produceEquivalentAlternation(next, dst);
1522 }
1523 }
1524 }
1525
1526 /**
1527 * Returns an array of strings that have all the possible
1528 * permutations of the characters in the input string.
1529 * This is used to get a list of all possible orderings
1530 * of a set of combining marks. Note that some of the permutations
1531 * are invalid because of combining class collisions, and these
1532 * possibilities must be removed because they are not canonically
1533 * equivalent.
1534 */
1535 private static String[] producePermutations(String input) {
1536 if (input.length() == countChars(input, 0, 1))
1537 return new String[] {input};
1538
1539 if (input.length() == countChars(input, 0, 2)) {
1540 int c0 = Character.codePointAt(input, 0);
1541 int c1 = Character.codePointAt(input, Character.charCount(c0));
1542 if (getClass(c1) == getClass(c0)) {
1543 return new String[] {input};
1544 }
1545 String[] result = new String[2];
1546 result[0] = input;
1547 StringBuilder sb = new StringBuilder(2);
1548 sb.appendCodePoint(c1);
1549 sb.appendCodePoint(c0);
1550 result[1] = sb.toString();
1551 return result;
1552 }
1553
1554 int length = 1;
1555 int nCodePoints = countCodePoints(input);
1573 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
1574 len = countChars(input, offset, 1);
1575 for(int y=x-1; y>=0; y--) {
1576 if (combClass[y] == combClass[x]) {
1577 continue loop;
1578 }
1579 }
1580 StringBuilder sb = new StringBuilder(input);
1581 String otherChars = sb.delete(offset, offset+len).toString();
1582 String[] subResult = producePermutations(otherChars);
1583
1584 String prefix = input.substring(offset, offset+len);
1585 for (String sre : subResult)
1586 temp[index++] = prefix + sre;
1587 }
1588 String[] result = new String[index];
1589 System.arraycopy(temp, 0, result, 0, index);
1590 return result;
1591 }
1592
1593 private static int getClass(int c) {
1594 return sun.text.Normalizer.getCombiningClass(c);
1595 }
1596
1597 /**
1598 * Attempts to compose input by combining the first character
1599 * with the first combining mark following it. Returns a String
1600 * that is the composition of the leading character with its first
1601 * combining mark followed by the remaining combining marks. Returns
1602 * null if the first two characters cannot be further composed.
1603 */
1604 private static String composeOneStep(String input) {
1605 int len = countChars(input, 0, 2);
1606 String firstTwoCharacters = input.substring(0, len);
1607 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
1608 if (result.equals(firstTwoCharacters))
1609 return null;
1610 else {
1611 String remainder = input.substring(len);
1612 return result + remainder;
1613 }
1614 }
1615
1616 /**
1617 * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1618 * See the description of `quotemeta' in perlfunc(1).
1619 */
1620 private void RemoveQEQuoting() {
1621 final int pLen = patternLength;
1622 int i = 0;
1623 while (i < pLen-1) {
1624 if (temp[i] != '\\')
1625 i += 1;
1626 else if (temp[i + 1] != 'Q')
1627 i += 2;
1674 newtemp[j++] = c;
1675 if (i != pLen)
1676 newtemp[j++] = temp[i++];
1677 }
1678 }
1679
1680 beginQuote = false;
1681 }
1682
1683 patternLength = j;
1684 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1685 }
1686
1687 /**
1688 * Copies regular expression to an int array and invokes the parsing
1689 * of the expression which will create the object tree.
1690 */
1691 private void compile() {
1692 // Handle canonical equivalences
1693 if (has(CANON_EQ) && !has(LITERAL)) {
1694 normalizedPattern = normalize(pattern);
1695 } else {
1696 normalizedPattern = pattern;
1697 }
1698 patternLength = normalizedPattern.length();
1699
1700 // Copy pattern to int array for convenience
1701 // Use double zero to terminate pattern
1702 temp = new int[patternLength + 2];
1703
1704 hasSupplementary = false;
1705 int c, count = 0;
1706 // Convert all chars into code points
1707 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1708 c = normalizedPattern.codePointAt(x);
1709 if (isSupplementary(c)) {
1710 hasSupplementary = true;
1711 }
1712 temp[count++] = c;
1713 }
1714
1751 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1752 }
1753
1754 // Release temporary storage
1755 temp = null;
1756 buffer = null;
1757 groupNodes = null;
1758 patternLength = 0;
1759 compiled = true;
1760 }
1761
1762 Map<String, Integer> namedGroups() {
1763 Map<String, Integer> groups = namedGroups;
1764 if (groups == null) {
1765 namedGroups = groups = new HashMap<>(2);
1766 }
1767 return groups;
1768 }
1769
1770 /**
1771 * Used to accumulate information about a subtree of the object graph
1772 * so that optimizations can be applied to the subtree.
1773 */
1774 static final class TreeInfo {
1775 int minLength;
1776 int maxLength;
1777 boolean maxValid;
1778 boolean deterministic;
1779
1780 TreeInfo() {
1781 reset();
1782 }
1783 void reset() {
1784 minLength = 0;
1785 maxLength = 0;
1786 maxValid = true;
1787 deterministic = true;
1788 }
1789 }
1790
2042 Node node = null;
2043 LOOP:
2044 for (;;) {
2045 int ch = peek();
2046 switch (ch) {
2047 case '(':
2048 // Because group handles its own closure,
2049 // we need to treat it differently
2050 node = group0();
2051 // Check for comment or flag group
2052 if (node == null)
2053 continue;
2054 if (head == null)
2055 head = node;
2056 else
2057 tail.next = node;
2058 // Double return: Tail was returned in root
2059 tail = root;
2060 continue;
2061 case '[':
2062 if (has(CANON_EQ) && !has(LITERAL))
2063 node = new NFCCharProperty(clazz(true));
2064 else
2065 node = newCharProperty(clazz(true));
2066 break;
2067 case '\\':
2068 ch = nextEscaped();
2069 if (ch == 'p' || ch == 'P') {
2070 boolean oneLetter = true;
2071 boolean comp = (ch == 'P');
2072 ch = next(); // Consume { if present
2073 if (ch != '{') {
2074 unread();
2075 } else {
2076 oneLetter = false;
2077 }
2078 // node = newCharProperty(family(oneLetter, comp));
2079 if (has(CANON_EQ) && !has(LITERAL))
2080 node = new NFCCharProperty(family(oneLetter, comp));
2081 else
2082 node = newCharProperty(family(oneLetter, comp));
2083 } else {
2084 unread();
2085 node = atom();
2086 }
2087 break;
2088 case '^':
2089 next();
2090 if (has(MULTILINE)) {
2091 if (has(UNIX_LINES))
2092 node = new UnixCaret();
2093 else
2094 node = new Caret();
2095 } else {
2096 node = new Begin();
2097 }
2098 break;
2099 case '$':
2100 next();
2101 if (has(UNIX_LINES))
2102 node = new UnixDollar(has(MULTILINE));
2103 else
2104 node = new Dollar(has(MULTILINE));
2105 break;
2106 case '.':
2107 next();
2108 if (has(DOTALL)) {
2109 node = new CharProperty(ALL);
2110 } else {
2111 if (has(UNIX_LINES)) {
2112 node = new CharProperty(UNIXDOT);
2113 } else {
2114 node = new CharProperty(DOT);
2115 }
2116 }
2117 break;
2118 case '|':
2119 case ')':
2120 break LOOP;
2121 case ']': // Now interpreting dangling ] and } as literals
2122 case '}':
2123 node = atom();
2124 break;
2125 case '?':
2126 case '*':
2127 case '+':
2128 next();
2129 throw error("Dangling meta character '" + ((char)ch) + "'");
2130 case 0:
2131 if (cursor >= patternLength) {
2132 break LOOP;
2133 }
2134 // Fall through
2135 default:
2136 node = atom();
2137 break;
2138 }
2139
2140 node = closure(node);
2141 if (head == null) {
2142 head = tail = node;
2143 } else {
2144 tail.next = node;
2145 tail = node;
2146 }
2147 }
2148 if (head == null) {
2149 return end;
2150 }
2151 tail.next = end;
2152 root = tail; //double return
2153 return head;
2154 }
2155
2156 @SuppressWarnings("fallthrough")
2157 /**
2158 * Parse and add a new Single or Slice.
2159 */
2160 private Node atom() {
2178 case '^':
2179 case '(':
2180 case '[':
2181 case '|':
2182 case ')':
2183 break;
2184 case '\\':
2185 ch = nextEscaped();
2186 if (ch == 'p' || ch == 'P') { // Property
2187 if (first > 0) { // Slice is waiting; handle it first
2188 unread();
2189 break;
2190 } else { // No slice; just return the family node
2191 boolean comp = (ch == 'P');
2192 boolean oneLetter = true;
2193 ch = next(); // Consume { if present
2194 if (ch != '{')
2195 unread();
2196 else
2197 oneLetter = false;
2198 if (has(CANON_EQ) && !has(LITERAL))
2199 return new NFCCharProperty(family(oneLetter, comp));
2200 else
2201 return newCharProperty(family(oneLetter, comp));
2202 }
2203 }
2204 unread();
2205 prev = cursor;
2206 ch = escape(false, first == 0, false);
2207 if (ch >= 0) {
2208 append(ch, first);
2209 first++;
2210 if (isSupplementary(ch)) {
2211 hasSupplementary = true;
2212 }
2213 ch = peek();
2214 continue;
2215 } else if (first == 0) {
2216 return root;
2217 }
2218 // Unwind meta escape sequence
2219 cursor = prev;
2220 break;
2221 case 0:
2222 if (cursor >= patternLength) {
2223 break;
2224 }
2225 // Fall through
2226 default:
2227 prev = cursor;
2228 append(ch, first);
2229 first++;
2230 if (isSupplementary(ch)) {
2231 hasSupplementary = true;
2232 }
2233 ch = next();
2234 continue;
2235 }
2236 break;
2237 }
2238 if (first == 1) {
2239 return newCharProperty(single(buffer[0]));
2240 } else {
2241 return newSlice(buffer, first, hasSupplementary);
2242 }
2243 }
2244
2245 private void append(int ch, int len) {
2246 if (len >= buffer.length) {
2247 int[] tmp = new int[len+len];
2248 System.arraycopy(buffer, 0, tmp, 0, len);
2249 buffer = tmp;
2250 }
2251 buffer[len] = ch;
2252 }
2253
2254 /**
2255 * Parses a backref greedily, taking as many numbers as it
2256 * can. The first digit is always treated as a backref, but
2257 * multi digit numbers are only treated as a backref if at
2258 * least that many backrefs exist at this point in the regex.
2259 */
2314 case '6':
2315 case '7':
2316 case '8':
2317 case '9':
2318 if (inclass) break;
2319 if (create) {
2320 root = ref((ch - '0'));
2321 }
2322 return -1;
2323 case 'A':
2324 if (inclass) break;
2325 if (create) root = new Begin();
2326 return -1;
2327 case 'B':
2328 if (inclass) break;
2329 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2330 return -1;
2331 case 'C':
2332 break;
2333 case 'D':
2334 if (create) {
2335 predicate = has(UNICODE_CHARACTER_CLASS) ?
2336 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
2337 predicate = predicate.negate();
2338 if (!inclass)
2339 root = newCharProperty(predicate);
2340 }
2341 return -1;
2342 case 'E':
2343 case 'F':
2344 break;
2345 case 'G':
2346 if (inclass) break;
2347 if (create) root = new LastMatch();
2348 return -1;
2349 case 'H':
2350 if (create) {
2351 predicate = HorizWS.negate();
2352 if (!inclass)
2353 root = newCharProperty(predicate);
2354 }
2355 return -1;
2356 case 'I':
2357 case 'J':
2358 case 'K':
2359 case 'L':
2360 case 'M':
2361 break;
2362 case 'N':
2363 return N();
2364 case 'O':
2365 case 'P':
2366 case 'Q':
2367 break;
2368 case 'R':
2369 if (inclass) break;
2370 if (create) root = new LineEnding();
2371 return -1;
2372 case 'S':
2373 if (create) {
2374 predicate = has(UNICODE_CHARACTER_CLASS) ?
2375 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
2376 predicate = predicate.negate();
2377 if (!inclass)
2378 root = newCharProperty(predicate);
2379 }
2380 return -1;
2381 case 'T':
2382 case 'U':
2383 break;
2384 case 'V':
2385 if (create) {
2386 predicate = VertWS.negate();
2387 if (!inclass)
2388 root = newCharProperty(predicate);
2389 }
2390 return -1;
2391 case 'W':
2392 if (create) {
2393 predicate = has(UNICODE_CHARACTER_CLASS) ?
2394 CharPredicates.WORD : CharPredicates.ASCII_WORD;
2395 predicate = predicate.negate();
2396 if (!inclass)
2397 root = newCharProperty(predicate);
2398 }
2399 return -1;
2400 case 'X':
2401 if (inclass) break;
2402 if (create) {
2403 root = new XGrapheme();
2404 }
2405 return -1;
2406 case 'Y':
2407 break;
2408 case 'Z':
2409 if (inclass) break;
2410 if (create) {
2411 if (has(UNIX_LINES))
2412 root = new UnixDollar(false);
2413 else
2414 root = new Dollar(false);
2415 }
2416 return -1;
2417 case 'a':
2418 return '\007';
2419 case 'b':
2420 if (inclass) break;
2421 if (create) {
2422 if (peek() == '{') {
2423 if (skip() == 'g') {
2424 if (read() == '}') {
2425 root = new GraphemeBound();
2426 return -1;
2427 }
2428 break; // error missing trailing }
2429 }
2430 unread(); unread();
2431 }
2432 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2433 }
2434 return -1;
2435 case 'c':
2436 return c();
2437 case 'd':
2438 if (create) {
2439 predicate = has(UNICODE_CHARACTER_CLASS) ?
2440 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
2441 if (!inclass)
2442 root = newCharProperty(predicate);
2443 }
2444 return -1;
2445 case 'e':
2446 return '\033';
2447 case 'f':
2448 return '\f';
2449 case 'g':
2450 break;
2451 case 'h':
2452 if (create) {
2453 predicate = HorizWS;
2454 if (!inclass)
2455 root = newCharProperty(predicate);
2456 }
2457 return -1;
2458 case 'i':
2459 case 'j':
2460 break;
2461 case 'k':
2462 if (inclass)
2463 break;
2464 if (read() != '<')
2465 throw error("\\k is not followed by '<' for named capturing group");
2466 String name = groupname(read());
2467 if (!namedGroups().containsKey(name))
2468 throw error("(named capturing group <"+ name+"> does not exit");
2469 if (create) {
2470 if (has(CASE_INSENSITIVE))
2471 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2472 else
2473 root = new BackRef(namedGroups().get(name));
2474 }
2475 return -1;
2476 case 'l':
2477 case 'm':
2478 break;
2479 case 'n':
2480 return '\n';
2481 case 'o':
2482 case 'p':
2483 case 'q':
2484 break;
2485 case 'r':
2486 return '\r';
2487 case 's':
2488 if (create) {
2489 predicate = has(UNICODE_CHARACTER_CLASS) ?
2490 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
2491 if (!inclass)
2492 root = newCharProperty(predicate);
2493 }
2494 return -1;
2495 case 't':
2496 return '\t';
2497 case 'u':
2498 return u();
2499 case 'v':
2500 // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2501 // undocumented). In JDK8 '\v' is specified as a predefined
2502 // character class for all vertical whitespace characters.
2503 // So [-1, root=VertWS node] pair is returned (instead of a
2504 // single 0x0B). This breaks the range if '\v' is used as
2505 // the start or end value, such as [\v-...] or [...-\v], in
2506 // which a single definite value (0x0B) is expected. For
2507 // compatibility concern '\013'/0x0B is returned if isrange.
2508 if (isrange)
2509 return '\013';
2510 if (create) {
2511 predicate = VertWS;
2512 if (!inclass)
2513 root = newCharProperty(predicate);
2514 }
2515 return -1;
2516 case 'w':
2517 if (create) {
2518 predicate = has(UNICODE_CHARACTER_CLASS) ?
2519 CharPredicates.WORD : CharPredicates.ASCII_WORD;
2520 if (!inclass)
2521 root = newCharProperty(predicate);
2522 }
2523 return -1;
2524 case 'x':
2525 return x();
2526 case 'y':
2527 break;
2528 case 'z':
2529 if (inclass) break;
2530 if (create) root = new End();
2531 return -1;
2532 default:
2533 return ch;
2534 }
2535 throw error("Illegal/unsupported escape sequence");
2536 }
2537
2538 /**
2539 * Parse a character class, and return the node that matches it.
2540 *
2541 * Consumes a ] on the way out if consume is true. Usually consume
2542 * is true except for the case of [abc&&def] where def is a separate
2543 * right hand node with "understood" brackets.
2544 */
2545 private CharPredicate clazz(boolean consume) {
2546 CharPredicate prev = null;
2547 CharPredicate curr = null;
2548 BitClass bits = new BitClass();
2549 BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
2550
2551 boolean isNeg = false;
2552 boolean hasBits = false;
2553 int ch = next();
2554
2555 // Negates if first char in a class, otherwise literal
2556 if (ch == '^' && temp[cursor-1] == '[') {
2557 ch = next();
2558 isNeg = true;
2559 }
2560 for (;;) {
2561 switch (ch) {
2562 case '[':
2563 curr = clazz(true);
2564 if (prev == null)
2565 prev = curr;
2566 else
2567 prev = prev.union(curr);
2568 ch = peek();
2569 continue;
2570 case '&':
2571 ch = next();
2572 if (ch == '&') {
2573 ch = next();
2574 CharPredicate right = null;
2575 while (ch != ']' && ch != '&') {
2576 if (ch == '[') {
2577 if (right == null)
2578 right = clazz(true);
2579 else
2580 right = right.union(clazz(true));
2581 } else { // abc&&def
2582 unread();
2583 right = clazz(false);
2584 }
2585 ch = peek();
2586 }
2587 if (hasBits) {
2588 // bits used, union has high precedence
2589 if (prev == null) {
2590 prev = curr = bitsP;
2591 } else {
2592 prev = prev.union(bitsP);
2593 }
2594 hasBits = false;
2595 }
2596 if (right != null)
2597 curr = right;
2598 if (prev == null) {
2599 if (right == null)
2600 throw error("Bad class syntax");
2601 else
2602 prev = right;
2603 } else {
2604 prev = prev.and(curr);
2605 }
2606 } else {
2607 // treat as a literal &
2608 unread();
2609 break;
2610 }
2611 continue;
2612 case 0:
2613 if (cursor >= patternLength)
2614 throw error("Unclosed character class");
2615 break;
2616 case ']':
2617 if (prev != null || hasBits) {
2618 if (consume)
2619 next();
2620 if (prev == null)
2621 prev = bitsP;
2622 else if (hasBits)
2623 prev = prev.union(bitsP);
2624 if (isNeg)
2625 return prev.negate();
2626 return prev;
2627 }
2628 break;
2629 default:
2630 break;
2631 }
2632 curr = range(bits);
2633 if (curr == null) { // the bits used
2634 hasBits = true;
2635 } else {
2636 if (prev == null)
2637 prev = curr;
2638 else if (prev != curr)
2639 prev = prev.union(curr);
2640 }
2641 ch = peek();
2642 }
2643 }
2644
2645 private CharPredicate bitsOrSingle(BitClass bits, int ch) {
2646 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2647 Use "single" node instead of bits when dealing with unicode
2648 case folding for codepoints listed below.
2649 (1)Uppercase out of range: u+00ff, u+00b5
2650 toUpperCase(u+00ff) -> u+0178
2651 toUpperCase(u+00b5) -> u+039c
2652 (2)LatinSmallLetterLongS u+17f
2653 toUpperCase(u+017f) -> u+0053
2654 (3)LatinSmallLetterDotlessI u+131
2655 toUpperCase(u+0131) -> u+0049
2656 (4)LatinCapitalLetterIWithDotAbove u+0130
2657 toLowerCase(u+0130) -> u+0069
2658 (5)KelvinSign u+212a
2659 toLowerCase(u+212a) ==> u+006B
2660 (6)AngstromSign u+212b
2661 toLowerCase(u+212b) ==> u+00e5
2662 */
2663 int d;
2664 if (ch < 256 &&
2665 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2666 (ch == 0xff || ch == 0xb5 ||
2667 ch == 0x49 || ch == 0x69 || //I and i
2668 ch == 0x53 || ch == 0x73 || //S and s
2669 ch == 0x4b || ch == 0x6b || //K and k
2670 ch == 0xc5 || ch == 0xe5))) { //A+ring {
2671 bits.add(ch, flags());
2672 return null;
2673 }
2674 return single(ch);
2675 }
2676
2677 /**
2678 * Returns a suitably optimized, single character predicate
2679 */
2680 private CharPredicate single(final int ch) {
2681 if (has(CASE_INSENSITIVE)) {
2682 int lower, upper;
2683 if (has(UNICODE_CASE)) {
2684 upper = Character.toUpperCase(ch);
2685 lower = Character.toLowerCase(upper);
2686 // Unicode case insensitive matches
2687 if (upper != lower)
2688 return SingleU(lower);
2689 } else if (ASCII.isAscii(ch)) {
2690 lower = ASCII.toLower(ch);
2691 upper = ASCII.toUpper(ch);
2692 // Case insensitive matches a given BMP character
2693 if (lower != upper)
2694 return SingleI(lower, upper);
2695 }
2696 }
2697 if (isSupplementary(ch))
2698 return SingleS(ch);
2699 return Single(ch); // Match a given BMP character
2700 }
2701
2702 /**
2703 * Parse a single character or a character range in a character class
2704 * and return its representative node.
2705 */
2706 private CharPredicate range(BitClass bits) {
2707 int ch = peek();
2708 if (ch == '\\') {
2709 ch = nextEscaped();
2710 if (ch == 'p' || ch == 'P') { // A property
2711 boolean comp = (ch == 'P');
2712 boolean oneLetter = true;
2713 // Consume { if present
2714 ch = next();
2715 if (ch != '{')
2716 unread();
2717 else
2718 oneLetter = false;
2719 return family(oneLetter, comp);
2720 } else { // ordinary escape
2721 boolean isrange = temp[cursor+1] == '-';
2722 unread();
2723 ch = escape(true, true, isrange);
2724 if (ch == -1)
2725 return predicate;
2726 }
2727 } else {
2728 next();
2729 }
2730 if (ch >= 0) {
2731 if (peek() == '-') {
2732 int endRange = temp[cursor+1];
2733 if (endRange == '[') {
2734 return bitsOrSingle(bits, ch);
2735 }
2736 if (endRange != ']') {
2737 next();
2738 int m = peek();
2739 if (m == '\\') {
2740 m = escape(true, false, true);
2741 } else {
2742 next();
2743 }
2744 if (m < ch) {
2745 throw error("Illegal character range");
2746 }
2747 if (has(CASE_INSENSITIVE)) {
2748 if (has(UNICODE_CASE))
2749 return CIRangeU(ch, m);
2750 return CIRange(ch, m);
2751 } else {
2752 return Range(ch, m);
2753 }
2754 }
2755 }
2756 return bitsOrSingle(bits, ch);
2757 }
2758 throw error("Unexpected character '"+((char)ch)+"'");
2759 }
2760
2761 /**
2762 * Parses a Unicode character family and returns its representative node.
2763 */
2764 private CharPredicate family(boolean singleLetter,
2765 boolean isComplement)
2766 {
2767 next();
2768 String name;
2769 CharPredicate p = null;
2770
2771 if (singleLetter) {
2772 int c = temp[cursor];
2773 if (!Character.isSupplementaryCodePoint(c)) {
2774 name = String.valueOf((char)c);
2775 } else {
2776 name = new String(temp, cursor, 1);
2777 }
2778 read();
2779 } else {
2780 int i = cursor;
2781 mark('}');
2782 while(read() != '}') {
2783 }
2784 mark('\000');
2785 int j = cursor;
2786 if (j > patternLength)
2787 throw error("Unclosed character family");
2788 if (i + 1 >= j)
2789 throw error("Empty character family");
2790 name = new String(temp, i, j-i-1);
2791 }
2792
2793 int i = name.indexOf('=');
2794 if (i != -1) {
2795 // property construct \p{name=value}
2796 String value = name.substring(i + 1);
2797 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2798 switch (name) {
2799 case "sc":
2800 case "script":
2801 p = CharPredicates.forUnicodeScript(value);
2802 break;
2803 case "blk":
2804 case "block":
2805 p = CharPredicates.forUnicodeBlock(value);
2806 break;
2807 case "gc":
2808 case "general_category":
2809 p = CharPredicates.forProperty(value);
2810 break;
2811 default:
2812 break;
2813 }
2814 if (p == null)
2815 throw error("Unknown Unicode property {name=<" + name + ">, "
2816 + "value=<" + value + ">}");
2817
2818 } else {
2819 if (name.startsWith("In")) {
2820 // \p{InBlockName}
2821 p = CharPredicates.forUnicodeBlock(name.substring(2));
2822 } else if (name.startsWith("Is")) {
2823 // \p{IsGeneralCategory} and \p{IsScriptName}
2824 name = name.substring(2);
2825 p = CharPredicates.forUnicodeProperty(name);
2826 if (p == null)
2827 p = CharPredicates.forProperty(name);
2828 if (p == null)
2829 p = CharPredicates.forUnicodeScript(name);
2830 } else {
2831 if (has(UNICODE_CHARACTER_CLASS)) {
2832 p = CharPredicates.forPOSIXName(name);
2833 }
2834 if (p == null)
2835 p = CharPredicates.forProperty(name);
2836 }
2837 if (p == null)
2838 throw error("Unknown character property name {In/Is" + name + "}");
2839 }
2840 if (isComplement) {
2841 // it might be too expensive to detect if a complement of
2842 // CharProperty can match "certain" supplementary. So just
2843 // go with StartS.
2844 hasSupplementary = true;
2845 p = p.negate();
2846 }
2847 return p;
2848 }
2849
2850 private CharProperty newCharProperty(CharPredicate p) {
2851 if (p == null)
2852 return null;
2853 if (p instanceof BmpCharPredicate)
2854 return new BmpCharProperty((BmpCharPredicate)p);
2855 else
2856 return new CharProperty(p);
2857 }
2858
2859 /**
2860 * Parses and returns the name of a "named capturing group", the trailing
2861 * ">" is consumed after parsing.
2862 */
2863 private String groupname(int ch) {
2864 StringBuilder sb = new StringBuilder();
2865 sb.append(Character.toChars(ch));
2866 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2867 ASCII.isDigit(ch)) {
2868 sb.append(Character.toChars(ch));
2869 }
2870 if (sb.length() == 0)
2871 throw error("named capturing group has 0 length name");
2872 if (ch != '>')
2873 throw error("named capturing group is missing trailing '>'");
2874 return sb.toString();
2875 }
2876
2892 case ':': // (?:xxx) pure group
2893 head = createGroup(true);
2894 tail = root;
2895 head.next = expr(tail);
2896 break;
2897 case '=': // (?=xxx) and (?!xxx) lookahead
2898 case '!':
2899 head = createGroup(true);
2900 tail = root;
2901 head.next = expr(tail);
2902 if (ch == '=') {
2903 head = tail = new Pos(head);
2904 } else {
2905 head = tail = new Neg(head);
2906 }
2907 break;
2908 case '>': // (?>xxx) independent group
2909 head = createGroup(true);
2910 tail = root;
2911 head.next = expr(tail);
2912 head = tail = new Ques(head, Qtype.INDEPENDENT);
2913 break;
2914 case '<': // (?<xxx) look behind
2915 ch = read();
2916 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2917 // named captured group
2918 String name = groupname(ch);
2919 if (namedGroups().containsKey(name))
2920 throw error("Named capturing group <" + name
2921 + "> is already defined");
2922 capturingGroup = true;
2923 head = createGroup(false);
2924 tail = root;
2925 namedGroups().put(name, capturingGroupCount-1);
2926 head.next = expr(tail);
2927 break;
2928 }
2929 int start = cursor;
2930 head = createGroup(true);
2931 tail = root;
2932 head.next = expr(tail);
2978 tail = root;
2979 head.next = expr(tail);
2980 }
2981
2982 accept(')', "Unclosed group");
2983 flags = save;
2984
2985 // Check for quantifiers
2986 Node node = closure(head);
2987 if (node == head) { // No closure
2988 root = tail;
2989 return node; // Dual return
2990 }
2991 if (head == tail) { // Zero length assertion
2992 root = node;
2993 return node; // Dual return
2994 }
2995
2996 if (node instanceof Ques) {
2997 Ques ques = (Ques) node;
2998 if (ques.type == Qtype.POSSESSIVE) {
2999 root = node;
3000 return node;
3001 }
3002 tail.next = new BranchConn();
3003 tail = tail.next;
3004 if (ques.type == Qtype.GREEDY) {
3005 head = new Branch(head, null, tail);
3006 } else { // Reluctant quantifier
3007 head = new Branch(null, head, tail);
3008 }
3009 root = tail;
3010 return head;
3011 } else if (node instanceof Curly) {
3012 Curly curly = (Curly) node;
3013 if (curly.type == Qtype.POSSESSIVE) {
3014 root = node;
3015 return node;
3016 }
3017 // Discover if the group is deterministic
3018 TreeInfo info = new TreeInfo();
3019 if (head.study(info)) { // Deterministic
3020 GroupTail temp = (GroupTail) tail;
3021 head = root = new GroupCurly(head.next, curly.cmin,
3022 curly.cmax, curly.type,
3023 ((GroupTail)tail).localIndex,
3024 ((GroupTail)tail).groupIndex,
3025 capturingGroup);
3026 return head;
3027 } else { // Non-deterministic
3028 int temp = ((GroupHead) head).localIndex;
3029 Loop loop;
3030 if (curly.type == Qtype.GREEDY)
3031 loop = new Loop(this.localCount, temp);
3032 else // Reluctant Curly
3033 loop = new LazyLoop(this.localCount, temp);
3034 Prolog prolog = new Prolog(loop);
3035 this.localCount += 1;
3036 loop.cmin = curly.cmin;
3037 loop.cmax = curly.cmax;
3038 loop.body = head;
3039 tail.next = loop;
3040 root = loop;
3041 return prolog; // Dual return
3042 }
3043 }
3044 throw error("Internal logic error");
3045 }
3046
3047 /**
3048 * Create group head and tail nodes using double return. If the group is
3049 * created with anonymous true then it is a pure group and should not
3050 * affect group counting.
3051 */
3052 private Node createGroup(boolean anonymous) {
3053 int localIndex = localCount++;
3054 int groupIndex = 0;
3055 if (!anonymous)
3056 groupIndex = capturingGroupCount++;
3057 GroupHead head = new GroupHead(localIndex);
3058 root = new GroupTail(localIndex, groupIndex);
3059
3060 // for debug/print only, head.match does NOT need the "tail" info
3061 head.tail = (GroupTail)root;
3062
3063 if (!anonymous && groupIndex < 10)
3064 groupNodes[groupIndex] = head;
3065 return head;
3066 }
3067
3068 @SuppressWarnings("fallthrough")
3069 /**
3070 * Parses inlined match flags and set them appropriately.
3071 */
3072 private void addFlag() {
3073 int ch = peek();
3074 for (;;) {
3075 switch (ch) {
3076 case 'i':
3077 flags |= CASE_INSENSITIVE;
3078 break;
3079 case 'm':
3080 flags |= MULTILINE;
3081 break;
3082 case 's':
3131 case 'u':
3132 flags &= ~UNICODE_CASE;
3133 break;
3134 case 'c':
3135 flags &= ~CANON_EQ;
3136 break;
3137 case 'x':
3138 flags &= ~COMMENTS;
3139 break;
3140 case 'U':
3141 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3142 default:
3143 return;
3144 }
3145 ch = next();
3146 }
3147 }
3148
3149 static final int MAX_REPS = 0x7FFFFFFF;
3150
3151 static enum Qtype {
3152 GREEDY, LAZY, POSSESSIVE, INDEPENDENT
3153 }
3154
3155 private Node curly(Node prev, int cmin) {
3156 int ch = next();
3157 if (ch == '?') {
3158 next();
3159 return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
3160 } else if (ch == '+') {
3161 next();
3162 return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
3163 }
3164 if (prev instanceof BmpCharProperty) {
3165 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3166 } else if (prev instanceof CharProperty) {
3167 return new CharPropertyGreedy((CharProperty)prev, cmin);
3168 }
3169 return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
3170 }
3171
3172 /**
3173 * Processes repetition. If the next character peeked is a quantifier
3174 * then new nodes must be appended to handle the repetition.
3175 * Prev could be a single or a group, so it could be a chain of nodes.
3176 */
3177 private Node closure(Node prev) {
3178 Node atom;
3179 int ch = peek();
3180 switch (ch) {
3181 case '?':
3182 ch = next();
3183 if (ch == '?') {
3184 next();
3185 return new Ques(prev, Qtype.LAZY);
3186 } else if (ch == '+') {
3187 next();
3188 return new Ques(prev, Qtype.POSSESSIVE);
3189 }
3190 return new Ques(prev, Qtype.GREEDY);
3191 case '*':
3192 return curly(prev, 0);
3193 case '+':
3194 return curly(prev, 1);
3195 case '{':
3196 ch = temp[cursor+1];
3197 if (ASCII.isDigit(ch)) {
3198 skip();
3199 int cmin = 0;
3200 do {
3201 cmin = cmin * 10 + (ch - '0');
3202 } while (ASCII.isDigit(ch = read()));
3203 int cmax = cmin;
3204 if (ch == ',') {
3205 ch = read();
3206 cmax = MAX_REPS;
3207 if (ch != '}') {
3208 cmax = 0;
3209 while (ASCII.isDigit(ch)) {
3210 cmax = cmax * 10 + (ch - '0');
3211 ch = read();
3212 }
3213 }
3214 }
3215 if (ch != '}')
3216 throw error("Unclosed counted closure");
3217 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3218 throw error("Illegal repetition range");
3219 Curly curly;
3220 ch = peek();
3221 if (ch == '?') {
3222 next();
3223 curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
3224 } else if (ch == '+') {
3225 next();
3226 curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3227 } else {
3228 curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
3229 }
3230 return curly;
3231 } else {
3232 throw error("Illegal repetition");
3233 }
3234 default:
3235 return prev;
3236 }
3237 }
3238
3239 /**
3240 * Utility method for parsing control escape sequences.
3241 */
3242 private int c() {
3243 if (cursor < patternLength) {
3244 return read() ^ 64;
3245 }
3246 throw error("Illegal control escape sequence");
3247 }
3248
3385
3386 private static final int countCodePoints(CharSequence seq) {
3387 int length = seq.length();
3388 int n = 0;
3389 for (int i = 0; i < length; ) {
3390 n++;
3391 if (Character.isHighSurrogate(seq.charAt(i++))) {
3392 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3393 i++;
3394 }
3395 }
3396 }
3397 return n;
3398 }
3399
3400 /**
3401 * Creates a bit vector for matching Latin-1 values. A normal BitClass
3402 * never matches values above Latin-1, and a complemented BitClass always
3403 * matches values above Latin-1.
3404 */
3405 static final class BitClass extends BmpCharProperty {
3406 final boolean[] bits;
3407 BitClass() {
3408 this(new boolean[256]);
3409 }
3410 private BitClass(boolean[] bits) {
3411 super( ch -> ch < 256 && bits[ch]);
3412 this.bits = bits;
3413 }
3414 BitClass add(int c, int flags) {
3415 assert c >= 0 && c <= 255;
3416 if ((flags & CASE_INSENSITIVE) != 0) {
3417 if (ASCII.isAscii(c)) {
3418 bits[ASCII.toUpper(c)] = true;
3419 bits[ASCII.toLower(c)] = true;
3420 } else if ((flags & UNICODE_CASE) != 0) {
3421 bits[Character.toLowerCase(c)] = true;
3422 bits[Character.toUpperCase(c)] = true;
3423 }
3424 }
3425 bits[c] = true;
3426 return this;
3427 }
3428 }
3429
3430 /**
3431 * Utility method for creating a string slice matcher.
3432 */
3433 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3434 int[] tmp = new int[count];
3435 if (has(CASE_INSENSITIVE)) {
3436 if (has(UNICODE_CASE)) {
3437 for (int i = 0; i < count; i++) {
3438 tmp[i] = Character.toLowerCase(
3439 Character.toUpperCase(buf[i]));
3440 }
3441 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3442 }
3443 for (int i = 0; i < count; i++) {
3444 tmp[i] = ASCII.toLower(buf[i]);
3445 }
3446 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3447 }
3815 if (i < matcher.to && seq.charAt(i) == 0x0A)
3816 i++;
3817 return next.match(matcher, i, seq);
3818 }
3819 } else {
3820 matcher.hitEnd = true;
3821 }
3822 return false;
3823 }
3824 boolean study(TreeInfo info) {
3825 info.minLength++;
3826 info.maxLength += 2;
3827 return next.study(info);
3828 }
3829 }
3830
3831 /**
3832 * Abstract node class to match one character satisfying some
3833 * boolean property.
3834 */
3835 static class CharProperty extends Node {
3836 CharPredicate predicate;
3837
3838 CharProperty (CharPredicate predicate) {
3839 this.predicate = predicate;
3840 }
3841 boolean match(Matcher matcher, int i, CharSequence seq) {
3842 if (i < matcher.to) {
3843 int ch = Character.codePointAt(seq, i);
3844 return predicate.is(ch) &&
3845 next.match(matcher, i + Character.charCount(ch), seq);
3846 } else {
3847 matcher.hitEnd = true;
3848 return false;
3849 }
3850 }
3851 boolean study(TreeInfo info) {
3852 info.minLength++;
3853 info.maxLength++;
3854 return next.study(info);
3855 }
3856 }
3857
3858 /**
3859 * Optimized version of CharProperty that works only for
3860 * properties never satisfied by Supplementary characters.
3861 */
3862 private static class BmpCharProperty extends CharProperty {
3863 BmpCharProperty (BmpCharPredicate predicate) {
3864 super(predicate);
3865 }
3866 boolean match(Matcher matcher, int i, CharSequence seq) {
3867 if (i < matcher.to) {
3868 return predicate.is(seq.charAt(i)) &&
3869 next.match(matcher, i + 1, seq);
3870 } else {
3871 matcher.hitEnd = true;
3872 return false;
3873 }
3874 }
3875 }
3876
3877 private static class NFCCharProperty extends Node {
3878 CharPredicate predicate;
3879 NFCCharProperty (CharPredicate predicate) {
3880 this.predicate = predicate;
3881 }
3882
3883 boolean match(Matcher matcher, int i, CharSequence seq) {
3884 if (i < matcher.to) {
3885 int ch0 = Character.codePointAt(seq, i);
3886 int n = Character.charCount(ch0);
3887 int j = i + n;
3888 while (j < matcher.to) {
3889 int ch1 = Character.codePointAt(seq, j);
3890 if (Grapheme.isBoundary(ch0, ch1))
3891 break;
3892 ch0 = ch1;
3893 j += Character.charCount(ch1);
3894 }
3895 if (i + n == j) { // single, assume nfc cp
3896 if (predicate.is(ch0))
3897 return next.match(matcher, j, seq);
3898 } else {
3899 while (i + n < j) {
3900 String nfc = Normalizer.normalize(
3901 seq.toString().substring(i, j), Normalizer.Form.NFC);
3902 if (nfc.codePointCount(0, nfc.length()) == 1) {
3903 if (predicate.is(nfc.codePointAt(0)) &&
3904 next.match(matcher, j, seq)) {
3905 return true;
3906 }
3907 }
3908
3909 ch0 = Character.codePointBefore(seq, j);
3910 j -= Character.charCount(ch0);
3911 }
3912 }
3913 if (j < matcher.to)
3914 return false;
3915 }
3916 matcher.hitEnd = true;
3917 return false;
3918 }
3919
3920 boolean study(TreeInfo info) {
3921 info.minLength++;
3922 info.deterministic = false;
3923 return next.study(info);
3924 }
3925 }
3926
3927 /**
3928 * Node class that matches an unicode extended grapheme cluster
3929 */
3930 static class XGrapheme extends Node {
3931 boolean match(Matcher matcher, int i, CharSequence seq) {
3932 if (i < matcher.to) {
3933 int ch0 = Character.codePointAt(seq, i);
3934 i += Character.charCount(ch0);
3935 while (i < matcher.to) {
3936 int ch1 = Character.codePointAt(seq, i);
3937 if (Grapheme.isBoundary(ch0, ch1))
3938 break;
3939 ch0 = ch1;
3940 i += Character.charCount(ch1);
3941 }
3942 return next.match(matcher, i, seq);
3943 }
4125 return false;
4126 }
4127 }
4128 return next.match(matcher, x, seq);
4129 }
4130 }
4131
4132 /**
4133 * Node class for a case insensitive sequence of literal characters.
4134 * Uses unicode case folding.
4135 */
4136 static final class SliceUS extends SliceIS {
4137 SliceUS(int[] buf) {
4138 super(buf);
4139 }
4140 int toLower(int c) {
4141 return Character.toLowerCase(Character.toUpperCase(c));
4142 }
4143 }
4144
4145 /**
4146 * The 0 or 1 quantifier. This one class implements all three types.
4147 */
4148 static final class Ques extends Node {
4149 Node atom;
4150 Qtype type;
4151 Ques(Node node, Qtype type) {
4152 this.atom = node;
4153 this.type = type;
4154 }
4155 boolean match(Matcher matcher, int i, CharSequence seq) {
4156 switch (type) {
4157 case GREEDY:
4158 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4159 || next.match(matcher, i, seq);
4160 case LAZY:
4161 return next.match(matcher, i, seq)
4162 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4163 case POSSESSIVE:
4164 if (atom.match(matcher, i, seq)) i = matcher.last;
4165 return next.match(matcher, i, seq);
4166 default:
4167 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4168 }
4169 }
4170 boolean study(TreeInfo info) {
4171 if (type != Qtype.INDEPENDENT) {
4172 int minL = info.minLength;
4173 atom.study(info);
4174 info.minLength = minL;
4175 info.deterministic = false;
4176 return next.study(info);
4177 } else {
4178 atom.study(info);
4179 return next.study(info);
4180 }
4181 }
4182 }
4183
4184 /**
4185 * Handles the greedy style repetition with the minimum either be
4186 * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
4187 */
4188 static class CharPropertyGreedy extends Node {
4189 final CharPredicate predicate;
4190 final int cmin;
4191
4192 CharPropertyGreedy(CharProperty cp, int cmin) {
4193 this.predicate = cp.predicate;
4194 this.cmin = cmin;
4195 }
4196 boolean match(Matcher matcher, int i, CharSequence seq) {
4197 int n = 0;
4198 int to = matcher.to;
4199 // greedy, all the way down
4200 while (i < to) {
4201 int ch = Character.codePointAt(seq, i);
4202 if (!predicate.is(ch))
4203 break;
4204 i += Character.charCount(ch);
4205 n++;
4206 }
4207 if (i >= to) {
4208 matcher.hitEnd = true;
4209 }
4210 while (n >= cmin) {
4211 if (next.match(matcher, i, seq))
4212 return true;
4213 if (n == cmin)
4214 return false;
4215 // backing off if match fails
4216 int ch = Character.codePointBefore(seq, i);
4217 i -= Character.charCount(ch);
4218 n--;
4219 }
4220 return false;
4221 }
4222
4223 boolean study(TreeInfo info) {
4224 info.minLength += cmin;
4225 if (info.maxValid) {
4226 info.maxLength += MAX_REPS;
4227 }
4228 info.deterministic = false;
4229 return next.study(info);
4230 }
4231 }
4232
4233 static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
4234
4235 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
4236 super(bcp, cmin);
4237 }
4238
4239 boolean match(Matcher matcher, int i, CharSequence seq) {
4240 int n = 0;
4241 int to = matcher.to;
4242 while (i < to && predicate.is(seq.charAt(i))) {
4243 i++; n++;
4244 }
4245 if (i >= to) {
4246 matcher.hitEnd = true;
4247 }
4248 while (n >= cmin) {
4249 if (next.match(matcher, i, seq))
4250 return true;
4251 i--; n--; // backing off if match fails
4252 }
4253 return false;
4254 }
4255 }
4256
4257 /**
4258 * Handles the curly-brace style repetition with a specified minimum and
4259 * maximum occurrences. The * quantifier is handled as a special case.
4260 * This class handles the three types.
4261 */
4262 static final class Curly extends Node {
4263 Node atom;
4264 Qtype type;
4265 int cmin;
4266 int cmax;
4267
4268 Curly(Node node, int cmin, int cmax, Qtype type) {
4269 this.atom = node;
4270 this.type = type;
4271 this.cmin = cmin;
4272 this.cmax = cmax;
4273 }
4274 boolean match(Matcher matcher, int i, CharSequence seq) {
4275 int j;
4276 for (j = 0; j < cmin; j++) {
4277 if (atom.match(matcher, i, seq)) {
4278 i = matcher.last;
4279 continue;
4280 }
4281 return false;
4282 }
4283 if (type == Qtype.GREEDY)
4284 return match0(matcher, i, j, seq);
4285 else if (type == Qtype.LAZY)
4286 return match1(matcher, i, j, seq);
4287 else
4288 return match2(matcher, i, j, seq);
4289 }
4290 // Greedy match.
4291 // i is the index to start matching at
4292 // j is the number of atoms that have matched
4293 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4294 if (j >= cmax) {
4295 // We have matched the maximum... continue with the rest of
4296 // the regular expression
4297 return next.match(matcher, i, seq);
4298 }
4299 int backLimit = j;
4300 while (atom.match(matcher, i, seq)) {
4301 // k is the length of this match
4302 int k = matcher.last - i;
4303 if (k == 0) // Zero length match
4304 break;
4305 // Move up index and number matched
4387 }
4388
4389 if (info.deterministic && cmin == cmax)
4390 info.deterministic = detm;
4391 else
4392 info.deterministic = false;
4393 return next.study(info);
4394 }
4395 }
4396
4397 /**
4398 * Handles the curly-brace style repetition with a specified minimum and
4399 * maximum occurrences in deterministic cases. This is an iterative
4400 * optimization over the Prolog and Loop system which would handle this
4401 * in a recursive way. The * quantifier is handled as a special case.
4402 * If capture is true then this class saves group settings and ensures
4403 * that groups are unset when backing off of a group match.
4404 */
4405 static final class GroupCurly extends Node {
4406 Node atom;
4407 Qtype type;
4408 int cmin;
4409 int cmax;
4410 int localIndex;
4411 int groupIndex;
4412 boolean capture;
4413
4414 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
4415 int group, boolean capture) {
4416 this.atom = node;
4417 this.type = type;
4418 this.cmin = cmin;
4419 this.cmax = cmax;
4420 this.localIndex = local;
4421 this.groupIndex = group;
4422 this.capture = capture;
4423 }
4424 boolean match(Matcher matcher, int i, CharSequence seq) {
4425 int[] groups = matcher.groups;
4426 int[] locals = matcher.locals;
4427 int save0 = locals[localIndex];
4428 int save1 = 0;
4429 int save2 = 0;
4430
4431 if (capture) {
4432 save1 = groups[groupIndex];
4433 save2 = groups[groupIndex+1];
4434 }
4435
4436 // Notify GroupTail there is no need to setup group info
4437 // because it will be set here
4438 locals[localIndex] = -1;
4439
4440 boolean ret = true;
4441 for (int j = 0; j < cmin; j++) {
4442 if (atom.match(matcher, i, seq)) {
4443 if (capture) {
4444 groups[groupIndex] = i;
4445 groups[groupIndex+1] = matcher.last;
4446 }
4447 i = matcher.last;
4448 } else {
4449 ret = false;
4450 break;
4451 }
4452 }
4453 if (ret) {
4454 if (type == Qtype.GREEDY) {
4455 ret = match0(matcher, i, cmin, seq);
4456 } else if (type == Qtype.LAZY) {
4457 ret = match1(matcher, i, cmin, seq);
4458 } else {
4459 ret = match2(matcher, i, cmin, seq);
4460 }
4461 }
4462 if (!ret) {
4463 locals[localIndex] = save0;
4464 if (capture) {
4465 groups[groupIndex] = save1;
4466 groups[groupIndex+1] = save2;
4467 }
4468 }
4469 return ret;
4470 }
4471 // Aggressive group match
4472 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4473 // don't back off passing the starting "j"
4474 int min = j;
4475 int[] groups = matcher.groups;
4476 int save0 = 0;
4682
4683 info.minLength += minL;
4684 info.maxLength += maxL;
4685 info.maxValid &= maxV;
4686 info.deterministic = false;
4687 return false;
4688 }
4689 }
4690
4691 /**
4692 * The GroupHead saves the location where the group begins in the locals
4693 * and restores them when the match is done.
4694 *
4695 * The matchRef is used when a reference to this group is accessed later
4696 * in the expression. The locals will have a negative value in them to
4697 * indicate that we do not want to unset the group if the reference
4698 * doesn't match.
4699 */
4700 static final class GroupHead extends Node {
4701 int localIndex;
4702 GroupTail tail; // for debug/print only, match does not need to know
4703 GroupHead(int localCount) {
4704 localIndex = localCount;
4705 }
4706 boolean match(Matcher matcher, int i, CharSequence seq) {
4707 int save = matcher.locals[localIndex];
4708 matcher.locals[localIndex] = i;
4709 boolean ret = next.match(matcher, i, seq);
4710 matcher.locals[localIndex] = save;
4711 return ret;
4712 }
4713 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4714 int save = matcher.locals[localIndex];
4715 matcher.locals[localIndex] = ~i; // HACK
4716 boolean ret = next.match(matcher, i, seq);
4717 matcher.locals[localIndex] = save;
4718 return ret;
4719 }
4720 }
4721
4722 /**
5276 int startIndex = (!matcher.transparentBounds) ?
5277 matcher.from : 0;
5278 int from = Math.max(i - rmaxChars, startIndex);
5279 matcher.lookbehindTo = i;
5280 // Relax transparent region boundaries for lookbehind
5281 if (matcher.transparentBounds)
5282 matcher.from = 0;
5283 for (int j = i - rminChars;
5284 !conditionMatched && j >= from;
5285 j -= j>from ? countChars(seq, j, -1) : 1) {
5286 conditionMatched = cond.match(matcher, j, seq);
5287 }
5288 //Reinstate region boundaries
5289 matcher.from = savedFrom;
5290 matcher.lookbehindTo = savedLBT;
5291 return !conditionMatched && next.match(matcher, i, seq);
5292 }
5293 }
5294
5295 /**
5296 * Handles word boundaries. Includes a field to allow this one class to
5297 * deal with the different types of word boundaries we can match. The word
5298 * characters include underscores, letters, and digits. Non spacing marks
5299 * can are also part of a word if they have a base character, otherwise
5300 * they are ignored for purposes of finding word boundaries.
5301 */
5302 static final class Bound extends Node {
5303 static int LEFT = 0x1;
5304 static int RIGHT= 0x2;
5305 static int BOTH = 0x3;
5306 static int NONE = 0x4;
5307 int type;
5308 boolean useUWORD;
5309 Bound(int n, boolean useUWORD) {
5310 type = n;
5311 this.useUWORD = useUWORD;
5312 }
5313
5314 boolean isWord(int ch) {
5315 return useUWORD ? CharPredicates.WORD.is(ch)
5316 : (ch == '_' || Character.isLetterOrDigit(ch));
5317 }
5318
5319 int check(Matcher matcher, int i, CharSequence seq) {
5320 int ch;
5321 boolean left = false;
5322 int startIndex = matcher.from;
5323 int endIndex = matcher.to;
5324 if (matcher.transparentBounds) {
5325 startIndex = 0;
5326 endIndex = matcher.getTextLength();
5327 }
5328 if (i > startIndex) {
5329 ch = Character.codePointBefore(seq, i);
5330 left = (isWord(ch) ||
5331 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5332 && hasBaseCharacter(matcher, i-1, seq)));
5333 }
5334 boolean right = false;
5335 if (i < endIndex) {
5541 i += countChars(seq, i, n);
5542 continue NEXT;
5543 }
5544 }
5545 // Entire pattern matched starting at i
5546 matcher.first = i;
5547 boolean ret = next.match(matcher, i + lengthInChars, seq);
5548 if (ret) {
5549 matcher.first = i;
5550 matcher.groups[0] = matcher.first;
5551 matcher.groups[1] = matcher.last;
5552 return true;
5553 }
5554 i += countChars(seq, i, 1);
5555 }
5556 matcher.hitEnd = true;
5557 return false;
5558 }
5559 }
5560
5561 @FunctionalInterface
5562 static interface CharPredicate {
5563 boolean is(int ch);
5564
5565 default CharPredicate and(CharPredicate p) {
5566 return ch -> is(ch) && p.is(ch);
5567 }
5568 default CharPredicate union(CharPredicate p) {
5569 return ch -> is(ch) || p.is(ch);
5570 }
5571 default CharPredicate union(CharPredicate p1,
5572 CharPredicate p2 ) {
5573 return ch -> is(ch) || p1.is(ch) || p2.is(ch);
5574 }
5575 default CharPredicate negate() {
5576 return ch -> !is(ch);
5577 }
5578 }
5579
5580 static interface BmpCharPredicate extends CharPredicate {
5581
5582 default CharPredicate and(CharPredicate p) {
5583 if(p instanceof BmpCharPredicate)
5584 return (BmpCharPredicate)((ch) -> is(ch) && p.is(ch));
5585 return ch -> is(ch) && p.is(ch);
5586 }
5587 default CharPredicate union(CharPredicate p) {
5588 if (p instanceof BmpCharPredicate)
5589 return (BmpCharPredicate)((ch) -> is(ch) || p.is(ch));
5590 return ch -> is(ch) || p.is(ch);
5591 }
5592 static CharPredicate union(CharPredicate... predicates) {
5593 CharPredicate cp = ch -> {
5594 for (CharPredicate p : predicates) {
5595 if (!p.is(ch))
5596 return false;
5597 }
5598 return true;
5599 };
5600 for (CharPredicate p : predicates) {
5601 if (! (p instanceof BmpCharPredicate))
5602 return cp;
5603 }
5604 return (BmpCharPredicate)cp;
5605 }
5606 }
5607
5608 /**
5609 * matches a Perl vertical whitespace
5610 */
5611 static BmpCharPredicate VertWS = cp ->
5612 (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
5613
5614 /**
5615 * matches a Perl horizontal whitespace
5616 */
5617 static BmpCharPredicate HorizWS = cp ->
5618 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
5619 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
5620 cp == 0x205f || cp == 0x3000;
5621
5622 /**
5623 * for the Unicode category ALL and the dot metacharacter when
5624 * in dotall mode.
5625 */
5626 static CharPredicate ALL = ch -> true;
5627
5628 /**
5629 * for the dot metacharacter when dotall is not enabled.
5630 */
5631 static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
5632 && (ch|1) != '\u2029'
5633 && ch != '\u0085');
5634 /**
5635 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
5636 */
5637 static CharPredicate UNIXDOT = ch -> ch != '\n';
5638
5639 /**
5640 * Indicate that matches a Supplementary Unicode character
5641 */
5642 static CharPredicate SingleS(int c) {
5643 return ch -> ch == c;
5644 }
5645
5646 /**
5647 * A bmp/optimized predicate of single
5648 */
5649 static BmpCharPredicate Single(int c) {
5650 return ch -> ch == c;
5651 }
5652
5653 /**
5654 * Case insensitive matches a given BMP character
5655 */
5656 static BmpCharPredicate SingleI(int lower, int upper) {
5657 return ch -> ch == lower || ch == upper;
5658 }
5659
5660 /**
5661 * Unicode case insensitive matches a given Unicode character
5662 */
5663 static CharPredicate SingleU(int lower) {
5664 return ch -> lower == ch ||
5665 lower == Character.toLowerCase(Character.toUpperCase(ch));
5666 }
5667
5668 private static boolean inRange(int lower, int ch, int upper) {
5669 return lower <= ch && ch <= upper;
5670 }
5671
5672 /**
5673 * Charactrs within a explicit value range
5674 */
5675 static CharPredicate Range(int lower, int upper) {
5676 if (upper < Character.MIN_HIGH_SURROGATE ||
5677 lower > Character.MAX_HIGH_SURROGATE &&
5678 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
5679 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
5680 return ch -> inRange(lower, ch, upper);
5681 }
5682
5683 /**
5684 * Charactrs within a explicit value range in a case insensitive manner.
5685 */
5686 static CharPredicate CIRange(int lower, int upper) {
5687 return ch -> inRange(lower, ch, upper) ||
5688 ASCII.isAscii(ch) &&
5689 (inRange(lower, ASCII.toUpper(ch), upper) ||
5690 inRange(lower, ASCII.toLower(ch), upper));
5691 }
5692
5693 static CharPredicate CIRangeU(int lower, int upper) {
5694 return ch -> {
5695 if (inRange(lower, ch, upper))
5696 return true;
5697 int up = Character.toUpperCase(ch);
5698 return inRange(lower, up, upper) ||
5699 inRange(lower, Character.toLowerCase(up), upper);
5700 };
5701 }
5702
5703 /**
5704 * This must be the very first initializer.
5705 */
5706 static Node accept = new Node();
5707
5708 static Node lastAccept = new LastNode();
5709
5710 /**
5711 * Creates a predicate which can be used to match a string.
5712 *
5713 * @return The predicate which can be used for matching on a string
5714 * @since 1.8
5715 */
5716 public Predicate<String> asPredicate() {
5717 return s -> matcher(s).find();
5718 }
5719
5720 /**
5721 * Creates a stream from the given input sequence around matches of this
5722 * pattern.
5723 *
5724 * <p> The stream returned by this method contains each substring of the
5725 * input sequence that is terminated by another subsequence that matches
5726 * this pattern or is terminated by the end of the input sequence. The
5727 * substrings in the stream are in the order in which they occur in the
5728 * input. Trailing empty strings will be discarded and not encountered in
5729 * the stream.
|