1 /*
2 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.util.regex;
27
28 import java.text.Normalizer;
29 import java.util.Locale;
30 import java.util.Iterator;
31 import java.util.Map;
32 import java.util.ArrayList;
33 import java.util.HashMap;
34 import java.util.Arrays;
35 import java.util.NoSuchElementException;
36 import java.util.Spliterator;
37 import java.util.Spliterators;
38 import java.util.function.Predicate;
39 import java.util.stream.Stream;
40 import java.util.stream.StreamSupport;
41
42
43 /**
44 * A compiled representation of a regular expression.
45 *
46 * <p> A regular expression, specified as a string, must first be compiled into
47 * an instance of this class. The resulting pattern can then be used to create
48 * a {@link Matcher} object that can match arbitrary {@linkplain
49 * java.lang.CharSequence character sequences} against the regular
50 * expression. All of the state involved in performing a match resides in the
51 * matcher, so many matchers can share the same pattern.
52 *
53 * <p> A typical invocation sequence is thus
967
968 /**
969 * The starting point of state machine for the find operation. This allows
970 * a match to start anywhere in the input.
971 */
972 transient Node root;
973
974 /**
975 * The root of object tree for a match operation. The pattern is matched
976 * at the beginning. This may include a find that uses BnM or a First
977 * node.
978 */
979 transient Node matchRoot;
980
981 /**
982 * Temporary storage used by parsing pattern slice.
983 */
984 transient int[] buffer;
985
986 /**
987 * Map the "name" of the "named capturing group" to its group id
988 * node.
989 */
990 transient volatile Map<String, Integer> namedGroups;
991
992 /**
993 * Temporary storage used while parsing group references.
994 */
995 transient GroupHead[] groupNodes;
996
997 /**
998 * Temporary null terminated code point array used by pattern compiling.
999 */
1000 private transient int[] temp;
1001
1002 /**
1003 * The number of capturing groups in this Pattern. Used by matchers to
1004 * allocate storage needed to perform a match.
1005 */
1006 transient int capturingGroupCount;
1007
1008 /**
1009 * The local variable count used by parsing tree. Used by matchers to
1010 * allocate storage needed to perform a match.
1011 */
1012 transient int localCount;
1013
1014 /**
1015 * Index into the pattern string that keeps track of how much has been
1016 * parsed.
1017 */
1018 private transient int cursor;
1019
1020 /**
1021 * Holds the length of the pattern string.
1022 */
1023 private transient int patternLength;
1024
1025 /**
1026 * If the Start node might possibly match supplementary characters.
1027 * It is set to true during compiling if
1028 * (1) There is supplementary char in pattern, or
1029 * (2) There is complement node of Category or Block
1030 */
1031 private transient boolean hasSupplementary;
1032
1033 /**
1034 * Compiles the given regular expression into a pattern.
1035 *
1036 * @param regex
1037 * The expression to be compiled
1038 * @return the given regular expression compiled into a pattern
1039 * @throws PatternSyntaxException
1040 * If the expression's syntax is invalid
1041 */
1042 public static Pattern compile(String regex) {
1043 return new Pattern(regex, 0);
1044 }
1045
1046 /**
1047 * Compiles the given regular expression into a pattern with the given
1048 * flags.
1049 *
1321 } while ((slashEIndex = s.indexOf("\\E", current)) != -1);
1322
1323 return sb.append(s, current, s.length())
1324 .append("\\E")
1325 .toString();
1326 }
1327
1328 /**
1329 * Recompile the Pattern instance from a stream. The original pattern
1330 * string is read in and the object tree is recompiled from it.
1331 */
1332 private void readObject(java.io.ObjectInputStream s)
1333 throws java.io.IOException, ClassNotFoundException {
1334
1335 // Read in all fields
1336 s.defaultReadObject();
1337
1338 // Initialize counts
1339 capturingGroupCount = 1;
1340 localCount = 0;
1341
1342 // if length > 0, the Pattern is lazily compiled
1343 if (pattern.length() == 0) {
1344 root = new Start(lastAccept);
1345 matchRoot = lastAccept;
1346 compiled = true;
1347 }
1348 }
1349
1350 /**
1351 * This private constructor is used to create all Patterns. The pattern
1352 * string and match flags are all that is needed to completely describe
1353 * a Pattern. An empty pattern string results in an object tree with
1354 * only a Start node and a LastNode node.
1355 */
1356 private Pattern(String p, int f) {
1357 if ((f & ~ALL_FLAGS) != 0) {
1358 throw new IllegalArgumentException("Unknown flag 0x"
1359 + Integer.toHexString(f));
1360 }
1361 pattern = p;
1362 flags = f;
1363
1364 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1365 if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1366 flags |= UNICODE_CASE;
1367
1368 // Reset group index count
1369 capturingGroupCount = 1;
1370 localCount = 0;
1371
1372 if (pattern.length() > 0) {
1373 compile();
1374 } else {
1375 root = new Start(lastAccept);
1376 matchRoot = lastAccept;
1377 }
1378 }
1379
1380 /**
1381 * The pattern is converted to normalized form ({@linkplain
1382 * java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
1383 * and then a pure group is constructed to match canonical
1384 * equivalences of the characters.
1385 */
1386 private void normalize() {
1387 int lastCodePoint = -1;
1388
1389 // Convert pattern into normalized form
1390 normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
1391 patternLength = normalizedPattern.length();
1392
1393 // Modify pattern to match canonical equivalences
1394 StringBuilder newPattern = new StringBuilder(patternLength);
1395 for(int i=0; i<patternLength; ) {
1396 int c = normalizedPattern.codePointAt(i);
1397 StringBuilder sequenceBuffer;
1398 if ((Character.getType(c) == Character.NON_SPACING_MARK)
1399 && (lastCodePoint != -1)) {
1400 sequenceBuffer = new StringBuilder();
1401 sequenceBuffer.appendCodePoint(lastCodePoint);
1402 sequenceBuffer.appendCodePoint(c);
1403 while(Character.getType(c) == Character.NON_SPACING_MARK) {
1404 i += Character.charCount(c);
1405 if (i >= patternLength)
1406 break;
1407 c = normalizedPattern.codePointAt(i);
1408 sequenceBuffer.appendCodePoint(c);
1409 }
1410 String ea = produceEquivalentAlternation(
1411 sequenceBuffer.toString());
1412 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
1413 newPattern.append("(?:").append(ea).append(")");
1414 } else if (c == '[' && lastCodePoint != '\\') {
1415 i = normalizeCharClass(newPattern, i);
1416 } else {
1417 newPattern.appendCodePoint(c);
1418 }
1419 lastCodePoint = c;
1420 i += Character.charCount(c);
1421 }
1422 normalizedPattern = newPattern.toString();
1423 }
1424
1425 /**
1426 * Complete the character class being parsed and add a set
1427 * of alternations to it that will match the canonical equivalences
1428 * of the characters within the class.
1429 */
1430 private int normalizeCharClass(StringBuilder newPattern, int i) {
1431 StringBuilder charClass = new StringBuilder();
1432 StringBuilder eq = null;
1433 int lastCodePoint = -1;
1434 String result;
1435
1436 i++;
1437 charClass.append("[");
1438 while(true) {
1439 int c = normalizedPattern.codePointAt(i);
1440 StringBuilder sequenceBuffer;
1441
1442 if (c == ']' && lastCodePoint != '\\') {
1443 charClass.append((char)c);
1444 break;
1445 } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
1446 sequenceBuffer = new StringBuilder();
1447 sequenceBuffer.appendCodePoint(lastCodePoint);
1448 while(Character.getType(c) == Character.NON_SPACING_MARK) {
1449 sequenceBuffer.appendCodePoint(c);
1450 i += Character.charCount(c);
1451 if (i >= normalizedPattern.length())
1452 break;
1453 c = normalizedPattern.codePointAt(i);
1454 }
1455 String ea = produceEquivalentAlternation(
1456 sequenceBuffer.toString());
1457
1458 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
1459 if (eq == null)
1460 eq = new StringBuilder();
1461 eq.append('|');
1462 eq.append(ea);
1463 } else {
1464 charClass.appendCodePoint(c);
1465 i++;
1466 }
1467 if (i == normalizedPattern.length())
1468 throw error("Unclosed character class");
1469 lastCodePoint = c;
1470 }
1471
1472 if (eq != null) {
1473 result = "(?:"+charClass.toString()+eq.toString()+")";
1474 } else {
1475 result = charClass.toString();
1476 }
1477
1478 newPattern.append(result);
1479 return i;
1480 }
1481
1482 /**
1483 * Given a specific sequence composed of a regular character and
1484 * combining marks that follow it, produce the alternation that will
1485 * match all canonical equivalences of that sequence.
1486 */
1487 private String produceEquivalentAlternation(String source) {
1488 int len = countChars(source, 0, 1);
1489 if (source.length() == len)
1490 // source has one character.
1491 return source;
1492
1493 String base = source.substring(0,len);
1494 String combiningMarks = source.substring(len);
1495
1496 String[] perms = producePermutations(combiningMarks);
1497 StringBuilder result = new StringBuilder(source);
1498
1499 // Add combined permutations
1500 for(int x=0; x<perms.length; x++) {
1501 String next = base + perms[x];
1502 if (x>0)
1503 result.append("|"+next);
1504 next = composeOneStep(next);
1505 if (next != null)
1506 result.append("|"+produceEquivalentAlternation(next));
1507 }
1508 return result.toString();
1509 }
1510
1511 /**
1512 * Returns an array of strings that have all the possible
1513 * permutations of the characters in the input string.
1514 * This is used to get a list of all possible orderings
1515 * of a set of combining marks. Note that some of the permutations
1516 * are invalid because of combining class collisions, and these
1517 * possibilities must be removed because they are not canonically
1518 * equivalent.
1519 */
1520 private String[] producePermutations(String input) {
1521 if (input.length() == countChars(input, 0, 1))
1522 return new String[] {input};
1523
1524 if (input.length() == countChars(input, 0, 2)) {
1525 int c0 = Character.codePointAt(input, 0);
1526 int c1 = Character.codePointAt(input, Character.charCount(c0));
1527 if (getClass(c1) == getClass(c0)) {
1528 return new String[] {input};
1529 }
1530 String[] result = new String[2];
1531 result[0] = input;
1532 StringBuilder sb = new StringBuilder(2);
1533 sb.appendCodePoint(c1);
1534 sb.appendCodePoint(c0);
1535 result[1] = sb.toString();
1536 return result;
1537 }
1538
1539 int length = 1;
1540 int nCodePoints = countCodePoints(input);
1558 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
1559 len = countChars(input, offset, 1);
1560 for(int y=x-1; y>=0; y--) {
1561 if (combClass[y] == combClass[x]) {
1562 continue loop;
1563 }
1564 }
1565 StringBuilder sb = new StringBuilder(input);
1566 String otherChars = sb.delete(offset, offset+len).toString();
1567 String[] subResult = producePermutations(otherChars);
1568
1569 String prefix = input.substring(offset, offset+len);
1570 for (String sre : subResult)
1571 temp[index++] = prefix + sre;
1572 }
1573 String[] result = new String[index];
1574 System.arraycopy(temp, 0, result, 0, index);
1575 return result;
1576 }
1577
1578 private int getClass(int c) {
1579 return sun.text.Normalizer.getCombiningClass(c);
1580 }
1581
1582 /**
1583 * Attempts to compose input by combining the first character
1584 * with the first combining mark following it. Returns a String
1585 * that is the composition of the leading character with its first
1586 * combining mark followed by the remaining combining marks. Returns
1587 * null if the first two characters cannot be further composed.
1588 */
1589 private String composeOneStep(String input) {
1590 int len = countChars(input, 0, 2);
1591 String firstTwoCharacters = input.substring(0, len);
1592 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
1593
1594 if (result.equals(firstTwoCharacters))
1595 return null;
1596 else {
1597 String remainder = input.substring(len);
1598 return result + remainder;
1599 }
1600 }
1601
1602 /**
1603 * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1604 * See the description of `quotemeta' in perlfunc(1).
1605 */
1606 private void RemoveQEQuoting() {
1607 final int pLen = patternLength;
1608 int i = 0;
1609 while (i < pLen-1) {
1610 if (temp[i] != '\\')
1611 i += 1;
1612 else if (temp[i + 1] != 'Q')
1613 i += 2;
1660 newtemp[j++] = c;
1661 if (i != pLen)
1662 newtemp[j++] = temp[i++];
1663 }
1664 }
1665
1666 beginQuote = false;
1667 }
1668
1669 patternLength = j;
1670 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1671 }
1672
1673 /**
1674 * Copies regular expression to an int array and invokes the parsing
1675 * of the expression which will create the object tree.
1676 */
1677 private void compile() {
1678 // Handle canonical equivalences
1679 if (has(CANON_EQ) && !has(LITERAL)) {
1680 normalize();
1681 } else {
1682 normalizedPattern = pattern;
1683 }
1684 patternLength = normalizedPattern.length();
1685
1686 // Copy pattern to int array for convenience
1687 // Use double zero to terminate pattern
1688 temp = new int[patternLength + 2];
1689
1690 hasSupplementary = false;
1691 int c, count = 0;
1692 // Convert all chars into code points
1693 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1694 c = normalizedPattern.codePointAt(x);
1695 if (isSupplementary(c)) {
1696 hasSupplementary = true;
1697 }
1698 temp[count++] = c;
1699 }
1700
1701 patternLength = count; // patternLength now in code points
1702
1703 if (! has(LITERAL))
1704 RemoveQEQuoting();
1705
1706 // Allocate all temporary objects here.
1707 buffer = new int[32];
1708 groupNodes = new GroupHead[10];
1709 namedGroups = null;
1710
1711 if (has(LITERAL)) {
1712 // Literal pattern handling
1713 matchRoot = newSlice(temp, patternLength, hasSupplementary);
1714 matchRoot.next = lastAccept;
1715 } else {
1716 // Start recursive descent parsing
1717 matchRoot = expr(lastAccept);
1718 // Check extra pattern characters
1719 if (patternLength != cursor) {
1720 if (peek() == ')') {
1721 throw error("Unmatched closing ')'");
1722 } else {
1723 throw error("Unexpected internal error");
1724 }
1725 }
1726 }
1727
1728 // Peephole optimization
1729 if (matchRoot instanceof Slice) {
1730 root = BnM.optimize(matchRoot);
1731 if (root == matchRoot) {
1732 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1733 }
1734 } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1735 root = matchRoot;
1736 } else {
1737 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1738 }
1739
1740 // Release temporary storage
1741 temp = null;
1742 buffer = null;
1743 groupNodes = null;
1744 patternLength = 0;
1745 compiled = true;
1746 }
1747
1748 Map<String, Integer> namedGroups() {
1749 Map<String, Integer> groups = namedGroups;
1750 if (groups == null) {
1751 namedGroups = groups = new HashMap<>(2);
1752 }
1753 return groups;
1754 }
1755
1756 /**
1757 * Used to print out a subtree of the Pattern to help with debugging.
1758 */
1759 private static void printObjectTree(Node node) {
1760 while(node != null) {
1761 if (node instanceof Prolog) {
1762 System.out.println(node);
1763 printObjectTree(((Prolog)node).loop);
1764 System.out.println("**** end contents prolog loop");
1765 } else if (node instanceof Loop) {
1766 System.out.println(node);
1767 printObjectTree(((Loop)node).body);
1768 System.out.println("**** end contents Loop body");
1769 } else if (node instanceof Curly) {
1770 System.out.println(node);
1771 printObjectTree(((Curly)node).atom);
1772 System.out.println("**** end contents Curly body");
1773 } else if (node instanceof GroupCurly) {
1774 System.out.println(node);
1775 printObjectTree(((GroupCurly)node).atom);
1776 System.out.println("**** end contents GroupCurly body");
1777 } else if (node instanceof GroupTail) {
1778 System.out.println(node);
1779 System.out.println("Tail next is "+node.next);
1780 return;
1781 } else {
1782 System.out.println(node);
1783 }
1784 node = node.next;
1785 if (node != null)
1786 System.out.println("->next:");
1787 if (node == Pattern.accept) {
1788 System.out.println("Accept Node");
1789 node = null;
1790 }
1791 }
1792 }
1793
1794 /**
1795 * Used to accumulate information about a subtree of the object graph
1796 * so that optimizations can be applied to the subtree.
1797 */
1798 static final class TreeInfo {
1799 int minLength;
1800 int maxLength;
1801 boolean maxValid;
1802 boolean deterministic;
1803
1804 TreeInfo() {
1805 reset();
1806 }
1807 void reset() {
1808 minLength = 0;
1809 maxLength = 0;
1810 maxValid = true;
1811 deterministic = true;
1812 }
1813 }
1814
2066 Node node = null;
2067 LOOP:
2068 for (;;) {
2069 int ch = peek();
2070 switch (ch) {
2071 case '(':
2072 // Because group handles its own closure,
2073 // we need to treat it differently
2074 node = group0();
2075 // Check for comment or flag group
2076 if (node == null)
2077 continue;
2078 if (head == null)
2079 head = node;
2080 else
2081 tail.next = node;
2082 // Double return: Tail was returned in root
2083 tail = root;
2084 continue;
2085 case '[':
2086 node = clazz(true);
2087 break;
2088 case '\\':
2089 ch = nextEscaped();
2090 if (ch == 'p' || ch == 'P') {
2091 boolean oneLetter = true;
2092 boolean comp = (ch == 'P');
2093 ch = next(); // Consume { if present
2094 if (ch != '{') {
2095 unread();
2096 } else {
2097 oneLetter = false;
2098 }
2099 node = family(oneLetter, comp);
2100 } else {
2101 unread();
2102 node = atom();
2103 }
2104 break;
2105 case '^':
2106 next();
2107 if (has(MULTILINE)) {
2108 if (has(UNIX_LINES))
2109 node = new UnixCaret();
2110 else
2111 node = new Caret();
2112 } else {
2113 node = new Begin();
2114 }
2115 break;
2116 case '$':
2117 next();
2118 if (has(UNIX_LINES))
2119 node = new UnixDollar(has(MULTILINE));
2120 else
2121 node = new Dollar(has(MULTILINE));
2122 break;
2123 case '.':
2124 next();
2125 if (has(DOTALL)) {
2126 node = new All();
2127 } else {
2128 if (has(UNIX_LINES))
2129 node = new UnixDot();
2130 else {
2131 node = new Dot();
2132 }
2133 }
2134 break;
2135 case '|':
2136 case ')':
2137 break LOOP;
2138 case ']': // Now interpreting dangling ] and } as literals
2139 case '}':
2140 node = atom();
2141 break;
2142 case '?':
2143 case '*':
2144 case '+':
2145 next();
2146 throw error("Dangling meta character '" + ((char)ch) + "'");
2147 case 0:
2148 if (cursor >= patternLength) {
2149 break LOOP;
2150 }
2151 // Fall through
2152 default:
2153 node = atom();
2154 break;
2155 }
2156
2157 node = closure(node);
2158
2159 if (head == null) {
2160 head = tail = node;
2161 } else {
2162 tail.next = node;
2163 tail = node;
2164 }
2165 }
2166 if (head == null) {
2167 return end;
2168 }
2169 tail.next = end;
2170 root = tail; //double return
2171 return head;
2172 }
2173
2174 @SuppressWarnings("fallthrough")
2175 /**
2176 * Parse and add a new Single or Slice.
2177 */
2178 private Node atom() {
2196 case '^':
2197 case '(':
2198 case '[':
2199 case '|':
2200 case ')':
2201 break;
2202 case '\\':
2203 ch = nextEscaped();
2204 if (ch == 'p' || ch == 'P') { // Property
2205 if (first > 0) { // Slice is waiting; handle it first
2206 unread();
2207 break;
2208 } else { // No slice; just return the family node
2209 boolean comp = (ch == 'P');
2210 boolean oneLetter = true;
2211 ch = next(); // Consume { if present
2212 if (ch != '{')
2213 unread();
2214 else
2215 oneLetter = false;
2216 return family(oneLetter, comp);
2217 }
2218 }
2219 unread();
2220 prev = cursor;
2221 ch = escape(false, first == 0, false);
2222 if (ch >= 0) {
2223 append(ch, first);
2224 first++;
2225 if (isSupplementary(ch)) {
2226 hasSupplementary = true;
2227 }
2228 ch = peek();
2229 continue;
2230 } else if (first == 0) {
2231 return root;
2232 }
2233 // Unwind meta escape sequence
2234 cursor = prev;
2235 break;
2236 case 0:
2237 if (cursor >= patternLength) {
2238 break;
2239 }
2240 // Fall through
2241 default:
2242 prev = cursor;
2243 append(ch, first);
2244 first++;
2245 if (isSupplementary(ch)) {
2246 hasSupplementary = true;
2247 }
2248 ch = next();
2249 continue;
2250 }
2251 break;
2252 }
2253 if (first == 1) {
2254 return newSingle(buffer[0]);
2255 } else {
2256 return newSlice(buffer, first, hasSupplementary);
2257 }
2258 }
2259
2260 private void append(int ch, int len) {
2261 if (len >= buffer.length) {
2262 int[] tmp = new int[len+len];
2263 System.arraycopy(buffer, 0, tmp, 0, len);
2264 buffer = tmp;
2265 }
2266 buffer[len] = ch;
2267 }
2268
2269 /**
2270 * Parses a backref greedily, taking as many numbers as it
2271 * can. The first digit is always treated as a backref, but
2272 * multi digit numbers are only treated as a backref if at
2273 * least that many backrefs exist at this point in the regex.
2274 */
2285 case '5':
2286 case '6':
2287 case '7':
2288 case '8':
2289 case '9':
2290 int newRefNum = (refNum * 10) + (ch - '0');
2291 // Add another number if it doesn't make a group
2292 // that doesn't exist
2293 if (capturingGroupCount - 1 < newRefNum) {
2294 done = true;
2295 break;
2296 }
2297 refNum = newRefNum;
2298 read();
2299 break;
2300 default:
2301 done = true;
2302 break;
2303 }
2304 }
2305 if (has(CASE_INSENSITIVE))
2306 return new CIBackRef(refNum, has(UNICODE_CASE));
2307 else
2308 return new BackRef(refNum);
2309 }
2310
2311 /**
2312 * Parses an escape sequence to determine the actual value that needs
2313 * to be matched.
2314 * If -1 is returned and create was true a new object was added to the tree
2315 * to handle the escape sequence.
2316 * If the returned value is greater than zero, it is the value that
2317 * matches the escape sequence.
2318 */
2319 private int escape(boolean inclass, boolean create, boolean isrange) {
2320 int ch = skip();
2321 switch (ch) {
2322 case '0':
2323 return o();
2324 case '1':
2329 case '6':
2330 case '7':
2331 case '8':
2332 case '9':
2333 if (inclass) break;
2334 if (create) {
2335 root = ref((ch - '0'));
2336 }
2337 return -1;
2338 case 'A':
2339 if (inclass) break;
2340 if (create) root = new Begin();
2341 return -1;
2342 case 'B':
2343 if (inclass) break;
2344 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2345 return -1;
2346 case 'C':
2347 break;
2348 case 'D':
2349 if (create) root = has(UNICODE_CHARACTER_CLASS)
2350 ? new Utype(UnicodeProp.DIGIT).complement()
2351 : new Ctype(ASCII.DIGIT).complement();
2352 return -1;
2353 case 'E':
2354 case 'F':
2355 break;
2356 case 'G':
2357 if (inclass) break;
2358 if (create) root = new LastMatch();
2359 return -1;
2360 case 'H':
2361 if (create) root = new HorizWS().complement();
2362 return -1;
2363 case 'I':
2364 case 'J':
2365 case 'K':
2366 case 'L':
2367 case 'M':
2368 break;
2369 case 'N':
2370 return N();
2371 case 'O':
2372 case 'P':
2373 case 'Q':
2374 break;
2375 case 'R':
2376 if (inclass) break;
2377 if (create) root = new LineEnding();
2378 return -1;
2379 case 'S':
2380 if (create) root = has(UNICODE_CHARACTER_CLASS)
2381 ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2382 : new Ctype(ASCII.SPACE).complement();
2383 return -1;
2384 case 'T':
2385 case 'U':
2386 break;
2387 case 'V':
2388 if (create) root = new VertWS().complement();
2389 return -1;
2390 case 'W':
2391 if (create) root = has(UNICODE_CHARACTER_CLASS)
2392 ? new Utype(UnicodeProp.WORD).complement()
2393 : new Ctype(ASCII.WORD).complement();
2394 return -1;
2395 case 'X':
2396 if (inclass) break;
2397 if (create) {
2398 root = new XGrapheme();
2399 }
2400 return -1;
2401 case 'Y':
2402 break;
2403 case 'Z':
2404 if (inclass) break;
2405 if (create) {
2406 if (has(UNIX_LINES))
2407 root = new UnixDollar(false);
2408 else
2409 root = new Dollar(false);
2410 }
2411 return -1;
2412 case 'a':
2413 return '\007';
2414 case 'b':
2415 if (inclass) break;
2416 if (create) {
2417 if (peek() == '{') {
2418 if (skip() == 'g') {
2419 if (read() == '}') {
2420 root = new GraphemeBound();
2421 return -1;
2422 }
2423 break; // error missing trailing }
2424 }
2425 unread(); unread();
2426 }
2427 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2428 }
2429 return -1;
2430 case 'c':
2431 return c();
2432 case 'd':
2433 if (create) root = has(UNICODE_CHARACTER_CLASS)
2434 ? new Utype(UnicodeProp.DIGIT)
2435 : new Ctype(ASCII.DIGIT);
2436 return -1;
2437 case 'e':
2438 return '\033';
2439 case 'f':
2440 return '\f';
2441 case 'g':
2442 break;
2443 case 'h':
2444 if (create) root = new HorizWS();
2445 return -1;
2446 case 'i':
2447 case 'j':
2448 break;
2449 case 'k':
2450 if (inclass)
2451 break;
2452 if (read() != '<')
2453 throw error("\\k is not followed by '<' for named capturing group");
2454 String name = groupname(read());
2455 if (!namedGroups().containsKey(name))
2456 throw error("(named capturing group <"+ name+"> does not exit");
2457 if (create) {
2458 if (has(CASE_INSENSITIVE))
2459 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2460 else
2461 root = new BackRef(namedGroups().get(name));
2462 }
2463 return -1;
2464 case 'l':
2465 case 'm':
2466 break;
2467 case 'n':
2468 return '\n';
2469 case 'o':
2470 case 'p':
2471 case 'q':
2472 break;
2473 case 'r':
2474 return '\r';
2475 case 's':
2476 if (create) root = has(UNICODE_CHARACTER_CLASS)
2477 ? new Utype(UnicodeProp.WHITE_SPACE)
2478 : new Ctype(ASCII.SPACE);
2479 return -1;
2480 case 't':
2481 return '\t';
2482 case 'u':
2483 return u();
2484 case 'v':
2485 // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2486 // undocumented). In JDK8 '\v' is specified as a predefined
2487 // character class for all vertical whitespace characters.
2488 // So [-1, root=VertWS node] pair is returned (instead of a
2489 // single 0x0B). This breaks the range if '\v' is used as
2490 // the start or end value, such as [\v-...] or [...-\v], in
2491 // which a single definite value (0x0B) is expected. For
2492 // compatibility concern '\013'/0x0B is returned if isrange.
2493 if (isrange)
2494 return '\013';
2495 if (create) root = new VertWS();
2496 return -1;
2497 case 'w':
2498 if (create) root = has(UNICODE_CHARACTER_CLASS)
2499 ? new Utype(UnicodeProp.WORD)
2500 : new Ctype(ASCII.WORD);
2501 return -1;
2502 case 'x':
2503 return x();
2504 case 'y':
2505 break;
2506 case 'z':
2507 if (inclass) break;
2508 if (create) root = new End();
2509 return -1;
2510 default:
2511 return ch;
2512 }
2513 throw error("Illegal/unsupported escape sequence");
2514 }
2515
2516 /**
2517 * Parse a character class, and return the node that matches it.
2518 *
2519 * Consumes a ] on the way out if consume is true. Usually consume
2520 * is true except for the case of [abc&&def] where def is a separate
2521 * right hand node with "understood" brackets.
2522 */
2523 private CharProperty clazz(boolean consume) {
2524 CharProperty prev = null;
2525 CharProperty node = null;
2526 BitClass bits = new BitClass();
2527 boolean include = true;
2528 boolean firstInClass = true;
2529 int ch = next();
2530 for (;;) {
2531 switch (ch) {
2532 case '^':
2533 // Negates if first char in a class, otherwise literal
2534 if (firstInClass) {
2535 if (temp[cursor-1] != '[')
2536 break;
2537 ch = next();
2538 include = !include;
2539 continue;
2540 } else {
2541 // ^ not first in class, treat as literal
2542 break;
2543 }
2544 case '[':
2545 firstInClass = false;
2546 node = clazz(true);
2547 if (prev == null)
2548 prev = node;
2549 else
2550 prev = union(prev, node);
2551 ch = peek();
2552 continue;
2553 case '&':
2554 firstInClass = false;
2555 ch = next();
2556 if (ch == '&') {
2557 ch = next();
2558 CharProperty rightNode = null;
2559 while (ch != ']' && ch != '&') {
2560 if (ch == '[') {
2561 if (rightNode == null)
2562 rightNode = clazz(true);
2563 else
2564 rightNode = union(rightNode, clazz(true));
2565 } else { // abc&&def
2566 unread();
2567 rightNode = clazz(false);
2568 }
2569 ch = peek();
2570 }
2571 if (rightNode != null)
2572 node = rightNode;
2573 if (prev == null) {
2574 if (rightNode == null)
2575 throw error("Bad class syntax");
2576 else
2577 prev = rightNode;
2578 } else {
2579 prev = intersection(prev, node);
2580 }
2581 } else {
2582 // treat as a literal &
2583 unread();
2584 break;
2585 }
2586 continue;
2587 case 0:
2588 firstInClass = false;
2589 if (cursor >= patternLength)
2590 throw error("Unclosed character class");
2591 break;
2592 case ']':
2593 firstInClass = false;
2594 if (prev != null) {
2595 if (consume)
2596 next();
2597 return prev;
2598 }
2599 break;
2600 default:
2601 firstInClass = false;
2602 break;
2603 }
2604 node = range(bits);
2605 if (include) {
2606 if (prev == null) {
2607 prev = node;
2608 } else {
2609 if (prev != node)
2610 prev = union(prev, node);
2611 }
2612 } else {
2613 if (prev == null) {
2614 prev = node.complement();
2615 } else {
2616 if (prev != node)
2617 prev = setDifference(prev, node);
2618 }
2619 }
2620 ch = peek();
2621 }
2622 }
2623
2624 private CharProperty bitsOrSingle(BitClass bits, int ch) {
2625 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2626 Use "single" node instead of bits when dealing with unicode
2627 case folding for codepoints listed below.
2628 (1)Uppercase out of range: u+00ff, u+00b5
2629 toUpperCase(u+00ff) -> u+0178
2630 toUpperCase(u+00b5) -> u+039c
2631 (2)LatinSmallLetterLongS u+17f
2632 toUpperCase(u+017f) -> u+0053
2633 (3)LatinSmallLetterDotlessI u+131
2634 toUpperCase(u+0131) -> u+0049
2635 (4)LatinCapitalLetterIWithDotAbove u+0130
2636 toLowerCase(u+0130) -> u+0069
2637 (5)KelvinSign u+212a
2638 toLowerCase(u+212a) ==> u+006B
2639 (6)AngstromSign u+212b
2640 toLowerCase(u+212b) ==> u+00e5
2641 */
2642 int d;
2643 if (ch < 256 &&
2644 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2645 (ch == 0xff || ch == 0xb5 ||
2646 ch == 0x49 || ch == 0x69 || //I and i
2647 ch == 0x53 || ch == 0x73 || //S and s
2648 ch == 0x4b || ch == 0x6b || //K and k
2649 ch == 0xc5 || ch == 0xe5))) //A+ring
2650 return bits.add(ch, flags());
2651 return newSingle(ch);
2652 }
2653
2654 /**
2655 * Parse a single character or a character range in a character class
2656 * and return its representative node.
2657 */
2658 private CharProperty range(BitClass bits) {
2659 int ch = peek();
2660 if (ch == '\\') {
2661 ch = nextEscaped();
2662 if (ch == 'p' || ch == 'P') { // A property
2663 boolean comp = (ch == 'P');
2664 boolean oneLetter = true;
2665 // Consume { if present
2666 ch = next();
2667 if (ch != '{')
2668 unread();
2669 else
2670 oneLetter = false;
2671 return family(oneLetter, comp);
2672 } else { // ordinary escape
2673 boolean isrange = temp[cursor+1] == '-';
2674 unread();
2675 ch = escape(true, true, isrange);
2676 if (ch == -1)
2677 return (CharProperty) root;
2678 }
2679 } else {
2680 next();
2681 }
2682 if (ch >= 0) {
2683 if (peek() == '-') {
2684 int endRange = temp[cursor+1];
2685 if (endRange == '[') {
2686 return bitsOrSingle(bits, ch);
2687 }
2688 if (endRange != ']') {
2689 next();
2690 int m = peek();
2691 if (m == '\\') {
2692 m = escape(true, false, true);
2693 } else {
2694 next();
2695 }
2696 if (m < ch) {
2697 throw error("Illegal character range");
2698 }
2699 if (has(CASE_INSENSITIVE))
2700 return caseInsensitiveRangeFor(ch, m);
2701 else
2702 return rangeFor(ch, m);
2703 }
2704 }
2705 return bitsOrSingle(bits, ch);
2706 }
2707 throw error("Unexpected character '"+((char)ch)+"'");
2708 }
2709
2710 /**
2711 * Parses a Unicode character family and returns its representative node.
2712 */
2713 private CharProperty family(boolean singleLetter,
2714 boolean maybeComplement)
2715 {
2716 next();
2717 String name;
2718 CharProperty node = null;
2719
2720 if (singleLetter) {
2721 int c = temp[cursor];
2722 if (!Character.isSupplementaryCodePoint(c)) {
2723 name = String.valueOf((char)c);
2724 } else {
2725 name = new String(temp, cursor, 1);
2726 }
2727 read();
2728 } else {
2729 int i = cursor;
2730 mark('}');
2731 while(read() != '}') {
2732 }
2733 mark('\000');
2734 int j = cursor;
2735 if (j > patternLength)
2736 throw error("Unclosed character family");
2737 if (i + 1 >= j)
2738 throw error("Empty character family");
2739 name = new String(temp, i, j-i-1);
2740 }
2741
2742 int i = name.indexOf('=');
2743 if (i != -1) {
2744 // property construct \p{name=value}
2745 String value = name.substring(i + 1);
2746 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2747 switch (name) {
2748 case "sc":
2749 case "script":
2750 node = unicodeScriptPropertyFor(value);
2751 break;
2752 case "blk":
2753 case "block":
2754 node = unicodeBlockPropertyFor(value);
2755 break;
2756 case "gc":
2757 case "general_category":
2758 node = charPropertyNodeFor(value);
2759 break;
2760 default:
2761 throw error("Unknown Unicode property {name=<" + name + ">, "
2762 + "value=<" + value + ">}");
2763 }
2764 } else {
2765 if (name.startsWith("In")) {
2766 // \p{inBlockName}
2767 node = unicodeBlockPropertyFor(name.substring(2));
2768 } else if (name.startsWith("Is")) {
2769 // \p{isGeneralCategory} and \p{isScriptName}
2770 name = name.substring(2);
2771 UnicodeProp uprop = UnicodeProp.forName(name);
2772 if (uprop != null)
2773 node = new Utype(uprop);
2774 if (node == null)
2775 node = CharPropertyNames.charPropertyFor(name);
2776 if (node == null)
2777 node = unicodeScriptPropertyFor(name);
2778 } else {
2779 if (has(UNICODE_CHARACTER_CLASS)) {
2780 UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
2781 if (uprop != null)
2782 node = new Utype(uprop);
2783 }
2784 if (node == null)
2785 node = charPropertyNodeFor(name);
2786 }
2787 }
2788 if (maybeComplement) {
2789 if (node instanceof Category || node instanceof Block)
2790 hasSupplementary = true;
2791 node = node.complement();
2792 }
2793 return node;
2794 }
2795
2796
2797 /**
2798 * Returns a CharProperty matching all characters belong to
2799 * a UnicodeScript.
2800 */
2801 private CharProperty unicodeScriptPropertyFor(String name) {
2802 final Character.UnicodeScript script;
2803 try {
2804 script = Character.UnicodeScript.forName(name);
2805 } catch (IllegalArgumentException iae) {
2806 throw error("Unknown character script name {" + name + "}");
2807 }
2808 return new Script(script);
2809 }
2810
2811 /**
2812 * Returns a CharProperty matching all characters in a UnicodeBlock.
2813 */
2814 private CharProperty unicodeBlockPropertyFor(String name) {
2815 final Character.UnicodeBlock block;
2816 try {
2817 block = Character.UnicodeBlock.forName(name);
2818 } catch (IllegalArgumentException iae) {
2819 throw error("Unknown character block name {" + name + "}");
2820 }
2821 return new Block(block);
2822 }
2823
2824 /**
2825 * Returns a CharProperty matching all characters in a named property.
2826 */
2827 private CharProperty charPropertyNodeFor(String name) {
2828 CharProperty p = CharPropertyNames.charPropertyFor(name);
2829 if (p == null)
2830 throw error("Unknown character property name {" + name + "}");
2831 return p;
2832 }
2833
2834 /**
2835 * Parses and returns the name of a "named capturing group", the trailing
2836 * ">" is consumed after parsing.
2837 */
2838 private String groupname(int ch) {
2839 StringBuilder sb = new StringBuilder();
2840 sb.append(Character.toChars(ch));
2841 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2842 ASCII.isDigit(ch)) {
2843 sb.append(Character.toChars(ch));
2844 }
2845 if (sb.length() == 0)
2846 throw error("named capturing group has 0 length name");
2847 if (ch != '>')
2848 throw error("named capturing group is missing trailing '>'");
2849 return sb.toString();
2850 }
2851
2852 /**
2853 * Parses a group and returns the head node of a set of nodes that process
2854 * the group. Sometimes a double return system is used where the tail is
2855 * returned in root.
2856 */
2857 private Node group0() {
2858 boolean capturingGroup = false;
2859 Node head = null;
2860 Node tail = null;
2861 int save = flags;
2862 root = null;
2863 int ch = next();
2864 if (ch == '?') {
2865 ch = skip();
2866 switch (ch) {
2867 case ':': // (?:xxx) pure group
2868 head = createGroup(true);
2869 tail = root;
2870 head.next = expr(tail);
2871 break;
2872 case '=': // (?=xxx) and (?!xxx) lookahead
2873 case '!':
2874 head = createGroup(true);
2875 tail = root;
2876 head.next = expr(tail);
2877 if (ch == '=') {
2878 head = tail = new Pos(head);
2879 } else {
2880 head = tail = new Neg(head);
2881 }
2882 break;
2883 case '>': // (?>xxx) independent group
2884 head = createGroup(true);
2885 tail = root;
2886 head.next = expr(tail);
2887 head = tail = new Ques(head, INDEPENDENT);
2888 break;
2889 case '<': // (?<xxx) look behind
2890 ch = read();
2891 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2892 // named captured group
2893 String name = groupname(ch);
2894 if (namedGroups().containsKey(name))
2895 throw error("Named capturing group <" + name
2896 + "> is already defined");
2897 capturingGroup = true;
2898 head = createGroup(false);
2899 tail = root;
2900 namedGroups().put(name, capturingGroupCount-1);
2901 head.next = expr(tail);
2902 break;
2903 }
2904 int start = cursor;
2905 head = createGroup(true);
2906 tail = root;
2907 head.next = expr(tail);
2911 if (info.maxValid == false) {
2912 throw error("Look-behind group does not have "
2913 + "an obvious maximum length");
2914 }
2915 boolean hasSupplementary = findSupplementary(start, patternLength);
2916 if (ch == '=') {
2917 head = tail = (hasSupplementary ?
2918 new BehindS(head, info.maxLength,
2919 info.minLength) :
2920 new Behind(head, info.maxLength,
2921 info.minLength));
2922 } else if (ch == '!') {
2923 head = tail = (hasSupplementary ?
2924 new NotBehindS(head, info.maxLength,
2925 info.minLength) :
2926 new NotBehind(head, info.maxLength,
2927 info.minLength));
2928 } else {
2929 throw error("Unknown look-behind group");
2930 }
2931 break;
2932 case '$':
2933 case '@':
2934 throw error("Unknown group type");
2935 default: // (?xxx:) inlined match flags
2936 unread();
2937 addFlag();
2938 ch = read();
2939 if (ch == ')') {
2940 return null; // Inline modifier only
2941 }
2942 if (ch != ':') {
2943 throw error("Unknown inline modifier");
2944 }
2945 head = createGroup(true);
2946 tail = root;
2947 head.next = expr(tail);
2948 break;
2949 }
2950 } else { // (xxx) a regular group
2951 capturingGroup = true;
2952 head = createGroup(false);
2953 tail = root;
2954 head.next = expr(tail);
2955 }
2956
2957 accept(')', "Unclosed group");
2958 flags = save;
2959
2960 // Check for quantifiers
2961 Node node = closure(head);
2962 if (node == head) { // No closure
2963 root = tail;
2964 return node; // Dual return
2965 }
2966 if (head == tail) { // Zero length assertion
2967 root = node;
2968 return node; // Dual return
2969 }
2970
2971 if (node instanceof Ques) {
2972 Ques ques = (Ques) node;
2973 if (ques.type == POSSESSIVE) {
2974 root = node;
2975 return node;
2976 }
2977 tail.next = new BranchConn();
2978 tail = tail.next;
2979 if (ques.type == GREEDY) {
2980 head = new Branch(head, null, tail);
2981 } else { // Reluctant quantifier
2982 head = new Branch(null, head, tail);
2983 }
2984 root = tail;
2985 return head;
2986 } else if (node instanceof Curly) {
2987 Curly curly = (Curly) node;
2988 if (curly.type == POSSESSIVE) {
2989 root = node;
2990 return node;
2991 }
2992 // Discover if the group is deterministic
2993 TreeInfo info = new TreeInfo();
2994 if (head.study(info)) { // Deterministic
2995 GroupTail temp = (GroupTail) tail;
2996 head = root = new GroupCurly(head.next, curly.cmin,
2997 curly.cmax, curly.type,
2998 ((GroupTail)tail).localIndex,
2999 ((GroupTail)tail).groupIndex,
3000 capturingGroup);
3001 return head;
3002 } else { // Non-deterministic
3003 int temp = ((GroupHead) head).localIndex;
3004 Loop loop;
3005 if (curly.type == GREEDY)
3006 loop = new Loop(this.localCount, temp);
3007 else // Reluctant Curly
3008 loop = new LazyLoop(this.localCount, temp);
3009 Prolog prolog = new Prolog(loop);
3010 this.localCount += 1;
3011 loop.cmin = curly.cmin;
3012 loop.cmax = curly.cmax;
3013 loop.body = head;
3014 tail.next = loop;
3015 root = loop;
3016 return prolog; // Dual return
3017 }
3018 }
3019 throw error("Internal logic error");
3020 }
3021
3022 /**
3023 * Create group head and tail nodes using double return. If the group is
3024 * created with anonymous true then it is a pure group and should not
3025 * affect group counting.
3026 */
3027 private Node createGroup(boolean anonymous) {
3028 int localIndex = localCount++;
3029 int groupIndex = 0;
3030 if (!anonymous)
3031 groupIndex = capturingGroupCount++;
3032 GroupHead head = new GroupHead(localIndex);
3033 root = new GroupTail(localIndex, groupIndex);
3034 if (!anonymous && groupIndex < 10)
3035 groupNodes[groupIndex] = head;
3036 return head;
3037 }
3038
3039 @SuppressWarnings("fallthrough")
3040 /**
3041 * Parses inlined match flags and set them appropriately.
3042 */
3043 private void addFlag() {
3044 int ch = peek();
3045 for (;;) {
3046 switch (ch) {
3047 case 'i':
3048 flags |= CASE_INSENSITIVE;
3049 break;
3050 case 'm':
3051 flags |= MULTILINE;
3052 break;
3053 case 's':
3102 case 'u':
3103 flags &= ~UNICODE_CASE;
3104 break;
3105 case 'c':
3106 flags &= ~CANON_EQ;
3107 break;
3108 case 'x':
3109 flags &= ~COMMENTS;
3110 break;
3111 case 'U':
3112 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3113 default:
3114 return;
3115 }
3116 ch = next();
3117 }
3118 }
3119
3120 static final int MAX_REPS = 0x7FFFFFFF;
3121
3122 static final int GREEDY = 0;
3123
3124 static final int LAZY = 1;
3125
3126 static final int POSSESSIVE = 2;
3127
3128 static final int INDEPENDENT = 3;
3129
3130 /**
3131 * Processes repetition. If the next character peeked is a quantifier
3132 * then new nodes must be appended to handle the repetition.
3133 * Prev could be a single or a group, so it could be a chain of nodes.
3134 */
3135 private Node closure(Node prev) {
3136 Node atom;
3137 int ch = peek();
3138 switch (ch) {
3139 case '?':
3140 ch = next();
3141 if (ch == '?') {
3142 next();
3143 return new Ques(prev, LAZY);
3144 } else if (ch == '+') {
3145 next();
3146 return new Ques(prev, POSSESSIVE);
3147 }
3148 return new Ques(prev, GREEDY);
3149 case '*':
3150 ch = next();
3151 if (ch == '?') {
3152 next();
3153 return new Curly(prev, 0, MAX_REPS, LAZY);
3154 } else if (ch == '+') {
3155 next();
3156 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
3157 }
3158 return new Curly(prev, 0, MAX_REPS, GREEDY);
3159 case '+':
3160 ch = next();
3161 if (ch == '?') {
3162 next();
3163 return new Curly(prev, 1, MAX_REPS, LAZY);
3164 } else if (ch == '+') {
3165 next();
3166 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
3167 }
3168 return new Curly(prev, 1, MAX_REPS, GREEDY);
3169 case '{':
3170 ch = temp[cursor+1];
3171 if (ASCII.isDigit(ch)) {
3172 skip();
3173 int cmin = 0;
3174 do {
3175 cmin = cmin * 10 + (ch - '0');
3176 } while (ASCII.isDigit(ch = read()));
3177 int cmax = cmin;
3178 if (ch == ',') {
3179 ch = read();
3180 cmax = MAX_REPS;
3181 if (ch != '}') {
3182 cmax = 0;
3183 while (ASCII.isDigit(ch)) {
3184 cmax = cmax * 10 + (ch - '0');
3185 ch = read();
3186 }
3187 }
3188 }
3189 if (ch != '}')
3190 throw error("Unclosed counted closure");
3191 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3192 throw error("Illegal repetition range");
3193 Curly curly;
3194 ch = peek();
3195 if (ch == '?') {
3196 next();
3197 curly = new Curly(prev, cmin, cmax, LAZY);
3198 } else if (ch == '+') {
3199 next();
3200 curly = new Curly(prev, cmin, cmax, POSSESSIVE);
3201 } else {
3202 curly = new Curly(prev, cmin, cmax, GREEDY);
3203 }
3204 return curly;
3205 } else {
3206 throw error("Illegal repetition");
3207 }
3208 default:
3209 return prev;
3210 }
3211 }
3212
3213 /**
3214 * Utility method for parsing control escape sequences.
3215 */
3216 private int c() {
3217 if (cursor < patternLength) {
3218 return read() ^ 64;
3219 }
3220 throw error("Illegal control escape sequence");
3221 }
3222
3359
3360 private static final int countCodePoints(CharSequence seq) {
3361 int length = seq.length();
3362 int n = 0;
3363 for (int i = 0; i < length; ) {
3364 n++;
3365 if (Character.isHighSurrogate(seq.charAt(i++))) {
3366 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3367 i++;
3368 }
3369 }
3370 }
3371 return n;
3372 }
3373
3374 /**
3375 * Creates a bit vector for matching Latin-1 values. A normal BitClass
3376 * never matches values above Latin-1, and a complemented BitClass always
3377 * matches values above Latin-1.
3378 */
3379 private static final class BitClass extends BmpCharProperty {
3380 final boolean[] bits;
3381 BitClass() { bits = new boolean[256]; }
3382 private BitClass(boolean[] bits) { this.bits = bits; }
3383 BitClass add(int c, int flags) {
3384 assert c >= 0 && c <= 255;
3385 if ((flags & CASE_INSENSITIVE) != 0) {
3386 if (ASCII.isAscii(c)) {
3387 bits[ASCII.toUpper(c)] = true;
3388 bits[ASCII.toLower(c)] = true;
3389 } else if ((flags & UNICODE_CASE) != 0) {
3390 bits[Character.toLowerCase(c)] = true;
3391 bits[Character.toUpperCase(c)] = true;
3392 }
3393 }
3394 bits[c] = true;
3395 return this;
3396 }
3397 boolean isSatisfiedBy(int ch) {
3398 return ch < 256 && bits[ch];
3399 }
3400 }
3401
3402 /**
3403 * Returns a suitably optimized, single character matcher.
3404 */
3405 private CharProperty newSingle(final int ch) {
3406 if (has(CASE_INSENSITIVE)) {
3407 int lower, upper;
3408 if (has(UNICODE_CASE)) {
3409 upper = Character.toUpperCase(ch);
3410 lower = Character.toLowerCase(upper);
3411 if (upper != lower)
3412 return new SingleU(lower);
3413 } else if (ASCII.isAscii(ch)) {
3414 lower = ASCII.toLower(ch);
3415 upper = ASCII.toUpper(ch);
3416 if (lower != upper)
3417 return new SingleI(lower, upper);
3418 }
3419 }
3420 if (isSupplementary(ch))
3421 return new SingleS(ch); // Match a given Unicode character
3422 return new Single(ch); // Match a given BMP character
3423 }
3424
3425 /**
3426 * Utility method for creating a string slice matcher.
3427 */
3428 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3429 int[] tmp = new int[count];
3430 if (has(CASE_INSENSITIVE)) {
3431 if (has(UNICODE_CASE)) {
3432 for (int i = 0; i < count; i++) {
3433 tmp[i] = Character.toLowerCase(
3434 Character.toUpperCase(buf[i]));
3435 }
3436 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3437 }
3438 for (int i = 0; i < count; i++) {
3439 tmp[i] = ASCII.toLower(buf[i]);
3440 }
3441 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3442 }
3810 if (i < matcher.to && seq.charAt(i) == 0x0A)
3811 i++;
3812 return next.match(matcher, i, seq);
3813 }
3814 } else {
3815 matcher.hitEnd = true;
3816 }
3817 return false;
3818 }
3819 boolean study(TreeInfo info) {
3820 info.minLength++;
3821 info.maxLength += 2;
3822 return next.study(info);
3823 }
3824 }
3825
3826 /**
3827 * Abstract node class to match one character satisfying some
3828 * boolean property.
3829 */
3830 private abstract static class CharProperty extends Node {
3831 abstract boolean isSatisfiedBy(int ch);
3832 CharProperty complement() {
3833 return new CharProperty() {
3834 boolean isSatisfiedBy(int ch) {
3835 return ! CharProperty.this.isSatisfiedBy(ch);}};
3836 }
3837 boolean match(Matcher matcher, int i, CharSequence seq) {
3838 if (i < matcher.to) {
3839 int ch = Character.codePointAt(seq, i);
3840 return isSatisfiedBy(ch)
3841 && next.match(matcher, i+Character.charCount(ch), seq);
3842 } else {
3843 matcher.hitEnd = true;
3844 return false;
3845 }
3846 }
3847 boolean study(TreeInfo info) {
3848 info.minLength++;
3849 info.maxLength++;
3850 return next.study(info);
3851 }
3852 }
3853
3854 /**
3855 * Optimized version of CharProperty that works only for
3856 * properties never satisfied by Supplementary characters.
3857 */
3858 private abstract static class BmpCharProperty extends CharProperty {
3859 boolean match(Matcher matcher, int i, CharSequence seq) {
3860 if (i < matcher.to) {
3861 return isSatisfiedBy(seq.charAt(i))
3862 && next.match(matcher, i+1, seq);
3863 } else {
3864 matcher.hitEnd = true;
3865 return false;
3866 }
3867 }
3868 }
3869
3870 /**
3871 * Node class that matches a Supplementary Unicode character
3872 */
3873 static final class SingleS extends CharProperty {
3874 final int c;
3875 SingleS(int c) { this.c = c; }
3876 boolean isSatisfiedBy(int ch) {
3877 return ch == c;
3878 }
3879 }
3880
3881 /**
3882 * Optimization -- matches a given BMP character
3883 */
3884 static final class Single extends BmpCharProperty {
3885 final int c;
3886 Single(int c) { this.c = c; }
3887 boolean isSatisfiedBy(int ch) {
3888 return ch == c;
3889 }
3890 }
3891
3892 /**
3893 * Case insensitive matches a given BMP character
3894 */
3895 static final class SingleI extends BmpCharProperty {
3896 final int lower;
3897 final int upper;
3898 SingleI(int lower, int upper) {
3899 this.lower = lower;
3900 this.upper = upper;
3901 }
3902 boolean isSatisfiedBy(int ch) {
3903 return ch == lower || ch == upper;
3904 }
3905 }
3906
3907 /**
3908 * Unicode case insensitive matches a given Unicode character
3909 */
3910 static final class SingleU extends CharProperty {
3911 final int lower;
3912 SingleU(int lower) {
3913 this.lower = lower;
3914 }
3915 boolean isSatisfiedBy(int ch) {
3916 return lower == ch ||
3917 lower == Character.toLowerCase(Character.toUpperCase(ch));
3918 }
3919 }
3920
3921 /**
3922 * Node class that matches a Unicode block.
3923 */
3924 static final class Block extends CharProperty {
3925 final Character.UnicodeBlock block;
3926 Block(Character.UnicodeBlock block) {
3927 this.block = block;
3928 }
3929 boolean isSatisfiedBy(int ch) {
3930 return block == Character.UnicodeBlock.of(ch);
3931 }
3932 }
3933
3934 /**
3935 * Node class that matches a Unicode script
3936 */
3937 static final class Script extends CharProperty {
3938 final Character.UnicodeScript script;
3939 Script(Character.UnicodeScript script) {
3940 this.script = script;
3941 }
3942 boolean isSatisfiedBy(int ch) {
3943 return script == Character.UnicodeScript.of(ch);
3944 }
3945 }
3946
3947 /**
3948 * Node class that matches a Unicode category.
3949 */
3950 static final class Category extends CharProperty {
3951 final int typeMask;
3952 Category(int typeMask) { this.typeMask = typeMask; }
3953 boolean isSatisfiedBy(int ch) {
3954 return (typeMask & (1 << Character.getType(ch))) != 0;
3955 }
3956 }
3957
3958 /**
3959 * Node class that matches a Unicode "type"
3960 */
3961 static final class Utype extends CharProperty {
3962 final UnicodeProp uprop;
3963 Utype(UnicodeProp uprop) { this.uprop = uprop; }
3964 boolean isSatisfiedBy(int ch) {
3965 return uprop.is(ch);
3966 }
3967 }
3968
3969 /**
3970 * Node class that matches a POSIX type.
3971 */
3972 static final class Ctype extends BmpCharProperty {
3973 final int ctype;
3974 Ctype(int ctype) { this.ctype = ctype; }
3975 boolean isSatisfiedBy(int ch) {
3976 return ch < 128 && ASCII.isType(ch, ctype);
3977 }
3978 }
3979
3980 /**
3981 * Node class that matches a Perl vertical whitespace
3982 */
3983 static final class VertWS extends BmpCharProperty {
3984 boolean isSatisfiedBy(int cp) {
3985 return (cp >= 0x0A && cp <= 0x0D) ||
3986 cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3987 }
3988 }
3989
3990 /**
3991 * Node class that matches a Perl horizontal whitespace
3992 */
3993 static final class HorizWS extends BmpCharProperty {
3994 boolean isSatisfiedBy(int cp) {
3995 return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3996 cp == 0x1680 || cp == 0x180e ||
3997 cp >= 0x2000 && cp <= 0x200a ||
3998 cp == 0x202f || cp == 0x205f || cp == 0x3000;
3999 }
4000 }
4001
4002 /**
4003 * Node class that matches an unicode extended grapheme cluster
4004 */
4005 static class XGrapheme extends Node {
4006 boolean match(Matcher matcher, int i, CharSequence seq) {
4007 if (i < matcher.to) {
4008 int ch0 = Character.codePointAt(seq, i);
4009 i += Character.charCount(ch0);
4010 while (i < matcher.to) {
4011 int ch1 = Character.codePointAt(seq, i);
4012 if (Grapheme.isBoundary(ch0, ch1))
4013 break;
4014 ch0 = ch1;
4015 i += Character.charCount(ch1);
4016 }
4017 return next.match(matcher, i, seq);
4018 }
4200 return false;
4201 }
4202 }
4203 return next.match(matcher, x, seq);
4204 }
4205 }
4206
4207 /**
4208 * Node class for a case insensitive sequence of literal characters.
4209 * Uses unicode case folding.
4210 */
4211 static final class SliceUS extends SliceIS {
4212 SliceUS(int[] buf) {
4213 super(buf);
4214 }
4215 int toLower(int c) {
4216 return Character.toLowerCase(Character.toUpperCase(c));
4217 }
4218 }
4219
4220 private static boolean inRange(int lower, int ch, int upper) {
4221 return lower <= ch && ch <= upper;
4222 }
4223
4224 /**
4225 * Returns node for matching characters within an explicit value range.
4226 */
4227 private static CharProperty rangeFor(final int lower,
4228 final int upper) {
4229 return new CharProperty() {
4230 boolean isSatisfiedBy(int ch) {
4231 return inRange(lower, ch, upper);}};
4232 }
4233
4234 /**
4235 * Returns node for matching characters within an explicit value
4236 * range in a case insensitive manner.
4237 */
4238 private CharProperty caseInsensitiveRangeFor(final int lower,
4239 final int upper) {
4240 if (has(UNICODE_CASE))
4241 return new CharProperty() {
4242 boolean isSatisfiedBy(int ch) {
4243 if (inRange(lower, ch, upper))
4244 return true;
4245 int up = Character.toUpperCase(ch);
4246 return inRange(lower, up, upper) ||
4247 inRange(lower, Character.toLowerCase(up), upper);}};
4248 return new CharProperty() {
4249 boolean isSatisfiedBy(int ch) {
4250 return inRange(lower, ch, upper) ||
4251 ASCII.isAscii(ch) &&
4252 (inRange(lower, ASCII.toUpper(ch), upper) ||
4253 inRange(lower, ASCII.toLower(ch), upper));
4254 }};
4255 }
4256
4257 /**
4258 * Implements the Unicode category ALL and the dot metacharacter when
4259 * in dotall mode.
4260 */
4261 static final class All extends CharProperty {
4262 boolean isSatisfiedBy(int ch) {
4263 return true;
4264 }
4265 }
4266
4267 /**
4268 * Node class for the dot metacharacter when dotall is not enabled.
4269 */
4270 static final class Dot extends CharProperty {
4271 boolean isSatisfiedBy(int ch) {
4272 return (ch != '\n' && ch != '\r'
4273 && (ch|1) != '\u2029'
4274 && ch != '\u0085');
4275 }
4276 }
4277
4278 /**
4279 * Node class for the dot metacharacter when dotall is not enabled
4280 * but UNIX_LINES is enabled.
4281 */
4282 static final class UnixDot extends CharProperty {
4283 boolean isSatisfiedBy(int ch) {
4284 return ch != '\n';
4285 }
4286 }
4287
4288 /**
4289 * The 0 or 1 quantifier. This one class implements all three types.
4290 */
4291 static final class Ques extends Node {
4292 Node atom;
4293 int type;
4294 Ques(Node node, int type) {
4295 this.atom = node;
4296 this.type = type;
4297 }
4298 boolean match(Matcher matcher, int i, CharSequence seq) {
4299 switch (type) {
4300 case GREEDY:
4301 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4302 || next.match(matcher, i, seq);
4303 case LAZY:
4304 return next.match(matcher, i, seq)
4305 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4306 case POSSESSIVE:
4307 if (atom.match(matcher, i, seq)) i = matcher.last;
4308 return next.match(matcher, i, seq);
4309 default:
4310 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4311 }
4312 }
4313 boolean study(TreeInfo info) {
4314 if (type != INDEPENDENT) {
4315 int minL = info.minLength;
4316 atom.study(info);
4317 info.minLength = minL;
4318 info.deterministic = false;
4319 return next.study(info);
4320 } else {
4321 atom.study(info);
4322 return next.study(info);
4323 }
4324 }
4325 }
4326
4327 /**
4328 * Handles the curly-brace style repetition with a specified minimum and
4329 * maximum occurrences. The * quantifier is handled as a special case.
4330 * This class handles the three types.
4331 */
4332 static final class Curly extends Node {
4333 Node atom;
4334 int type;
4335 int cmin;
4336 int cmax;
4337
4338 Curly(Node node, int cmin, int cmax, int type) {
4339 this.atom = node;
4340 this.type = type;
4341 this.cmin = cmin;
4342 this.cmax = cmax;
4343 }
4344 boolean match(Matcher matcher, int i, CharSequence seq) {
4345 int j;
4346 for (j = 0; j < cmin; j++) {
4347 if (atom.match(matcher, i, seq)) {
4348 i = matcher.last;
4349 continue;
4350 }
4351 return false;
4352 }
4353 if (type == GREEDY)
4354 return match0(matcher, i, j, seq);
4355 else if (type == LAZY)
4356 return match1(matcher, i, j, seq);
4357 else
4358 return match2(matcher, i, j, seq);
4359 }
4360 // Greedy match.
4361 // i is the index to start matching at
4362 // j is the number of atoms that have matched
4363 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4364 if (j >= cmax) {
4365 // We have matched the maximum... continue with the rest of
4366 // the regular expression
4367 return next.match(matcher, i, seq);
4368 }
4369 int backLimit = j;
4370 while (atom.match(matcher, i, seq)) {
4371 // k is the length of this match
4372 int k = matcher.last - i;
4373 if (k == 0) // Zero length match
4374 break;
4375 // Move up index and number matched
4457 }
4458
4459 if (info.deterministic && cmin == cmax)
4460 info.deterministic = detm;
4461 else
4462 info.deterministic = false;
4463 return next.study(info);
4464 }
4465 }
4466
4467 /**
4468 * Handles the curly-brace style repetition with a specified minimum and
4469 * maximum occurrences in deterministic cases. This is an iterative
4470 * optimization over the Prolog and Loop system which would handle this
4471 * in a recursive way. The * quantifier is handled as a special case.
4472 * If capture is true then this class saves group settings and ensures
4473 * that groups are unset when backing off of a group match.
4474 */
4475 static final class GroupCurly extends Node {
4476 Node atom;
4477 int type;
4478 int cmin;
4479 int cmax;
4480 int localIndex;
4481 int groupIndex;
4482 boolean capture;
4483
4484 GroupCurly(Node node, int cmin, int cmax, int type, int local,
4485 int group, boolean capture) {
4486 this.atom = node;
4487 this.type = type;
4488 this.cmin = cmin;
4489 this.cmax = cmax;
4490 this.localIndex = local;
4491 this.groupIndex = group;
4492 this.capture = capture;
4493 }
4494 boolean match(Matcher matcher, int i, CharSequence seq) {
4495 int[] groups = matcher.groups;
4496 int[] locals = matcher.locals;
4497 int save0 = locals[localIndex];
4498 int save1 = 0;
4499 int save2 = 0;
4500
4501 if (capture) {
4502 save1 = groups[groupIndex];
4503 save2 = groups[groupIndex+1];
4504 }
4505
4506 // Notify GroupTail there is no need to setup group info
4507 // because it will be set here
4508 locals[localIndex] = -1;
4509
4510 boolean ret = true;
4511 for (int j = 0; j < cmin; j++) {
4512 if (atom.match(matcher, i, seq)) {
4513 if (capture) {
4514 groups[groupIndex] = i;
4515 groups[groupIndex+1] = matcher.last;
4516 }
4517 i = matcher.last;
4518 } else {
4519 ret = false;
4520 break;
4521 }
4522 }
4523 if (ret) {
4524 if (type == GREEDY) {
4525 ret = match0(matcher, i, cmin, seq);
4526 } else if (type == LAZY) {
4527 ret = match1(matcher, i, cmin, seq);
4528 } else {
4529 ret = match2(matcher, i, cmin, seq);
4530 }
4531 }
4532 if (!ret) {
4533 locals[localIndex] = save0;
4534 if (capture) {
4535 groups[groupIndex] = save1;
4536 groups[groupIndex+1] = save2;
4537 }
4538 }
4539 return ret;
4540 }
4541 // Aggressive group match
4542 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4543 // don't back off passing the starting "j"
4544 int min = j;
4545 int[] groups = matcher.groups;
4546 int save0 = 0;
4752
4753 info.minLength += minL;
4754 info.maxLength += maxL;
4755 info.maxValid &= maxV;
4756 info.deterministic = false;
4757 return false;
4758 }
4759 }
4760
4761 /**
4762 * The GroupHead saves the location where the group begins in the locals
4763 * and restores them when the match is done.
4764 *
4765 * The matchRef is used when a reference to this group is accessed later
4766 * in the expression. The locals will have a negative value in them to
4767 * indicate that we do not want to unset the group if the reference
4768 * doesn't match.
4769 */
4770 static final class GroupHead extends Node {
4771 int localIndex;
4772 GroupHead(int localCount) {
4773 localIndex = localCount;
4774 }
4775 boolean match(Matcher matcher, int i, CharSequence seq) {
4776 int save = matcher.locals[localIndex];
4777 matcher.locals[localIndex] = i;
4778 boolean ret = next.match(matcher, i, seq);
4779 matcher.locals[localIndex] = save;
4780 return ret;
4781 }
4782 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4783 int save = matcher.locals[localIndex];
4784 matcher.locals[localIndex] = ~i; // HACK
4785 boolean ret = next.match(matcher, i, seq);
4786 matcher.locals[localIndex] = save;
4787 return ret;
4788 }
4789 }
4790
4791 /**
4859 }
4860 boolean match(Matcher matcher, int i, CharSequence seq) {
4861 return loop.matchInit(matcher, i, seq);
4862 }
4863 boolean study(TreeInfo info) {
4864 return loop.study(info);
4865 }
4866 }
4867
4868 /**
4869 * Handles the repetition count for a greedy Curly. The matchInit
4870 * is called from the Prolog to save the index of where the group
4871 * beginning is stored. A zero length group check occurs in the
4872 * normal match but is skipped in the matchInit.
4873 */
4874 static class Loop extends Node {
4875 Node body;
4876 int countIndex; // local count index in matcher locals
4877 int beginIndex; // group beginning index
4878 int cmin, cmax;
4879 Loop(int countIndex, int beginIndex) {
4880 this.countIndex = countIndex;
4881 this.beginIndex = beginIndex;
4882 }
4883 boolean match(Matcher matcher, int i, CharSequence seq) {
4884 // Avoid infinite loop in zero-length case.
4885 if (i > matcher.locals[beginIndex]) {
4886 int count = matcher.locals[countIndex];
4887
4888 // This block is for before we reach the minimum
4889 // iterations required for the loop to match
4890 if (count < cmin) {
4891 matcher.locals[countIndex] = count + 1;
4892 boolean b = body.match(matcher, i, seq);
4893 // If match failed we must backtrack, so
4894 // the loop count should NOT be incremented
4895 if (!b)
4896 matcher.locals[countIndex] = count;
4897 // Return success or failure since we are under
4898 // minimum
4899 return b;
4900 }
4901 // This block is for after we have the minimum
4902 // iterations required for the loop to match
4903 if (count < cmax) {
4904 matcher.locals[countIndex] = count + 1;
4905 boolean b = body.match(matcher, i, seq);
4906 // If match failed we must backtrack, so
4907 // the loop count should NOT be incremented
4908 if (!b)
4909 matcher.locals[countIndex] = count;
4910 else
4911 return true;
4912 }
4913 }
4914 return next.match(matcher, i, seq);
4915 }
4916 boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4917 int save = matcher.locals[countIndex];
4918 boolean ret = false;
4919 if (0 < cmin) {
4920 matcher.locals[countIndex] = 1;
4921 ret = body.match(matcher, i, seq);
4922 } else if (0 < cmax) {
4923 matcher.locals[countIndex] = 1;
4924 ret = body.match(matcher, i, seq);
4925 if (ret == false)
4926 ret = next.match(matcher, i, seq);
4927 } else {
4928 ret = next.match(matcher, i, seq);
4929 }
4930 matcher.locals[countIndex] = save;
4931 return ret;
4932 }
4933 boolean study(TreeInfo info) {
4934 info.maxValid = false;
4935 info.deterministic = false;
4936 return false;
4937 }
4938 }
5345 int startIndex = (!matcher.transparentBounds) ?
5346 matcher.from : 0;
5347 int from = Math.max(i - rmaxChars, startIndex);
5348 matcher.lookbehindTo = i;
5349 // Relax transparent region boundaries for lookbehind
5350 if (matcher.transparentBounds)
5351 matcher.from = 0;
5352 for (int j = i - rminChars;
5353 !conditionMatched && j >= from;
5354 j -= j>from ? countChars(seq, j, -1) : 1) {
5355 conditionMatched = cond.match(matcher, j, seq);
5356 }
5357 //Reinstate region boundaries
5358 matcher.from = savedFrom;
5359 matcher.lookbehindTo = savedLBT;
5360 return !conditionMatched && next.match(matcher, i, seq);
5361 }
5362 }
5363
5364 /**
5365 * Returns the set union of two CharProperty nodes.
5366 */
5367 private static CharProperty union(final CharProperty lhs,
5368 final CharProperty rhs) {
5369 return new CharProperty() {
5370 boolean isSatisfiedBy(int ch) {
5371 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
5372 }
5373
5374 /**
5375 * Returns the set intersection of two CharProperty nodes.
5376 */
5377 private static CharProperty intersection(final CharProperty lhs,
5378 final CharProperty rhs) {
5379 return new CharProperty() {
5380 boolean isSatisfiedBy(int ch) {
5381 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
5382 }
5383
5384 /**
5385 * Returns the set difference of two CharProperty nodes.
5386 */
5387 private static CharProperty setDifference(final CharProperty lhs,
5388 final CharProperty rhs) {
5389 return new CharProperty() {
5390 boolean isSatisfiedBy(int ch) {
5391 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
5392 }
5393
5394 /**
5395 * Handles word boundaries. Includes a field to allow this one class to
5396 * deal with the different types of word boundaries we can match. The word
5397 * characters include underscores, letters, and digits. Non spacing marks
5398 * can are also part of a word if they have a base character, otherwise
5399 * they are ignored for purposes of finding word boundaries.
5400 */
5401 static final class Bound extends Node {
5402 static int LEFT = 0x1;
5403 static int RIGHT= 0x2;
5404 static int BOTH = 0x3;
5405 static int NONE = 0x4;
5406 int type;
5407 boolean useUWORD;
5408 Bound(int n, boolean useUWORD) {
5409 type = n;
5410 this.useUWORD = useUWORD;
5411 }
5412
5413 boolean isWord(int ch) {
5414 return useUWORD ? UnicodeProp.WORD.is(ch)
5415 : (ch == '_' || Character.isLetterOrDigit(ch));
5416 }
5417
5418 int check(Matcher matcher, int i, CharSequence seq) {
5419 int ch;
5420 boolean left = false;
5421 int startIndex = matcher.from;
5422 int endIndex = matcher.to;
5423 if (matcher.transparentBounds) {
5424 startIndex = 0;
5425 endIndex = matcher.getTextLength();
5426 }
5427 if (i > startIndex) {
5428 ch = Character.codePointBefore(seq, i);
5429 left = (isWord(ch) ||
5430 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5431 && hasBaseCharacter(matcher, i-1, seq)));
5432 }
5433 boolean right = false;
5434 if (i < endIndex) {
5640 i += countChars(seq, i, n);
5641 continue NEXT;
5642 }
5643 }
5644 // Entire pattern matched starting at i
5645 matcher.first = i;
5646 boolean ret = next.match(matcher, i + lengthInChars, seq);
5647 if (ret) {
5648 matcher.first = i;
5649 matcher.groups[0] = matcher.first;
5650 matcher.groups[1] = matcher.last;
5651 return true;
5652 }
5653 i += countChars(seq, i, 1);
5654 }
5655 matcher.hitEnd = true;
5656 return false;
5657 }
5658 }
5659
5660 ///////////////////////////////////////////////////////////////////////////////
5661 ///////////////////////////////////////////////////////////////////////////////
5662
5663 /**
5664 * This must be the very first initializer.
5665 */
5666 static Node accept = new Node();
5667
5668 static Node lastAccept = new LastNode();
5669
5670 private static class CharPropertyNames {
5671
5672 static CharProperty charPropertyFor(String name) {
5673 CharPropertyFactory m = map.get(name);
5674 return m == null ? null : m.make();
5675 }
5676
5677 private abstract static class CharPropertyFactory {
5678 abstract CharProperty make();
5679 }
5680
5681 private static void defCategory(String name,
5682 final int typeMask) {
5683 map.put(name, new CharPropertyFactory() {
5684 CharProperty make() { return new Category(typeMask);}});
5685 }
5686
5687 private static void defRange(String name,
5688 final int lower, final int upper) {
5689 map.put(name, new CharPropertyFactory() {
5690 CharProperty make() { return rangeFor(lower, upper);}});
5691 }
5692
5693 private static void defCtype(String name,
5694 final int ctype) {
5695 map.put(name, new CharPropertyFactory() {
5696 CharProperty make() { return new Ctype(ctype);}});
5697 }
5698
5699 private abstract static class CloneableProperty
5700 extends CharProperty implements Cloneable
5701 {
5702 public CloneableProperty clone() {
5703 try {
5704 return (CloneableProperty) super.clone();
5705 } catch (CloneNotSupportedException e) {
5706 throw new AssertionError(e);
5707 }
5708 }
5709 }
5710
5711 private static void defClone(String name,
5712 final CloneableProperty p) {
5713 map.put(name, new CharPropertyFactory() {
5714 CharProperty make() { return p.clone();}});
5715 }
5716
5717 private static final HashMap<String, CharPropertyFactory> map
5718 = new HashMap<>();
5719
5720 static {
5721 // Unicode character property aliases, defined in
5722 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
5723 defCategory("Cn", 1<<Character.UNASSIGNED);
5724 defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
5725 defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
5726 defCategory("Lt", 1<<Character.TITLECASE_LETTER);
5727 defCategory("Lm", 1<<Character.MODIFIER_LETTER);
5728 defCategory("Lo", 1<<Character.OTHER_LETTER);
5729 defCategory("Mn", 1<<Character.NON_SPACING_MARK);
5730 defCategory("Me", 1<<Character.ENCLOSING_MARK);
5731 defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
5732 defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
5733 defCategory("Nl", 1<<Character.LETTER_NUMBER);
5734 defCategory("No", 1<<Character.OTHER_NUMBER);
5735 defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
5736 defCategory("Zl", 1<<Character.LINE_SEPARATOR);
5737 defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
5738 defCategory("Cc", 1<<Character.CONTROL);
5739 defCategory("Cf", 1<<Character.FORMAT);
5740 defCategory("Co", 1<<Character.PRIVATE_USE);
5741 defCategory("Cs", 1<<Character.SURROGATE);
5742 defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
5743 defCategory("Ps", 1<<Character.START_PUNCTUATION);
5744 defCategory("Pe", 1<<Character.END_PUNCTUATION);
5745 defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
5746 defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
5747 defCategory("Sm", 1<<Character.MATH_SYMBOL);
5748 defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
5749 defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
5750 defCategory("So", 1<<Character.OTHER_SYMBOL);
5751 defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
5752 defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
5753 defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
5754 (1<<Character.LOWERCASE_LETTER) |
5755 (1<<Character.TITLECASE_LETTER) |
5756 (1<<Character.MODIFIER_LETTER) |
5757 (1<<Character.OTHER_LETTER)));
5758 defCategory("M", ((1<<Character.NON_SPACING_MARK) |
5759 (1<<Character.ENCLOSING_MARK) |
5760 (1<<Character.COMBINING_SPACING_MARK)));
5761 defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
5762 (1<<Character.LETTER_NUMBER) |
5763 (1<<Character.OTHER_NUMBER)));
5764 defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
5765 (1<<Character.LINE_SEPARATOR) |
5766 (1<<Character.PARAGRAPH_SEPARATOR)));
5767 defCategory("C", ((1<<Character.CONTROL) |
5768 (1<<Character.FORMAT) |
5769 (1<<Character.PRIVATE_USE) |
5770 (1<<Character.SURROGATE))); // Other
5771 defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
5772 (1<<Character.START_PUNCTUATION) |
5773 (1<<Character.END_PUNCTUATION) |
5774 (1<<Character.CONNECTOR_PUNCTUATION) |
5775 (1<<Character.OTHER_PUNCTUATION) |
5776 (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
5777 (1<<Character.FINAL_QUOTE_PUNCTUATION)));
5778 defCategory("S", ((1<<Character.MATH_SYMBOL) |
5779 (1<<Character.CURRENCY_SYMBOL) |
5780 (1<<Character.MODIFIER_SYMBOL) |
5781 (1<<Character.OTHER_SYMBOL)));
5782 defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
5783 (1<<Character.LOWERCASE_LETTER) |
5784 (1<<Character.TITLECASE_LETTER)));
5785 defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
5786 (1<<Character.LOWERCASE_LETTER) |
5787 (1<<Character.TITLECASE_LETTER) |
5788 (1<<Character.MODIFIER_LETTER) |
5789 (1<<Character.OTHER_LETTER) |
5790 (1<<Character.DECIMAL_DIGIT_NUMBER)));
5791 defRange("L1", 0x00, 0xFF); // Latin-1
5792 map.put("all", new CharPropertyFactory() {
5793 CharProperty make() { return new All(); }});
5794
5795 // Posix regular expression character classes, defined in
5796 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
5797 defRange("ASCII", 0x00, 0x7F); // ASCII
5798 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
5799 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
5800 defCtype("Blank", ASCII.BLANK); // Space and tab characters
5801 defCtype("Cntrl", ASCII.CNTRL); // Control characters
5802 defRange("Digit", '0', '9'); // Numeric characters
5803 defCtype("Graph", ASCII.GRAPH); // printable and visible
5804 defRange("Lower", 'a', 'z'); // Lower-case alphabetic
5805 defRange("Print", 0x20, 0x7E); // Printable characters
5806 defCtype("Punct", ASCII.PUNCT); // Punctuation characters
5807 defCtype("Space", ASCII.SPACE); // Space characters
5808 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
5809 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5810
5811 // Java character properties, defined by methods in Character.java
5812 defClone("javaLowerCase", new CloneableProperty() {
5813 boolean isSatisfiedBy(int ch) {
5814 return Character.isLowerCase(ch);}});
5815 defClone("javaUpperCase", new CloneableProperty() {
5816 boolean isSatisfiedBy(int ch) {
5817 return Character.isUpperCase(ch);}});
5818 defClone("javaAlphabetic", new CloneableProperty() {
5819 boolean isSatisfiedBy(int ch) {
5820 return Character.isAlphabetic(ch);}});
5821 defClone("javaIdeographic", new CloneableProperty() {
5822 boolean isSatisfiedBy(int ch) {
5823 return Character.isIdeographic(ch);}});
5824 defClone("javaTitleCase", new CloneableProperty() {
5825 boolean isSatisfiedBy(int ch) {
5826 return Character.isTitleCase(ch);}});
5827 defClone("javaDigit", new CloneableProperty() {
5828 boolean isSatisfiedBy(int ch) {
5829 return Character.isDigit(ch);}});
5830 defClone("javaDefined", new CloneableProperty() {
5831 boolean isSatisfiedBy(int ch) {
5832 return Character.isDefined(ch);}});
5833 defClone("javaLetter", new CloneableProperty() {
5834 boolean isSatisfiedBy(int ch) {
5835 return Character.isLetter(ch);}});
5836 defClone("javaLetterOrDigit", new CloneableProperty() {
5837 boolean isSatisfiedBy(int ch) {
5838 return Character.isLetterOrDigit(ch);}});
5839 defClone("javaJavaIdentifierStart", new CloneableProperty() {
5840 boolean isSatisfiedBy(int ch) {
5841 return Character.isJavaIdentifierStart(ch);}});
5842 defClone("javaJavaIdentifierPart", new CloneableProperty() {
5843 boolean isSatisfiedBy(int ch) {
5844 return Character.isJavaIdentifierPart(ch);}});
5845 defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
5846 boolean isSatisfiedBy(int ch) {
5847 return Character.isUnicodeIdentifierStart(ch);}});
5848 defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
5849 boolean isSatisfiedBy(int ch) {
5850 return Character.isUnicodeIdentifierPart(ch);}});
5851 defClone("javaIdentifierIgnorable", new CloneableProperty() {
5852 boolean isSatisfiedBy(int ch) {
5853 return Character.isIdentifierIgnorable(ch);}});
5854 defClone("javaSpaceChar", new CloneableProperty() {
5855 boolean isSatisfiedBy(int ch) {
5856 return Character.isSpaceChar(ch);}});
5857 defClone("javaWhitespace", new CloneableProperty() {
5858 boolean isSatisfiedBy(int ch) {
5859 return Character.isWhitespace(ch);}});
5860 defClone("javaISOControl", new CloneableProperty() {
5861 boolean isSatisfiedBy(int ch) {
5862 return Character.isISOControl(ch);}});
5863 defClone("javaMirrored", new CloneableProperty() {
5864 boolean isSatisfiedBy(int ch) {
5865 return Character.isMirrored(ch);}});
5866 }
5867 }
5868
5869 /**
5870 * Creates a predicate which can be used to match a string.
5871 *
5872 * @return The predicate which can be used for matching on a string
5873 * @since 1.8
5874 */
5875 public Predicate<String> asPredicate() {
5876 return s -> matcher(s).find();
5877 }
5878
5879 /**
5880 * Creates a stream from the given input sequence around matches of this
5881 * pattern.
5882 *
5883 * <p> The stream returned by this method contains each substring of the
5884 * input sequence that is terminated by another subsequence that matches
5885 * this pattern or is terminated by the end of the input sequence. The
5886 * substrings in the stream are in the order in which they occur in the
5887 * input. Trailing empty strings will be discarded and not encountered in
5888 * the stream.
5889 *
|
1 /*
2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.util.regex;
27
28 import java.text.Normalizer;
29 import java.text.Normalizer.Form;
30 import java.util.Locale;
31 import java.util.Iterator;
32 import java.util.Map;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.LinkedHashSet;
36 import java.util.List;
37 import java.util.Set;
38 import java.util.Arrays;
39 import java.util.NoSuchElementException;
40 import java.util.Spliterator;
41 import java.util.Spliterators;
42 import java.util.function.Predicate;
43 import java.util.stream.Stream;
44 import java.util.stream.StreamSupport;
45
46
47 /**
48 * A compiled representation of a regular expression.
49 *
50 * <p> A regular expression, specified as a string, must first be compiled into
51 * an instance of this class. The resulting pattern can then be used to create
52 * a {@link Matcher} object that can match arbitrary {@linkplain
53 * java.lang.CharSequence character sequences} against the regular
54 * expression. All of the state involved in performing a match resides in the
55 * matcher, so many matchers can share the same pattern.
56 *
57 * <p> A typical invocation sequence is thus
971
972 /**
973 * The starting point of state machine for the find operation. This allows
974 * a match to start anywhere in the input.
975 */
976 transient Node root;
977
978 /**
979 * The root of object tree for a match operation. The pattern is matched
980 * at the beginning. This may include a find that uses BnM or a First
981 * node.
982 */
983 transient Node matchRoot;
984
985 /**
986 * Temporary storage used by parsing pattern slice.
987 */
988 transient int[] buffer;
989
990 /**
991 * A temporary storage used for predicate for double return.
992 */
993 transient CharPredicate predicate;
994
995 /**
996 * Map the "name" of the "named capturing group" to its group id
997 * node.
998 */
999 transient volatile Map<String, Integer> namedGroups;
1000
1001 /**
1002 * Temporary storage used while parsing group references.
1003 */
1004 transient GroupHead[] groupNodes;
1005
1006 /**
1007 * Temporary storage used to store the top level closure nodes.
1008 */
1009 transient List<Node> topClosureNodes;
1010
1011 /**
1012 * The number of top greedy closure nodes in this Pattern. Used by
1013 * matchers to allocate storage needed for a IntHashSet to keep the
1014 * beginning pos {@code i} of all failed match.
1015 */
1016 transient int localTCNCount;
1017
1018 /*
1019 * Turn off the stop-exponential-backtracking optimization if there
1020 * is a group ref in the pattern.
1021 */
1022 transient boolean hasGroupRef;
1023
1024 /**
1025 * Temporary null terminated code point array used by pattern compiling.
1026 */
1027 private transient int[] temp;
1028
1029 /**
1030 * The number of capturing groups in this Pattern. Used by matchers to
1031 * allocate storage needed to perform a match.
1032 */
1033 transient int capturingGroupCount;
1034
1035 /**
1036 * The local variable count used by parsing tree. Used by matchers to
1037 * allocate storage needed to perform a match.
1038 */
1039 transient int localCount;
1040
1041 /**
1042 * Index into the pattern string that keeps track of how much has been
1043 * parsed.
1044 */
1045 private transient int cursor;
1046
1047 /**
1048 * Holds the length of the pattern string.
1049 */
1050 private transient int patternLength;
1051
1052 /**
1053 * If the Start node might possibly match supplementary characters.
1054 * It is set to true during compiling if
1055 * (1) There is supplementary char in pattern, or
1056 * (2) There is complement node of a "family" CharProperty
1057 */
1058 private transient boolean hasSupplementary;
1059
1060 /**
1061 * Compiles the given regular expression into a pattern.
1062 *
1063 * @param regex
1064 * The expression to be compiled
1065 * @return the given regular expression compiled into a pattern
1066 * @throws PatternSyntaxException
1067 * If the expression's syntax is invalid
1068 */
1069 public static Pattern compile(String regex) {
1070 return new Pattern(regex, 0);
1071 }
1072
1073 /**
1074 * Compiles the given regular expression into a pattern with the given
1075 * flags.
1076 *
1348 } while ((slashEIndex = s.indexOf("\\E", current)) != -1);
1349
1350 return sb.append(s, current, s.length())
1351 .append("\\E")
1352 .toString();
1353 }
1354
1355 /**
1356 * Recompile the Pattern instance from a stream. The original pattern
1357 * string is read in and the object tree is recompiled from it.
1358 */
1359 private void readObject(java.io.ObjectInputStream s)
1360 throws java.io.IOException, ClassNotFoundException {
1361
1362 // Read in all fields
1363 s.defaultReadObject();
1364
1365 // Initialize counts
1366 capturingGroupCount = 1;
1367 localCount = 0;
1368 localTCNCount = 0;
1369
1370 // if length > 0, the Pattern is lazily compiled
1371 if (pattern.length() == 0) {
1372 root = new Start(lastAccept);
1373 matchRoot = lastAccept;
1374 compiled = true;
1375 }
1376 }
1377
1378 /**
1379 * This private constructor is used to create all Patterns. The pattern
1380 * string and match flags are all that is needed to completely describe
1381 * a Pattern. An empty pattern string results in an object tree with
1382 * only a Start node and a LastNode node.
1383 */
1384 private Pattern(String p, int f) {
1385 if ((f & ~ALL_FLAGS) != 0) {
1386 throw new IllegalArgumentException("Unknown flag 0x"
1387 + Integer.toHexString(f));
1388 }
1389 pattern = p;
1390 flags = f;
1391
1392 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1393 if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1394 flags |= UNICODE_CASE;
1395
1396 // Reset group index count
1397 capturingGroupCount = 1;
1398 localCount = 0;
1399 localTCNCount = 0;
1400
1401 if (pattern.length() > 0) {
1402 compile();
1403 } else {
1404 root = new Start(lastAccept);
1405 matchRoot = lastAccept;
1406 }
1407 }
1408
1409 /**
1410 * The pattern is converted to normalized form ({@link
1411 * java.text.Normalizer.Form.NFC NFC}, canonical decomposition,
1412 * followed by canonical composition for the character class
1413 * part, and {@link java.text.Normalizer.Form.NFD NFD},
1414 * canonical decomposition) for the rest), and then a pure
1415 * group is constructed to match canonical equivalences of the
1416 * characters.
1417 */
1418 private static String normalize(String pattern) {
1419 int plen = pattern.length();
1420 StringBuilder pbuf = new StringBuilder(plen);
1421 char last = 0;
1422 int lastStart = 0;
1423 char cc = 0;
1424 for (int i = 0; i < plen;) {
1425 char c = pattern.charAt(i);
1426 if (cc == 0 && // top level
1427 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {
1428 i += 2; last = 0;
1429 continue;
1430 }
1431 if (c == '[' && last != '\\') {
1432 if (cc == 0) {
1433 if (lastStart < i)
1434 normalizeSlice(pattern, lastStart, i, pbuf);
1435 lastStart = i;
1436 }
1437 cc++;
1438 } else if (c == ']' && last != '\\') {
1439 cc--;
1440 if (cc == 0) {
1441 normalizeClazz(pattern, lastStart, i + 1, pbuf);
1442 lastStart = i + 1;
1443 }
1444 }
1445 last = c;
1446 i++;
1447 }
1448 assert (cc == 0);
1449 if (lastStart < plen)
1450 normalizeSlice(pattern, lastStart, plen, pbuf);
1451 return pbuf.toString();
1452 }
1453
1454 private static void normalizeSlice(String src, int off, int limit,
1455 StringBuilder dst)
1456 {
1457 int len = src.length();
1458 int off0 = off;
1459 while (off < limit && ASCII.isAscii(src.charAt(off))) {
1460 off++;
1461 }
1462 if (off == limit) {
1463 dst.append(src, off0, limit);
1464 return;
1465 }
1466 off--;
1467 if (off < off0)
1468 off = off0;
1469 else
1470 dst.append(src, off0, off);
1471 while (off < limit) {
1472 int ch0 = src.codePointAt(off);
1473 if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
1474 dst.append((char)ch0);
1475 off++;
1476 continue;
1477 }
1478 int j = off + Character.charCount(ch0);
1479 int ch1;
1480 while (j < limit) {
1481 ch1 = src.codePointAt(j);
1482 if (Grapheme.isBoundary(ch0, ch1))
1483 break;
1484 ch0 = ch1;
1485 j += Character.charCount(ch1);
1486 }
1487 String seq = src.substring(off, j);
1488 String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
1489 off = j;
1490 if (nfd.length() > 1) {
1491 ch0 = nfd.codePointAt(0);
1492 ch1 = nfd.codePointAt(Character.charCount(ch0));
1493 if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
1494 Set<String> altns = new LinkedHashSet<>();
1495 altns.add(seq);
1496 produceEquivalentAlternation(nfd, altns);
1497 dst.append("(?:");
1498 altns.forEach( s -> dst.append(s + "|"));
1499 dst.delete(dst.length() - 1, dst.length());
1500 dst.append(")");
1501 continue;
1502 }
1503 }
1504 String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);
1505 if (!seq.equals(nfc) && !nfd.equals(nfc))
1506 dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")");
1507 else if (!seq.equals(nfd))
1508 dst.append("(?:" + seq + "|" + nfd + ")");
1509 else
1510 dst.append(seq);
1511 }
1512 }
1513
1514 private static void normalizeClazz(String src, int off, int limit,
1515 StringBuilder dst)
1516 {
1517 dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));
1518 }
1519
1520 /**
1521 * Given a specific sequence composed of a regular character and
1522 * combining marks that follow it, produce the alternation that will
1523 * match all canonical equivalences of that sequence.
1524 */
1525 private static void produceEquivalentAlternation(String src,
1526 Set<String> dst)
1527 {
1528 int len = countChars(src, 0, 1);
1529 if (src.length() == len) {
1530 dst.add(src); // source has one character.
1531 return;
1532 }
1533 String base = src.substring(0,len);
1534 String combiningMarks = src.substring(len);
1535 String[] perms = producePermutations(combiningMarks);
1536 // Add combined permutations
1537 for(int x = 0; x < perms.length; x++) {
1538 String next = base + perms[x];
1539 dst.add(next);
1540 next = composeOneStep(next);
1541 if (next != null) {
1542 produceEquivalentAlternation(next, dst);
1543 }
1544 }
1545 }
1546
1547 /**
1548 * Returns an array of strings that have all the possible
1549 * permutations of the characters in the input string.
1550 * This is used to get a list of all possible orderings
1551 * of a set of combining marks. Note that some of the permutations
1552 * are invalid because of combining class collisions, and these
1553 * possibilities must be removed because they are not canonically
1554 * equivalent.
1555 */
1556 private static String[] producePermutations(String input) {
1557 if (input.length() == countChars(input, 0, 1))
1558 return new String[] {input};
1559
1560 if (input.length() == countChars(input, 0, 2)) {
1561 int c0 = Character.codePointAt(input, 0);
1562 int c1 = Character.codePointAt(input, Character.charCount(c0));
1563 if (getClass(c1) == getClass(c0)) {
1564 return new String[] {input};
1565 }
1566 String[] result = new String[2];
1567 result[0] = input;
1568 StringBuilder sb = new StringBuilder(2);
1569 sb.appendCodePoint(c1);
1570 sb.appendCodePoint(c0);
1571 result[1] = sb.toString();
1572 return result;
1573 }
1574
1575 int length = 1;
1576 int nCodePoints = countCodePoints(input);
1594 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
1595 len = countChars(input, offset, 1);
1596 for(int y=x-1; y>=0; y--) {
1597 if (combClass[y] == combClass[x]) {
1598 continue loop;
1599 }
1600 }
1601 StringBuilder sb = new StringBuilder(input);
1602 String otherChars = sb.delete(offset, offset+len).toString();
1603 String[] subResult = producePermutations(otherChars);
1604
1605 String prefix = input.substring(offset, offset+len);
1606 for (String sre : subResult)
1607 temp[index++] = prefix + sre;
1608 }
1609 String[] result = new String[index];
1610 System.arraycopy(temp, 0, result, 0, index);
1611 return result;
1612 }
1613
1614 private static int getClass(int c) {
1615 return sun.text.Normalizer.getCombiningClass(c);
1616 }
1617
1618 /**
1619 * Attempts to compose input by combining the first character
1620 * with the first combining mark following it. Returns a String
1621 * that is the composition of the leading character with its first
1622 * combining mark followed by the remaining combining marks. Returns
1623 * null if the first two characters cannot be further composed.
1624 */
1625 private static String composeOneStep(String input) {
1626 int len = countChars(input, 0, 2);
1627 String firstTwoCharacters = input.substring(0, len);
1628 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
1629 if (result.equals(firstTwoCharacters))
1630 return null;
1631 else {
1632 String remainder = input.substring(len);
1633 return result + remainder;
1634 }
1635 }
1636
1637 /**
1638 * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1639 * See the description of `quotemeta' in perlfunc(1).
1640 */
1641 private void RemoveQEQuoting() {
1642 final int pLen = patternLength;
1643 int i = 0;
1644 while (i < pLen-1) {
1645 if (temp[i] != '\\')
1646 i += 1;
1647 else if (temp[i + 1] != 'Q')
1648 i += 2;
1695 newtemp[j++] = c;
1696 if (i != pLen)
1697 newtemp[j++] = temp[i++];
1698 }
1699 }
1700
1701 beginQuote = false;
1702 }
1703
1704 patternLength = j;
1705 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1706 }
1707
1708 /**
1709 * Copies regular expression to an int array and invokes the parsing
1710 * of the expression which will create the object tree.
1711 */
1712 private void compile() {
1713 // Handle canonical equivalences
1714 if (has(CANON_EQ) && !has(LITERAL)) {
1715 normalizedPattern = normalize(pattern);
1716 } else {
1717 normalizedPattern = pattern;
1718 }
1719 patternLength = normalizedPattern.length();
1720
1721 // Copy pattern to int array for convenience
1722 // Use double zero to terminate pattern
1723 temp = new int[patternLength + 2];
1724
1725 hasSupplementary = false;
1726 int c, count = 0;
1727 // Convert all chars into code points
1728 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1729 c = normalizedPattern.codePointAt(x);
1730 if (isSupplementary(c)) {
1731 hasSupplementary = true;
1732 }
1733 temp[count++] = c;
1734 }
1735
1736 patternLength = count; // patternLength now in code points
1737
1738 if (! has(LITERAL))
1739 RemoveQEQuoting();
1740
1741 // Allocate all temporary objects here.
1742 buffer = new int[32];
1743 groupNodes = new GroupHead[10];
1744 namedGroups = null;
1745 topClosureNodes = new ArrayList<>(10);
1746
1747 if (has(LITERAL)) {
1748 // Literal pattern handling
1749 matchRoot = newSlice(temp, patternLength, hasSupplementary);
1750 matchRoot.next = lastAccept;
1751 } else {
1752 // Start recursive descent parsing
1753 matchRoot = expr(lastAccept);
1754 // Check extra pattern characters
1755 if (patternLength != cursor) {
1756 if (peek() == ')') {
1757 throw error("Unmatched closing ')'");
1758 } else {
1759 throw error("Unexpected internal error");
1760 }
1761 }
1762 }
1763
1764 // Peephole optimization
1765 if (matchRoot instanceof Slice) {
1766 root = BnM.optimize(matchRoot);
1767 if (root == matchRoot) {
1768 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1769 }
1770 } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1771 root = matchRoot;
1772 } else {
1773 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1774 }
1775
1776 // Optimize the greedy Loop to prevent exponential backtracking, IF there
1777 // is no group ref in this pattern. With a non-negative localTCNCount value,
1778 // the greedy type Loop, Curly will skip the backtracking for any starting
1779 // position "i" that failed in the past.
1780 if (!hasGroupRef) {
1781 for (Node node : topClosureNodes) {
1782 if (node instanceof Loop) {
1783 // non-deterministic-greedy-group
1784 ((Loop)node).posIndex = localTCNCount++;
1785 }
1786 }
1787 }
1788
1789 // Release temporary storage
1790 temp = null;
1791 buffer = null;
1792 groupNodes = null;
1793 patternLength = 0;
1794 compiled = true;
1795 topClosureNodes = null;
1796 }
1797
1798 Map<String, Integer> namedGroups() {
1799 Map<String, Integer> groups = namedGroups;
1800 if (groups == null) {
1801 namedGroups = groups = new HashMap<>(2);
1802 }
1803 return groups;
1804 }
1805
1806 /**
1807 * Used to accumulate information about a subtree of the object graph
1808 * so that optimizations can be applied to the subtree.
1809 */
1810 static final class TreeInfo {
1811 int minLength;
1812 int maxLength;
1813 boolean maxValid;
1814 boolean deterministic;
1815
1816 TreeInfo() {
1817 reset();
1818 }
1819 void reset() {
1820 minLength = 0;
1821 maxLength = 0;
1822 maxValid = true;
1823 deterministic = true;
1824 }
1825 }
1826
2078 Node node = null;
2079 LOOP:
2080 for (;;) {
2081 int ch = peek();
2082 switch (ch) {
2083 case '(':
2084 // Because group handles its own closure,
2085 // we need to treat it differently
2086 node = group0();
2087 // Check for comment or flag group
2088 if (node == null)
2089 continue;
2090 if (head == null)
2091 head = node;
2092 else
2093 tail.next = node;
2094 // Double return: Tail was returned in root
2095 tail = root;
2096 continue;
2097 case '[':
2098 if (has(CANON_EQ) && !has(LITERAL))
2099 node = new NFCCharProperty(clazz(true));
2100 else
2101 node = newCharProperty(clazz(true));
2102 break;
2103 case '\\':
2104 ch = nextEscaped();
2105 if (ch == 'p' || ch == 'P') {
2106 boolean oneLetter = true;
2107 boolean comp = (ch == 'P');
2108 ch = next(); // Consume { if present
2109 if (ch != '{') {
2110 unread();
2111 } else {
2112 oneLetter = false;
2113 }
2114 // node = newCharProperty(family(oneLetter, comp));
2115 if (has(CANON_EQ) && !has(LITERAL))
2116 node = new NFCCharProperty(family(oneLetter, comp));
2117 else
2118 node = newCharProperty(family(oneLetter, comp));
2119 } else {
2120 unread();
2121 node = atom();
2122 }
2123 break;
2124 case '^':
2125 next();
2126 if (has(MULTILINE)) {
2127 if (has(UNIX_LINES))
2128 node = new UnixCaret();
2129 else
2130 node = new Caret();
2131 } else {
2132 node = new Begin();
2133 }
2134 break;
2135 case '$':
2136 next();
2137 if (has(UNIX_LINES))
2138 node = new UnixDollar(has(MULTILINE));
2139 else
2140 node = new Dollar(has(MULTILINE));
2141 break;
2142 case '.':
2143 next();
2144 if (has(DOTALL)) {
2145 node = new CharProperty(ALL);
2146 } else {
2147 if (has(UNIX_LINES)) {
2148 node = new CharProperty(UNIXDOT);
2149 } else {
2150 node = new CharProperty(DOT);
2151 }
2152 }
2153 break;
2154 case '|':
2155 case ')':
2156 break LOOP;
2157 case ']': // Now interpreting dangling ] and } as literals
2158 case '}':
2159 node = atom();
2160 break;
2161 case '?':
2162 case '*':
2163 case '+':
2164 next();
2165 throw error("Dangling meta character '" + ((char)ch) + "'");
2166 case 0:
2167 if (cursor >= patternLength) {
2168 break LOOP;
2169 }
2170 // Fall through
2171 default:
2172 node = atom();
2173 break;
2174 }
2175
2176 node = closure(node);
2177 /* save the top dot-greedy nodes (.*, .+) as well
2178 if (node instanceof GreedyCharProperty &&
2179 ((GreedyCharProperty)node).cp instanceof Dot) {
2180 topClosureNodes.add(node);
2181 }
2182 */
2183 if (head == null) {
2184 head = tail = node;
2185 } else {
2186 tail.next = node;
2187 tail = node;
2188 }
2189 }
2190 if (head == null) {
2191 return end;
2192 }
2193 tail.next = end;
2194 root = tail; //double return
2195 return head;
2196 }
2197
2198 @SuppressWarnings("fallthrough")
2199 /**
2200 * Parse and add a new Single or Slice.
2201 */
2202 private Node atom() {
2220 case '^':
2221 case '(':
2222 case '[':
2223 case '|':
2224 case ')':
2225 break;
2226 case '\\':
2227 ch = nextEscaped();
2228 if (ch == 'p' || ch == 'P') { // Property
2229 if (first > 0) { // Slice is waiting; handle it first
2230 unread();
2231 break;
2232 } else { // No slice; just return the family node
2233 boolean comp = (ch == 'P');
2234 boolean oneLetter = true;
2235 ch = next(); // Consume { if present
2236 if (ch != '{')
2237 unread();
2238 else
2239 oneLetter = false;
2240 if (has(CANON_EQ) && !has(LITERAL))
2241 return new NFCCharProperty(family(oneLetter, comp));
2242 else
2243 return newCharProperty(family(oneLetter, comp));
2244 }
2245 }
2246 unread();
2247 prev = cursor;
2248 ch = escape(false, first == 0, false);
2249 if (ch >= 0) {
2250 append(ch, first);
2251 first++;
2252 if (isSupplementary(ch)) {
2253 hasSupplementary = true;
2254 }
2255 ch = peek();
2256 continue;
2257 } else if (first == 0) {
2258 return root;
2259 }
2260 // Unwind meta escape sequence
2261 cursor = prev;
2262 break;
2263 case 0:
2264 if (cursor >= patternLength) {
2265 break;
2266 }
2267 // Fall through
2268 default:
2269 prev = cursor;
2270 append(ch, first);
2271 first++;
2272 if (isSupplementary(ch)) {
2273 hasSupplementary = true;
2274 }
2275 ch = next();
2276 continue;
2277 }
2278 break;
2279 }
2280 if (first == 1) {
2281 return newCharProperty(single(buffer[0]));
2282 } else {
2283 return newSlice(buffer, first, hasSupplementary);
2284 }
2285 }
2286
2287 private void append(int ch, int len) {
2288 if (len >= buffer.length) {
2289 int[] tmp = new int[len+len];
2290 System.arraycopy(buffer, 0, tmp, 0, len);
2291 buffer = tmp;
2292 }
2293 buffer[len] = ch;
2294 }
2295
2296 /**
2297 * Parses a backref greedily, taking as many numbers as it
2298 * can. The first digit is always treated as a backref, but
2299 * multi digit numbers are only treated as a backref if at
2300 * least that many backrefs exist at this point in the regex.
2301 */
2312 case '5':
2313 case '6':
2314 case '7':
2315 case '8':
2316 case '9':
2317 int newRefNum = (refNum * 10) + (ch - '0');
2318 // Add another number if it doesn't make a group
2319 // that doesn't exist
2320 if (capturingGroupCount - 1 < newRefNum) {
2321 done = true;
2322 break;
2323 }
2324 refNum = newRefNum;
2325 read();
2326 break;
2327 default:
2328 done = true;
2329 break;
2330 }
2331 }
2332 hasGroupRef = true;
2333 if (has(CASE_INSENSITIVE))
2334 return new CIBackRef(refNum, has(UNICODE_CASE));
2335 else
2336 return new BackRef(refNum);
2337 }
2338
2339 /**
2340 * Parses an escape sequence to determine the actual value that needs
2341 * to be matched.
2342 * If -1 is returned and create was true a new object was added to the tree
2343 * to handle the escape sequence.
2344 * If the returned value is greater than zero, it is the value that
2345 * matches the escape sequence.
2346 */
2347 private int escape(boolean inclass, boolean create, boolean isrange) {
2348 int ch = skip();
2349 switch (ch) {
2350 case '0':
2351 return o();
2352 case '1':
2357 case '6':
2358 case '7':
2359 case '8':
2360 case '9':
2361 if (inclass) break;
2362 if (create) {
2363 root = ref((ch - '0'));
2364 }
2365 return -1;
2366 case 'A':
2367 if (inclass) break;
2368 if (create) root = new Begin();
2369 return -1;
2370 case 'B':
2371 if (inclass) break;
2372 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2373 return -1;
2374 case 'C':
2375 break;
2376 case 'D':
2377 if (create) {
2378 predicate = has(UNICODE_CHARACTER_CLASS) ?
2379 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
2380 predicate = predicate.negate();
2381 if (!inclass)
2382 root = newCharProperty(predicate);
2383 }
2384 return -1;
2385 case 'E':
2386 case 'F':
2387 break;
2388 case 'G':
2389 if (inclass) break;
2390 if (create) root = new LastMatch();
2391 return -1;
2392 case 'H':
2393 if (create) {
2394 predicate = HorizWS.negate();
2395 if (!inclass)
2396 root = newCharProperty(predicate);
2397 }
2398 return -1;
2399 case 'I':
2400 case 'J':
2401 case 'K':
2402 case 'L':
2403 case 'M':
2404 break;
2405 case 'N':
2406 return N();
2407 case 'O':
2408 case 'P':
2409 case 'Q':
2410 break;
2411 case 'R':
2412 if (inclass) break;
2413 if (create) root = new LineEnding();
2414 return -1;
2415 case 'S':
2416 if (create) {
2417 predicate = has(UNICODE_CHARACTER_CLASS) ?
2418 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
2419 predicate = predicate.negate();
2420 if (!inclass)
2421 root = newCharProperty(predicate);
2422 }
2423 return -1;
2424 case 'T':
2425 case 'U':
2426 break;
2427 case 'V':
2428 if (create) {
2429 predicate = VertWS.negate();
2430 if (!inclass)
2431 root = newCharProperty(predicate);
2432 }
2433 return -1;
2434 case 'W':
2435 if (create) {
2436 predicate = has(UNICODE_CHARACTER_CLASS) ?
2437 CharPredicates.WORD : CharPredicates.ASCII_WORD;
2438 predicate = predicate.negate();
2439 if (!inclass)
2440 root = newCharProperty(predicate);
2441 }
2442 return -1;
2443 case 'X':
2444 if (inclass) break;
2445 if (create) {
2446 root = new XGrapheme();
2447 }
2448 return -1;
2449 case 'Y':
2450 break;
2451 case 'Z':
2452 if (inclass) break;
2453 if (create) {
2454 if (has(UNIX_LINES))
2455 root = new UnixDollar(false);
2456 else
2457 root = new Dollar(false);
2458 }
2459 return -1;
2460 case 'a':
2461 return '\007';
2462 case 'b':
2463 if (inclass) break;
2464 if (create) {
2465 if (peek() == '{') {
2466 if (skip() == 'g') {
2467 if (read() == '}') {
2468 root = new GraphemeBound();
2469 return -1;
2470 }
2471 break; // error missing trailing }
2472 }
2473 unread(); unread();
2474 }
2475 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2476 }
2477 return -1;
2478 case 'c':
2479 return c();
2480 case 'd':
2481 if (create) {
2482 predicate = has(UNICODE_CHARACTER_CLASS) ?
2483 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
2484 if (!inclass)
2485 root = newCharProperty(predicate);
2486 }
2487 return -1;
2488 case 'e':
2489 return '\033';
2490 case 'f':
2491 return '\f';
2492 case 'g':
2493 break;
2494 case 'h':
2495 if (create) {
2496 predicate = HorizWS;
2497 if (!inclass)
2498 root = newCharProperty(predicate);
2499 }
2500 return -1;
2501 case 'i':
2502 case 'j':
2503 break;
2504 case 'k':
2505 if (inclass)
2506 break;
2507 if (read() != '<')
2508 throw error("\\k is not followed by '<' for named capturing group");
2509 String name = groupname(read());
2510 if (!namedGroups().containsKey(name))
2511 throw error("(named capturing group <"+ name+"> does not exit");
2512 if (create) {
2513 hasGroupRef = true;
2514 if (has(CASE_INSENSITIVE))
2515 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2516 else
2517 root = new BackRef(namedGroups().get(name));
2518 }
2519 return -1;
2520 case 'l':
2521 case 'm':
2522 break;
2523 case 'n':
2524 return '\n';
2525 case 'o':
2526 case 'p':
2527 case 'q':
2528 break;
2529 case 'r':
2530 return '\r';
2531 case 's':
2532 if (create) {
2533 predicate = has(UNICODE_CHARACTER_CLASS) ?
2534 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
2535 if (!inclass)
2536 root = newCharProperty(predicate);
2537 }
2538 return -1;
2539 case 't':
2540 return '\t';
2541 case 'u':
2542 return u();
2543 case 'v':
2544 // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2545 // undocumented). In JDK8 '\v' is specified as a predefined
2546 // character class for all vertical whitespace characters.
2547 // So [-1, root=VertWS node] pair is returned (instead of a
2548 // single 0x0B). This breaks the range if '\v' is used as
2549 // the start or end value, such as [\v-...] or [...-\v], in
2550 // which a single definite value (0x0B) is expected. For
2551 // compatibility concern '\013'/0x0B is returned if isrange.
2552 if (isrange)
2553 return '\013';
2554 if (create) {
2555 predicate = VertWS;
2556 if (!inclass)
2557 root = newCharProperty(predicate);
2558 }
2559 return -1;
2560 case 'w':
2561 if (create) {
2562 predicate = has(UNICODE_CHARACTER_CLASS) ?
2563 CharPredicates.WORD : CharPredicates.ASCII_WORD;
2564 if (!inclass)
2565 root = newCharProperty(predicate);
2566 }
2567 return -1;
2568 case 'x':
2569 return x();
2570 case 'y':
2571 break;
2572 case 'z':
2573 if (inclass) break;
2574 if (create) root = new End();
2575 return -1;
2576 default:
2577 return ch;
2578 }
2579 throw error("Illegal/unsupported escape sequence");
2580 }
2581
2582 /**
2583 * Parse a character class, and return the node that matches it.
2584 *
2585 * Consumes a ] on the way out if consume is true. Usually consume
2586 * is true except for the case of [abc&&def] where def is a separate
2587 * right hand node with "understood" brackets.
2588 */
2589 private CharPredicate clazz(boolean consume) {
2590 CharPredicate prev = null;
2591 CharPredicate curr = null;
2592 BitClass bits = new BitClass();
2593 BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
2594
2595 boolean isNeg = false;
2596 boolean hasBits = false;
2597 int ch = next();
2598
2599 // Negates if first char in a class, otherwise literal
2600 if (ch == '^' && temp[cursor-1] == '[') {
2601 ch = next();
2602 isNeg = true;
2603 }
2604 for (;;) {
2605 switch (ch) {
2606 case '[':
2607 curr = clazz(true);
2608 if (prev == null)
2609 prev = curr;
2610 else
2611 prev = prev.union(curr);
2612 ch = peek();
2613 continue;
2614 case '&':
2615 ch = next();
2616 if (ch == '&') {
2617 ch = next();
2618 CharPredicate right = null;
2619 while (ch != ']' && ch != '&') {
2620 if (ch == '[') {
2621 if (right == null)
2622 right = clazz(true);
2623 else
2624 right = right.union(clazz(true));
2625 } else { // abc&&def
2626 unread();
2627 right = clazz(false);
2628 }
2629 ch = peek();
2630 }
2631 if (hasBits) {
2632 // bits used, union has high precedence
2633 if (prev == null) {
2634 prev = curr = bitsP;
2635 } else {
2636 prev = prev.union(bitsP);
2637 }
2638 hasBits = false;
2639 }
2640 if (right != null)
2641 curr = right;
2642 if (prev == null) {
2643 if (right == null)
2644 throw error("Bad class syntax");
2645 else
2646 prev = right;
2647 } else {
2648 prev = prev.and(curr);
2649 }
2650 } else {
2651 // treat as a literal &
2652 unread();
2653 break;
2654 }
2655 continue;
2656 case 0:
2657 if (cursor >= patternLength)
2658 throw error("Unclosed character class");
2659 break;
2660 case ']':
2661 if (prev != null || hasBits) {
2662 if (consume)
2663 next();
2664 if (prev == null)
2665 prev = bitsP;
2666 else if (hasBits)
2667 prev = prev.union(bitsP);
2668 if (isNeg)
2669 return prev.negate();
2670 return prev;
2671 }
2672 break;
2673 default:
2674 break;
2675 }
2676 curr = range(bits);
2677 if (curr == null) { // the bits used
2678 hasBits = true;
2679 } else {
2680 if (prev == null)
2681 prev = curr;
2682 else if (prev != curr)
2683 prev = prev.union(curr);
2684 }
2685 ch = peek();
2686 }
2687 }
2688
2689 private CharPredicate bitsOrSingle(BitClass bits, int ch) {
2690 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2691 Use "single" node instead of bits when dealing with unicode
2692 case folding for codepoints listed below.
2693 (1)Uppercase out of range: u+00ff, u+00b5
2694 toUpperCase(u+00ff) -> u+0178
2695 toUpperCase(u+00b5) -> u+039c
2696 (2)LatinSmallLetterLongS u+17f
2697 toUpperCase(u+017f) -> u+0053
2698 (3)LatinSmallLetterDotlessI u+131
2699 toUpperCase(u+0131) -> u+0049
2700 (4)LatinCapitalLetterIWithDotAbove u+0130
2701 toLowerCase(u+0130) -> u+0069
2702 (5)KelvinSign u+212a
2703 toLowerCase(u+212a) ==> u+006B
2704 (6)AngstromSign u+212b
2705 toLowerCase(u+212b) ==> u+00e5
2706 */
2707 int d;
2708 if (ch < 256 &&
2709 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2710 (ch == 0xff || ch == 0xb5 ||
2711 ch == 0x49 || ch == 0x69 || //I and i
2712 ch == 0x53 || ch == 0x73 || //S and s
2713 ch == 0x4b || ch == 0x6b || //K and k
2714 ch == 0xc5 || ch == 0xe5))) { //A+ring
2715 bits.add(ch, flags());
2716 return null;
2717 }
2718 return single(ch);
2719 }
2720
2721 /**
2722 * Returns a suitably optimized, single character predicate
2723 */
2724 private CharPredicate single(final int ch) {
2725 if (has(CASE_INSENSITIVE)) {
2726 int lower, upper;
2727 if (has(UNICODE_CASE)) {
2728 upper = Character.toUpperCase(ch);
2729 lower = Character.toLowerCase(upper);
2730 // Unicode case insensitive matches
2731 if (upper != lower)
2732 return SingleU(lower);
2733 } else if (ASCII.isAscii(ch)) {
2734 lower = ASCII.toLower(ch);
2735 upper = ASCII.toUpper(ch);
2736 // Case insensitive matches a given BMP character
2737 if (lower != upper)
2738 return SingleI(lower, upper);
2739 }
2740 }
2741 if (isSupplementary(ch))
2742 return SingleS(ch);
2743 return Single(ch); // Match a given BMP character
2744 }
2745
2746 /**
2747 * Parse a single character or a character range in a character class
2748 * and return its representative node.
2749 */
2750 private CharPredicate range(BitClass bits) {
2751 int ch = peek();
2752 if (ch == '\\') {
2753 ch = nextEscaped();
2754 if (ch == 'p' || ch == 'P') { // A property
2755 boolean comp = (ch == 'P');
2756 boolean oneLetter = true;
2757 // Consume { if present
2758 ch = next();
2759 if (ch != '{')
2760 unread();
2761 else
2762 oneLetter = false;
2763 return family(oneLetter, comp);
2764 } else { // ordinary escape
2765 boolean isrange = temp[cursor+1] == '-';
2766 unread();
2767 ch = escape(true, true, isrange);
2768 if (ch == -1)
2769 return predicate;
2770 }
2771 } else {
2772 next();
2773 }
2774 if (ch >= 0) {
2775 if (peek() == '-') {
2776 int endRange = temp[cursor+1];
2777 if (endRange == '[') {
2778 return bitsOrSingle(bits, ch);
2779 }
2780 if (endRange != ']') {
2781 next();
2782 int m = peek();
2783 if (m == '\\') {
2784 m = escape(true, false, true);
2785 } else {
2786 next();
2787 }
2788 if (m < ch) {
2789 throw error("Illegal character range");
2790 }
2791 if (has(CASE_INSENSITIVE)) {
2792 if (has(UNICODE_CASE))
2793 return CIRangeU(ch, m);
2794 return CIRange(ch, m);
2795 } else {
2796 return Range(ch, m);
2797 }
2798 }
2799 }
2800 return bitsOrSingle(bits, ch);
2801 }
2802 throw error("Unexpected character '"+((char)ch)+"'");
2803 }
2804
2805 /**
2806 * Parses a Unicode character family and returns its representative node.
2807 */
2808 private CharPredicate family(boolean singleLetter, boolean isComplement) {
2809 next();
2810 String name;
2811 CharPredicate p = null;
2812
2813 if (singleLetter) {
2814 int c = temp[cursor];
2815 if (!Character.isSupplementaryCodePoint(c)) {
2816 name = String.valueOf((char)c);
2817 } else {
2818 name = new String(temp, cursor, 1);
2819 }
2820 read();
2821 } else {
2822 int i = cursor;
2823 mark('}');
2824 while(read() != '}') {
2825 }
2826 mark('\000');
2827 int j = cursor;
2828 if (j > patternLength)
2829 throw error("Unclosed character family");
2830 if (i + 1 >= j)
2831 throw error("Empty character family");
2832 name = new String(temp, i, j-i-1);
2833 }
2834
2835 int i = name.indexOf('=');
2836 if (i != -1) {
2837 // property construct \p{name=value}
2838 String value = name.substring(i + 1);
2839 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2840 switch (name) {
2841 case "sc":
2842 case "script":
2843 p = CharPredicates.forUnicodeScript(value);
2844 break;
2845 case "blk":
2846 case "block":
2847 p = CharPredicates.forUnicodeBlock(value);
2848 break;
2849 case "gc":
2850 case "general_category":
2851 p = CharPredicates.forProperty(value);
2852 break;
2853 default:
2854 break;
2855 }
2856 if (p == null)
2857 throw error("Unknown Unicode property {name=<" + name + ">, "
2858 + "value=<" + value + ">}");
2859
2860 } else {
2861 if (name.startsWith("In")) {
2862 // \p{InBlockName}
2863 p = CharPredicates.forUnicodeBlock(name.substring(2));
2864 } else if (name.startsWith("Is")) {
2865 // \p{IsGeneralCategory} and \p{IsScriptName}
2866 name = name.substring(2);
2867 p = CharPredicates.forUnicodeProperty(name);
2868 if (p == null)
2869 p = CharPredicates.forProperty(name);
2870 if (p == null)
2871 p = CharPredicates.forUnicodeScript(name);
2872 } else {
2873 if (has(UNICODE_CHARACTER_CLASS)) {
2874 p = CharPredicates.forPOSIXName(name);
2875 }
2876 if (p == null)
2877 p = CharPredicates.forProperty(name);
2878 }
2879 if (p == null)
2880 throw error("Unknown character property name {In/Is" + name + "}");
2881 }
2882 if (isComplement) {
2883 // it might be too expensive to detect if a complement of
2884 // CharProperty can match "certain" supplementary. So just
2885 // go with StartS.
2886 hasSupplementary = true;
2887 p = p.negate();
2888 }
2889 return p;
2890 }
2891
2892 private CharProperty newCharProperty(CharPredicate p) {
2893 if (p == null)
2894 return null;
2895 if (p instanceof BmpCharPredicate)
2896 return new BmpCharProperty((BmpCharPredicate)p);
2897 else
2898 return new CharProperty(p);
2899 }
2900
2901 /**
2902 * Parses and returns the name of a "named capturing group", the trailing
2903 * ">" is consumed after parsing.
2904 */
2905 private String groupname(int ch) {
2906 StringBuilder sb = new StringBuilder();
2907 sb.append(Character.toChars(ch));
2908 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2909 ASCII.isDigit(ch)) {
2910 sb.append(Character.toChars(ch));
2911 }
2912 if (sb.length() == 0)
2913 throw error("named capturing group has 0 length name");
2914 if (ch != '>')
2915 throw error("named capturing group is missing trailing '>'");
2916 return sb.toString();
2917 }
2918
2919 /**
2920 * Parses a group and returns the head node of a set of nodes that process
2921 * the group. Sometimes a double return system is used where the tail is
2922 * returned in root.
2923 */
2924 private Node group0() {
2925 boolean capturingGroup = false;
2926 Node head = null;
2927 Node tail = null;
2928 int save = flags;
2929 int saveTCNCount = topClosureNodes.size();
2930 root = null;
2931 int ch = next();
2932 if (ch == '?') {
2933 ch = skip();
2934 switch (ch) {
2935 case ':': // (?:xxx) pure group
2936 head = createGroup(true);
2937 tail = root;
2938 head.next = expr(tail);
2939 break;
2940 case '=': // (?=xxx) and (?!xxx) lookahead
2941 case '!':
2942 head = createGroup(true);
2943 tail = root;
2944 head.next = expr(tail);
2945 if (ch == '=') {
2946 head = tail = new Pos(head);
2947 } else {
2948 head = tail = new Neg(head);
2949 }
2950 break;
2951 case '>': // (?>xxx) independent group
2952 head = createGroup(true);
2953 tail = root;
2954 head.next = expr(tail);
2955 head = tail = new Ques(head, Qtype.INDEPENDENT);
2956 break;
2957 case '<': // (?<xxx) look behind
2958 ch = read();
2959 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2960 // named captured group
2961 String name = groupname(ch);
2962 if (namedGroups().containsKey(name))
2963 throw error("Named capturing group <" + name
2964 + "> is already defined");
2965 capturingGroup = true;
2966 head = createGroup(false);
2967 tail = root;
2968 namedGroups().put(name, capturingGroupCount-1);
2969 head.next = expr(tail);
2970 break;
2971 }
2972 int start = cursor;
2973 head = createGroup(true);
2974 tail = root;
2975 head.next = expr(tail);
2979 if (info.maxValid == false) {
2980 throw error("Look-behind group does not have "
2981 + "an obvious maximum length");
2982 }
2983 boolean hasSupplementary = findSupplementary(start, patternLength);
2984 if (ch == '=') {
2985 head = tail = (hasSupplementary ?
2986 new BehindS(head, info.maxLength,
2987 info.minLength) :
2988 new Behind(head, info.maxLength,
2989 info.minLength));
2990 } else if (ch == '!') {
2991 head = tail = (hasSupplementary ?
2992 new NotBehindS(head, info.maxLength,
2993 info.minLength) :
2994 new NotBehind(head, info.maxLength,
2995 info.minLength));
2996 } else {
2997 throw error("Unknown look-behind group");
2998 }
2999 // clear all top-closure-nodes inside lookbehind
3000 if (saveTCNCount < topClosureNodes.size())
3001 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
3002 break;
3003 case '$':
3004 case '@':
3005 throw error("Unknown group type");
3006 default: // (?xxx:) inlined match flags
3007 unread();
3008 addFlag();
3009 ch = read();
3010 if (ch == ')') {
3011 return null; // Inline modifier only
3012 }
3013 if (ch != ':') {
3014 throw error("Unknown inline modifier");
3015 }
3016 head = createGroup(true);
3017 tail = root;
3018 head.next = expr(tail);
3019 break;
3020 }
3021 } else { // (xxx) a regular group
3022 capturingGroup = true;
3023 head = createGroup(false);
3024 tail = root;
3025 head.next = expr(tail);
3026 }
3027
3028 accept(')', "Unclosed group");
3029 flags = save;
3030
3031 // Check for quantifiers
3032 Node node = closure(head);
3033 if (node == head) { // No closure
3034 root = tail;
3035 return node; // Dual return
3036 }
3037 if (head == tail) { // Zero length assertion
3038 root = node;
3039 return node; // Dual return
3040 }
3041
3042 // have group closure, clear all inner closure nodes from the
3043 // top list (no backtracking stopper optimization for inner
3044 if (saveTCNCount < topClosureNodes.size())
3045 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
3046
3047 if (node instanceof Ques) {
3048 Ques ques = (Ques) node;
3049 if (ques.type == Qtype.POSSESSIVE) {
3050 root = node;
3051 return node;
3052 }
3053 tail.next = new BranchConn();
3054 tail = tail.next;
3055 if (ques.type == Qtype.GREEDY) {
3056 head = new Branch(head, null, tail);
3057 } else { // Reluctant quantifier
3058 head = new Branch(null, head, tail);
3059 }
3060 root = tail;
3061 return head;
3062 } else if (node instanceof Curly) {
3063 Curly curly = (Curly) node;
3064 if (curly.type == Qtype.POSSESSIVE) {
3065 root = node;
3066 return node;
3067 }
3068 // Discover if the group is deterministic
3069 TreeInfo info = new TreeInfo();
3070 if (head.study(info)) { // Deterministic
3071 GroupTail temp = (GroupTail) tail;
3072 head = root = new GroupCurly(head.next, curly.cmin,
3073 curly.cmax, curly.type,
3074 ((GroupTail)tail).localIndex,
3075 ((GroupTail)tail).groupIndex,
3076 capturingGroup);
3077 return head;
3078 } else { // Non-deterministic
3079 int temp = ((GroupHead) head).localIndex;
3080 Loop loop;
3081 if (curly.type == Qtype.GREEDY) {
3082 loop = new Loop(this.localCount, temp);
3083 // add the max_reps greedy to the top-closure-node list
3084 if (curly.cmax == MAX_REPS)
3085 topClosureNodes.add(loop);
3086 } else { // Reluctant Curly
3087 loop = new LazyLoop(this.localCount, temp);
3088 }
3089 Prolog prolog = new Prolog(loop);
3090 this.localCount += 1;
3091 loop.cmin = curly.cmin;
3092 loop.cmax = curly.cmax;
3093 loop.body = head;
3094 tail.next = loop;
3095 root = loop;
3096 return prolog; // Dual return
3097 }
3098 }
3099 throw error("Internal logic error");
3100 }
3101
3102 /**
3103 * Create group head and tail nodes using double return. If the group is
3104 * created with anonymous true then it is a pure group and should not
3105 * affect group counting.
3106 */
3107 private Node createGroup(boolean anonymous) {
3108 int localIndex = localCount++;
3109 int groupIndex = 0;
3110 if (!anonymous)
3111 groupIndex = capturingGroupCount++;
3112 GroupHead head = new GroupHead(localIndex);
3113 root = new GroupTail(localIndex, groupIndex);
3114
3115 // for debug/print only, head.match does NOT need the "tail" info
3116 head.tail = (GroupTail)root;
3117
3118 if (!anonymous && groupIndex < 10)
3119 groupNodes[groupIndex] = head;
3120 return head;
3121 }
3122
3123 @SuppressWarnings("fallthrough")
3124 /**
3125 * Parses inlined match flags and set them appropriately.
3126 */
3127 private void addFlag() {
3128 int ch = peek();
3129 for (;;) {
3130 switch (ch) {
3131 case 'i':
3132 flags |= CASE_INSENSITIVE;
3133 break;
3134 case 'm':
3135 flags |= MULTILINE;
3136 break;
3137 case 's':
3186 case 'u':
3187 flags &= ~UNICODE_CASE;
3188 break;
3189 case 'c':
3190 flags &= ~CANON_EQ;
3191 break;
3192 case 'x':
3193 flags &= ~COMMENTS;
3194 break;
3195 case 'U':
3196 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3197 default:
3198 return;
3199 }
3200 ch = next();
3201 }
3202 }
3203
3204 static final int MAX_REPS = 0x7FFFFFFF;
3205
3206 static enum Qtype {
3207 GREEDY, LAZY, POSSESSIVE, INDEPENDENT
3208 }
3209
3210 private Node curly(Node prev, int cmin) {
3211 int ch = next();
3212 if (ch == '?') {
3213 next();
3214 return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
3215 } else if (ch == '+') {
3216 next();
3217 return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
3218 }
3219 if (prev instanceof BmpCharProperty) {
3220 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3221 } else if (prev instanceof CharProperty) {
3222 return new CharPropertyGreedy((CharProperty)prev, cmin);
3223 }
3224 return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
3225 }
3226
3227 /**
3228 * Processes repetition. If the next character peeked is a quantifier
3229 * then new nodes must be appended to handle the repetition.
3230 * Prev could be a single or a group, so it could be a chain of nodes.
3231 */
3232 private Node closure(Node prev) {
3233 Node atom;
3234 int ch = peek();
3235 switch (ch) {
3236 case '?':
3237 ch = next();
3238 if (ch == '?') {
3239 next();
3240 return new Ques(prev, Qtype.LAZY);
3241 } else if (ch == '+') {
3242 next();
3243 return new Ques(prev, Qtype.POSSESSIVE);
3244 }
3245 return new Ques(prev, Qtype.GREEDY);
3246 case '*':
3247 return curly(prev, 0);
3248 case '+':
3249 return curly(prev, 1);
3250 case '{':
3251 ch = temp[cursor+1];
3252 if (ASCII.isDigit(ch)) {
3253 skip();
3254 int cmin = 0;
3255 do {
3256 cmin = cmin * 10 + (ch - '0');
3257 } while (ASCII.isDigit(ch = read()));
3258 int cmax = cmin;
3259 if (ch == ',') {
3260 ch = read();
3261 cmax = MAX_REPS;
3262 if (ch != '}') {
3263 cmax = 0;
3264 while (ASCII.isDigit(ch)) {
3265 cmax = cmax * 10 + (ch - '0');
3266 ch = read();
3267 }
3268 }
3269 }
3270 if (ch != '}')
3271 throw error("Unclosed counted closure");
3272 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3273 throw error("Illegal repetition range");
3274 Curly curly;
3275 ch = peek();
3276 if (ch == '?') {
3277 next();
3278 curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
3279 } else if (ch == '+') {
3280 next();
3281 curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3282 } else {
3283 curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
3284 }
3285 return curly;
3286 } else {
3287 throw error("Illegal repetition");
3288 }
3289 default:
3290 return prev;
3291 }
3292 }
3293
3294 /**
3295 * Utility method for parsing control escape sequences.
3296 */
3297 private int c() {
3298 if (cursor < patternLength) {
3299 return read() ^ 64;
3300 }
3301 throw error("Illegal control escape sequence");
3302 }
3303
3440
3441 private static final int countCodePoints(CharSequence seq) {
3442 int length = seq.length();
3443 int n = 0;
3444 for (int i = 0; i < length; ) {
3445 n++;
3446 if (Character.isHighSurrogate(seq.charAt(i++))) {
3447 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3448 i++;
3449 }
3450 }
3451 }
3452 return n;
3453 }
3454
3455 /**
3456 * Creates a bit vector for matching Latin-1 values. A normal BitClass
3457 * never matches values above Latin-1, and a complemented BitClass always
3458 * matches values above Latin-1.
3459 */
3460 static final class BitClass extends BmpCharProperty {
3461 final boolean[] bits;
3462 BitClass() {
3463 this(new boolean[256]);
3464 }
3465 private BitClass(boolean[] bits) {
3466 super( ch -> ch < 256 && bits[ch]);
3467 this.bits = bits;
3468 }
3469 BitClass add(int c, int flags) {
3470 assert c >= 0 && c <= 255;
3471 if ((flags & CASE_INSENSITIVE) != 0) {
3472 if (ASCII.isAscii(c)) {
3473 bits[ASCII.toUpper(c)] = true;
3474 bits[ASCII.toLower(c)] = true;
3475 } else if ((flags & UNICODE_CASE) != 0) {
3476 bits[Character.toLowerCase(c)] = true;
3477 bits[Character.toUpperCase(c)] = true;
3478 }
3479 }
3480 bits[c] = true;
3481 return this;
3482 }
3483 }
3484
3485 /**
3486 * Utility method for creating a string slice matcher.
3487 */
3488 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3489 int[] tmp = new int[count];
3490 if (has(CASE_INSENSITIVE)) {
3491 if (has(UNICODE_CASE)) {
3492 for (int i = 0; i < count; i++) {
3493 tmp[i] = Character.toLowerCase(
3494 Character.toUpperCase(buf[i]));
3495 }
3496 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3497 }
3498 for (int i = 0; i < count; i++) {
3499 tmp[i] = ASCII.toLower(buf[i]);
3500 }
3501 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3502 }
3870 if (i < matcher.to && seq.charAt(i) == 0x0A)
3871 i++;
3872 return next.match(matcher, i, seq);
3873 }
3874 } else {
3875 matcher.hitEnd = true;
3876 }
3877 return false;
3878 }
3879 boolean study(TreeInfo info) {
3880 info.minLength++;
3881 info.maxLength += 2;
3882 return next.study(info);
3883 }
3884 }
3885
3886 /**
3887 * Abstract node class to match one character satisfying some
3888 * boolean property.
3889 */
3890 static class CharProperty extends Node {
3891 CharPredicate predicate;
3892
3893 CharProperty (CharPredicate predicate) {
3894 this.predicate = predicate;
3895 }
3896 boolean match(Matcher matcher, int i, CharSequence seq) {
3897 if (i < matcher.to) {
3898 int ch = Character.codePointAt(seq, i);
3899 return predicate.is(ch) &&
3900 next.match(matcher, i + Character.charCount(ch), seq);
3901 } else {
3902 matcher.hitEnd = true;
3903 return false;
3904 }
3905 }
3906 boolean study(TreeInfo info) {
3907 info.minLength++;
3908 info.maxLength++;
3909 return next.study(info);
3910 }
3911 }
3912
3913 /**
3914 * Optimized version of CharProperty that works only for
3915 * properties never satisfied by Supplementary characters.
3916 */
3917 private static class BmpCharProperty extends CharProperty {
3918 BmpCharProperty (BmpCharPredicate predicate) {
3919 super(predicate);
3920 }
3921 boolean match(Matcher matcher, int i, CharSequence seq) {
3922 if (i < matcher.to) {
3923 return predicate.is(seq.charAt(i)) &&
3924 next.match(matcher, i + 1, seq);
3925 } else {
3926 matcher.hitEnd = true;
3927 return false;
3928 }
3929 }
3930 }
3931
3932 private static class NFCCharProperty extends Node {
3933 CharPredicate predicate;
3934 NFCCharProperty (CharPredicate predicate) {
3935 this.predicate = predicate;
3936 }
3937
3938 boolean match(Matcher matcher, int i, CharSequence seq) {
3939 if (i < matcher.to) {
3940 int ch0 = Character.codePointAt(seq, i);
3941 int n = Character.charCount(ch0);
3942 int j = i + n;
3943 while (j < matcher.to) {
3944 int ch1 = Character.codePointAt(seq, j);
3945 if (Grapheme.isBoundary(ch0, ch1))
3946 break;
3947 ch0 = ch1;
3948 j += Character.charCount(ch1);
3949 }
3950 if (i + n == j) { // single, assume nfc cp
3951 if (predicate.is(ch0))
3952 return next.match(matcher, j, seq);
3953 } else {
3954 while (i + n < j) {
3955 String nfc = Normalizer.normalize(
3956 seq.toString().substring(i, j), Normalizer.Form.NFC);
3957 if (nfc.codePointCount(0, nfc.length()) == 1) {
3958 if (predicate.is(nfc.codePointAt(0)) &&
3959 next.match(matcher, j, seq)) {
3960 return true;
3961 }
3962 }
3963
3964 ch0 = Character.codePointBefore(seq, j);
3965 j -= Character.charCount(ch0);
3966 }
3967 }
3968 if (j < matcher.to)
3969 return false;
3970 }
3971 matcher.hitEnd = true;
3972 return false;
3973 }
3974
3975 boolean study(TreeInfo info) {
3976 info.minLength++;
3977 info.deterministic = false;
3978 return next.study(info);
3979 }
3980 }
3981
3982 /**
3983 * Node class that matches an unicode extended grapheme cluster
3984 */
3985 static class XGrapheme extends Node {
3986 boolean match(Matcher matcher, int i, CharSequence seq) {
3987 if (i < matcher.to) {
3988 int ch0 = Character.codePointAt(seq, i);
3989 i += Character.charCount(ch0);
3990 while (i < matcher.to) {
3991 int ch1 = Character.codePointAt(seq, i);
3992 if (Grapheme.isBoundary(ch0, ch1))
3993 break;
3994 ch0 = ch1;
3995 i += Character.charCount(ch1);
3996 }
3997 return next.match(matcher, i, seq);
3998 }
4180 return false;
4181 }
4182 }
4183 return next.match(matcher, x, seq);
4184 }
4185 }
4186
4187 /**
4188 * Node class for a case insensitive sequence of literal characters.
4189 * Uses unicode case folding.
4190 */
4191 static final class SliceUS extends SliceIS {
4192 SliceUS(int[] buf) {
4193 super(buf);
4194 }
4195 int toLower(int c) {
4196 return Character.toLowerCase(Character.toUpperCase(c));
4197 }
4198 }
4199
4200 /**
4201 * The 0 or 1 quantifier. This one class implements all three types.
4202 */
4203 static final class Ques extends Node {
4204 Node atom;
4205 Qtype type;
4206 Ques(Node node, Qtype type) {
4207 this.atom = node;
4208 this.type = type;
4209 }
4210 boolean match(Matcher matcher, int i, CharSequence seq) {
4211 switch (type) {
4212 case GREEDY:
4213 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4214 || next.match(matcher, i, seq);
4215 case LAZY:
4216 return next.match(matcher, i, seq)
4217 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4218 case POSSESSIVE:
4219 if (atom.match(matcher, i, seq)) i = matcher.last;
4220 return next.match(matcher, i, seq);
4221 default:
4222 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4223 }
4224 }
4225 boolean study(TreeInfo info) {
4226 if (type != Qtype.INDEPENDENT) {
4227 int minL = info.minLength;
4228 atom.study(info);
4229 info.minLength = minL;
4230 info.deterministic = false;
4231 return next.study(info);
4232 } else {
4233 atom.study(info);
4234 return next.study(info);
4235 }
4236 }
4237 }
4238
4239 /**
4240 * Handles the greedy style repetition with the minimum either be
4241 * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
4242 */
4243 static class CharPropertyGreedy extends Node {
4244 final CharPredicate predicate;
4245 final int cmin;
4246
4247 CharPropertyGreedy(CharProperty cp, int cmin) {
4248 this.predicate = cp.predicate;
4249 this.cmin = cmin;
4250 }
4251 boolean match(Matcher matcher, int i, CharSequence seq) {
4252 int n = 0;
4253 int to = matcher.to;
4254 // greedy, all the way down
4255 while (i < to) {
4256 int ch = Character.codePointAt(seq, i);
4257 if (!predicate.is(ch))
4258 break;
4259 i += Character.charCount(ch);
4260 n++;
4261 }
4262 if (i >= to) {
4263 matcher.hitEnd = true;
4264 }
4265 while (n >= cmin) {
4266 if (next.match(matcher, i, seq))
4267 return true;
4268 if (n == cmin)
4269 return false;
4270 // backing off if match fails
4271 int ch = Character.codePointBefore(seq, i);
4272 i -= Character.charCount(ch);
4273 n--;
4274 }
4275 return false;
4276 }
4277
4278 boolean study(TreeInfo info) {
4279 info.minLength += cmin;
4280 if (info.maxValid) {
4281 info.maxLength += MAX_REPS;
4282 }
4283 info.deterministic = false;
4284 return next.study(info);
4285 }
4286 }
4287
4288 static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
4289
4290 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
4291 super(bcp, cmin);
4292 }
4293
4294 boolean match(Matcher matcher, int i, CharSequence seq) {
4295 int n = 0;
4296 int to = matcher.to;
4297 while (i < to && predicate.is(seq.charAt(i))) {
4298 i++; n++;
4299 }
4300 if (i >= to) {
4301 matcher.hitEnd = true;
4302 }
4303 while (n >= cmin) {
4304 if (next.match(matcher, i, seq))
4305 return true;
4306 i--; n--; // backing off if match fails
4307 }
4308 return false;
4309 }
4310 }
4311
4312 /**
4313 * Handles the curly-brace style repetition with a specified minimum and
4314 * maximum occurrences. The * quantifier is handled as a special case.
4315 * This class handles the three types.
4316 */
4317 static final class Curly extends Node {
4318 Node atom;
4319 Qtype type;
4320 int cmin;
4321 int cmax;
4322
4323 Curly(Node node, int cmin, int cmax, Qtype type) {
4324 this.atom = node;
4325 this.type = type;
4326 this.cmin = cmin;
4327 this.cmax = cmax;
4328 }
4329 boolean match(Matcher matcher, int i, CharSequence seq) {
4330 int j;
4331 for (j = 0; j < cmin; j++) {
4332 if (atom.match(matcher, i, seq)) {
4333 i = matcher.last;
4334 continue;
4335 }
4336 return false;
4337 }
4338 if (type == Qtype.GREEDY)
4339 return match0(matcher, i, j, seq);
4340 else if (type == Qtype.LAZY)
4341 return match1(matcher, i, j, seq);
4342 else
4343 return match2(matcher, i, j, seq);
4344 }
4345 // Greedy match.
4346 // i is the index to start matching at
4347 // j is the number of atoms that have matched
4348 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4349 if (j >= cmax) {
4350 // We have matched the maximum... continue with the rest of
4351 // the regular expression
4352 return next.match(matcher, i, seq);
4353 }
4354 int backLimit = j;
4355 while (atom.match(matcher, i, seq)) {
4356 // k is the length of this match
4357 int k = matcher.last - i;
4358 if (k == 0) // Zero length match
4359 break;
4360 // Move up index and number matched
4442 }
4443
4444 if (info.deterministic && cmin == cmax)
4445 info.deterministic = detm;
4446 else
4447 info.deterministic = false;
4448 return next.study(info);
4449 }
4450 }
4451
4452 /**
4453 * Handles the curly-brace style repetition with a specified minimum and
4454 * maximum occurrences in deterministic cases. This is an iterative
4455 * optimization over the Prolog and Loop system which would handle this
4456 * in a recursive way. The * quantifier is handled as a special case.
4457 * If capture is true then this class saves group settings and ensures
4458 * that groups are unset when backing off of a group match.
4459 */
4460 static final class GroupCurly extends Node {
4461 Node atom;
4462 Qtype type;
4463 int cmin;
4464 int cmax;
4465 int localIndex;
4466 int groupIndex;
4467 boolean capture;
4468
4469 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
4470 int group, boolean capture) {
4471 this.atom = node;
4472 this.type = type;
4473 this.cmin = cmin;
4474 this.cmax = cmax;
4475 this.localIndex = local;
4476 this.groupIndex = group;
4477 this.capture = capture;
4478 }
4479 boolean match(Matcher matcher, int i, CharSequence seq) {
4480 int[] groups = matcher.groups;
4481 int[] locals = matcher.locals;
4482 int save0 = locals[localIndex];
4483 int save1 = 0;
4484 int save2 = 0;
4485
4486 if (capture) {
4487 save1 = groups[groupIndex];
4488 save2 = groups[groupIndex+1];
4489 }
4490
4491 // Notify GroupTail there is no need to setup group info
4492 // because it will be set here
4493 locals[localIndex] = -1;
4494
4495 boolean ret = true;
4496 for (int j = 0; j < cmin; j++) {
4497 if (atom.match(matcher, i, seq)) {
4498 if (capture) {
4499 groups[groupIndex] = i;
4500 groups[groupIndex+1] = matcher.last;
4501 }
4502 i = matcher.last;
4503 } else {
4504 ret = false;
4505 break;
4506 }
4507 }
4508 if (ret) {
4509 if (type == Qtype.GREEDY) {
4510 ret = match0(matcher, i, cmin, seq);
4511 } else if (type == Qtype.LAZY) {
4512 ret = match1(matcher, i, cmin, seq);
4513 } else {
4514 ret = match2(matcher, i, cmin, seq);
4515 }
4516 }
4517 if (!ret) {
4518 locals[localIndex] = save0;
4519 if (capture) {
4520 groups[groupIndex] = save1;
4521 groups[groupIndex+1] = save2;
4522 }
4523 }
4524 return ret;
4525 }
4526 // Aggressive group match
4527 boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4528 // don't back off passing the starting "j"
4529 int min = j;
4530 int[] groups = matcher.groups;
4531 int save0 = 0;
4737
4738 info.minLength += minL;
4739 info.maxLength += maxL;
4740 info.maxValid &= maxV;
4741 info.deterministic = false;
4742 return false;
4743 }
4744 }
4745
4746 /**
4747 * The GroupHead saves the location where the group begins in the locals
4748 * and restores them when the match is done.
4749 *
4750 * The matchRef is used when a reference to this group is accessed later
4751 * in the expression. The locals will have a negative value in them to
4752 * indicate that we do not want to unset the group if the reference
4753 * doesn't match.
4754 */
4755 static final class GroupHead extends Node {
4756 int localIndex;
4757 GroupTail tail; // for debug/print only, match does not need to know
4758 GroupHead(int localCount) {
4759 localIndex = localCount;
4760 }
4761 boolean match(Matcher matcher, int i, CharSequence seq) {
4762 int save = matcher.locals[localIndex];
4763 matcher.locals[localIndex] = i;
4764 boolean ret = next.match(matcher, i, seq);
4765 matcher.locals[localIndex] = save;
4766 return ret;
4767 }
4768 boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4769 int save = matcher.locals[localIndex];
4770 matcher.locals[localIndex] = ~i; // HACK
4771 boolean ret = next.match(matcher, i, seq);
4772 matcher.locals[localIndex] = save;
4773 return ret;
4774 }
4775 }
4776
4777 /**
4845 }
4846 boolean match(Matcher matcher, int i, CharSequence seq) {
4847 return loop.matchInit(matcher, i, seq);
4848 }
4849 boolean study(TreeInfo info) {
4850 return loop.study(info);
4851 }
4852 }
4853
4854 /**
4855 * Handles the repetition count for a greedy Curly. The matchInit
4856 * is called from the Prolog to save the index of where the group
4857 * beginning is stored. A zero length group check occurs in the
4858 * normal match but is skipped in the matchInit.
4859 */
4860 static class Loop extends Node {
4861 Node body;
4862 int countIndex; // local count index in matcher locals
4863 int beginIndex; // group beginning index
4864 int cmin, cmax;
4865 int posIndex;
4866 Loop(int countIndex, int beginIndex) {
4867 this.countIndex = countIndex;
4868 this.beginIndex = beginIndex;
4869 this.posIndex = -1;
4870 }
4871 boolean match(Matcher matcher, int i, CharSequence seq) {
4872 // Avoid infinite loop in zero-length case.
4873 if (i > matcher.locals[beginIndex]) {
4874 int count = matcher.locals[countIndex];
4875
4876 // This block is for before we reach the minimum
4877 // iterations required for the loop to match
4878 if (count < cmin) {
4879 matcher.locals[countIndex] = count + 1;
4880 boolean b = body.match(matcher, i, seq);
4881 // If match failed we must backtrack, so
4882 // the loop count should NOT be incremented
4883 if (!b)
4884 matcher.locals[countIndex] = count;
4885 // Return success or failure since we are under
4886 // minimum
4887 return b;
4888 }
4889 // This block is for after we have the minimum
4890 // iterations required for the loop to match
4891 if (count < cmax) {
4892 // Let's check if we have already tried and failed
4893 // at this starting position "i" in the past.
4894 // If yes, then just return false wihtout trying
4895 // again, to stop the exponential backtracking.
4896 if (posIndex != -1 &&
4897 matcher.localsPos[posIndex].contains(i)) {
4898 return next.match(matcher, i, seq);
4899 }
4900 matcher.locals[countIndex] = count + 1;
4901 boolean b = body.match(matcher, i, seq);
4902 // If match failed we must backtrack, so
4903 // the loop count should NOT be incremented
4904 if (b)
4905 return true;
4906 matcher.locals[countIndex] = count;
4907 // save the failed position
4908 if (posIndex != -1) {
4909 matcher.localsPos[posIndex].add(i);
4910 }
4911 }
4912 }
4913 return next.match(matcher, i, seq);
4914 }
4915 boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4916 int save = matcher.locals[countIndex];
4917 boolean ret = false;
4918 if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
4919 matcher.localsPos[posIndex] = new IntHashSet();
4920 }
4921 if (0 < cmin) {
4922 matcher.locals[countIndex] = 1;
4923 ret = body.match(matcher, i, seq);
4924 } else if (0 < cmax) {
4925 matcher.locals[countIndex] = 1;
4926 ret = body.match(matcher, i, seq);
4927 if (ret == false)
4928 ret = next.match(matcher, i, seq);
4929 } else {
4930 ret = next.match(matcher, i, seq);
4931 }
4932 matcher.locals[countIndex] = save;
4933 return ret;
4934 }
4935 boolean study(TreeInfo info) {
4936 info.maxValid = false;
4937 info.deterministic = false;
4938 return false;
4939 }
4940 }
5347 int startIndex = (!matcher.transparentBounds) ?
5348 matcher.from : 0;
5349 int from = Math.max(i - rmaxChars, startIndex);
5350 matcher.lookbehindTo = i;
5351 // Relax transparent region boundaries for lookbehind
5352 if (matcher.transparentBounds)
5353 matcher.from = 0;
5354 for (int j = i - rminChars;
5355 !conditionMatched && j >= from;
5356 j -= j>from ? countChars(seq, j, -1) : 1) {
5357 conditionMatched = cond.match(matcher, j, seq);
5358 }
5359 //Reinstate region boundaries
5360 matcher.from = savedFrom;
5361 matcher.lookbehindTo = savedLBT;
5362 return !conditionMatched && next.match(matcher, i, seq);
5363 }
5364 }
5365
5366 /**
5367 * Handles word boundaries. Includes a field to allow this one class to
5368 * deal with the different types of word boundaries we can match. The word
5369 * characters include underscores, letters, and digits. Non spacing marks
5370 * can are also part of a word if they have a base character, otherwise
5371 * they are ignored for purposes of finding word boundaries.
5372 */
5373 static final class Bound extends Node {
5374 static int LEFT = 0x1;
5375 static int RIGHT= 0x2;
5376 static int BOTH = 0x3;
5377 static int NONE = 0x4;
5378 int type;
5379 boolean useUWORD;
5380 Bound(int n, boolean useUWORD) {
5381 type = n;
5382 this.useUWORD = useUWORD;
5383 }
5384
5385 boolean isWord(int ch) {
5386 return useUWORD ? CharPredicates.WORD.is(ch)
5387 : (ch == '_' || Character.isLetterOrDigit(ch));
5388 }
5389
5390 int check(Matcher matcher, int i, CharSequence seq) {
5391 int ch;
5392 boolean left = false;
5393 int startIndex = matcher.from;
5394 int endIndex = matcher.to;
5395 if (matcher.transparentBounds) {
5396 startIndex = 0;
5397 endIndex = matcher.getTextLength();
5398 }
5399 if (i > startIndex) {
5400 ch = Character.codePointBefore(seq, i);
5401 left = (isWord(ch) ||
5402 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5403 && hasBaseCharacter(matcher, i-1, seq)));
5404 }
5405 boolean right = false;
5406 if (i < endIndex) {
5612 i += countChars(seq, i, n);
5613 continue NEXT;
5614 }
5615 }
5616 // Entire pattern matched starting at i
5617 matcher.first = i;
5618 boolean ret = next.match(matcher, i + lengthInChars, seq);
5619 if (ret) {
5620 matcher.first = i;
5621 matcher.groups[0] = matcher.first;
5622 matcher.groups[1] = matcher.last;
5623 return true;
5624 }
5625 i += countChars(seq, i, 1);
5626 }
5627 matcher.hitEnd = true;
5628 return false;
5629 }
5630 }
5631
5632 @FunctionalInterface
5633 static interface CharPredicate {
5634 boolean is(int ch);
5635
5636 default CharPredicate and(CharPredicate p) {
5637 return ch -> is(ch) && p.is(ch);
5638 }
5639 default CharPredicate union(CharPredicate p) {
5640 return ch -> is(ch) || p.is(ch);
5641 }
5642 default CharPredicate union(CharPredicate p1,
5643 CharPredicate p2 ) {
5644 return ch -> is(ch) || p1.is(ch) || p2.is(ch);
5645 }
5646 default CharPredicate negate() {
5647 return ch -> !is(ch);
5648 }
5649 }
5650
5651 static interface BmpCharPredicate extends CharPredicate {
5652
5653 default CharPredicate and(CharPredicate p) {
5654 if(p instanceof BmpCharPredicate)
5655 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
5656 return ch -> is(ch) && p.is(ch);
5657 }
5658 default CharPredicate union(CharPredicate p) {
5659 if (p instanceof BmpCharPredicate)
5660 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch));
5661 return ch -> is(ch) || p.is(ch);
5662 }
5663 static CharPredicate union(CharPredicate... predicates) {
5664 CharPredicate cp = ch -> {
5665 for (CharPredicate p : predicates) {
5666 if (!p.is(ch))
5667 return false;
5668 }
5669 return true;
5670 };
5671 for (CharPredicate p : predicates) {
5672 if (! (p instanceof BmpCharPredicate))
5673 return cp;
5674 }
5675 return (BmpCharPredicate)cp;
5676 }
5677 }
5678
5679 /**
5680 * matches a Perl vertical whitespace
5681 */
5682 static BmpCharPredicate VertWS = cp ->
5683 (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
5684
5685 /**
5686 * matches a Perl horizontal whitespace
5687 */
5688 static BmpCharPredicate HorizWS = cp ->
5689 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
5690 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
5691 cp == 0x205f || cp == 0x3000;
5692
5693 /**
5694 * for the Unicode category ALL and the dot metacharacter when
5695 * in dotall mode.
5696 */
5697 static CharPredicate ALL = ch -> true;
5698
5699 /**
5700 * for the dot metacharacter when dotall is not enabled.
5701 */
5702 static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
5703 && (ch|1) != '\u2029'
5704 && ch != '\u0085');
5705 /**
5706 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
5707 */
5708 static CharPredicate UNIXDOT = ch -> ch != '\n';
5709
5710 /**
5711 * Indicate that matches a Supplementary Unicode character
5712 */
5713 static CharPredicate SingleS(int c) {
5714 return ch -> ch == c;
5715 }
5716
5717 /**
5718 * A bmp/optimized predicate of single
5719 */
5720 static BmpCharPredicate Single(int c) {
5721 return ch -> ch == c;
5722 }
5723
5724 /**
5725 * Case insensitive matches a given BMP character
5726 */
5727 static BmpCharPredicate SingleI(int lower, int upper) {
5728 return ch -> ch == lower || ch == upper;
5729 }
5730
5731 /**
5732 * Unicode case insensitive matches a given Unicode character
5733 */
5734 static CharPredicate SingleU(int lower) {
5735 return ch -> lower == ch ||
5736 lower == Character.toLowerCase(Character.toUpperCase(ch));
5737 }
5738
5739 private static boolean inRange(int lower, int ch, int upper) {
5740 return lower <= ch && ch <= upper;
5741 }
5742
5743 /**
5744 * Charactrs within a explicit value range
5745 */
5746 static CharPredicate Range(int lower, int upper) {
5747 if (upper < Character.MIN_HIGH_SURROGATE ||
5748 lower > Character.MAX_HIGH_SURROGATE &&
5749 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
5750 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
5751 return ch -> inRange(lower, ch, upper);
5752 }
5753
5754 /**
5755 * Charactrs within a explicit value range in a case insensitive manner.
5756 */
5757 static CharPredicate CIRange(int lower, int upper) {
5758 return ch -> inRange(lower, ch, upper) ||
5759 ASCII.isAscii(ch) &&
5760 (inRange(lower, ASCII.toUpper(ch), upper) ||
5761 inRange(lower, ASCII.toLower(ch), upper));
5762 }
5763
5764 static CharPredicate CIRangeU(int lower, int upper) {
5765 return ch -> {
5766 if (inRange(lower, ch, upper))
5767 return true;
5768 int up = Character.toUpperCase(ch);
5769 return inRange(lower, up, upper) ||
5770 inRange(lower, Character.toLowerCase(up), upper);
5771 };
5772 }
5773
5774 /**
5775 * This must be the very first initializer.
5776 */
5777 static final Node accept = new Node();
5778
5779 static final Node lastAccept = new LastNode();
5780
5781 /**
5782 * Creates a predicate which can be used to match a string.
5783 *
5784 * @return The predicate which can be used for matching on a string
5785 * @since 1.8
5786 */
5787 public Predicate<String> asPredicate() {
5788 return s -> matcher(s).find();
5789 }
5790
5791 /**
5792 * Creates a stream from the given input sequence around matches of this
5793 * pattern.
5794 *
5795 * <p> The stream returned by this method contains each substring of the
5796 * input sequence that is terminated by another subsequence that matches
5797 * this pattern or is terminated by the end of the input sequence. The
5798 * substrings in the stream are in the order in which they occur in the
5799 * input. Trailing empty strings will be discarded and not encountered in
5800 * the stream.
5801 *
|