src/java.base/share/classes/java/util/regex/Pattern.java
Print this page
@@ -24,15 +24,18 @@
*/
package java.util.regex;
import java.text.Normalizer;
+import java.text.Normalizer.Form;
import java.util.Locale;
import java.util.Iterator;
import java.util.Map;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.Set;
import java.util.Arrays;
import java.util.NoSuchElementException;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Predicate;
@@ -982,10 +985,15 @@
* Temporary storage used by parsing pattern slice.
*/
transient int[] buffer;
/**
+ * A temporary storage used for predicate for double return.
+ */
+ transient CharPredicate predicate;
+
+ /**
* Map the "name" of the "named capturing group" to its group id
* node.
*/
transient volatile Map<String, Integer> namedGroups;
@@ -1024,11 +1032,11 @@
/**
* If the Start node might possibly match supplementary characters.
* It is set to true during compiling if
* (1) There is supplementary char in pattern, or
- * (2) There is complement node of Category or Block
+ * (2) There is complement node of a "family" CharProperty
*/
private transient boolean hasSupplementary;
/**
* Compiles the given regular expression into a pattern.
@@ -1376,138 +1384,145 @@
matchRoot = lastAccept;
}
}
/**
- * The pattern is converted to normalized form ({@linkplain
- * java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
- * and then a pure group is constructed to match canonical
- * equivalences of the characters.
+ * The pattern is converted to normalized form ({@link
+ * java.text.Normalizer.Form.NFC NFC}, canonical decomposition,
+ * followed by canonical composition for the character class
+ * part, and {@link java.text.Normalizer.Form.NFD NFD},
+ * canonical decomposition) for the rest), and then a pure
+ * group is constructed to match canonical equivalences of the
+ * characters.
*/
- private void normalize() {
- int lastCodePoint = -1;
-
- // Convert pattern into normalized form
- normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
- patternLength = normalizedPattern.length();
-
- // Modify pattern to match canonical equivalences
- StringBuilder newPattern = new StringBuilder(patternLength);
- for(int i=0; i<patternLength; ) {
- int c = normalizedPattern.codePointAt(i);
- StringBuilder sequenceBuffer;
- if ((Character.getType(c) == Character.NON_SPACING_MARK)
- && (lastCodePoint != -1)) {
- sequenceBuffer = new StringBuilder();
- sequenceBuffer.appendCodePoint(lastCodePoint);
- sequenceBuffer.appendCodePoint(c);
- while(Character.getType(c) == Character.NON_SPACING_MARK) {
- i += Character.charCount(c);
- if (i >= patternLength)
- break;
- c = normalizedPattern.codePointAt(i);
- sequenceBuffer.appendCodePoint(c);
+ private static String normalize(String pattern) {
+ int plen = pattern.length();
+ StringBuilder pbuf = new StringBuilder(plen);
+ char last = 0;
+ int lastStart = 0;
+ char cc = 0;
+ for (int i = 0; i < plen;) {
+ char c = pattern.charAt(i);
+ if (cc == 0 && // top level
+ c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {
+ i += 2; last = 0;
+ continue;
}
- String ea = produceEquivalentAlternation(
- sequenceBuffer.toString());
- newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
- newPattern.append("(?:").append(ea).append(")");
- } else if (c == '[' && lastCodePoint != '\\') {
- i = normalizeCharClass(newPattern, i);
- } else {
- newPattern.appendCodePoint(c);
+ if (c == '[' && last != '\\') {
+ if (cc == 0) {
+ if (lastStart < i)
+ normalizeSlice(pattern, lastStart, i, pbuf);
+ lastStart = i;
+ }
+ cc++;
+ } else if (c == ']' && last != '\\') {
+ cc--;
+ if (cc == 0) {
+ normalizeClazz(pattern, lastStart, i + 1, pbuf);
+ lastStart = i + 1;
}
- lastCodePoint = c;
- i += Character.charCount(c);
}
- normalizedPattern = newPattern.toString();
+ last = c;
+ i++;
+ }
+ assert (cc == 0);
+ if (lastStart < plen)
+ normalizeSlice(pattern, lastStart, plen, pbuf);
+ return pbuf.toString();
}
- /**
- * Complete the character class being parsed and add a set
- * of alternations to it that will match the canonical equivalences
- * of the characters within the class.
- */
- private int normalizeCharClass(StringBuilder newPattern, int i) {
- StringBuilder charClass = new StringBuilder();
- StringBuilder eq = null;
- int lastCodePoint = -1;
- String result;
-
- i++;
- charClass.append("[");
- while(true) {
- int c = normalizedPattern.codePointAt(i);
- StringBuilder sequenceBuffer;
-
- if (c == ']' && lastCodePoint != '\\') {
- charClass.append((char)c);
- break;
- } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
- sequenceBuffer = new StringBuilder();
- sequenceBuffer.appendCodePoint(lastCodePoint);
- while(Character.getType(c) == Character.NON_SPACING_MARK) {
- sequenceBuffer.appendCodePoint(c);
- i += Character.charCount(c);
- if (i >= normalizedPattern.length())
+ private static void normalizeSlice(String src, int off, int limit,
+ StringBuilder dst)
+ {
+ int len = src.length();
+ int off0 = off;
+ while (off < limit && ASCII.isAscii(src.charAt(off))) {
+ off++;
+ }
+ if (off == limit) {
+ dst.append(src, off0, limit);
+ return;
+ }
+ off--;
+ if (off < off0)
+ off = off0;
+ else
+ dst.append(src, off0, off);
+ while (off < limit) {
+ int ch0 = src.codePointAt(off);
+ if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
+ dst.append((char)ch0);
+ off++;
+ continue;
+ }
+ int j = off + Character.charCount(ch0);
+ int ch1;
+ while (j < limit) {
+ ch1 = src.codePointAt(j);
+ if (Grapheme.isBoundary(ch0, ch1))
break;
- c = normalizedPattern.codePointAt(i);
+ ch0 = ch1;
+ j += Character.charCount(ch1);
}
- String ea = produceEquivalentAlternation(
- sequenceBuffer.toString());
-
- charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
- if (eq == null)
- eq = new StringBuilder();
- eq.append('|');
- eq.append(ea);
- } else {
- charClass.appendCodePoint(c);
- i++;
+ String seq = src.substring(off, j);
+ String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
+ off = j;
+ if (nfd.length() > 1) {
+ ch0 = nfd.codePointAt(0);
+ ch1 = nfd.codePointAt(Character.charCount(ch0));
+ if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
+ Set<String> altns = new LinkedHashSet<>();
+ altns.add(seq);
+ produceEquivalentAlternation(nfd, altns);
+ dst.append("(?:");
+ altns.forEach( s -> dst.append(s + "|"));
+ dst.delete(dst.length() - 1, dst.length());
+ dst.append(")");
+ continue;
}
- if (i == normalizedPattern.length())
- throw error("Unclosed character class");
- lastCodePoint = c;
}
-
- if (eq != null) {
- result = "(?:"+charClass.toString()+eq.toString()+")";
- } else {
- result = charClass.toString();
+ String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);
+ if (!seq.equals(nfc) && !nfd.equals(nfc))
+ dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")");
+ else if (!seq.equals(nfd))
+ dst.append("(?:" + seq + "|" + nfd + ")");
+ else
+ dst.append(seq);
+ }
}
- newPattern.append(result);
- return i;
+ private static void normalizeClazz(String src, int off, int limit,
+ StringBuilder dst)
+ {
+ dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));
}
/**
* Given a specific sequence composed of a regular character and
* combining marks that follow it, produce the alternation that will
* match all canonical equivalences of that sequence.
*/
- private String produceEquivalentAlternation(String source) {
- int len = countChars(source, 0, 1);
- if (source.length() == len)
- // source has one character.
- return source;
-
- String base = source.substring(0,len);
- String combiningMarks = source.substring(len);
-
+ private static void produceEquivalentAlternation(String src,
+ Set<String> dst)
+ {
+ int len = countChars(src, 0, 1);
+ if (src.length() == len) {
+ dst.add(src); // source has one character.
+ return;
+ }
+ String base = src.substring(0,len);
+ String combiningMarks = src.substring(len);
String[] perms = producePermutations(combiningMarks);
- StringBuilder result = new StringBuilder(source);
-
// Add combined permutations
- for(int x=0; x<perms.length; x++) {
+ for(int x = 0; x < perms.length; x++) {
String next = base + perms[x];
- if (x>0)
- result.append("|"+next);
+ dst.add(next);
next = composeOneStep(next);
- if (next != null)
- result.append("|"+produceEquivalentAlternation(next));
+ if (next != null) {
+ produceEquivalentAlternation(next, dst);
+ }
}
- return result.toString();
}
/**
* Returns an array of strings that have all the possible
* permutations of the characters in the input string.
@@ -1515,11 +1530,11 @@
* of a set of combining marks. Note that some of the permutations
* are invalid because of combining class collisions, and these
* possibilities must be removed because they are not canonically
* equivalent.
*/
- private String[] producePermutations(String input) {
+ private static String[] producePermutations(String input) {
if (input.length() == countChars(input, 0, 1))
return new String[] {input};
if (input.length() == countChars(input, 0, 2)) {
int c0 = Character.codePointAt(input, 0);
@@ -1573,26 +1588,25 @@
String[] result = new String[index];
System.arraycopy(temp, 0, result, 0, index);
return result;
}
- private int getClass(int c) {
+ private static int getClass(int c) {
return sun.text.Normalizer.getCombiningClass(c);
}
/**
* Attempts to compose input by combining the first character
* with the first combining mark following it. Returns a String
* that is the composition of the leading character with its first
* combining mark followed by the remaining combining marks. Returns
* null if the first two characters cannot be further composed.
*/
- private String composeOneStep(String input) {
+ private static String composeOneStep(String input) {
int len = countChars(input, 0, 2);
String firstTwoCharacters = input.substring(0, len);
String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
-
if (result.equals(firstTwoCharacters))
return null;
else {
String remainder = input.substring(len);
return result + remainder;
@@ -1675,11 +1689,11 @@
* of the expression which will create the object tree.
*/
private void compile() {
// Handle canonical equivalences
if (has(CANON_EQ) && !has(LITERAL)) {
- normalize();
+ normalizedPattern = normalize(pattern);
} else {
normalizedPattern = pattern;
}
patternLength = normalizedPattern.length();
@@ -1752,48 +1766,10 @@
}
return groups;
}
/**
- * Used to print out a subtree of the Pattern to help with debugging.
- */
- private static void printObjectTree(Node node) {
- while(node != null) {
- if (node instanceof Prolog) {
- System.out.println(node);
- printObjectTree(((Prolog)node).loop);
- System.out.println("**** end contents prolog loop");
- } else if (node instanceof Loop) {
- System.out.println(node);
- printObjectTree(((Loop)node).body);
- System.out.println("**** end contents Loop body");
- } else if (node instanceof Curly) {
- System.out.println(node);
- printObjectTree(((Curly)node).atom);
- System.out.println("**** end contents Curly body");
- } else if (node instanceof GroupCurly) {
- System.out.println(node);
- printObjectTree(((GroupCurly)node).atom);
- System.out.println("**** end contents GroupCurly body");
- } else if (node instanceof GroupTail) {
- System.out.println(node);
- System.out.println("Tail next is "+node.next);
- return;
- } else {
- System.out.println(node);
- }
- node = node.next;
- if (node != null)
- System.out.println("->next:");
- if (node == Pattern.accept) {
- System.out.println("Accept Node");
- node = null;
- }
- }
- }
-
- /**
* Used to accumulate information about a subtree of the object graph
* so that optimizations can be applied to the subtree.
*/
static final class TreeInfo {
int minLength;
@@ -2081,11 +2057,14 @@
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[':
- node = clazz(true);
+ if (has(CANON_EQ) && !has(LITERAL))
+ node = new NFCCharProperty(clazz(true));
+ else
+ node = newCharProperty(clazz(true));
break;
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
boolean oneLetter = true;
@@ -2094,11 +2073,15 @@
if (ch != '{') {
unread();
} else {
oneLetter = false;
}
- node = family(oneLetter, comp);
+ // node = newCharProperty(family(oneLetter, comp));
+ if (has(CANON_EQ) && !has(LITERAL))
+ node = new NFCCharProperty(family(oneLetter, comp));
+ else
+ node = newCharProperty(family(oneLetter, comp));
} else {
unread();
node = atom();
}
break;
@@ -2121,16 +2104,16 @@
node = new Dollar(has(MULTILINE));
break;
case '.':
next();
if (has(DOTALL)) {
- node = new All();
+ node = new CharProperty(ALL);
} else {
- if (has(UNIX_LINES))
- node = new UnixDot();
- else {
- node = new Dot();
+ if (has(UNIX_LINES)) {
+ node = new CharProperty(UNIXDOT);
+ } else {
+ node = new CharProperty(DOT);
}
}
break;
case '|':
case ')':
@@ -2153,11 +2136,10 @@
node = atom();
break;
}
node = closure(node);
-
if (head == null) {
head = tail = node;
} else {
tail.next = node;
tail = node;
@@ -2211,11 +2193,14 @@
ch = next(); // Consume { if present
if (ch != '{')
unread();
else
oneLetter = false;
- return family(oneLetter, comp);
+ if (has(CANON_EQ) && !has(LITERAL))
+ return new NFCCharProperty(family(oneLetter, comp));
+ else
+ return newCharProperty(family(oneLetter, comp));
}
}
unread();
prev = cursor;
ch = escape(false, first == 0, false);
@@ -2249,11 +2234,11 @@
continue;
}
break;
}
if (first == 1) {
- return newSingle(buffer[0]);
+ return newCharProperty(single(buffer[0]));
} else {
return newSlice(buffer, first, hasSupplementary);
}
}
@@ -2344,23 +2329,31 @@
if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
- if (create) root = has(UNICODE_CHARACTER_CLASS)
- ? new Utype(UnicodeProp.DIGIT).complement()
- : new Ctype(ASCII.DIGIT).complement();
+ if (create) {
+ predicate = has(UNICODE_CHARACTER_CLASS) ?
+ CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
+ predicate = predicate.negate();
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'E':
case 'F':
break;
case 'G':
if (inclass) break;
if (create) root = new LastMatch();
return -1;
case 'H':
- if (create) root = new HorizWS().complement();
+ if (create) {
+ predicate = HorizWS.negate();
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'I':
case 'J':
case 'K':
case 'L':
@@ -2375,24 +2368,36 @@
case 'R':
if (inclass) break;
if (create) root = new LineEnding();
return -1;
case 'S':
- if (create) root = has(UNICODE_CHARACTER_CLASS)
- ? new Utype(UnicodeProp.WHITE_SPACE).complement()
- : new Ctype(ASCII.SPACE).complement();
+ if (create) {
+ predicate = has(UNICODE_CHARACTER_CLASS) ?
+ CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
+ predicate = predicate.negate();
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'T':
case 'U':
break;
case 'V':
- if (create) root = new VertWS().complement();
+ if (create) {
+ predicate = VertWS.negate();
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'W':
- if (create) root = has(UNICODE_CHARACTER_CLASS)
- ? new Utype(UnicodeProp.WORD).complement()
- : new Ctype(ASCII.WORD).complement();
+ if (create) {
+ predicate = has(UNICODE_CHARACTER_CLASS) ?
+ CharPredicates.WORD : CharPredicates.ASCII_WORD;
+ predicate = predicate.negate();
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'X':
if (inclass) break;
if (create) {
root = new XGrapheme();
@@ -2428,22 +2433,29 @@
}
return -1;
case 'c':
return c();
case 'd':
- if (create) root = has(UNICODE_CHARACTER_CLASS)
- ? new Utype(UnicodeProp.DIGIT)
- : new Ctype(ASCII.DIGIT);
+ if (create) {
+ predicate = has(UNICODE_CHARACTER_CLASS) ?
+ CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'e':
return '\033';
case 'f':
return '\f';
case 'g':
break;
case 'h':
- if (create) root = new HorizWS();
+ if (create) {
+ predicate = HorizWS;
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'i':
case 'j':
break;
case 'k':
@@ -2471,13 +2483,16 @@
case 'q':
break;
case 'r':
return '\r';
case 's':
- if (create) root = has(UNICODE_CHARACTER_CLASS)
- ? new Utype(UnicodeProp.WHITE_SPACE)
- : new Ctype(ASCII.SPACE);
+ if (create) {
+ predicate = has(UNICODE_CHARACTER_CLASS) ?
+ CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 't':
return '\t';
case 'u':
return u();
@@ -2490,16 +2505,23 @@
// the start or end value, such as [\v-...] or [...-\v], in
// which a single definite value (0x0B) is expected. For
// compatibility concern '\013'/0x0B is returned if isrange.
if (isrange)
return '\013';
- if (create) root = new VertWS();
+ if (create) {
+ predicate = VertWS;
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'w':
- if (create) root = has(UNICODE_CHARACTER_CLASS)
- ? new Utype(UnicodeProp.WORD)
- : new Ctype(ASCII.WORD);
+ if (create) {
+ predicate = has(UNICODE_CHARACTER_CLASS) ?
+ CharPredicates.WORD : CharPredicates.ASCII_WORD;
+ if (!inclass)
+ root = newCharProperty(predicate);
+ }
return -1;
case 'x':
return x();
case 'y':
break;
@@ -2518,112 +2540,111 @@
*
* Consumes a ] on the way out if consume is true. Usually consume
* is true except for the case of [abc&&def] where def is a separate
* right hand node with "understood" brackets.
*/
- private CharProperty clazz(boolean consume) {
- CharProperty prev = null;
- CharProperty node = null;
+ private CharPredicate clazz(boolean consume) {
+ CharPredicate prev = null;
+ CharPredicate curr = null;
BitClass bits = new BitClass();
- boolean include = true;
- boolean firstInClass = true;
+ BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
+
+ boolean isNeg = false;
+ boolean hasBits = false;
int ch = next();
- for (;;) {
- switch (ch) {
- case '^':
+
// Negates if first char in a class, otherwise literal
- if (firstInClass) {
- if (temp[cursor-1] != '[')
- break;
+ if (ch == '^' && temp[cursor-1] == '[') {
ch = next();
- include = !include;
- continue;
- } else {
- // ^ not first in class, treat as literal
- break;
+ isNeg = true;
}
+ for (;;) {
+ switch (ch) {
case '[':
- firstInClass = false;
- node = clazz(true);
+ curr = clazz(true);
if (prev == null)
- prev = node;
+ prev = curr;
else
- prev = union(prev, node);
+ prev = prev.union(curr);
ch = peek();
continue;
case '&':
- firstInClass = false;
ch = next();
if (ch == '&') {
ch = next();
- CharProperty rightNode = null;
+ CharPredicate right = null;
while (ch != ']' && ch != '&') {
if (ch == '[') {
- if (rightNode == null)
- rightNode = clazz(true);
+ if (right == null)
+ right = clazz(true);
else
- rightNode = union(rightNode, clazz(true));
+ right = right.union(clazz(true));
} else { // abc&&def
unread();
- rightNode = clazz(false);
+ right = clazz(false);
}
ch = peek();
}
- if (rightNode != null)
- node = rightNode;
+ if (hasBits) {
+ // bits used, union has high precedence
if (prev == null) {
- if (rightNode == null)
+ prev = curr = bitsP;
+ } else {
+ prev = prev.union(bitsP);
+ }
+ hasBits = false;
+ }
+ if (right != null)
+ curr = right;
+ if (prev == null) {
+ if (right == null)
throw error("Bad class syntax");
else
- prev = rightNode;
+ prev = right;
} else {
- prev = intersection(prev, node);
+ prev = prev.and(curr);
}
} else {
// treat as a literal &
unread();
break;
}
continue;
case 0:
- firstInClass = false;
if (cursor >= patternLength)
throw error("Unclosed character class");
break;
case ']':
- firstInClass = false;
- if (prev != null) {
+ if (prev != null || hasBits) {
if (consume)
next();
+ if (prev == null)
+ prev = bitsP;
+ else if (hasBits)
+ prev = prev.union(bitsP);
+ if (isNeg)
+ return prev.negate();
return prev;
}
break;
default:
- firstInClass = false;
break;
}
- node = range(bits);
- if (include) {
- if (prev == null) {
- prev = node;
- } else {
- if (prev != node)
- prev = union(prev, node);
- }
+ curr = range(bits);
+ if (curr == null) { // the bits used
+ hasBits = true;
} else {
- if (prev == null) {
- prev = node.complement();
- } else {
- if (prev != node)
- prev = setDifference(prev, node);
- }
+ if (prev == null)
+ prev = curr;
+ else if (prev != curr)
+ prev = prev.union(curr);
}
ch = peek();
}
}
- private CharProperty bitsOrSingle(BitClass bits, int ch) {
+ private CharPredicate bitsOrSingle(BitClass bits, int ch) {
/* Bits can only handle codepoints in [u+0000-u+00ff] range.
Use "single" node instead of bits when dealing with unicode
case folding for codepoints listed below.
(1)Uppercase out of range: u+00ff, u+00b5
toUpperCase(u+00ff) -> u+0178
@@ -2644,20 +2665,47 @@
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
(ch == 0xff || ch == 0xb5 ||
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
- ch == 0xc5 || ch == 0xe5))) //A+ring
- return bits.add(ch, flags());
- return newSingle(ch);
+ ch == 0xc5 || ch == 0xe5))) { //A+ring {
+ bits.add(ch, flags());
+ return null;
+ }
+ return single(ch);
+ }
+
+ /**
+ * Returns a suitably optimized, single character predicate
+ */
+ private CharPredicate single(final int ch) {
+ if (has(CASE_INSENSITIVE)) {
+ int lower, upper;
+ if (has(UNICODE_CASE)) {
+ upper = Character.toUpperCase(ch);
+ lower = Character.toLowerCase(upper);
+ // Unicode case insensitive matches
+ if (upper != lower)
+ return SingleU(lower);
+ } else if (ASCII.isAscii(ch)) {
+ lower = ASCII.toLower(ch);
+ upper = ASCII.toUpper(ch);
+ // Case insensitive matches a given BMP character
+ if (lower != upper)
+ return SingleI(lower, upper);
+ }
+ }
+ if (isSupplementary(ch))
+ return SingleS(ch);
+ return Single(ch); // Match a given BMP character
}
/**
* Parse a single character or a character range in a character class
* and return its representative node.
*/
- private CharProperty range(BitClass bits) {
+ private CharPredicate range(BitClass bits) {
int ch = peek();
if (ch == '\\') {
ch = nextEscaped();
if (ch == 'p' || ch == 'P') { // A property
boolean comp = (ch == 'P');
@@ -2672,11 +2720,11 @@
} else { // ordinary escape
boolean isrange = temp[cursor+1] == '-';
unread();
ch = escape(true, true, isrange);
if (ch == -1)
- return (CharProperty) root;
+ return predicate;
}
} else {
next();
}
if (ch >= 0) {
@@ -2694,30 +2742,33 @@
next();
}
if (m < ch) {
throw error("Illegal character range");
}
- if (has(CASE_INSENSITIVE))
- return caseInsensitiveRangeFor(ch, m);
- else
- return rangeFor(ch, m);
+ if (has(CASE_INSENSITIVE)) {
+ if (has(UNICODE_CASE))
+ return CIRangeU(ch, m);
+ return CIRange(ch, m);
+ } else {
+ return Range(ch, m);
+ }
}
}
return bitsOrSingle(bits, ch);
}
throw error("Unexpected character '"+((char)ch)+"'");
}
/**
* Parses a Unicode character family and returns its representative node.
*/
- private CharProperty family(boolean singleLetter,
- boolean maybeComplement)
+ private CharPredicate family(boolean singleLetter,
+ boolean isComplement)
{
next();
String name;
- CharProperty node = null;
+ CharPredicate p = null;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
@@ -2745,92 +2796,66 @@
String value = name.substring(i + 1);
name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
switch (name) {
case "sc":
case "script":
- node = unicodeScriptPropertyFor(value);
+ p = CharPredicates.forUnicodeScript(value);
break;
case "blk":
case "block":
- node = unicodeBlockPropertyFor(value);
+ p = CharPredicates.forUnicodeBlock(value);
break;
case "gc":
case "general_category":
- node = charPropertyNodeFor(value);
+ p = CharPredicates.forProperty(value);
break;
default:
+ break;
+ }
+ if (p == null)
throw error("Unknown Unicode property {name=<" + name + ">, "
+ "value=<" + value + ">}");
- }
+
} else {
if (name.startsWith("In")) {
- // \p{inBlockName}
- node = unicodeBlockPropertyFor(name.substring(2));
+ // \p{InBlockName}
+ p = CharPredicates.forUnicodeBlock(name.substring(2));
} else if (name.startsWith("Is")) {
- // \p{isGeneralCategory} and \p{isScriptName}
+ // \p{IsGeneralCategory} and \p{IsScriptName}
name = name.substring(2);
- UnicodeProp uprop = UnicodeProp.forName(name);
- if (uprop != null)
- node = new Utype(uprop);
- if (node == null)
- node = CharPropertyNames.charPropertyFor(name);
- if (node == null)
- node = unicodeScriptPropertyFor(name);
+ p = CharPredicates.forUnicodeProperty(name);
+ if (p == null)
+ p = CharPredicates.forProperty(name);
+ if (p == null)
+ p = CharPredicates.forUnicodeScript(name);
} else {
if (has(UNICODE_CHARACTER_CLASS)) {
- UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
- if (uprop != null)
- node = new Utype(uprop);
+ p = CharPredicates.forPOSIXName(name);
}
- if (node == null)
- node = charPropertyNodeFor(name);
+ if (p == null)
+ p = CharPredicates.forProperty(name);
}
+ if (p == null)
+ throw error("Unknown character property name {In/Is" + name + "}");
}
- if (maybeComplement) {
- if (node instanceof Category || node instanceof Block)
+ if (isComplement) {
+ // it might be too expensive to detect if a complement of
+ // CharProperty can match "certain" supplementary. So just
+ // go with StartS.
hasSupplementary = true;
- node = node.complement();
- }
- return node;
- }
-
-
- /**
- * Returns a CharProperty matching all characters belong to
- * a UnicodeScript.
- */
- private CharProperty unicodeScriptPropertyFor(String name) {
- final Character.UnicodeScript script;
- try {
- script = Character.UnicodeScript.forName(name);
- } catch (IllegalArgumentException iae) {
- throw error("Unknown character script name {" + name + "}");
- }
- return new Script(script);
- }
-
- /**
- * Returns a CharProperty matching all characters in a UnicodeBlock.
- */
- private CharProperty unicodeBlockPropertyFor(String name) {
- final Character.UnicodeBlock block;
- try {
- block = Character.UnicodeBlock.forName(name);
- } catch (IllegalArgumentException iae) {
- throw error("Unknown character block name {" + name + "}");
+ p = p.negate();
}
- return new Block(block);
+ return p;
}
- /**
- * Returns a CharProperty matching all characters in a named property.
- */
- private CharProperty charPropertyNodeFor(String name) {
- CharProperty p = CharPropertyNames.charPropertyFor(name);
+ private CharProperty newCharProperty(CharPredicate p) {
if (p == null)
- throw error("Unknown character property name {" + name + "}");
- return p;
+ return null;
+ if (p instanceof BmpCharPredicate)
+ return new BmpCharProperty((BmpCharPredicate)p);
+ else
+ return new CharProperty(p);
}
/**
* Parses and returns the name of a "named capturing group", the trailing
* ">" is consumed after parsing.
@@ -2882,11 +2907,11 @@
break;
case '>': // (?>xxx) independent group
head = createGroup(true);
tail = root;
head.next = expr(tail);
- head = tail = new Ques(head, INDEPENDENT);
+ head = tail = new Ques(head, Qtype.INDEPENDENT);
break;
case '<': // (?<xxx) look behind
ch = read();
if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
// named captured group
@@ -2968,26 +2993,26 @@
return node; // Dual return
}
if (node instanceof Ques) {
Ques ques = (Ques) node;
- if (ques.type == POSSESSIVE) {
+ if (ques.type == Qtype.POSSESSIVE) {
root = node;
return node;
}
tail.next = new BranchConn();
tail = tail.next;
- if (ques.type == GREEDY) {
+ if (ques.type == Qtype.GREEDY) {
head = new Branch(head, null, tail);
} else { // Reluctant quantifier
head = new Branch(null, head, tail);
}
root = tail;
return head;
} else if (node instanceof Curly) {
Curly curly = (Curly) node;
- if (curly.type == POSSESSIVE) {
+ if (curly.type == Qtype.POSSESSIVE) {
root = node;
return node;
}
// Discover if the group is deterministic
TreeInfo info = new TreeInfo();
@@ -3000,11 +3025,11 @@
capturingGroup);
return head;
} else { // Non-deterministic
int temp = ((GroupHead) head).localIndex;
Loop loop;
- if (curly.type == GREEDY)
+ if (curly.type == Qtype.GREEDY)
loop = new Loop(this.localCount, temp);
else // Reluctant Curly
loop = new LazyLoop(this.localCount, temp);
Prolog prolog = new Prolog(loop);
this.localCount += 1;
@@ -3029,10 +3054,14 @@
int groupIndex = 0;
if (!anonymous)
groupIndex = capturingGroupCount++;
GroupHead head = new GroupHead(localIndex);
root = new GroupTail(localIndex, groupIndex);
+
+ // for debug/print only, head.match does NOT need the "tail" info
+ head.tail = (GroupTail)root;
+
if (!anonymous && groupIndex < 10)
groupNodes[groupIndex] = head;
return head;
}
@@ -3117,17 +3146,30 @@
}
}
static final int MAX_REPS = 0x7FFFFFFF;
- static final int GREEDY = 0;
-
- static final int LAZY = 1;
-
- static final int POSSESSIVE = 2;
+ static enum Qtype {
+ GREEDY, LAZY, POSSESSIVE, INDEPENDENT
+ }
- static final int INDEPENDENT = 3;
+ private Node curly(Node prev, int cmin) {
+ int ch = next();
+ if (ch == '?') {
+ next();
+ return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
+ }
+ if (prev instanceof BmpCharProperty) {
+ return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
+ } else if (prev instanceof CharProperty) {
+ return new CharPropertyGreedy((CharProperty)prev, cmin);
+ }
+ return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
+ }
/**
* Processes repetition. If the next character peeked is a quantifier
* then new nodes must be appended to handle the repetition.
* Prev could be a single or a group, so it could be a chain of nodes.
@@ -3138,36 +3180,20 @@
switch (ch) {
case '?':
ch = next();
if (ch == '?') {
next();
- return new Ques(prev, LAZY);
+ return new Ques(prev, Qtype.LAZY);
} else if (ch == '+') {
next();
- return new Ques(prev, POSSESSIVE);
+ return new Ques(prev, Qtype.POSSESSIVE);
}
- return new Ques(prev, GREEDY);
+ return new Ques(prev, Qtype.GREEDY);
case '*':
- ch = next();
- if (ch == '?') {
- next();
- return new Curly(prev, 0, MAX_REPS, LAZY);
- } else if (ch == '+') {
- next();
- return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
- }
- return new Curly(prev, 0, MAX_REPS, GREEDY);
+ return curly(prev, 0);
case '+':
- ch = next();
- if (ch == '?') {
- next();
- return new Curly(prev, 1, MAX_REPS, LAZY);
- } else if (ch == '+') {
- next();
- return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
- }
- return new Curly(prev, 1, MAX_REPS, GREEDY);
+ return curly(prev, 1);
case '{':
ch = temp[cursor+1];
if (ASCII.isDigit(ch)) {
skip();
int cmin = 0;
@@ -3192,16 +3218,16 @@
throw error("Illegal repetition range");
Curly curly;
ch = peek();
if (ch == '?') {
next();
- curly = new Curly(prev, cmin, cmax, LAZY);
+ curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
} else if (ch == '+') {
next();
- curly = new Curly(prev, cmin, cmax, POSSESSIVE);
+ curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
} else {
- curly = new Curly(prev, cmin, cmax, GREEDY);
+ curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
}
return curly;
} else {
throw error("Illegal repetition");
}
@@ -3374,14 +3400,19 @@
/**
* Creates a bit vector for matching Latin-1 values. A normal BitClass
* never matches values above Latin-1, and a complemented BitClass always
* matches values above Latin-1.
*/
- private static final class BitClass extends BmpCharProperty {
+ static final class BitClass extends BmpCharProperty {
final boolean[] bits;
- BitClass() { bits = new boolean[256]; }
- private BitClass(boolean[] bits) { this.bits = bits; }
+ BitClass() {
+ this(new boolean[256]);
+ }
+ private BitClass(boolean[] bits) {
+ super( ch -> ch < 256 && bits[ch]);
+ this.bits = bits;
+ }
BitClass add(int c, int flags) {
assert c >= 0 && c <= 255;
if ((flags & CASE_INSENSITIVE) != 0) {
if (ASCII.isAscii(c)) {
bits[ASCII.toUpper(c)] = true;
@@ -3392,36 +3423,10 @@
}
}
bits[c] = true;
return this;
}
- boolean isSatisfiedBy(int ch) {
- return ch < 256 && bits[ch];
- }
- }
-
- /**
- * Returns a suitably optimized, single character matcher.
- */
- private CharProperty newSingle(final int ch) {
- if (has(CASE_INSENSITIVE)) {
- int lower, upper;
- if (has(UNICODE_CASE)) {
- upper = Character.toUpperCase(ch);
- lower = Character.toLowerCase(upper);
- if (upper != lower)
- return new SingleU(lower);
- } else if (ASCII.isAscii(ch)) {
- lower = ASCII.toLower(ch);
- upper = ASCII.toUpper(ch);
- if (lower != upper)
- return new SingleI(lower, upper);
- }
- }
- if (isSupplementary(ch))
- return new SingleS(ch); // Match a given Unicode character
- return new Single(ch); // Match a given BMP character
}
/**
* Utility method for creating a string slice matcher.
*/
@@ -3825,22 +3830,21 @@
/**
* Abstract node class to match one character satisfying some
* boolean property.
*/
- private abstract static class CharProperty extends Node {
- abstract boolean isSatisfiedBy(int ch);
- CharProperty complement() {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return ! CharProperty.this.isSatisfiedBy(ch);}};
+ static class CharProperty extends Node {
+ CharPredicate predicate;
+
+ CharProperty (CharPredicate predicate) {
+ this.predicate = predicate;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch = Character.codePointAt(seq, i);
- return isSatisfiedBy(ch)
- && next.match(matcher, i+Character.charCount(ch), seq);
+ return predicate.is(ch) &&
+ next.match(matcher, i + Character.charCount(ch), seq);
} else {
matcher.hitEnd = true;
return false;
}
}
@@ -3853,151 +3857,72 @@
/**
* Optimized version of CharProperty that works only for
* properties never satisfied by Supplementary characters.
*/
- private abstract static class BmpCharProperty extends CharProperty {
+ private static class BmpCharProperty extends CharProperty {
+ BmpCharProperty (BmpCharPredicate predicate) {
+ super(predicate);
+ }
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
- return isSatisfiedBy(seq.charAt(i))
- && next.match(matcher, i+1, seq);
+ return predicate.is(seq.charAt(i)) &&
+ next.match(matcher, i + 1, seq);
} else {
matcher.hitEnd = true;
return false;
}
}
}
- /**
- * Node class that matches a Supplementary Unicode character
- */
- static final class SingleS extends CharProperty {
- final int c;
- SingleS(int c) { this.c = c; }
- boolean isSatisfiedBy(int ch) {
- return ch == c;
- }
+ private static class NFCCharProperty extends Node {
+ CharPredicate predicate;
+ NFCCharProperty (CharPredicate predicate) {
+ this.predicate = predicate;
}
- /**
- * Optimization -- matches a given BMP character
- */
- static final class Single extends BmpCharProperty {
- final int c;
- Single(int c) { this.c = c; }
- boolean isSatisfiedBy(int ch) {
- return ch == c;
- }
- }
-
- /**
- * Case insensitive matches a given BMP character
- */
- static final class SingleI extends BmpCharProperty {
- final int lower;
- final int upper;
- SingleI(int lower, int upper) {
- this.lower = lower;
- this.upper = upper;
- }
- boolean isSatisfiedBy(int ch) {
- return ch == lower || ch == upper;
- }
- }
-
- /**
- * Unicode case insensitive matches a given Unicode character
- */
- static final class SingleU extends CharProperty {
- final int lower;
- SingleU(int lower) {
- this.lower = lower;
- }
- boolean isSatisfiedBy(int ch) {
- return lower == ch ||
- lower == Character.toLowerCase(Character.toUpperCase(ch));
- }
- }
-
- /**
- * Node class that matches a Unicode block.
- */
- static final class Block extends CharProperty {
- final Character.UnicodeBlock block;
- Block(Character.UnicodeBlock block) {
- this.block = block;
- }
- boolean isSatisfiedBy(int ch) {
- return block == Character.UnicodeBlock.of(ch);
- }
- }
-
- /**
- * Node class that matches a Unicode script
- */
- static final class Script extends CharProperty {
- final Character.UnicodeScript script;
- Script(Character.UnicodeScript script) {
- this.script = script;
- }
- boolean isSatisfiedBy(int ch) {
- return script == Character.UnicodeScript.of(ch);
- }
- }
-
- /**
- * Node class that matches a Unicode category.
- */
- static final class Category extends CharProperty {
- final int typeMask;
- Category(int typeMask) { this.typeMask = typeMask; }
- boolean isSatisfiedBy(int ch) {
- return (typeMask & (1 << Character.getType(ch))) != 0;
- }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i < matcher.to) {
+ int ch0 = Character.codePointAt(seq, i);
+ int n = Character.charCount(ch0);
+ int j = i + n;
+ while (j < matcher.to) {
+ int ch1 = Character.codePointAt(seq, j);
+ if (Grapheme.isBoundary(ch0, ch1))
+ break;
+ ch0 = ch1;
+ j += Character.charCount(ch1);
}
-
- /**
- * Node class that matches a Unicode "type"
- */
- static final class Utype extends CharProperty {
- final UnicodeProp uprop;
- Utype(UnicodeProp uprop) { this.uprop = uprop; }
- boolean isSatisfiedBy(int ch) {
- return uprop.is(ch);
+ if (i + n == j) { // single, assume nfc cp
+ if (predicate.is(ch0))
+ return next.match(matcher, j, seq);
+ } else {
+ while (i + n < j) {
+ String nfc = Normalizer.normalize(
+ seq.toString().substring(i, j), Normalizer.Form.NFC);
+ if (nfc.codePointCount(0, nfc.length()) == 1) {
+ if (predicate.is(nfc.codePointAt(0)) &&
+ next.match(matcher, j, seq)) {
+ return true;
}
}
- /**
- * Node class that matches a POSIX type.
- */
- static final class Ctype extends BmpCharProperty {
- final int ctype;
- Ctype(int ctype) { this.ctype = ctype; }
- boolean isSatisfiedBy(int ch) {
- return ch < 128 && ASCII.isType(ch, ctype);
+ ch0 = Character.codePointBefore(seq, j);
+ j -= Character.charCount(ch0);
}
}
-
- /**
- * Node class that matches a Perl vertical whitespace
- */
- static final class VertWS extends BmpCharProperty {
- boolean isSatisfiedBy(int cp) {
- return (cp >= 0x0A && cp <= 0x0D) ||
- cp == 0x85 || cp == 0x2028 || cp == 0x2029;
+ if (j < matcher.to)
+ return false;
}
+ matcher.hitEnd = true;
+ return false;
}
- /**
- * Node class that matches a Perl horizontal whitespace
- */
- static final class HorizWS extends BmpCharProperty {
- boolean isSatisfiedBy(int cp) {
- return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
- cp == 0x1680 || cp == 0x180e ||
- cp >= 0x2000 && cp <= 0x200a ||
- cp == 0x202f || cp == 0x205f || cp == 0x3000;
+ boolean study(TreeInfo info) {
+ info.minLength++;
+ info.deterministic = false;
+ return next.study(info);
}
}
/**
* Node class that matches an unicode extended grapheme cluster
@@ -4215,85 +4140,17 @@
int toLower(int c) {
return Character.toLowerCase(Character.toUpperCase(c));
}
}
- private static boolean inRange(int lower, int ch, int upper) {
- return lower <= ch && ch <= upper;
- }
-
- /**
- * Returns node for matching characters within an explicit value range.
- */
- private static CharProperty rangeFor(final int lower,
- final int upper) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return inRange(lower, ch, upper);}};
- }
-
- /**
- * Returns node for matching characters within an explicit value
- * range in a case insensitive manner.
- */
- private CharProperty caseInsensitiveRangeFor(final int lower,
- final int upper) {
- if (has(UNICODE_CASE))
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- if (inRange(lower, ch, upper))
- return true;
- int up = Character.toUpperCase(ch);
- return inRange(lower, up, upper) ||
- inRange(lower, Character.toLowerCase(up), upper);}};
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return inRange(lower, ch, upper) ||
- ASCII.isAscii(ch) &&
- (inRange(lower, ASCII.toUpper(ch), upper) ||
- inRange(lower, ASCII.toLower(ch), upper));
- }};
- }
-
- /**
- * Implements the Unicode category ALL and the dot metacharacter when
- * in dotall mode.
- */
- static final class All extends CharProperty {
- boolean isSatisfiedBy(int ch) {
- return true;
- }
- }
-
- /**
- * Node class for the dot metacharacter when dotall is not enabled.
- */
- static final class Dot extends CharProperty {
- boolean isSatisfiedBy(int ch) {
- return (ch != '\n' && ch != '\r'
- && (ch|1) != '\u2029'
- && ch != '\u0085');
- }
- }
-
- /**
- * Node class for the dot metacharacter when dotall is not enabled
- * but UNIX_LINES is enabled.
- */
- static final class UnixDot extends CharProperty {
- boolean isSatisfiedBy(int ch) {
- return ch != '\n';
- }
- }
-
/**
* The 0 or 1 quantifier. This one class implements all three types.
*/
static final class Ques extends Node {
Node atom;
- int type;
- Ques(Node node, int type) {
+ Qtype type;
+ Ques(Node node, Qtype type) {
this.atom = node;
this.type = type;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
switch (type) {
@@ -4309,11 +4166,11 @@
default:
return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
}
}
boolean study(TreeInfo info) {
- if (type != INDEPENDENT) {
+ if (type != Qtype.INDEPENDENT) {
int minL = info.minLength;
atom.study(info);
info.minLength = minL;
info.deterministic = false;
return next.study(info);
@@ -4323,21 +4180,94 @@
}
}
}
/**
+ * Handles the greedy style repetition with the minimum either be
+ * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
+ */
+ static class CharPropertyGreedy extends Node {
+ final CharPredicate predicate;
+ final int cmin;
+
+ CharPropertyGreedy(CharProperty cp, int cmin) {
+ this.predicate = cp.predicate;
+ this.cmin = cmin;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int n = 0;
+ int to = matcher.to;
+ // greedy, all the way down
+ while (i < to) {
+ int ch = Character.codePointAt(seq, i);
+ if (!predicate.is(ch))
+ break;
+ i += Character.charCount(ch);
+ n++;
+ }
+ if (i >= to) {
+ matcher.hitEnd = true;
+ }
+ while (n >= cmin) {
+ if (next.match(matcher, i, seq))
+ return true;
+ if (n == cmin)
+ return false;
+ // backing off if match fails
+ int ch = Character.codePointBefore(seq, i);
+ i -= Character.charCount(ch);
+ n--;
+ }
+ return false;
+ }
+
+ boolean study(TreeInfo info) {
+ info.minLength += cmin;
+ if (info.maxValid) {
+ info.maxLength += MAX_REPS;
+ }
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
+
+ BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
+ super(bcp, cmin);
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int n = 0;
+ int to = matcher.to;
+ while (i < to && predicate.is(seq.charAt(i))) {
+ i++; n++;
+ }
+ if (i >= to) {
+ matcher.hitEnd = true;
+ }
+ while (n >= cmin) {
+ if (next.match(matcher, i, seq))
+ return true;
+ i--; n--; // backing off if match fails
+ }
+ return false;
+ }
+ }
+
+ /**
* Handles the curly-brace style repetition with a specified minimum and
* maximum occurrences. The * quantifier is handled as a special case.
* This class handles the three types.
*/
static final class Curly extends Node {
Node atom;
- int type;
+ Qtype type;
int cmin;
int cmax;
- Curly(Node node, int cmin, int cmax, int type) {
+ Curly(Node node, int cmin, int cmax, Qtype type) {
this.atom = node;
this.type = type;
this.cmin = cmin;
this.cmax = cmax;
}
@@ -4348,13 +4278,13 @@
i = matcher.last;
continue;
}
return false;
}
- if (type == GREEDY)
+ if (type == Qtype.GREEDY)
return match0(matcher, i, j, seq);
- else if (type == LAZY)
+ else if (type == Qtype.LAZY)
return match1(matcher, i, j, seq);
else
return match2(matcher, i, j, seq);
}
// Greedy match.
@@ -4472,18 +4402,18 @@
* If capture is true then this class saves group settings and ensures
* that groups are unset when backing off of a group match.
*/
static final class GroupCurly extends Node {
Node atom;
- int type;
+ Qtype type;
int cmin;
int cmax;
int localIndex;
int groupIndex;
boolean capture;
- GroupCurly(Node node, int cmin, int cmax, int type, int local,
+ GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
int group, boolean capture) {
this.atom = node;
this.type = type;
this.cmin = cmin;
this.cmax = cmax;
@@ -4519,13 +4449,13 @@
ret = false;
break;
}
}
if (ret) {
- if (type == GREEDY) {
+ if (type == Qtype.GREEDY) {
ret = match0(matcher, i, cmin, seq);
- } else if (type == LAZY) {
+ } else if (type == Qtype.LAZY) {
ret = match1(matcher, i, cmin, seq);
} else {
ret = match2(matcher, i, cmin, seq);
}
}
@@ -4767,10 +4697,11 @@
* indicate that we do not want to unset the group if the reference
* doesn't match.
*/
static final class GroupHead extends Node {
int localIndex;
+ GroupTail tail; // for debug/print only, match does not need to know
GroupHead(int localCount) {
localIndex = localCount;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int save = matcher.locals[localIndex];
@@ -5360,40 +5291,10 @@
return !conditionMatched && next.match(matcher, i, seq);
}
}
/**
- * Returns the set union of two CharProperty nodes.
- */
- private static CharProperty union(final CharProperty lhs,
- final CharProperty rhs) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
- }
-
- /**
- * Returns the set intersection of two CharProperty nodes.
- */
- private static CharProperty intersection(final CharProperty lhs,
- final CharProperty rhs) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
- }
-
- /**
- * Returns the set difference of two CharProperty nodes.
- */
- private static CharProperty setDifference(final CharProperty lhs,
- final CharProperty rhs) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
- }
-
- /**
* Handles word boundaries. Includes a field to allow this one class to
* deal with the different types of word boundaries we can match. The word
* characters include underscores, letters, and digits. Non spacing marks
* can are also part of a word if they have a base character, otherwise
* they are ignored for purposes of finding word boundaries.
@@ -5409,11 +5310,11 @@
type = n;
this.useUWORD = useUWORD;
}
boolean isWord(int ch) {
- return useUWORD ? UnicodeProp.WORD.is(ch)
+ return useUWORD ? CharPredicates.WORD.is(ch)
: (ch == '_' || Character.isLetterOrDigit(ch));
}
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
@@ -5655,219 +5556,159 @@
matcher.hitEnd = true;
return false;
}
}
-///////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
+ @FunctionalInterface
+ static interface CharPredicate {
+ boolean is(int ch);
+
+ default CharPredicate and(CharPredicate p) {
+ return ch -> is(ch) && p.is(ch);
+ }
+ default CharPredicate union(CharPredicate p) {
+ return ch -> is(ch) || p.is(ch);
+ }
+ default CharPredicate union(CharPredicate p1,
+ CharPredicate p2 ) {
+ return ch -> is(ch) || p1.is(ch) || p2.is(ch);
+ }
+ default CharPredicate negate() {
+ return ch -> !is(ch);
+ }
+ }
+
+ static interface BmpCharPredicate extends CharPredicate {
+
+ default CharPredicate and(CharPredicate p) {
+ if(p instanceof BmpCharPredicate)
+ return (BmpCharPredicate)((ch) -> is(ch) && p.is(ch));
+ return ch -> is(ch) && p.is(ch);
+ }
+ default CharPredicate union(CharPredicate p) {
+ if (p instanceof BmpCharPredicate)
+ return (BmpCharPredicate)((ch) -> is(ch) || p.is(ch));
+ return ch -> is(ch) || p.is(ch);
+ }
+ static CharPredicate union(CharPredicate... predicates) {
+ CharPredicate cp = ch -> {
+ for (CharPredicate p : predicates) {
+ if (!p.is(ch))
+ return false;
+ }
+ return true;
+ };
+ for (CharPredicate p : predicates) {
+ if (! (p instanceof BmpCharPredicate))
+ return cp;
+ }
+ return (BmpCharPredicate)cp;
+ }
+ }
+
+ /**
+ * matches a Perl vertical whitespace
+ */
+ static BmpCharPredicate VertWS = cp ->
+ (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
/**
- * This must be the very first initializer.
+ * matches a Perl horizontal whitespace
*/
- static Node accept = new Node();
+ static BmpCharPredicate HorizWS = cp ->
+ cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
+ cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
+ cp == 0x205f || cp == 0x3000;
- static Node lastAccept = new LastNode();
+ /**
+ * for the Unicode category ALL and the dot metacharacter when
+ * in dotall mode.
+ */
+ static CharPredicate ALL = ch -> true;
- private static class CharPropertyNames {
+ /**
+ * for the dot metacharacter when dotall is not enabled.
+ */
+ static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
+ && (ch|1) != '\u2029'
+ && ch != '\u0085');
+ /**
+ * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
+ */
+ static CharPredicate UNIXDOT = ch -> ch != '\n';
- static CharProperty charPropertyFor(String name) {
- CharPropertyFactory m = map.get(name);
- return m == null ? null : m.make();
+ /**
+ * Indicate that matches a Supplementary Unicode character
+ */
+ static CharPredicate SingleS(int c) {
+ return ch -> ch == c;
}
- private abstract static class CharPropertyFactory {
- abstract CharProperty make();
+ /**
+ * A bmp/optimized predicate of single
+ */
+ static BmpCharPredicate Single(int c) {
+ return ch -> ch == c;
}
- private static void defCategory(String name,
- final int typeMask) {
- map.put(name, new CharPropertyFactory() {
- CharProperty make() { return new Category(typeMask);}});
+ /**
+ * Case insensitive matches a given BMP character
+ */
+ static BmpCharPredicate SingleI(int lower, int upper) {
+ return ch -> ch == lower || ch == upper;
}
- private static void defRange(String name,
- final int lower, final int upper) {
- map.put(name, new CharPropertyFactory() {
- CharProperty make() { return rangeFor(lower, upper);}});
+ /**
+ * Unicode case insensitive matches a given Unicode character
+ */
+ static CharPredicate SingleU(int lower) {
+ return ch -> lower == ch ||
+ lower == Character.toLowerCase(Character.toUpperCase(ch));
}
- private static void defCtype(String name,
- final int ctype) {
- map.put(name, new CharPropertyFactory() {
- CharProperty make() { return new Ctype(ctype);}});
+ private static boolean inRange(int lower, int ch, int upper) {
+ return lower <= ch && ch <= upper;
}
- private abstract static class CloneableProperty
- extends CharProperty implements Cloneable
- {
- public CloneableProperty clone() {
- try {
- return (CloneableProperty) super.clone();
- } catch (CloneNotSupportedException e) {
- throw new AssertionError(e);
- }
- }
- }
-
- private static void defClone(String name,
- final CloneableProperty p) {
- map.put(name, new CharPropertyFactory() {
- CharProperty make() { return p.clone();}});
- }
-
- private static final HashMap<String, CharPropertyFactory> map
- = new HashMap<>();
-
- static {
- // Unicode character property aliases, defined in
- // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
- defCategory("Cn", 1<<Character.UNASSIGNED);
- defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
- defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
- defCategory("Lt", 1<<Character.TITLECASE_LETTER);
- defCategory("Lm", 1<<Character.MODIFIER_LETTER);
- defCategory("Lo", 1<<Character.OTHER_LETTER);
- defCategory("Mn", 1<<Character.NON_SPACING_MARK);
- defCategory("Me", 1<<Character.ENCLOSING_MARK);
- defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
- defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
- defCategory("Nl", 1<<Character.LETTER_NUMBER);
- defCategory("No", 1<<Character.OTHER_NUMBER);
- defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
- defCategory("Zl", 1<<Character.LINE_SEPARATOR);
- defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
- defCategory("Cc", 1<<Character.CONTROL);
- defCategory("Cf", 1<<Character.FORMAT);
- defCategory("Co", 1<<Character.PRIVATE_USE);
- defCategory("Cs", 1<<Character.SURROGATE);
- defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
- defCategory("Ps", 1<<Character.START_PUNCTUATION);
- defCategory("Pe", 1<<Character.END_PUNCTUATION);
- defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
- defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
- defCategory("Sm", 1<<Character.MATH_SYMBOL);
- defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
- defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
- defCategory("So", 1<<Character.OTHER_SYMBOL);
- defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
- defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
- defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
- (1<<Character.LOWERCASE_LETTER) |
- (1<<Character.TITLECASE_LETTER) |
- (1<<Character.MODIFIER_LETTER) |
- (1<<Character.OTHER_LETTER)));
- defCategory("M", ((1<<Character.NON_SPACING_MARK) |
- (1<<Character.ENCLOSING_MARK) |
- (1<<Character.COMBINING_SPACING_MARK)));
- defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
- (1<<Character.LETTER_NUMBER) |
- (1<<Character.OTHER_NUMBER)));
- defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
- (1<<Character.LINE_SEPARATOR) |
- (1<<Character.PARAGRAPH_SEPARATOR)));
- defCategory("C", ((1<<Character.CONTROL) |
- (1<<Character.FORMAT) |
- (1<<Character.PRIVATE_USE) |
- (1<<Character.SURROGATE))); // Other
- defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
- (1<<Character.START_PUNCTUATION) |
- (1<<Character.END_PUNCTUATION) |
- (1<<Character.CONNECTOR_PUNCTUATION) |
- (1<<Character.OTHER_PUNCTUATION) |
- (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
- (1<<Character.FINAL_QUOTE_PUNCTUATION)));
- defCategory("S", ((1<<Character.MATH_SYMBOL) |
- (1<<Character.CURRENCY_SYMBOL) |
- (1<<Character.MODIFIER_SYMBOL) |
- (1<<Character.OTHER_SYMBOL)));
- defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
- (1<<Character.LOWERCASE_LETTER) |
- (1<<Character.TITLECASE_LETTER)));
- defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
- (1<<Character.LOWERCASE_LETTER) |
- (1<<Character.TITLECASE_LETTER) |
- (1<<Character.MODIFIER_LETTER) |
- (1<<Character.OTHER_LETTER) |
- (1<<Character.DECIMAL_DIGIT_NUMBER)));
- defRange("L1", 0x00, 0xFF); // Latin-1
- map.put("all", new CharPropertyFactory() {
- CharProperty make() { return new All(); }});
-
- // Posix regular expression character classes, defined in
- // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
- defRange("ASCII", 0x00, 0x7F); // ASCII
- defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
- defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
- defCtype("Blank", ASCII.BLANK); // Space and tab characters
- defCtype("Cntrl", ASCII.CNTRL); // Control characters
- defRange("Digit", '0', '9'); // Numeric characters
- defCtype("Graph", ASCII.GRAPH); // printable and visible
- defRange("Lower", 'a', 'z'); // Lower-case alphabetic
- defRange("Print", 0x20, 0x7E); // Printable characters
- defCtype("Punct", ASCII.PUNCT); // Punctuation characters
- defCtype("Space", ASCII.SPACE); // Space characters
- defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
- defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
-
- // Java character properties, defined by methods in Character.java
- defClone("javaLowerCase", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isLowerCase(ch);}});
- defClone("javaUpperCase", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isUpperCase(ch);}});
- defClone("javaAlphabetic", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isAlphabetic(ch);}});
- defClone("javaIdeographic", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isIdeographic(ch);}});
- defClone("javaTitleCase", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isTitleCase(ch);}});
- defClone("javaDigit", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isDigit(ch);}});
- defClone("javaDefined", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isDefined(ch);}});
- defClone("javaLetter", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isLetter(ch);}});
- defClone("javaLetterOrDigit", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isLetterOrDigit(ch);}});
- defClone("javaJavaIdentifierStart", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isJavaIdentifierStart(ch);}});
- defClone("javaJavaIdentifierPart", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isJavaIdentifierPart(ch);}});
- defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isUnicodeIdentifierStart(ch);}});
- defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isUnicodeIdentifierPart(ch);}});
- defClone("javaIdentifierIgnorable", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isIdentifierIgnorable(ch);}});
- defClone("javaSpaceChar", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isSpaceChar(ch);}});
- defClone("javaWhitespace", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isWhitespace(ch);}});
- defClone("javaISOControl", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isISOControl(ch);}});
- defClone("javaMirrored", new CloneableProperty() {
- boolean isSatisfiedBy(int ch) {
- return Character.isMirrored(ch);}});
+ /**
+ * Charactrs within a explicit value range
+ */
+ static CharPredicate Range(int lower, int upper) {
+ if (upper < Character.MIN_HIGH_SURROGATE ||
+ lower > Character.MAX_HIGH_SURROGATE &&
+ upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
+ return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
+ return ch -> inRange(lower, ch, upper);
}
+
+ /**
+ * Charactrs within a explicit value range in a case insensitive manner.
+ */
+ static CharPredicate CIRange(int lower, int upper) {
+ return ch -> inRange(lower, ch, upper) ||
+ ASCII.isAscii(ch) &&
+ (inRange(lower, ASCII.toUpper(ch), upper) ||
+ inRange(lower, ASCII.toLower(ch), upper));
}
+ static CharPredicate CIRangeU(int lower, int upper) {
+ return ch -> {
+ if (inRange(lower, ch, upper))
+ return true;
+ int up = Character.toUpperCase(ch);
+ return inRange(lower, up, upper) ||
+ inRange(lower, Character.toLowerCase(up), upper);
+ };
+ }
+
+ /**
+ * This must be the very first initializer.
+ */
+ static Node accept = new Node();
+
+ static Node lastAccept = new LastNode();
+
/**
* Creates a predicate which can be used to match a string.
*
* @return The predicate which can be used for matching on a string
* @since 1.8