src/java.base/share/classes/java/util/regex/Pattern.java
Print this page
*** 982,991 ****
--- 982,996 ----
* Temporary storage used by parsing pattern slice.
*/
transient int[] buffer;
/**
+ * A temporary storage used for predicate for double return.
+ */
+ transient CharPredicate predicate;
+
+ /**
* Map the "name" of the "named capturing group" to its group id
* node.
*/
transient volatile Map<String, Integer> namedGroups;
*** 1024,1034 ****
/**
* If the Start node might possibly match supplementary characters.
* It is set to true during compiling if
* (1) There is supplementary char in pattern, or
! * (2) There is complement node of Category or Block
*/
private transient boolean hasSupplementary;
/**
* Compiles the given regular expression into a pattern.
--- 1029,1039 ----
/**
* If the Start node might possibly match supplementary characters.
* It is set to true during compiling if
* (1) There is supplementary char in pattern, or
! * (2) There is complement node of a "family" CharProperty
*/
private transient boolean hasSupplementary;
/**
* Compiles the given regular expression into a pattern.
*** 1752,1799 ****
}
return groups;
}
/**
- * Used to print out a subtree of the Pattern to help with debugging.
- */
- private static void printObjectTree(Node node) {
- while(node != null) {
- if (node instanceof Prolog) {
- System.out.println(node);
- printObjectTree(((Prolog)node).loop);
- System.out.println("**** end contents prolog loop");
- } else if (node instanceof Loop) {
- System.out.println(node);
- printObjectTree(((Loop)node).body);
- System.out.println("**** end contents Loop body");
- } else if (node instanceof Curly) {
- System.out.println(node);
- printObjectTree(((Curly)node).atom);
- System.out.println("**** end contents Curly body");
- } else if (node instanceof GroupCurly) {
- System.out.println(node);
- printObjectTree(((GroupCurly)node).atom);
- System.out.println("**** end contents GroupCurly body");
- } else if (node instanceof GroupTail) {
- System.out.println(node);
- System.out.println("Tail next is "+node.next);
- return;
- } else {
- System.out.println(node);
- }
- node = node.next;
- if (node != null)
- System.out.println("->next:");
- if (node == Pattern.accept) {
- System.out.println("Accept Node");
- node = null;
- }
- }
- }
-
- /**
* Used to accumulate information about a subtree of the object graph
* so that optimizations can be applied to the subtree.
*/
static final class TreeInfo {
int minLength;
--- 1757,1766 ----
*** 2081,2091 ****
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[':
! node = clazz(true);
break;
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
boolean oneLetter = true;
--- 2048,2058 ----
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[':
! node = newCharProperty(clazz(true));
break;
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
boolean oneLetter = true;
*** 2094,2104 ****
if (ch != '{') {
unread();
} else {
oneLetter = false;
}
! node = family(oneLetter, comp);
} else {
unread();
node = atom();
}
break;
--- 2061,2071 ----
if (ch != '{') {
unread();
} else {
oneLetter = false;
}
! node = newCharProperty(family(oneLetter, comp));
} else {
unread();
node = atom();
}
break;
*** 2121,2136 ****
node = new Dollar(has(MULTILINE));
break;
case '.':
next();
if (has(DOTALL)) {
! node = new All();
} else {
! if (has(UNIX_LINES))
! node = new UnixDot();
! else {
! node = new Dot();
}
}
break;
case '|':
case ')':
--- 2088,2103 ----
node = new Dollar(has(MULTILINE));
break;
case '.':
next();
if (has(DOTALL)) {
! node = new CharProperty(ALL);
} else {
! if (has(UNIX_LINES)) {
! node = new CharProperty(UNIXDOT);
! } else {
! node = new CharProperty(DOT);
}
}
break;
case '|':
case ')':
*** 2153,2163 ****
node = atom();
break;
}
node = closure(node);
-
if (head == null) {
head = tail = node;
} else {
tail.next = node;
tail = node;
--- 2120,2129 ----
*** 2211,2221 ****
ch = next(); // Consume { if present
if (ch != '{')
unread();
else
oneLetter = false;
! return family(oneLetter, comp);
}
}
unread();
prev = cursor;
ch = escape(false, first == 0, false);
--- 2177,2187 ----
ch = next(); // Consume { if present
if (ch != '{')
unread();
else
oneLetter = false;
! return newCharProperty(family(oneLetter, comp));
}
}
unread();
prev = cursor;
ch = escape(false, first == 0, false);
*** 2249,2259 ****
continue;
}
break;
}
if (first == 1) {
! return newSingle(buffer[0]);
} else {
return newSlice(buffer, first, hasSupplementary);
}
}
--- 2215,2225 ----
continue;
}
break;
}
if (first == 1) {
! return newCharProperty(single(buffer[0]));
} else {
return newSlice(buffer, first, hasSupplementary);
}
}
*** 2344,2366 ****
if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.DIGIT).complement()
! : new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E':
case 'F':
break;
case 'G':
if (inclass) break;
if (create) root = new LastMatch();
return -1;
case 'H':
! if (create) root = new HorizWS().complement();
return -1;
case 'I':
case 'J':
case 'K':
case 'L':
--- 2310,2340 ----
if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
! if (create) {
! predicate = has(UNICODE_CHARACTER_CLASS) ?
! CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
! predicate = predicate.negate();
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'E':
case 'F':
break;
case 'G':
if (inclass) break;
if (create) root = new LastMatch();
return -1;
case 'H':
! if (create) {
! predicate = HorizWS.negate();
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'I':
case 'J':
case 'K':
case 'L':
*** 2375,2398 ****
case 'R':
if (inclass) break;
if (create) root = new LineEnding();
return -1;
case 'S':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WHITE_SPACE).complement()
! : new Ctype(ASCII.SPACE).complement();
return -1;
case 'T':
case 'U':
break;
case 'V':
! if (create) root = new VertWS().complement();
return -1;
case 'W':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WORD).complement()
! : new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
if (inclass) break;
if (create) {
root = new XGrapheme();
--- 2349,2384 ----
case 'R':
if (inclass) break;
if (create) root = new LineEnding();
return -1;
case 'S':
! if (create) {
! predicate = has(UNICODE_CHARACTER_CLASS) ?
! CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
! predicate = predicate.negate();
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'T':
case 'U':
break;
case 'V':
! if (create) {
! predicate = VertWS.negate();
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'W':
! if (create) {
! predicate = has(UNICODE_CHARACTER_CLASS) ?
! CharPredicates.WORD : CharPredicates.ASCII_WORD;
! predicate = predicate.negate();
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'X':
if (inclass) break;
if (create) {
root = new XGrapheme();
*** 2428,2449 ****
}
return -1;
case 'c':
return c();
case 'd':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.DIGIT)
! : new Ctype(ASCII.DIGIT);
return -1;
case 'e':
return '\033';
case 'f':
return '\f';
case 'g':
break;
case 'h':
! if (create) root = new HorizWS();
return -1;
case 'i':
case 'j':
break;
case 'k':
--- 2414,2442 ----
}
return -1;
case 'c':
return c();
case 'd':
! if (create) {
! predicate = has(UNICODE_CHARACTER_CLASS) ?
! CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'e':
return '\033';
case 'f':
return '\f';
case 'g':
break;
case 'h':
! if (create) {
! predicate = HorizWS;
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'i':
case 'j':
break;
case 'k':
*** 2471,2483 ****
case 'q':
break;
case 'r':
return '\r';
case 's':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WHITE_SPACE)
! : new Ctype(ASCII.SPACE);
return -1;
case 't':
return '\t';
case 'u':
return u();
--- 2464,2479 ----
case 'q':
break;
case 'r':
return '\r';
case 's':
! if (create) {
! predicate = has(UNICODE_CHARACTER_CLASS) ?
! CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 't':
return '\t';
case 'u':
return u();
*** 2490,2505 ****
// the start or end value, such as [\v-...] or [...-\v], in
// which a single definite value (0x0B) is expected. For
// compatibility concern '\013'/0x0B is returned if isrange.
if (isrange)
return '\013';
! if (create) root = new VertWS();
return -1;
case 'w':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WORD)
! : new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
case 'y':
break;
--- 2486,2508 ----
// the start or end value, such as [\v-...] or [...-\v], in
// which a single definite value (0x0B) is expected. For
// compatibility concern '\013'/0x0B is returned if isrange.
if (isrange)
return '\013';
! if (create) {
! predicate = VertWS;
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'w':
! if (create) {
! predicate = has(UNICODE_CHARACTER_CLASS) ?
! CharPredicates.WORD : CharPredicates.ASCII_WORD;
! if (!inclass)
! root = newCharProperty(predicate);
! }
return -1;
case 'x':
return x();
case 'y':
break;
*** 2518,2629 ****
*
* Consumes a ] on the way out if consume is true. Usually consume
* is true except for the case of [abc&&def] where def is a separate
* right hand node with "understood" brackets.
*/
! private CharProperty clazz(boolean consume) {
! CharProperty prev = null;
! CharProperty node = null;
BitClass bits = new BitClass();
! boolean include = true;
! boolean firstInClass = true;
int ch = next();
! for (;;) {
! switch (ch) {
! case '^':
// Negates if first char in a class, otherwise literal
! if (firstInClass) {
! if (temp[cursor-1] != '[')
! break;
ch = next();
! include = !include;
! continue;
! } else {
! // ^ not first in class, treat as literal
! break;
}
case '[':
! firstInClass = false;
! node = clazz(true);
if (prev == null)
! prev = node;
else
! prev = union(prev, node);
ch = peek();
continue;
case '&':
- firstInClass = false;
ch = next();
if (ch == '&') {
ch = next();
! CharProperty rightNode = null;
while (ch != ']' && ch != '&') {
if (ch == '[') {
! if (rightNode == null)
! rightNode = clazz(true);
else
! rightNode = union(rightNode, clazz(true));
} else { // abc&&def
unread();
! rightNode = clazz(false);
}
ch = peek();
}
! if (rightNode != null)
! node = rightNode;
if (prev == null) {
! if (rightNode == null)
throw error("Bad class syntax");
else
! prev = rightNode;
} else {
! prev = intersection(prev, node);
}
} else {
// treat as a literal &
unread();
break;
}
continue;
case 0:
- firstInClass = false;
if (cursor >= patternLength)
throw error("Unclosed character class");
break;
case ']':
! firstInClass = false;
! if (prev != null) {
if (consume)
next();
return prev;
}
break;
default:
- firstInClass = false;
break;
}
! node = range(bits);
! if (include) {
! if (prev == null) {
! prev = node;
! } else {
! if (prev != node)
! prev = union(prev, node);
! }
! } else {
! if (prev == null) {
! prev = node.complement();
} else {
! if (prev != node)
! prev = setDifference(prev, node);
! }
}
ch = peek();
}
}
! private CharProperty bitsOrSingle(BitClass bits, int ch) {
/* Bits can only handle codepoints in [u+0000-u+00ff] range.
Use "single" node instead of bits when dealing with unicode
case folding for codepoints listed below.
(1)Uppercase out of range: u+00ff, u+00b5
toUpperCase(u+00ff) -> u+0178
--- 2521,2631 ----
*
* Consumes a ] on the way out if consume is true. Usually consume
* is true except for the case of [abc&&def] where def is a separate
* right hand node with "understood" brackets.
*/
! private CharPredicate clazz(boolean consume) {
! CharPredicate prev = null;
! CharPredicate curr = null;
BitClass bits = new BitClass();
! BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
!
! boolean isNeg = false;
! boolean hasBits = false;
int ch = next();
!
// Negates if first char in a class, otherwise literal
! if (ch == '^' && temp[cursor-1] == '[') {
ch = next();
! isNeg = true;
}
+ for (;;) {
+ switch (ch) {
case '[':
! curr = clazz(true);
if (prev == null)
! prev = curr;
else
! prev = prev.union(curr);
ch = peek();
continue;
case '&':
ch = next();
if (ch == '&') {
ch = next();
! CharPredicate right = null;
while (ch != ']' && ch != '&') {
if (ch == '[') {
! if (right == null)
! right = clazz(true);
else
! right = right.union(clazz(true));
} else { // abc&&def
unread();
! right = clazz(false);
}
ch = peek();
}
! if (hasBits) {
! // bits used, union has high precedence
if (prev == null) {
! prev = curr = bitsP;
! } else {
! prev = prev.union(bitsP);
! }
! hasBits = false;
! }
! if (right != null)
! curr = right;
! if (prev == null) {
! if (right == null)
throw error("Bad class syntax");
else
! prev = right;
} else {
! prev = prev.and(curr);
}
} else {
// treat as a literal &
unread();
break;
}
continue;
case 0:
if (cursor >= patternLength)
throw error("Unclosed character class");
break;
case ']':
! if (prev != null || hasBits) {
if (consume)
next();
+ if (prev == null)
+ prev = bitsP;
+ else if (hasBits)
+ prev = prev.union(bitsP);
+ if (isNeg)
+ return prev.negate();
return prev;
}
break;
default:
break;
}
! curr = range(bits);
! if (curr == null) { // the bits used
! hasBits = true;
} else {
! if (prev == null)
! prev = curr;
! else if (prev != curr)
! prev = prev.union(curr);
}
ch = peek();
}
}
! private CharPredicate bitsOrSingle(BitClass bits, int ch) {
/* Bits can only handle codepoints in [u+0000-u+00ff] range.
Use "single" node instead of bits when dealing with unicode
case folding for codepoints listed below.
(1)Uppercase out of range: u+00ff, u+00b5
toUpperCase(u+00ff) -> u+0178
*** 2644,2663 ****
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
(ch == 0xff || ch == 0xb5 ||
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
! ch == 0xc5 || ch == 0xe5))) //A+ring
! return bits.add(ch, flags());
! return newSingle(ch);
}
/**
* Parse a single character or a character range in a character class
* and return its representative node.
*/
! private CharProperty range(BitClass bits) {
int ch = peek();
if (ch == '\\') {
ch = nextEscaped();
if (ch == 'p' || ch == 'P') { // A property
boolean comp = (ch == 'P');
--- 2646,2692 ----
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
(ch == 0xff || ch == 0xb5 ||
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
! ch == 0xc5 || ch == 0xe5))) { //A+ring {
! bits.add(ch, flags());
! return null;
! }
! return single(ch);
! }
!
! /**
! * Returns a suitably optimized, single character predicate
! */
! private CharPredicate single(final int ch) {
! if (has(CASE_INSENSITIVE)) {
! int lower, upper;
! if (has(UNICODE_CASE)) {
! upper = Character.toUpperCase(ch);
! lower = Character.toLowerCase(upper);
! // Unicode case insensitive matches
! if (upper != lower)
! return SingleU(lower);
! } else if (ASCII.isAscii(ch)) {
! lower = ASCII.toLower(ch);
! upper = ASCII.toUpper(ch);
! // Case insensitive matches a given BMP character
! if (lower != upper)
! return SingleI(lower, upper);
! }
! }
! if (isSupplementary(ch))
! return SingleS(ch);
! return Single(ch); // Match a given BMP character
}
/**
* Parse a single character or a character range in a character class
* and return its representative node.
*/
! private CharPredicate range(BitClass bits) {
int ch = peek();
if (ch == '\\') {
ch = nextEscaped();
if (ch == 'p' || ch == 'P') { // A property
boolean comp = (ch == 'P');
*** 2672,2682 ****
} else { // ordinary escape
boolean isrange = temp[cursor+1] == '-';
unread();
ch = escape(true, true, isrange);
if (ch == -1)
! return (CharProperty) root;
}
} else {
next();
}
if (ch >= 0) {
--- 2701,2711 ----
} else { // ordinary escape
boolean isrange = temp[cursor+1] == '-';
unread();
ch = escape(true, true, isrange);
if (ch == -1)
! return predicate;
}
} else {
next();
}
if (ch >= 0) {
*** 2694,2723 ****
next();
}
if (m < ch) {
throw error("Illegal character range");
}
! if (has(CASE_INSENSITIVE))
! return caseInsensitiveRangeFor(ch, m);
! else
! return rangeFor(ch, m);
}
}
return bitsOrSingle(bits, ch);
}
throw error("Unexpected character '"+((char)ch)+"'");
}
/**
* Parses a Unicode character family and returns its representative node.
*/
! private CharProperty family(boolean singleLetter,
! boolean maybeComplement)
{
next();
String name;
! CharProperty node = null;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
--- 2723,2755 ----
next();
}
if (m < ch) {
throw error("Illegal character range");
}
! if (has(CASE_INSENSITIVE)) {
! if (has(UNICODE_CASE))
! return CIRangeU(ch, m);
! return CIRange(ch, m);
! } else {
! return Range(ch, m);
! }
}
}
return bitsOrSingle(bits, ch);
}
throw error("Unexpected character '"+((char)ch)+"'");
}
/**
* Parses a Unicode character family and returns its representative node.
*/
! private CharPredicate family(boolean singleLetter,
! boolean isComplement)
{
next();
String name;
! CharPredicate p = null;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
*** 2745,2836 ****
String value = name.substring(i + 1);
name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
switch (name) {
case "sc":
case "script":
! node = unicodeScriptPropertyFor(value);
break;
case "blk":
case "block":
! node = unicodeBlockPropertyFor(value);
break;
case "gc":
case "general_category":
! node = charPropertyNodeFor(value);
break;
default:
throw error("Unknown Unicode property {name=<" + name + ">, "
+ "value=<" + value + ">}");
! }
} else {
if (name.startsWith("In")) {
! // \p{inBlockName}
! node = unicodeBlockPropertyFor(name.substring(2));
} else if (name.startsWith("Is")) {
! // \p{isGeneralCategory} and \p{isScriptName}
name = name.substring(2);
! UnicodeProp uprop = UnicodeProp.forName(name);
! if (uprop != null)
! node = new Utype(uprop);
! if (node == null)
! node = CharPropertyNames.charPropertyFor(name);
! if (node == null)
! node = unicodeScriptPropertyFor(name);
} else {
if (has(UNICODE_CHARACTER_CLASS)) {
! UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
! if (uprop != null)
! node = new Utype(uprop);
}
! if (node == null)
! node = charPropertyNodeFor(name);
}
}
! if (maybeComplement) {
! if (node instanceof Category || node instanceof Block)
hasSupplementary = true;
! node = node.complement();
}
! return node;
! }
!
!
! /**
! * Returns a CharProperty matching all characters belong to
! * a UnicodeScript.
! */
! private CharProperty unicodeScriptPropertyFor(String name) {
! final Character.UnicodeScript script;
! try {
! script = Character.UnicodeScript.forName(name);
! } catch (IllegalArgumentException iae) {
! throw error("Unknown character script name {" + name + "}");
! }
! return new Script(script);
! }
!
! /**
! * Returns a CharProperty matching all characters in a UnicodeBlock.
! */
! private CharProperty unicodeBlockPropertyFor(String name) {
! final Character.UnicodeBlock block;
! try {
! block = Character.UnicodeBlock.forName(name);
! } catch (IllegalArgumentException iae) {
! throw error("Unknown character block name {" + name + "}");
! }
! return new Block(block);
}
! /**
! * Returns a CharProperty matching all characters in a named property.
! */
! private CharProperty charPropertyNodeFor(String name) {
! CharProperty p = CharPropertyNames.charPropertyFor(name);
if (p == null)
! throw error("Unknown character property name {" + name + "}");
! return p;
}
/**
* Parses and returns the name of a "named capturing group", the trailing
* ">" is consumed after parsing.
--- 2777,2842 ----
String value = name.substring(i + 1);
name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
switch (name) {
case "sc":
case "script":
! p = CharPredicates.forUnicodeScript(value);
break;
case "blk":
case "block":
! p = CharPredicates.forUnicodeBlock(value);
break;
case "gc":
case "general_category":
! p = CharPredicates.forProperty(value);
break;
default:
+ break;
+ }
+ if (p == null)
throw error("Unknown Unicode property {name=<" + name + ">, "
+ "value=<" + value + ">}");
!
} else {
if (name.startsWith("In")) {
! // \p{InBlockName}
! p = CharPredicates.forUnicodeBlock(name.substring(2));
} else if (name.startsWith("Is")) {
! // \p{IsGeneralCategory} and \p{IsScriptName}
name = name.substring(2);
! p = CharPredicates.forUnicodeProperty(name);
! if (p == null)
! p = CharPredicates.forProperty(name);
! if (p == null)
! p = CharPredicates.forUnicodeScript(name);
} else {
if (has(UNICODE_CHARACTER_CLASS)) {
! p = CharPredicates.forPOSIXName(name);
}
! if (p == null)
! p = CharPredicates.forProperty(name);
}
+ if (p == null)
+ throw error("Unknown character property name {In/Is" + name + "}");
}
! if (isComplement) {
! // it might be too expensive to detect if a complement of
! // CharProperty can match "certain" supplementary. So just
! // go with StartS.
hasSupplementary = true;
! p = p.negate();
}
! return p;
}
! private CharProperty newCharProperty(CharPredicate p) {
if (p == null)
! return null;
! if (p instanceof BmpCharPredicate)
! return new BmpCharProperty((BmpCharPredicate)p);
! else
! return new CharProperty(p);
}
/**
* Parses and returns the name of a "named capturing group", the trailing
* ">" is consumed after parsing.
*** 2882,2892 ****
break;
case '>': // (?>xxx) independent group
head = createGroup(true);
tail = root;
head.next = expr(tail);
! head = tail = new Ques(head, INDEPENDENT);
break;
case '<': // (?<xxx) look behind
ch = read();
if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
// named captured group
--- 2888,2898 ----
break;
case '>': // (?>xxx) independent group
head = createGroup(true);
tail = root;
head.next = expr(tail);
! head = tail = new Ques(head, Qtype.INDEPENDENT);
break;
case '<': // (?<xxx) look behind
ch = read();
if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
// named captured group
*** 2968,2993 ****
return node; // Dual return
}
if (node instanceof Ques) {
Ques ques = (Ques) node;
! if (ques.type == POSSESSIVE) {
root = node;
return node;
}
tail.next = new BranchConn();
tail = tail.next;
! if (ques.type == GREEDY) {
head = new Branch(head, null, tail);
} else { // Reluctant quantifier
head = new Branch(null, head, tail);
}
root = tail;
return head;
} else if (node instanceof Curly) {
Curly curly = (Curly) node;
! if (curly.type == POSSESSIVE) {
root = node;
return node;
}
// Discover if the group is deterministic
TreeInfo info = new TreeInfo();
--- 2974,2999 ----
return node; // Dual return
}
if (node instanceof Ques) {
Ques ques = (Ques) node;
! if (ques.type == Qtype.POSSESSIVE) {
root = node;
return node;
}
tail.next = new BranchConn();
tail = tail.next;
! if (ques.type == Qtype.GREEDY) {
head = new Branch(head, null, tail);
} else { // Reluctant quantifier
head = new Branch(null, head, tail);
}
root = tail;
return head;
} else if (node instanceof Curly) {
Curly curly = (Curly) node;
! if (curly.type == Qtype.POSSESSIVE) {
root = node;
return node;
}
// Discover if the group is deterministic
TreeInfo info = new TreeInfo();
*** 3000,3010 ****
capturingGroup);
return head;
} else { // Non-deterministic
int temp = ((GroupHead) head).localIndex;
Loop loop;
! if (curly.type == GREEDY)
loop = new Loop(this.localCount, temp);
else // Reluctant Curly
loop = new LazyLoop(this.localCount, temp);
Prolog prolog = new Prolog(loop);
this.localCount += 1;
--- 3006,3016 ----
capturingGroup);
return head;
} else { // Non-deterministic
int temp = ((GroupHead) head).localIndex;
Loop loop;
! if (curly.type == Qtype.GREEDY)
loop = new Loop(this.localCount, temp);
else // Reluctant Curly
loop = new LazyLoop(this.localCount, temp);
Prolog prolog = new Prolog(loop);
this.localCount += 1;
*** 3029,3038 ****
--- 3035,3048 ----
int groupIndex = 0;
if (!anonymous)
groupIndex = capturingGroupCount++;
GroupHead head = new GroupHead(localIndex);
root = new GroupTail(localIndex, groupIndex);
+
+ // for debug/print only, head.match does NOT need the "tail" info
+ head.tail = (GroupTail)root;
+
if (!anonymous && groupIndex < 10)
groupNodes[groupIndex] = head;
return head;
}
*** 3117,3133 ****
}
}
static final int MAX_REPS = 0x7FFFFFFF;
! static final int GREEDY = 0;
!
! static final int LAZY = 1;
!
! static final int POSSESSIVE = 2;
! static final int INDEPENDENT = 3;
/**
* Processes repetition. If the next character peeked is a quantifier
* then new nodes must be appended to handle the repetition.
* Prev could be a single or a group, so it could be a chain of nodes.
--- 3127,3156 ----
}
}
static final int MAX_REPS = 0x7FFFFFFF;
! static enum Qtype {
! GREEDY, LAZY, POSSESSIVE, INDEPENDENT
! }
! private Node curly(Node prev, int cmin) {
! int ch = next();
! if (ch == '?') {
! next();
! return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
! } else if (ch == '+') {
! next();
! return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
! }
! if (prev instanceof BmpCharProperty) {
! return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
! } else if (prev instanceof CharProperty) {
! return new CharPropertyGreedy((CharProperty)prev, cmin);
! }
! return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
! }
/**
* Processes repetition. If the next character peeked is a quantifier
* then new nodes must be appended to handle the repetition.
* Prev could be a single or a group, so it could be a chain of nodes.
*** 3138,3173 ****
switch (ch) {
case '?':
ch = next();
if (ch == '?') {
next();
! return new Ques(prev, LAZY);
} else if (ch == '+') {
next();
! return new Ques(prev, POSSESSIVE);
}
! return new Ques(prev, GREEDY);
case '*':
! ch = next();
! if (ch == '?') {
! next();
! return new Curly(prev, 0, MAX_REPS, LAZY);
! } else if (ch == '+') {
! next();
! return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
! }
! return new Curly(prev, 0, MAX_REPS, GREEDY);
case '+':
! ch = next();
! if (ch == '?') {
! next();
! return new Curly(prev, 1, MAX_REPS, LAZY);
! } else if (ch == '+') {
! next();
! return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
! }
! return new Curly(prev, 1, MAX_REPS, GREEDY);
case '{':
ch = temp[cursor+1];
if (ASCII.isDigit(ch)) {
skip();
int cmin = 0;
--- 3161,3180 ----
switch (ch) {
case '?':
ch = next();
if (ch == '?') {
next();
! return new Ques(prev, Qtype.LAZY);
} else if (ch == '+') {
next();
! return new Ques(prev, Qtype.POSSESSIVE);
}
! return new Ques(prev, Qtype.GREEDY);
case '*':
! return curly(prev, 0);
case '+':
! return curly(prev, 1);
case '{':
ch = temp[cursor+1];
if (ASCII.isDigit(ch)) {
skip();
int cmin = 0;
*** 3192,3207 ****
throw error("Illegal repetition range");
Curly curly;
ch = peek();
if (ch == '?') {
next();
! curly = new Curly(prev, cmin, cmax, LAZY);
} else if (ch == '+') {
next();
! curly = new Curly(prev, cmin, cmax, POSSESSIVE);
} else {
! curly = new Curly(prev, cmin, cmax, GREEDY);
}
return curly;
} else {
throw error("Illegal repetition");
}
--- 3199,3214 ----
throw error("Illegal repetition range");
Curly curly;
ch = peek();
if (ch == '?') {
next();
! curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
} else if (ch == '+') {
next();
! curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
} else {
! curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
}
return curly;
} else {
throw error("Illegal repetition");
}
*** 3374,3387 ****
/**
* Creates a bit vector for matching Latin-1 values. A normal BitClass
* never matches values above Latin-1, and a complemented BitClass always
* matches values above Latin-1.
*/
! private static final class BitClass extends BmpCharProperty {
final boolean[] bits;
! BitClass() { bits = new boolean[256]; }
! private BitClass(boolean[] bits) { this.bits = bits; }
BitClass add(int c, int flags) {
assert c >= 0 && c <= 255;
if ((flags & CASE_INSENSITIVE) != 0) {
if (ASCII.isAscii(c)) {
bits[ASCII.toUpper(c)] = true;
--- 3381,3399 ----
/**
* Creates a bit vector for matching Latin-1 values. A normal BitClass
* never matches values above Latin-1, and a complemented BitClass always
* matches values above Latin-1.
*/
! static final class BitClass extends BmpCharProperty {
final boolean[] bits;
! BitClass() {
! this(new boolean[256]);
! }
! private BitClass(boolean[] bits) {
! super( ch -> ch < 256 && bits[ch]);
! this.bits = bits;
! }
BitClass add(int c, int flags) {
assert c >= 0 && c <= 255;
if ((flags & CASE_INSENSITIVE) != 0) {
if (ASCII.isAscii(c)) {
bits[ASCII.toUpper(c)] = true;
*** 3392,3427 ****
}
}
bits[c] = true;
return this;
}
- boolean isSatisfiedBy(int ch) {
- return ch < 256 && bits[ch];
- }
- }
-
- /**
- * Returns a suitably optimized, single character matcher.
- */
- private CharProperty newSingle(final int ch) {
- if (has(CASE_INSENSITIVE)) {
- int lower, upper;
- if (has(UNICODE_CASE)) {
- upper = Character.toUpperCase(ch);
- lower = Character.toLowerCase(upper);
- if (upper != lower)
- return new SingleU(lower);
- } else if (ASCII.isAscii(ch)) {
- lower = ASCII.toLower(ch);
- upper = ASCII.toUpper(ch);
- if (lower != upper)
- return new SingleI(lower, upper);
- }
- }
- if (isSupplementary(ch))
- return new SingleS(ch); // Match a given Unicode character
- return new Single(ch); // Match a given BMP character
}
/**
* Utility method for creating a string slice matcher.
*/
--- 3404,3413 ----
*** 3825,3846 ****
/**
* Abstract node class to match one character satisfying some
* boolean property.
*/
! private abstract static class CharProperty extends Node {
! abstract boolean isSatisfiedBy(int ch);
! CharProperty complement() {
! return new CharProperty() {
! boolean isSatisfiedBy(int ch) {
! return ! CharProperty.this.isSatisfiedBy(ch);}};
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch = Character.codePointAt(seq, i);
! return isSatisfiedBy(ch)
! && next.match(matcher, i+Character.charCount(ch), seq);
} else {
matcher.hitEnd = true;
return false;
}
}
--- 3811,3831 ----
/**
* Abstract node class to match one character satisfying some
* boolean property.
*/
! static class CharProperty extends Node {
! CharPredicate predicate;
!
! CharProperty (CharPredicate predicate) {
! this.predicate = predicate;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch = Character.codePointAt(seq, i);
! return predicate.is(ch) &&
! next.match(matcher, i + Character.charCount(ch), seq);
} else {
matcher.hitEnd = true;
return false;
}
}
*** 3853,4007 ****
/**
* Optimized version of CharProperty that works only for
* properties never satisfied by Supplementary characters.
*/
! private abstract static class BmpCharProperty extends CharProperty {
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
! return isSatisfiedBy(seq.charAt(i))
! && next.match(matcher, i+1, seq);
} else {
matcher.hitEnd = true;
return false;
}
}
}
/**
- * Node class that matches a Supplementary Unicode character
- */
- static final class SingleS extends CharProperty {
- final int c;
- SingleS(int c) { this.c = c; }
- boolean isSatisfiedBy(int ch) {
- return ch == c;
- }
- }
-
- /**
- * Optimization -- matches a given BMP character
- */
- static final class Single extends BmpCharProperty {
- final int c;
- Single(int c) { this.c = c; }
- boolean isSatisfiedBy(int ch) {
- return ch == c;
- }
- }
-
- /**
- * Case insensitive matches a given BMP character
- */
- static final class SingleI extends BmpCharProperty {
- final int lower;
- final int upper;
- SingleI(int lower, int upper) {
- this.lower = lower;
- this.upper = upper;
- }
- boolean isSatisfiedBy(int ch) {
- return ch == lower || ch == upper;
- }
- }
-
- /**
- * Unicode case insensitive matches a given Unicode character
- */
- static final class SingleU extends CharProperty {
- final int lower;
- SingleU(int lower) {
- this.lower = lower;
- }
- boolean isSatisfiedBy(int ch) {
- return lower == ch ||
- lower == Character.toLowerCase(Character.toUpperCase(ch));
- }
- }
-
- /**
- * Node class that matches a Unicode block.
- */
- static final class Block extends CharProperty {
- final Character.UnicodeBlock block;
- Block(Character.UnicodeBlock block) {
- this.block = block;
- }
- boolean isSatisfiedBy(int ch) {
- return block == Character.UnicodeBlock.of(ch);
- }
- }
-
- /**
- * Node class that matches a Unicode script
- */
- static final class Script extends CharProperty {
- final Character.UnicodeScript script;
- Script(Character.UnicodeScript script) {
- this.script = script;
- }
- boolean isSatisfiedBy(int ch) {
- return script == Character.UnicodeScript.of(ch);
- }
- }
-
- /**
- * Node class that matches a Unicode category.
- */
- static final class Category extends CharProperty {
- final int typeMask;
- Category(int typeMask) { this.typeMask = typeMask; }
- boolean isSatisfiedBy(int ch) {
- return (typeMask & (1 << Character.getType(ch))) != 0;
- }
- }
-
- /**
- * Node class that matches a Unicode "type"
- */
- static final class Utype extends CharProperty {
- final UnicodeProp uprop;
- Utype(UnicodeProp uprop) { this.uprop = uprop; }
- boolean isSatisfiedBy(int ch) {
- return uprop.is(ch);
- }
- }
-
- /**
- * Node class that matches a POSIX type.
- */
- static final class Ctype extends BmpCharProperty {
- final int ctype;
- Ctype(int ctype) { this.ctype = ctype; }
- boolean isSatisfiedBy(int ch) {
- return ch < 128 && ASCII.isType(ch, ctype);
- }
- }
-
- /**
- * Node class that matches a Perl vertical whitespace
- */
- static final class VertWS extends BmpCharProperty {
- boolean isSatisfiedBy(int cp) {
- return (cp >= 0x0A && cp <= 0x0D) ||
- cp == 0x85 || cp == 0x2028 || cp == 0x2029;
- }
- }
-
- /**
- * Node class that matches a Perl horizontal whitespace
- */
- static final class HorizWS extends BmpCharProperty {
- boolean isSatisfiedBy(int cp) {
- return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
- cp == 0x1680 || cp == 0x180e ||
- cp >= 0x2000 && cp <= 0x200a ||
- cp == 0x202f || cp == 0x205f || cp == 0x3000;
- }
- }
-
- /**
* Node class that matches an unicode extended grapheme cluster
*/
static class XGrapheme extends Node {
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
--- 3838,3863 ----
/**
* Optimized version of CharProperty that works only for
* properties never satisfied by Supplementary characters.
*/
! private static class BmpCharProperty extends CharProperty {
! BmpCharProperty (BmpCharPredicate predicate) {
! super(predicate);
! }
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
! return predicate.is(seq.charAt(i)) &&
! next.match(matcher, i + 1, seq);
} else {
matcher.hitEnd = true;
return false;
}
}
}
/**
* Node class that matches an unicode extended grapheme cluster
*/
static class XGrapheme extends Node {
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
*** 4215,4299 ****
int toLower(int c) {
return Character.toLowerCase(Character.toUpperCase(c));
}
}
- private static boolean inRange(int lower, int ch, int upper) {
- return lower <= ch && ch <= upper;
- }
-
- /**
- * Returns node for matching characters within an explicit value range.
- */
- private static CharProperty rangeFor(final int lower,
- final int upper) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return inRange(lower, ch, upper);}};
- }
-
- /**
- * Returns node for matching characters within an explicit value
- * range in a case insensitive manner.
- */
- private CharProperty caseInsensitiveRangeFor(final int lower,
- final int upper) {
- if (has(UNICODE_CASE))
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- if (inRange(lower, ch, upper))
- return true;
- int up = Character.toUpperCase(ch);
- return inRange(lower, up, upper) ||
- inRange(lower, Character.toLowerCase(up), upper);}};
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return inRange(lower, ch, upper) ||
- ASCII.isAscii(ch) &&
- (inRange(lower, ASCII.toUpper(ch), upper) ||
- inRange(lower, ASCII.toLower(ch), upper));
- }};
- }
-
- /**
- * Implements the Unicode category ALL and the dot metacharacter when
- * in dotall mode.
- */
- static final class All extends CharProperty {
- boolean isSatisfiedBy(int ch) {
- return true;
- }
- }
-
- /**
- * Node class for the dot metacharacter when dotall is not enabled.
- */
- static final class Dot extends CharProperty {
- boolean isSatisfiedBy(int ch) {
- return (ch != '\n' && ch != '\r'
- && (ch|1) != '\u2029'
- && ch != '\u0085');
- }
- }
-
- /**
- * Node class for the dot metacharacter when dotall is not enabled
- * but UNIX_LINES is enabled.
- */
- static final class UnixDot extends CharProperty {
- boolean isSatisfiedBy(int ch) {
- return ch != '\n';
- }
- }
-
/**
* The 0 or 1 quantifier. This one class implements all three types.
*/
static final class Ques extends Node {
Node atom;
! int type;
! Ques(Node node, int type) {
this.atom = node;
this.type = type;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
switch (type) {
--- 4071,4087 ----
int toLower(int c) {
return Character.toLowerCase(Character.toUpperCase(c));
}
}
/**
* The 0 or 1 quantifier. This one class implements all three types.
*/
static final class Ques extends Node {
Node atom;
! Qtype type;
! Ques(Node node, Qtype type) {
this.atom = node;
this.type = type;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
switch (type) {
*** 4309,4319 ****
default:
return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
}
}
boolean study(TreeInfo info) {
! if (type != INDEPENDENT) {
int minL = info.minLength;
atom.study(info);
info.minLength = minL;
info.deterministic = false;
return next.study(info);
--- 4097,4107 ----
default:
return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
}
}
boolean study(TreeInfo info) {
! if (type != Qtype.INDEPENDENT) {
int minL = info.minLength;
atom.study(info);
info.minLength = minL;
info.deterministic = false;
return next.study(info);
*** 4323,4343 ****
}
}
}
/**
* Handles the curly-brace style repetition with a specified minimum and
* maximum occurrences. The * quantifier is handled as a special case.
* This class handles the three types.
*/
static final class Curly extends Node {
Node atom;
! int type;
int cmin;
int cmax;
! Curly(Node node, int cmin, int cmax, int type) {
this.atom = node;
this.type = type;
this.cmin = cmin;
this.cmax = cmax;
}
--- 4111,4204 ----
}
}
}
/**
+ * Handles the greedy style repetition with the minimum either be
+ * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
+ */
+ static class CharPropertyGreedy extends Node {
+ final CharPredicate predicate;
+ final int cmin;
+
+ CharPropertyGreedy(CharProperty cp, int cmin) {
+ this.predicate = cp.predicate;
+ this.cmin = cmin;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int n = 0;
+ int to = matcher.to;
+ // greedy, all the way down
+ while (i < to) {
+ int ch = Character.codePointAt(seq, i);
+ if (!predicate.is(ch))
+ break;
+ i += Character.charCount(ch);
+ n++;
+ }
+ if (i >= to) {
+ matcher.hitEnd = true;
+ }
+ while (n >= cmin) {
+ if (next.match(matcher, i, seq))
+ return true;
+ if (n == cmin)
+ return false;
+ // backing off if match fails
+ int ch = Character.codePointBefore(seq, i);
+ i -= Character.charCount(ch);
+ n--;
+ }
+ return false;
+ }
+
+ boolean study(TreeInfo info) {
+ info.minLength += cmin;
+ if (info.maxValid) {
+ info.maxLength += MAX_REPS;
+ }
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
+
+ BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
+ super(bcp, cmin);
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int n = 0;
+ int to = matcher.to;
+ while (i < to && predicate.is(seq.charAt(i))) {
+ i++; n++;
+ }
+ if (i >= to) {
+ matcher.hitEnd = true;
+ }
+ while (n >= cmin) {
+ if (next.match(matcher, i, seq))
+ return true;
+ i--; n--; // backing off if match fails
+ }
+ return false;
+ }
+ }
+
+ /**
* Handles the curly-brace style repetition with a specified minimum and
* maximum occurrences. The * quantifier is handled as a special case.
* This class handles the three types.
*/
static final class Curly extends Node {
Node atom;
! Qtype type;
int cmin;
int cmax;
! Curly(Node node, int cmin, int cmax, Qtype type) {
this.atom = node;
this.type = type;
this.cmin = cmin;
this.cmax = cmax;
}
*** 4348,4360 ****
i = matcher.last;
continue;
}
return false;
}
! if (type == GREEDY)
return match0(matcher, i, j, seq);
! else if (type == LAZY)
return match1(matcher, i, j, seq);
else
return match2(matcher, i, j, seq);
}
// Greedy match.
--- 4209,4221 ----
i = matcher.last;
continue;
}
return false;
}
! if (type == Qtype.GREEDY)
return match0(matcher, i, j, seq);
! else if (type == Qtype.LAZY)
return match1(matcher, i, j, seq);
else
return match2(matcher, i, j, seq);
}
// Greedy match.
*** 4472,4489 ****
* If capture is true then this class saves group settings and ensures
* that groups are unset when backing off of a group match.
*/
static final class GroupCurly extends Node {
Node atom;
! int type;
int cmin;
int cmax;
int localIndex;
int groupIndex;
boolean capture;
! GroupCurly(Node node, int cmin, int cmax, int type, int local,
int group, boolean capture) {
this.atom = node;
this.type = type;
this.cmin = cmin;
this.cmax = cmax;
--- 4333,4350 ----
* If capture is true then this class saves group settings and ensures
* that groups are unset when backing off of a group match.
*/
static final class GroupCurly extends Node {
Node atom;
! Qtype type;
int cmin;
int cmax;
int localIndex;
int groupIndex;
boolean capture;
! GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
int group, boolean capture) {
this.atom = node;
this.type = type;
this.cmin = cmin;
this.cmax = cmax;
*** 4519,4531 ****
ret = false;
break;
}
}
if (ret) {
! if (type == GREEDY) {
ret = match0(matcher, i, cmin, seq);
! } else if (type == LAZY) {
ret = match1(matcher, i, cmin, seq);
} else {
ret = match2(matcher, i, cmin, seq);
}
}
--- 4380,4392 ----
ret = false;
break;
}
}
if (ret) {
! if (type == Qtype.GREEDY) {
ret = match0(matcher, i, cmin, seq);
! } else if (type == Qtype.LAZY) {
ret = match1(matcher, i, cmin, seq);
} else {
ret = match2(matcher, i, cmin, seq);
}
}
*** 4767,4776 ****
--- 4628,4638 ----
* indicate that we do not want to unset the group if the reference
* doesn't match.
*/
static final class GroupHead extends Node {
int localIndex;
+ GroupTail tail; // for debug/print only, match does not need to know
GroupHead(int localCount) {
localIndex = localCount;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int save = matcher.locals[localIndex];
*** 5360,5399 ****
return !conditionMatched && next.match(matcher, i, seq);
}
}
/**
- * Returns the set union of two CharProperty nodes.
- */
- private static CharProperty union(final CharProperty lhs,
- final CharProperty rhs) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
- }
-
- /**
- * Returns the set intersection of two CharProperty nodes.
- */
- private static CharProperty intersection(final CharProperty lhs,
- final CharProperty rhs) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
- }
-
- /**
- * Returns the set difference of two CharProperty nodes.
- */
- private static CharProperty setDifference(final CharProperty lhs,
- final CharProperty rhs) {
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
- }
-
- /**
* Handles word boundaries. Includes a field to allow this one class to
* deal with the different types of word boundaries we can match. The word
* characters include underscores, letters, and digits. Non spacing marks
* can are also part of a word if they have a base character, otherwise
* they are ignored for purposes of finding word boundaries.
--- 5222,5231 ----
*** 5409,5419 ****
type = n;
this.useUWORD = useUWORD;
}
boolean isWord(int ch) {
! return useUWORD ? UnicodeProp.WORD.is(ch)
: (ch == '_' || Character.isLetterOrDigit(ch));
}
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
--- 5241,5251 ----
type = n;
this.useUWORD = useUWORD;
}
boolean isWord(int ch) {
! return useUWORD ? CharPredicates.WORD.is(ch)
: (ch == '_' || Character.isLetterOrDigit(ch));
}
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
*** 5655,5874 ****
matcher.hitEnd = true;
return false;
}
}
! ///////////////////////////////////////////////////////////////////////////////
! ///////////////////////////////////////////////////////////////////////////////
/**
! * This must be the very first initializer.
*/
! static Node accept = new Node();
! static Node lastAccept = new LastNode();
! private static class CharPropertyNames {
! static CharProperty charPropertyFor(String name) {
! CharPropertyFactory m = map.get(name);
! return m == null ? null : m.make();
}
! private abstract static class CharPropertyFactory {
! abstract CharProperty make();
}
! private static void defCategory(String name,
! final int typeMask) {
! map.put(name, new CharPropertyFactory() {
! CharProperty make() { return new Category(typeMask);}});
}
! private static void defRange(String name,
! final int lower, final int upper) {
! map.put(name, new CharPropertyFactory() {
! CharProperty make() { return rangeFor(lower, upper);}});
}
! private static void defCtype(String name,
! final int ctype) {
! map.put(name, new CharPropertyFactory() {
! CharProperty make() { return new Ctype(ctype);}});
}
! private abstract static class CloneableProperty
! extends CharProperty implements Cloneable
! {
! public CloneableProperty clone() {
! try {
! return (CloneableProperty) super.clone();
! } catch (CloneNotSupportedException e) {
! throw new AssertionError(e);
! }
! }
! }
!
! private static void defClone(String name,
! final CloneableProperty p) {
! map.put(name, new CharPropertyFactory() {
! CharProperty make() { return p.clone();}});
! }
!
! private static final HashMap<String, CharPropertyFactory> map
! = new HashMap<>();
!
! static {
! // Unicode character property aliases, defined in
! // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
! defCategory("Cn", 1<<Character.UNASSIGNED);
! defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
! defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
! defCategory("Lt", 1<<Character.TITLECASE_LETTER);
! defCategory("Lm", 1<<Character.MODIFIER_LETTER);
! defCategory("Lo", 1<<Character.OTHER_LETTER);
! defCategory("Mn", 1<<Character.NON_SPACING_MARK);
! defCategory("Me", 1<<Character.ENCLOSING_MARK);
! defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
! defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
! defCategory("Nl", 1<<Character.LETTER_NUMBER);
! defCategory("No", 1<<Character.OTHER_NUMBER);
! defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
! defCategory("Zl", 1<<Character.LINE_SEPARATOR);
! defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
! defCategory("Cc", 1<<Character.CONTROL);
! defCategory("Cf", 1<<Character.FORMAT);
! defCategory("Co", 1<<Character.PRIVATE_USE);
! defCategory("Cs", 1<<Character.SURROGATE);
! defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
! defCategory("Ps", 1<<Character.START_PUNCTUATION);
! defCategory("Pe", 1<<Character.END_PUNCTUATION);
! defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
! defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
! defCategory("Sm", 1<<Character.MATH_SYMBOL);
! defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
! defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
! defCategory("So", 1<<Character.OTHER_SYMBOL);
! defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
! defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
! defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
! (1<<Character.LOWERCASE_LETTER) |
! (1<<Character.TITLECASE_LETTER) |
! (1<<Character.MODIFIER_LETTER) |
! (1<<Character.OTHER_LETTER)));
! defCategory("M", ((1<<Character.NON_SPACING_MARK) |
! (1<<Character.ENCLOSING_MARK) |
! (1<<Character.COMBINING_SPACING_MARK)));
! defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
! (1<<Character.LETTER_NUMBER) |
! (1<<Character.OTHER_NUMBER)));
! defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
! (1<<Character.LINE_SEPARATOR) |
! (1<<Character.PARAGRAPH_SEPARATOR)));
! defCategory("C", ((1<<Character.CONTROL) |
! (1<<Character.FORMAT) |
! (1<<Character.PRIVATE_USE) |
! (1<<Character.SURROGATE))); // Other
! defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
! (1<<Character.START_PUNCTUATION) |
! (1<<Character.END_PUNCTUATION) |
! (1<<Character.CONNECTOR_PUNCTUATION) |
! (1<<Character.OTHER_PUNCTUATION) |
! (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
! (1<<Character.FINAL_QUOTE_PUNCTUATION)));
! defCategory("S", ((1<<Character.MATH_SYMBOL) |
! (1<<Character.CURRENCY_SYMBOL) |
! (1<<Character.MODIFIER_SYMBOL) |
! (1<<Character.OTHER_SYMBOL)));
! defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
! (1<<Character.LOWERCASE_LETTER) |
! (1<<Character.TITLECASE_LETTER)));
! defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
! (1<<Character.LOWERCASE_LETTER) |
! (1<<Character.TITLECASE_LETTER) |
! (1<<Character.MODIFIER_LETTER) |
! (1<<Character.OTHER_LETTER) |
! (1<<Character.DECIMAL_DIGIT_NUMBER)));
! defRange("L1", 0x00, 0xFF); // Latin-1
! map.put("all", new CharPropertyFactory() {
! CharProperty make() { return new All(); }});
!
! // Posix regular expression character classes, defined in
! // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
! defRange("ASCII", 0x00, 0x7F); // ASCII
! defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
! defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
! defCtype("Blank", ASCII.BLANK); // Space and tab characters
! defCtype("Cntrl", ASCII.CNTRL); // Control characters
! defRange("Digit", '0', '9'); // Numeric characters
! defCtype("Graph", ASCII.GRAPH); // printable and visible
! defRange("Lower", 'a', 'z'); // Lower-case alphabetic
! defRange("Print", 0x20, 0x7E); // Printable characters
! defCtype("Punct", ASCII.PUNCT); // Punctuation characters
! defCtype("Space", ASCII.SPACE); // Space characters
! defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
! defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
!
! // Java character properties, defined by methods in Character.java
! defClone("javaLowerCase", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isLowerCase(ch);}});
! defClone("javaUpperCase", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isUpperCase(ch);}});
! defClone("javaAlphabetic", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isAlphabetic(ch);}});
! defClone("javaIdeographic", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isIdeographic(ch);}});
! defClone("javaTitleCase", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isTitleCase(ch);}});
! defClone("javaDigit", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isDigit(ch);}});
! defClone("javaDefined", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isDefined(ch);}});
! defClone("javaLetter", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isLetter(ch);}});
! defClone("javaLetterOrDigit", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isLetterOrDigit(ch);}});
! defClone("javaJavaIdentifierStart", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isJavaIdentifierStart(ch);}});
! defClone("javaJavaIdentifierPart", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isJavaIdentifierPart(ch);}});
! defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isUnicodeIdentifierStart(ch);}});
! defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isUnicodeIdentifierPart(ch);}});
! defClone("javaIdentifierIgnorable", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isIdentifierIgnorable(ch);}});
! defClone("javaSpaceChar", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isSpaceChar(ch);}});
! defClone("javaWhitespace", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isWhitespace(ch);}});
! defClone("javaISOControl", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isISOControl(ch);}});
! defClone("javaMirrored", new CloneableProperty() {
! boolean isSatisfiedBy(int ch) {
! return Character.isMirrored(ch);}});
}
}
/**
* Creates a predicate which can be used to match a string.
*
* @return The predicate which can be used for matching on a string
* @since 1.8
*/
--- 5487,5646 ----
matcher.hitEnd = true;
return false;
}
}
! @FunctionalInterface
! static interface CharPredicate {
! boolean is(int ch);
!
! default CharPredicate and(CharPredicate p) {
! return ch -> is(ch) && p.is(ch);
! }
! default CharPredicate union(CharPredicate p) {
! return ch -> is(ch) || p.is(ch);
! }
! default CharPredicate union(CharPredicate p1,
! CharPredicate p2 ) {
! return ch -> is(ch) || p1.is(ch) || p2.is(ch);
! }
! default CharPredicate negate() {
! return ch -> !is(ch);
! }
! }
!
! static interface BmpCharPredicate extends CharPredicate {
!
! default CharPredicate and(CharPredicate p) {
! if(p instanceof BmpCharPredicate)
! return (BmpCharPredicate)((ch) -> is(ch) && p.is(ch));
! return ch -> is(ch) && p.is(ch);
! }
! default CharPredicate union(CharPredicate p) {
! if (p instanceof BmpCharPredicate)
! return (BmpCharPredicate)((ch) -> is(ch) || p.is(ch));
! return ch -> is(ch) || p.is(ch);
! }
! static CharPredicate union(CharPredicate... predicates) {
! CharPredicate cp = ch -> {
! for (CharPredicate p : predicates) {
! if (!p.is(ch))
! return false;
! }
! return true;
! };
! for (CharPredicate p : predicates) {
! if (! (p instanceof BmpCharPredicate))
! return cp;
! }
! return (BmpCharPredicate)cp;
! }
! }
/**
! * matches a Perl vertical whitespace
*/
! static BmpCharPredicate VertWS = cp ->
! (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
! /**
! * matches a Perl horizontal whitespace
! */
! static BmpCharPredicate HorizWS = cp ->
! cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
! cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f ||
! cp == 0x205f || cp == 0x3000;
! /**
! * for the Unicode category ALL and the dot metacharacter when
! * in dotall mode.
! */
! static CharPredicate ALL = ch -> true;
! /**
! * for the dot metacharacter when dotall is not enabled.
! */
! static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
! && (ch|1) != '\u2029'
! && ch != '\u0085');
! /**
! * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
! */
! static CharPredicate UNIXDOT = ch -> ch != '\n';
!
! /**
! * Indicate that matches a Supplementary Unicode character
! */
! static CharPredicate SingleS(int c) {
! return ch -> ch == c;
}
! /**
! * A bmp/optimized predicate of single
! */
! static BmpCharPredicate Single(int c) {
! return ch -> ch == c;
}
! /**
! * Case insensitive matches a given BMP character
! */
! static BmpCharPredicate SingleI(int lower, int upper) {
! return ch -> ch == lower || ch == upper;
}
! /**
! * Unicode case insensitive matches a given Unicode character
! */
! static CharPredicate SingleU(int lower) {
! return ch -> lower == ch ||
! lower == Character.toLowerCase(Character.toUpperCase(ch));
}
! private static boolean inRange(int lower, int ch, int upper) {
! return lower <= ch && ch <= upper;
}
! /**
! * Charactrs within a explicit value range
! */
! static CharPredicate Range(int lower, int upper) {
! if (upper < Character.MIN_HIGH_SURROGATE ||
! lower > Character.MAX_HIGH_SURROGATE &&
! upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
! return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
! return ch -> inRange(lower, ch, upper);
! }
!
! /**
! * Charactrs within a explicit value range in a case insensitive manner.
! */
! static CharPredicate CIRange(int lower, int upper) {
! return ch -> inRange(lower, ch, upper) ||
! ASCII.isAscii(ch) &&
! (inRange(lower, ASCII.toUpper(ch), upper) ||
! inRange(lower, ASCII.toLower(ch), upper));
}
+
+ static CharPredicate CIRangeU(int lower, int upper) {
+ return ch -> {
+ if (inRange(lower, ch, upper))
+ return true;
+ int up = Character.toUpperCase(ch);
+ return inRange(lower, up, upper) ||
+ inRange(lower, Character.toLowerCase(up), upper);
+ };
}
/**
+ * This must be the very first initializer.
+ */
+ static Node accept = new Node();
+
+ static Node lastAccept = new LastNode();
+
+ /**
* Creates a predicate which can be used to match a string.
*
* @return The predicate which can be used for matching on a string
* @since 1.8
*/