src/java.base/share/classes/java/util/regex/Pattern.java

Print this page

        

*** 982,991 **** --- 982,996 ---- * Temporary storage used by parsing pattern slice. */ transient int[] buffer; /** + * A temporary storage used for predicate for double return. + */ + transient CharPredicate predicate; + + /** * Map the "name" of the "named capturing group" to its group id * node. */ transient volatile Map<String, Integer> namedGroups;
*** 1024,1034 **** /** * If the Start node might possibly match supplementary characters. * It is set to true during compiling if * (1) There is supplementary char in pattern, or ! * (2) There is complement node of Category or Block */ private transient boolean hasSupplementary; /** * Compiles the given regular expression into a pattern. --- 1029,1039 ---- /** * If the Start node might possibly match supplementary characters. * It is set to true during compiling if * (1) There is supplementary char in pattern, or ! * (2) There is complement node of a "family" CharProperty */ private transient boolean hasSupplementary; /** * Compiles the given regular expression into a pattern.
*** 1752,1799 **** } return groups; } /** - * Used to print out a subtree of the Pattern to help with debugging. - */ - private static void printObjectTree(Node node) { - while(node != null) { - if (node instanceof Prolog) { - System.out.println(node); - printObjectTree(((Prolog)node).loop); - System.out.println("**** end contents prolog loop"); - } else if (node instanceof Loop) { - System.out.println(node); - printObjectTree(((Loop)node).body); - System.out.println("**** end contents Loop body"); - } else if (node instanceof Curly) { - System.out.println(node); - printObjectTree(((Curly)node).atom); - System.out.println("**** end contents Curly body"); - } else if (node instanceof GroupCurly) { - System.out.println(node); - printObjectTree(((GroupCurly)node).atom); - System.out.println("**** end contents GroupCurly body"); - } else if (node instanceof GroupTail) { - System.out.println(node); - System.out.println("Tail next is "+node.next); - return; - } else { - System.out.println(node); - } - node = node.next; - if (node != null) - System.out.println("->next:"); - if (node == Pattern.accept) { - System.out.println("Accept Node"); - node = null; - } - } - } - - /** * Used to accumulate information about a subtree of the object graph * so that optimizations can be applied to the subtree. */ static final class TreeInfo { int minLength; --- 1757,1766 ----
*** 2081,2091 **** tail.next = node; // Double return: Tail was returned in root tail = root; continue; case '[': ! node = clazz(true); break; case '\\': ch = nextEscaped(); if (ch == 'p' || ch == 'P') { boolean oneLetter = true; --- 2048,2058 ---- tail.next = node; // Double return: Tail was returned in root tail = root; continue; case '[': ! node = newCharProperty(clazz(true)); break; case '\\': ch = nextEscaped(); if (ch == 'p' || ch == 'P') { boolean oneLetter = true;
*** 2094,2104 **** if (ch != '{') { unread(); } else { oneLetter = false; } ! node = family(oneLetter, comp); } else { unread(); node = atom(); } break; --- 2061,2071 ---- if (ch != '{') { unread(); } else { oneLetter = false; } ! node = newCharProperty(family(oneLetter, comp)); } else { unread(); node = atom(); } break;
*** 2121,2136 **** node = new Dollar(has(MULTILINE)); break; case '.': next(); if (has(DOTALL)) { ! node = new All(); } else { ! if (has(UNIX_LINES)) ! node = new UnixDot(); ! else { ! node = new Dot(); } } break; case '|': case ')': --- 2088,2103 ---- node = new Dollar(has(MULTILINE)); break; case '.': next(); if (has(DOTALL)) { ! node = new CharProperty(ALL); } else { ! if (has(UNIX_LINES)) { ! node = new CharProperty(UNIXDOT); ! } else { ! node = new CharProperty(DOT); } } break; case '|': case ')':
*** 2153,2163 **** node = atom(); break; } node = closure(node); - if (head == null) { head = tail = node; } else { tail.next = node; tail = node; --- 2120,2129 ----
*** 2211,2221 **** ch = next(); // Consume { if present if (ch != '{') unread(); else oneLetter = false; ! return family(oneLetter, comp); } } unread(); prev = cursor; ch = escape(false, first == 0, false); --- 2177,2187 ---- ch = next(); // Consume { if present if (ch != '{') unread(); else oneLetter = false; ! return newCharProperty(family(oneLetter, comp)); } } unread(); prev = cursor; ch = escape(false, first == 0, false);
*** 2249,2259 **** continue; } break; } if (first == 1) { ! return newSingle(buffer[0]); } else { return newSlice(buffer, first, hasSupplementary); } } --- 2215,2225 ---- continue; } break; } if (first == 1) { ! return newCharProperty(single(buffer[0])); } else { return newSlice(buffer, first, hasSupplementary); } }
*** 2344,2366 **** if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.DIGIT).complement() ! : new Ctype(ASCII.DIGIT).complement(); return -1; case 'E': case 'F': break; case 'G': if (inclass) break; if (create) root = new LastMatch(); return -1; case 'H': ! if (create) root = new HorizWS().complement(); return -1; case 'I': case 'J': case 'K': case 'L': --- 2310,2340 ---- if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT; ! predicate = predicate.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'E': case 'F': break; case 'G': if (inclass) break; if (create) root = new LastMatch(); return -1; case 'H': ! if (create) { ! predicate = HorizWS.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'I': case 'J': case 'K': case 'L':
*** 2375,2398 **** case 'R': if (inclass) break; if (create) root = new LineEnding(); return -1; case 'S': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WHITE_SPACE).complement() ! : new Ctype(ASCII.SPACE).complement(); return -1; case 'T': case 'U': break; case 'V': ! if (create) root = new VertWS().complement(); return -1; case 'W': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WORD).complement() ! : new Ctype(ASCII.WORD).complement(); return -1; case 'X': if (inclass) break; if (create) { root = new XGrapheme(); --- 2349,2384 ---- case 'R': if (inclass) break; if (create) root = new LineEnding(); return -1; case 'S': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE; ! predicate = predicate.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'T': case 'U': break; case 'V': ! if (create) { ! predicate = VertWS.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'W': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WORD : CharPredicates.ASCII_WORD; ! predicate = predicate.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'X': if (inclass) break; if (create) { root = new XGrapheme();
*** 2428,2449 **** } return -1; case 'c': return c(); case 'd': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.DIGIT) ! : new Ctype(ASCII.DIGIT); return -1; case 'e': return '\033'; case 'f': return '\f'; case 'g': break; case 'h': ! if (create) root = new HorizWS(); return -1; case 'i': case 'j': break; case 'k': --- 2414,2442 ---- } return -1; case 'c': return c(); case 'd': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'e': return '\033'; case 'f': return '\f'; case 'g': break; case 'h': ! if (create) { ! predicate = HorizWS; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'i': case 'j': break; case 'k':
*** 2471,2483 **** case 'q': break; case 'r': return '\r'; case 's': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WHITE_SPACE) ! : new Ctype(ASCII.SPACE); return -1; case 't': return '\t'; case 'u': return u(); --- 2464,2479 ---- case 'q': break; case 'r': return '\r'; case 's': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 't': return '\t'; case 'u': return u();
*** 2490,2505 **** // the start or end value, such as [\v-...] or [...-\v], in // which a single definite value (0x0B) is expected. For // compatibility concern '\013'/0x0B is returned if isrange. if (isrange) return '\013'; ! if (create) root = new VertWS(); return -1; case 'w': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WORD) ! : new Ctype(ASCII.WORD); return -1; case 'x': return x(); case 'y': break; --- 2486,2508 ---- // the start or end value, such as [\v-...] or [...-\v], in // which a single definite value (0x0B) is expected. For // compatibility concern '\013'/0x0B is returned if isrange. if (isrange) return '\013'; ! if (create) { ! predicate = VertWS; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'w': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WORD : CharPredicates.ASCII_WORD; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'x': return x(); case 'y': break;
*** 2518,2629 **** * * Consumes a ] on the way out if consume is true. Usually consume * is true except for the case of [abc&&def] where def is a separate * right hand node with "understood" brackets. */ ! private CharProperty clazz(boolean consume) { ! CharProperty prev = null; ! CharProperty node = null; BitClass bits = new BitClass(); ! boolean include = true; ! boolean firstInClass = true; int ch = next(); ! for (;;) { ! switch (ch) { ! case '^': // Negates if first char in a class, otherwise literal ! if (firstInClass) { ! if (temp[cursor-1] != '[') ! break; ch = next(); ! include = !include; ! continue; ! } else { ! // ^ not first in class, treat as literal ! break; } case '[': ! firstInClass = false; ! node = clazz(true); if (prev == null) ! prev = node; else ! prev = union(prev, node); ch = peek(); continue; case '&': - firstInClass = false; ch = next(); if (ch == '&') { ch = next(); ! CharProperty rightNode = null; while (ch != ']' && ch != '&') { if (ch == '[') { ! if (rightNode == null) ! rightNode = clazz(true); else ! rightNode = union(rightNode, clazz(true)); } else { // abc&&def unread(); ! rightNode = clazz(false); } ch = peek(); } ! if (rightNode != null) ! node = rightNode; if (prev == null) { ! if (rightNode == null) throw error("Bad class syntax"); else ! prev = rightNode; } else { ! prev = intersection(prev, node); } } else { // treat as a literal & unread(); break; } continue; case 0: - firstInClass = false; if (cursor >= patternLength) throw error("Unclosed character class"); break; case ']': ! firstInClass = false; ! if (prev != null) { if (consume) next(); return prev; } break; default: - firstInClass = false; break; } ! node = range(bits); ! if (include) { ! if (prev == null) { ! prev = node; ! } else { ! if (prev != node) ! prev = union(prev, node); ! } ! } else { ! if (prev == null) { ! prev = node.complement(); } else { ! if (prev != node) ! prev = setDifference(prev, node); ! } } ch = peek(); } } ! private CharProperty bitsOrSingle(BitClass bits, int ch) { /* Bits can only handle codepoints in [u+0000-u+00ff] range. Use "single" node instead of bits when dealing with unicode case folding for codepoints listed below. (1)Uppercase out of range: u+00ff, u+00b5 toUpperCase(u+00ff) -> u+0178 --- 2521,2631 ---- * * Consumes a ] on the way out if consume is true. Usually consume * is true except for the case of [abc&&def] where def is a separate * right hand node with "understood" brackets. */ ! private CharPredicate clazz(boolean consume) { ! CharPredicate prev = null; ! CharPredicate curr = null; BitClass bits = new BitClass(); ! BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch]; ! ! boolean isNeg = false; ! boolean hasBits = false; int ch = next(); ! // Negates if first char in a class, otherwise literal ! if (ch == '^' && temp[cursor-1] == '[') { ch = next(); ! isNeg = true; } + for (;;) { + switch (ch) { case '[': ! curr = clazz(true); if (prev == null) ! prev = curr; else ! prev = prev.union(curr); ch = peek(); continue; case '&': ch = next(); if (ch == '&') { ch = next(); ! CharPredicate right = null; while (ch != ']' && ch != '&') { if (ch == '[') { ! if (right == null) ! right = clazz(true); else ! right = right.union(clazz(true)); } else { // abc&&def unread(); ! right = clazz(false); } ch = peek(); } ! if (hasBits) { ! // bits used, union has high precedence if (prev == null) { ! prev = curr = bitsP; ! } else { ! prev = prev.union(bitsP); ! } ! hasBits = false; ! } ! if (right != null) ! curr = right; ! if (prev == null) { ! if (right == null) throw error("Bad class syntax"); else ! prev = right; } else { ! prev = prev.and(curr); } } else { // treat as a literal & unread(); break; } continue; case 0: if (cursor >= patternLength) throw error("Unclosed character class"); break; case ']': ! if (prev != null || hasBits) { if (consume) next(); + if (prev == null) + prev = bitsP; + else if (hasBits) + prev = prev.union(bitsP); + if (isNeg) + return prev.negate(); return prev; } break; default: break; } ! curr = range(bits); ! if (curr == null) { // the bits used ! hasBits = true; } else { ! if (prev == null) ! prev = curr; ! else if (prev != curr) ! prev = prev.union(curr); } ch = peek(); } } ! private CharPredicate bitsOrSingle(BitClass bits, int ch) { /* Bits can only handle codepoints in [u+0000-u+00ff] range. Use "single" node instead of bits when dealing with unicode case folding for codepoints listed below. (1)Uppercase out of range: u+00ff, u+00b5 toUpperCase(u+00ff) -> u+0178
*** 2644,2663 **** !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && (ch == 0xff || ch == 0xb5 || ch == 0x49 || ch == 0x69 || //I and i ch == 0x53 || ch == 0x73 || //S and s ch == 0x4b || ch == 0x6b || //K and k ! ch == 0xc5 || ch == 0xe5))) //A+ring ! return bits.add(ch, flags()); ! return newSingle(ch); } /** * Parse a single character or a character range in a character class * and return its representative node. */ ! private CharProperty range(BitClass bits) { int ch = peek(); if (ch == '\\') { ch = nextEscaped(); if (ch == 'p' || ch == 'P') { // A property boolean comp = (ch == 'P'); --- 2646,2692 ---- !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && (ch == 0xff || ch == 0xb5 || ch == 0x49 || ch == 0x69 || //I and i ch == 0x53 || ch == 0x73 || //S and s ch == 0x4b || ch == 0x6b || //K and k ! ch == 0xc5 || ch == 0xe5))) { //A+ring { ! bits.add(ch, flags()); ! return null; ! } ! return single(ch); ! } ! ! /** ! * Returns a suitably optimized, single character predicate ! */ ! private CharPredicate single(final int ch) { ! if (has(CASE_INSENSITIVE)) { ! int lower, upper; ! if (has(UNICODE_CASE)) { ! upper = Character.toUpperCase(ch); ! lower = Character.toLowerCase(upper); ! // Unicode case insensitive matches ! if (upper != lower) ! return SingleU(lower); ! } else if (ASCII.isAscii(ch)) { ! lower = ASCII.toLower(ch); ! upper = ASCII.toUpper(ch); ! // Case insensitive matches a given BMP character ! if (lower != upper) ! return SingleI(lower, upper); ! } ! } ! if (isSupplementary(ch)) ! return SingleS(ch); ! return Single(ch); // Match a given BMP character } /** * Parse a single character or a character range in a character class * and return its representative node. */ ! private CharPredicate range(BitClass bits) { int ch = peek(); if (ch == '\\') { ch = nextEscaped(); if (ch == 'p' || ch == 'P') { // A property boolean comp = (ch == 'P');
*** 2672,2682 **** } else { // ordinary escape boolean isrange = temp[cursor+1] == '-'; unread(); ch = escape(true, true, isrange); if (ch == -1) ! return (CharProperty) root; } } else { next(); } if (ch >= 0) { --- 2701,2711 ---- } else { // ordinary escape boolean isrange = temp[cursor+1] == '-'; unread(); ch = escape(true, true, isrange); if (ch == -1) ! return predicate; } } else { next(); } if (ch >= 0) {
*** 2694,2723 **** next(); } if (m < ch) { throw error("Illegal character range"); } ! if (has(CASE_INSENSITIVE)) ! return caseInsensitiveRangeFor(ch, m); ! else ! return rangeFor(ch, m); } } return bitsOrSingle(bits, ch); } throw error("Unexpected character '"+((char)ch)+"'"); } /** * Parses a Unicode character family and returns its representative node. */ ! private CharProperty family(boolean singleLetter, ! boolean maybeComplement) { next(); String name; ! CharProperty node = null; if (singleLetter) { int c = temp[cursor]; if (!Character.isSupplementaryCodePoint(c)) { name = String.valueOf((char)c); --- 2723,2755 ---- next(); } if (m < ch) { throw error("Illegal character range"); } ! if (has(CASE_INSENSITIVE)) { ! if (has(UNICODE_CASE)) ! return CIRangeU(ch, m); ! return CIRange(ch, m); ! } else { ! return Range(ch, m); ! } } } return bitsOrSingle(bits, ch); } throw error("Unexpected character '"+((char)ch)+"'"); } /** * Parses a Unicode character family and returns its representative node. */ ! private CharPredicate family(boolean singleLetter, ! boolean isComplement) { next(); String name; ! CharPredicate p = null; if (singleLetter) { int c = temp[cursor]; if (!Character.isSupplementaryCodePoint(c)) { name = String.valueOf((char)c);
*** 2745,2836 **** String value = name.substring(i + 1); name = name.substring(0, i).toLowerCase(Locale.ENGLISH); switch (name) { case "sc": case "script": ! node = unicodeScriptPropertyFor(value); break; case "blk": case "block": ! node = unicodeBlockPropertyFor(value); break; case "gc": case "general_category": ! node = charPropertyNodeFor(value); break; default: throw error("Unknown Unicode property {name=<" + name + ">, " + "value=<" + value + ">}"); ! } } else { if (name.startsWith("In")) { ! // \p{inBlockName} ! node = unicodeBlockPropertyFor(name.substring(2)); } else if (name.startsWith("Is")) { ! // \p{isGeneralCategory} and \p{isScriptName} name = name.substring(2); ! UnicodeProp uprop = UnicodeProp.forName(name); ! if (uprop != null) ! node = new Utype(uprop); ! if (node == null) ! node = CharPropertyNames.charPropertyFor(name); ! if (node == null) ! node = unicodeScriptPropertyFor(name); } else { if (has(UNICODE_CHARACTER_CLASS)) { ! UnicodeProp uprop = UnicodeProp.forPOSIXName(name); ! if (uprop != null) ! node = new Utype(uprop); } ! if (node == null) ! node = charPropertyNodeFor(name); } } ! if (maybeComplement) { ! if (node instanceof Category || node instanceof Block) hasSupplementary = true; ! node = node.complement(); } ! return node; ! } ! ! ! /** ! * Returns a CharProperty matching all characters belong to ! * a UnicodeScript. ! */ ! private CharProperty unicodeScriptPropertyFor(String name) { ! final Character.UnicodeScript script; ! try { ! script = Character.UnicodeScript.forName(name); ! } catch (IllegalArgumentException iae) { ! throw error("Unknown character script name {" + name + "}"); ! } ! return new Script(script); ! } ! ! /** ! * Returns a CharProperty matching all characters in a UnicodeBlock. ! */ ! private CharProperty unicodeBlockPropertyFor(String name) { ! final Character.UnicodeBlock block; ! try { ! block = Character.UnicodeBlock.forName(name); ! } catch (IllegalArgumentException iae) { ! throw error("Unknown character block name {" + name + "}"); ! } ! return new Block(block); } ! /** ! * Returns a CharProperty matching all characters in a named property. ! */ ! private CharProperty charPropertyNodeFor(String name) { ! CharProperty p = CharPropertyNames.charPropertyFor(name); if (p == null) ! throw error("Unknown character property name {" + name + "}"); ! return p; } /** * Parses and returns the name of a "named capturing group", the trailing * ">" is consumed after parsing. --- 2777,2842 ---- String value = name.substring(i + 1); name = name.substring(0, i).toLowerCase(Locale.ENGLISH); switch (name) { case "sc": case "script": ! p = CharPredicates.forUnicodeScript(value); break; case "blk": case "block": ! p = CharPredicates.forUnicodeBlock(value); break; case "gc": case "general_category": ! p = CharPredicates.forProperty(value); break; default: + break; + } + if (p == null) throw error("Unknown Unicode property {name=<" + name + ">, " + "value=<" + value + ">}"); ! } else { if (name.startsWith("In")) { ! // \p{InBlockName} ! p = CharPredicates.forUnicodeBlock(name.substring(2)); } else if (name.startsWith("Is")) { ! // \p{IsGeneralCategory} and \p{IsScriptName} name = name.substring(2); ! p = CharPredicates.forUnicodeProperty(name); ! if (p == null) ! p = CharPredicates.forProperty(name); ! if (p == null) ! p = CharPredicates.forUnicodeScript(name); } else { if (has(UNICODE_CHARACTER_CLASS)) { ! p = CharPredicates.forPOSIXName(name); } ! if (p == null) ! p = CharPredicates.forProperty(name); } + if (p == null) + throw error("Unknown character property name {In/Is" + name + "}"); } ! if (isComplement) { ! // it might be too expensive to detect if a complement of ! // CharProperty can match "certain" supplementary. So just ! // go with StartS. hasSupplementary = true; ! p = p.negate(); } ! return p; } ! private CharProperty newCharProperty(CharPredicate p) { if (p == null) ! return null; ! if (p instanceof BmpCharPredicate) ! return new BmpCharProperty((BmpCharPredicate)p); ! else ! return new CharProperty(p); } /** * Parses and returns the name of a "named capturing group", the trailing * ">" is consumed after parsing.
*** 2882,2892 **** break; case '>': // (?>xxx) independent group head = createGroup(true); tail = root; head.next = expr(tail); ! head = tail = new Ques(head, INDEPENDENT); break; case '<': // (?<xxx) look behind ch = read(); if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { // named captured group --- 2888,2898 ---- break; case '>': // (?>xxx) independent group head = createGroup(true); tail = root; head.next = expr(tail); ! head = tail = new Ques(head, Qtype.INDEPENDENT); break; case '<': // (?<xxx) look behind ch = read(); if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { // named captured group
*** 2968,2993 **** return node; // Dual return } if (node instanceof Ques) { Ques ques = (Ques) node; ! if (ques.type == POSSESSIVE) { root = node; return node; } tail.next = new BranchConn(); tail = tail.next; ! if (ques.type == GREEDY) { head = new Branch(head, null, tail); } else { // Reluctant quantifier head = new Branch(null, head, tail); } root = tail; return head; } else if (node instanceof Curly) { Curly curly = (Curly) node; ! if (curly.type == POSSESSIVE) { root = node; return node; } // Discover if the group is deterministic TreeInfo info = new TreeInfo(); --- 2974,2999 ---- return node; // Dual return } if (node instanceof Ques) { Ques ques = (Ques) node; ! if (ques.type == Qtype.POSSESSIVE) { root = node; return node; } tail.next = new BranchConn(); tail = tail.next; ! if (ques.type == Qtype.GREEDY) { head = new Branch(head, null, tail); } else { // Reluctant quantifier head = new Branch(null, head, tail); } root = tail; return head; } else if (node instanceof Curly) { Curly curly = (Curly) node; ! if (curly.type == Qtype.POSSESSIVE) { root = node; return node; } // Discover if the group is deterministic TreeInfo info = new TreeInfo();
*** 3000,3010 **** capturingGroup); return head; } else { // Non-deterministic int temp = ((GroupHead) head).localIndex; Loop loop; ! if (curly.type == GREEDY) loop = new Loop(this.localCount, temp); else // Reluctant Curly loop = new LazyLoop(this.localCount, temp); Prolog prolog = new Prolog(loop); this.localCount += 1; --- 3006,3016 ---- capturingGroup); return head; } else { // Non-deterministic int temp = ((GroupHead) head).localIndex; Loop loop; ! if (curly.type == Qtype.GREEDY) loop = new Loop(this.localCount, temp); else // Reluctant Curly loop = new LazyLoop(this.localCount, temp); Prolog prolog = new Prolog(loop); this.localCount += 1;
*** 3029,3038 **** --- 3035,3048 ---- int groupIndex = 0; if (!anonymous) groupIndex = capturingGroupCount++; GroupHead head = new GroupHead(localIndex); root = new GroupTail(localIndex, groupIndex); + + // for debug/print only, head.match does NOT need the "tail" info + head.tail = (GroupTail)root; + if (!anonymous && groupIndex < 10) groupNodes[groupIndex] = head; return head; }
*** 3117,3133 **** } } static final int MAX_REPS = 0x7FFFFFFF; ! static final int GREEDY = 0; ! ! static final int LAZY = 1; ! ! static final int POSSESSIVE = 2; ! static final int INDEPENDENT = 3; /** * Processes repetition. If the next character peeked is a quantifier * then new nodes must be appended to handle the repetition. * Prev could be a single or a group, so it could be a chain of nodes. --- 3127,3156 ---- } } static final int MAX_REPS = 0x7FFFFFFF; ! static enum Qtype { ! GREEDY, LAZY, POSSESSIVE, INDEPENDENT ! } ! private Node curly(Node prev, int cmin) { ! int ch = next(); ! if (ch == '?') { ! next(); ! return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY); ! } else if (ch == '+') { ! next(); ! return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE); ! } ! if (prev instanceof BmpCharProperty) { ! return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin); ! } else if (prev instanceof CharProperty) { ! return new CharPropertyGreedy((CharProperty)prev, cmin); ! } ! return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY); ! } /** * Processes repetition. If the next character peeked is a quantifier * then new nodes must be appended to handle the repetition. * Prev could be a single or a group, so it could be a chain of nodes.
*** 3138,3173 **** switch (ch) { case '?': ch = next(); if (ch == '?') { next(); ! return new Ques(prev, LAZY); } else if (ch == '+') { next(); ! return new Ques(prev, POSSESSIVE); } ! return new Ques(prev, GREEDY); case '*': ! ch = next(); ! if (ch == '?') { ! next(); ! return new Curly(prev, 0, MAX_REPS, LAZY); ! } else if (ch == '+') { ! next(); ! return new Curly(prev, 0, MAX_REPS, POSSESSIVE); ! } ! return new Curly(prev, 0, MAX_REPS, GREEDY); case '+': ! ch = next(); ! if (ch == '?') { ! next(); ! return new Curly(prev, 1, MAX_REPS, LAZY); ! } else if (ch == '+') { ! next(); ! return new Curly(prev, 1, MAX_REPS, POSSESSIVE); ! } ! return new Curly(prev, 1, MAX_REPS, GREEDY); case '{': ch = temp[cursor+1]; if (ASCII.isDigit(ch)) { skip(); int cmin = 0; --- 3161,3180 ---- switch (ch) { case '?': ch = next(); if (ch == '?') { next(); ! return new Ques(prev, Qtype.LAZY); } else if (ch == '+') { next(); ! return new Ques(prev, Qtype.POSSESSIVE); } ! return new Ques(prev, Qtype.GREEDY); case '*': ! return curly(prev, 0); case '+': ! return curly(prev, 1); case '{': ch = temp[cursor+1]; if (ASCII.isDigit(ch)) { skip(); int cmin = 0;
*** 3192,3207 **** throw error("Illegal repetition range"); Curly curly; ch = peek(); if (ch == '?') { next(); ! curly = new Curly(prev, cmin, cmax, LAZY); } else if (ch == '+') { next(); ! curly = new Curly(prev, cmin, cmax, POSSESSIVE); } else { ! curly = new Curly(prev, cmin, cmax, GREEDY); } return curly; } else { throw error("Illegal repetition"); } --- 3199,3214 ---- throw error("Illegal repetition range"); Curly curly; ch = peek(); if (ch == '?') { next(); ! curly = new Curly(prev, cmin, cmax, Qtype.LAZY); } else if (ch == '+') { next(); ! curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE); } else { ! curly = new Curly(prev, cmin, cmax, Qtype.GREEDY); } return curly; } else { throw error("Illegal repetition"); }
*** 3374,3387 **** /** * Creates a bit vector for matching Latin-1 values. A normal BitClass * never matches values above Latin-1, and a complemented BitClass always * matches values above Latin-1. */ ! private static final class BitClass extends BmpCharProperty { final boolean[] bits; ! BitClass() { bits = new boolean[256]; } ! private BitClass(boolean[] bits) { this.bits = bits; } BitClass add(int c, int flags) { assert c >= 0 && c <= 255; if ((flags & CASE_INSENSITIVE) != 0) { if (ASCII.isAscii(c)) { bits[ASCII.toUpper(c)] = true; --- 3381,3399 ---- /** * Creates a bit vector for matching Latin-1 values. A normal BitClass * never matches values above Latin-1, and a complemented BitClass always * matches values above Latin-1. */ ! static final class BitClass extends BmpCharProperty { final boolean[] bits; ! BitClass() { ! this(new boolean[256]); ! } ! private BitClass(boolean[] bits) { ! super( ch -> ch < 256 && bits[ch]); ! this.bits = bits; ! } BitClass add(int c, int flags) { assert c >= 0 && c <= 255; if ((flags & CASE_INSENSITIVE) != 0) { if (ASCII.isAscii(c)) { bits[ASCII.toUpper(c)] = true;
*** 3392,3427 **** } } bits[c] = true; return this; } - boolean isSatisfiedBy(int ch) { - return ch < 256 && bits[ch]; - } - } - - /** - * Returns a suitably optimized, single character matcher. - */ - private CharProperty newSingle(final int ch) { - if (has(CASE_INSENSITIVE)) { - int lower, upper; - if (has(UNICODE_CASE)) { - upper = Character.toUpperCase(ch); - lower = Character.toLowerCase(upper); - if (upper != lower) - return new SingleU(lower); - } else if (ASCII.isAscii(ch)) { - lower = ASCII.toLower(ch); - upper = ASCII.toUpper(ch); - if (lower != upper) - return new SingleI(lower, upper); - } - } - if (isSupplementary(ch)) - return new SingleS(ch); // Match a given Unicode character - return new Single(ch); // Match a given BMP character } /** * Utility method for creating a string slice matcher. */ --- 3404,3413 ----
*** 3825,3846 **** /** * Abstract node class to match one character satisfying some * boolean property. */ ! private abstract static class CharProperty extends Node { ! abstract boolean isSatisfiedBy(int ch); ! CharProperty complement() { ! return new CharProperty() { ! boolean isSatisfiedBy(int ch) { ! return ! CharProperty.this.isSatisfiedBy(ch);}}; } boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { int ch = Character.codePointAt(seq, i); ! return isSatisfiedBy(ch) ! && next.match(matcher, i+Character.charCount(ch), seq); } else { matcher.hitEnd = true; return false; } } --- 3811,3831 ---- /** * Abstract node class to match one character satisfying some * boolean property. */ ! static class CharProperty extends Node { ! CharPredicate predicate; ! ! CharProperty (CharPredicate predicate) { ! this.predicate = predicate; } boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { int ch = Character.codePointAt(seq, i); ! return predicate.is(ch) && ! next.match(matcher, i + Character.charCount(ch), seq); } else { matcher.hitEnd = true; return false; } }
*** 3853,4007 **** /** * Optimized version of CharProperty that works only for * properties never satisfied by Supplementary characters. */ ! private abstract static class BmpCharProperty extends CharProperty { boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { ! return isSatisfiedBy(seq.charAt(i)) ! && next.match(matcher, i+1, seq); } else { matcher.hitEnd = true; return false; } } } /** - * Node class that matches a Supplementary Unicode character - */ - static final class SingleS extends CharProperty { - final int c; - SingleS(int c) { this.c = c; } - boolean isSatisfiedBy(int ch) { - return ch == c; - } - } - - /** - * Optimization -- matches a given BMP character - */ - static final class Single extends BmpCharProperty { - final int c; - Single(int c) { this.c = c; } - boolean isSatisfiedBy(int ch) { - return ch == c; - } - } - - /** - * Case insensitive matches a given BMP character - */ - static final class SingleI extends BmpCharProperty { - final int lower; - final int upper; - SingleI(int lower, int upper) { - this.lower = lower; - this.upper = upper; - } - boolean isSatisfiedBy(int ch) { - return ch == lower || ch == upper; - } - } - - /** - * Unicode case insensitive matches a given Unicode character - */ - static final class SingleU extends CharProperty { - final int lower; - SingleU(int lower) { - this.lower = lower; - } - boolean isSatisfiedBy(int ch) { - return lower == ch || - lower == Character.toLowerCase(Character.toUpperCase(ch)); - } - } - - /** - * Node class that matches a Unicode block. - */ - static final class Block extends CharProperty { - final Character.UnicodeBlock block; - Block(Character.UnicodeBlock block) { - this.block = block; - } - boolean isSatisfiedBy(int ch) { - return block == Character.UnicodeBlock.of(ch); - } - } - - /** - * Node class that matches a Unicode script - */ - static final class Script extends CharProperty { - final Character.UnicodeScript script; - Script(Character.UnicodeScript script) { - this.script = script; - } - boolean isSatisfiedBy(int ch) { - return script == Character.UnicodeScript.of(ch); - } - } - - /** - * Node class that matches a Unicode category. - */ - static final class Category extends CharProperty { - final int typeMask; - Category(int typeMask) { this.typeMask = typeMask; } - boolean isSatisfiedBy(int ch) { - return (typeMask & (1 << Character.getType(ch))) != 0; - } - } - - /** - * Node class that matches a Unicode "type" - */ - static final class Utype extends CharProperty { - final UnicodeProp uprop; - Utype(UnicodeProp uprop) { this.uprop = uprop; } - boolean isSatisfiedBy(int ch) { - return uprop.is(ch); - } - } - - /** - * Node class that matches a POSIX type. - */ - static final class Ctype extends BmpCharProperty { - final int ctype; - Ctype(int ctype) { this.ctype = ctype; } - boolean isSatisfiedBy(int ch) { - return ch < 128 && ASCII.isType(ch, ctype); - } - } - - /** - * Node class that matches a Perl vertical whitespace - */ - static final class VertWS extends BmpCharProperty { - boolean isSatisfiedBy(int cp) { - return (cp >= 0x0A && cp <= 0x0D) || - cp == 0x85 || cp == 0x2028 || cp == 0x2029; - } - } - - /** - * Node class that matches a Perl horizontal whitespace - */ - static final class HorizWS extends BmpCharProperty { - boolean isSatisfiedBy(int cp) { - return cp == 0x09 || cp == 0x20 || cp == 0xa0 || - cp == 0x1680 || cp == 0x180e || - cp >= 0x2000 && cp <= 0x200a || - cp == 0x202f || cp == 0x205f || cp == 0x3000; - } - } - - /** * Node class that matches an unicode extended grapheme cluster */ static class XGrapheme extends Node { boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { --- 3838,3863 ---- /** * Optimized version of CharProperty that works only for * properties never satisfied by Supplementary characters. */ ! private static class BmpCharProperty extends CharProperty { ! BmpCharProperty (BmpCharPredicate predicate) { ! super(predicate); ! } boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { ! return predicate.is(seq.charAt(i)) && ! next.match(matcher, i + 1, seq); } else { matcher.hitEnd = true; return false; } } } /** * Node class that matches an unicode extended grapheme cluster */ static class XGrapheme extends Node { boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) {
*** 4215,4299 **** int toLower(int c) { return Character.toLowerCase(Character.toUpperCase(c)); } } - private static boolean inRange(int lower, int ch, int upper) { - return lower <= ch && ch <= upper; - } - - /** - * Returns node for matching characters within an explicit value range. - */ - private static CharProperty rangeFor(final int lower, - final int upper) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return inRange(lower, ch, upper);}}; - } - - /** - * Returns node for matching characters within an explicit value - * range in a case insensitive manner. - */ - private CharProperty caseInsensitiveRangeFor(final int lower, - final int upper) { - if (has(UNICODE_CASE)) - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - if (inRange(lower, ch, upper)) - return true; - int up = Character.toUpperCase(ch); - return inRange(lower, up, upper) || - inRange(lower, Character.toLowerCase(up), upper);}}; - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return inRange(lower, ch, upper) || - ASCII.isAscii(ch) && - (inRange(lower, ASCII.toUpper(ch), upper) || - inRange(lower, ASCII.toLower(ch), upper)); - }}; - } - - /** - * Implements the Unicode category ALL and the dot metacharacter when - * in dotall mode. - */ - static final class All extends CharProperty { - boolean isSatisfiedBy(int ch) { - return true; - } - } - - /** - * Node class for the dot metacharacter when dotall is not enabled. - */ - static final class Dot extends CharProperty { - boolean isSatisfiedBy(int ch) { - return (ch != '\n' && ch != '\r' - && (ch|1) != '\u2029' - && ch != '\u0085'); - } - } - - /** - * Node class for the dot metacharacter when dotall is not enabled - * but UNIX_LINES is enabled. - */ - static final class UnixDot extends CharProperty { - boolean isSatisfiedBy(int ch) { - return ch != '\n'; - } - } - /** * The 0 or 1 quantifier. This one class implements all three types. */ static final class Ques extends Node { Node atom; ! int type; ! Ques(Node node, int type) { this.atom = node; this.type = type; } boolean match(Matcher matcher, int i, CharSequence seq) { switch (type) { --- 4071,4087 ---- int toLower(int c) { return Character.toLowerCase(Character.toUpperCase(c)); } } /** * The 0 or 1 quantifier. This one class implements all three types. */ static final class Ques extends Node { Node atom; ! Qtype type; ! Ques(Node node, Qtype type) { this.atom = node; this.type = type; } boolean match(Matcher matcher, int i, CharSequence seq) { switch (type) {
*** 4309,4319 **** default: return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); } } boolean study(TreeInfo info) { ! if (type != INDEPENDENT) { int minL = info.minLength; atom.study(info); info.minLength = minL; info.deterministic = false; return next.study(info); --- 4097,4107 ---- default: return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); } } boolean study(TreeInfo info) { ! if (type != Qtype.INDEPENDENT) { int minL = info.minLength; atom.study(info); info.minLength = minL; info.deterministic = false; return next.study(info);
*** 4323,4343 **** } } } /** * Handles the curly-brace style repetition with a specified minimum and * maximum occurrences. The * quantifier is handled as a special case. * This class handles the three types. */ static final class Curly extends Node { Node atom; ! int type; int cmin; int cmax; ! Curly(Node node, int cmin, int cmax, int type) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax; } --- 4111,4204 ---- } } } /** + * Handles the greedy style repetition with the minimum either be + * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier. + */ + static class CharPropertyGreedy extends Node { + final CharPredicate predicate; + final int cmin; + + CharPropertyGreedy(CharProperty cp, int cmin) { + this.predicate = cp.predicate; + this.cmin = cmin; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int n = 0; + int to = matcher.to; + // greedy, all the way down + while (i < to) { + int ch = Character.codePointAt(seq, i); + if (!predicate.is(ch)) + break; + i += Character.charCount(ch); + n++; + } + if (i >= to) { + matcher.hitEnd = true; + } + while (n >= cmin) { + if (next.match(matcher, i, seq)) + return true; + if (n == cmin) + return false; + // backing off if match fails + int ch = Character.codePointBefore(seq, i); + i -= Character.charCount(ch); + n--; + } + return false; + } + + boolean study(TreeInfo info) { + info.minLength += cmin; + if (info.maxValid) { + info.maxLength += MAX_REPS; + } + info.deterministic = false; + return next.study(info); + } + } + + static final class BmpCharPropertyGreedy extends CharPropertyGreedy { + + BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { + super(bcp, cmin); + } + + boolean match(Matcher matcher, int i, CharSequence seq) { + int n = 0; + int to = matcher.to; + while (i < to && predicate.is(seq.charAt(i))) { + i++; n++; + } + if (i >= to) { + matcher.hitEnd = true; + } + while (n >= cmin) { + if (next.match(matcher, i, seq)) + return true; + i--; n--; // backing off if match fails + } + return false; + } + } + + /** * Handles the curly-brace style repetition with a specified minimum and * maximum occurrences. The * quantifier is handled as a special case. * This class handles the three types. */ static final class Curly extends Node { Node atom; ! Qtype type; int cmin; int cmax; ! Curly(Node node, int cmin, int cmax, Qtype type) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax; }
*** 4348,4360 **** i = matcher.last; continue; } return false; } ! if (type == GREEDY) return match0(matcher, i, j, seq); ! else if (type == LAZY) return match1(matcher, i, j, seq); else return match2(matcher, i, j, seq); } // Greedy match. --- 4209,4221 ---- i = matcher.last; continue; } return false; } ! if (type == Qtype.GREEDY) return match0(matcher, i, j, seq); ! else if (type == Qtype.LAZY) return match1(matcher, i, j, seq); else return match2(matcher, i, j, seq); } // Greedy match.
*** 4472,4489 **** * If capture is true then this class saves group settings and ensures * that groups are unset when backing off of a group match. */ static final class GroupCurly extends Node { Node atom; ! int type; int cmin; int cmax; int localIndex; int groupIndex; boolean capture; ! GroupCurly(Node node, int cmin, int cmax, int type, int local, int group, boolean capture) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax; --- 4333,4350 ---- * If capture is true then this class saves group settings and ensures * that groups are unset when backing off of a group match. */ static final class GroupCurly extends Node { Node atom; ! Qtype type; int cmin; int cmax; int localIndex; int groupIndex; boolean capture; ! GroupCurly(Node node, int cmin, int cmax, Qtype type, int local, int group, boolean capture) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax;
*** 4519,4531 **** ret = false; break; } } if (ret) { ! if (type == GREEDY) { ret = match0(matcher, i, cmin, seq); ! } else if (type == LAZY) { ret = match1(matcher, i, cmin, seq); } else { ret = match2(matcher, i, cmin, seq); } } --- 4380,4392 ---- ret = false; break; } } if (ret) { ! if (type == Qtype.GREEDY) { ret = match0(matcher, i, cmin, seq); ! } else if (type == Qtype.LAZY) { ret = match1(matcher, i, cmin, seq); } else { ret = match2(matcher, i, cmin, seq); } }
*** 4767,4776 **** --- 4628,4638 ---- * indicate that we do not want to unset the group if the reference * doesn't match. */ static final class GroupHead extends Node { int localIndex; + GroupTail tail; // for debug/print only, match does not need to know GroupHead(int localCount) { localIndex = localCount; } boolean match(Matcher matcher, int i, CharSequence seq) { int save = matcher.locals[localIndex];
*** 5360,5399 **** return !conditionMatched && next.match(matcher, i, seq); } } /** - * Returns the set union of two CharProperty nodes. - */ - private static CharProperty union(final CharProperty lhs, - final CharProperty rhs) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; - } - - /** - * Returns the set intersection of two CharProperty nodes. - */ - private static CharProperty intersection(final CharProperty lhs, - final CharProperty rhs) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; - } - - /** - * Returns the set difference of two CharProperty nodes. - */ - private static CharProperty setDifference(final CharProperty lhs, - final CharProperty rhs) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; - } - - /** * Handles word boundaries. Includes a field to allow this one class to * deal with the different types of word boundaries we can match. The word * characters include underscores, letters, and digits. Non spacing marks * can are also part of a word if they have a base character, otherwise * they are ignored for purposes of finding word boundaries. --- 5222,5231 ----
*** 5409,5419 **** type = n; this.useUWORD = useUWORD; } boolean isWord(int ch) { ! return useUWORD ? UnicodeProp.WORD.is(ch) : (ch == '_' || Character.isLetterOrDigit(ch)); } int check(Matcher matcher, int i, CharSequence seq) { int ch; --- 5241,5251 ---- type = n; this.useUWORD = useUWORD; } boolean isWord(int ch) { ! return useUWORD ? CharPredicates.WORD.is(ch) : (ch == '_' || Character.isLetterOrDigit(ch)); } int check(Matcher matcher, int i, CharSequence seq) { int ch;
*** 5655,5874 **** matcher.hitEnd = true; return false; } } ! /////////////////////////////////////////////////////////////////////////////// ! /////////////////////////////////////////////////////////////////////////////// /** ! * This must be the very first initializer. */ ! static Node accept = new Node(); ! static Node lastAccept = new LastNode(); ! private static class CharPropertyNames { ! static CharProperty charPropertyFor(String name) { ! CharPropertyFactory m = map.get(name); ! return m == null ? null : m.make(); } ! private abstract static class CharPropertyFactory { ! abstract CharProperty make(); } ! private static void defCategory(String name, ! final int typeMask) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return new Category(typeMask);}}); } ! private static void defRange(String name, ! final int lower, final int upper) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return rangeFor(lower, upper);}}); } ! private static void defCtype(String name, ! final int ctype) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return new Ctype(ctype);}}); } ! private abstract static class CloneableProperty ! extends CharProperty implements Cloneable ! { ! public CloneableProperty clone() { ! try { ! return (CloneableProperty) super.clone(); ! } catch (CloneNotSupportedException e) { ! throw new AssertionError(e); ! } ! } ! } ! ! private static void defClone(String name, ! final CloneableProperty p) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return p.clone();}}); ! } ! ! private static final HashMap<String, CharPropertyFactory> map ! = new HashMap<>(); ! ! static { ! // Unicode character property aliases, defined in ! // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt ! defCategory("Cn", 1<<Character.UNASSIGNED); ! defCategory("Lu", 1<<Character.UPPERCASE_LETTER); ! defCategory("Ll", 1<<Character.LOWERCASE_LETTER); ! defCategory("Lt", 1<<Character.TITLECASE_LETTER); ! defCategory("Lm", 1<<Character.MODIFIER_LETTER); ! defCategory("Lo", 1<<Character.OTHER_LETTER); ! defCategory("Mn", 1<<Character.NON_SPACING_MARK); ! defCategory("Me", 1<<Character.ENCLOSING_MARK); ! defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK); ! defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER); ! defCategory("Nl", 1<<Character.LETTER_NUMBER); ! defCategory("No", 1<<Character.OTHER_NUMBER); ! defCategory("Zs", 1<<Character.SPACE_SEPARATOR); ! defCategory("Zl", 1<<Character.LINE_SEPARATOR); ! defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR); ! defCategory("Cc", 1<<Character.CONTROL); ! defCategory("Cf", 1<<Character.FORMAT); ! defCategory("Co", 1<<Character.PRIVATE_USE); ! defCategory("Cs", 1<<Character.SURROGATE); ! defCategory("Pd", 1<<Character.DASH_PUNCTUATION); ! defCategory("Ps", 1<<Character.START_PUNCTUATION); ! defCategory("Pe", 1<<Character.END_PUNCTUATION); ! defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION); ! defCategory("Po", 1<<Character.OTHER_PUNCTUATION); ! defCategory("Sm", 1<<Character.MATH_SYMBOL); ! defCategory("Sc", 1<<Character.CURRENCY_SYMBOL); ! defCategory("Sk", 1<<Character.MODIFIER_SYMBOL); ! defCategory("So", 1<<Character.OTHER_SYMBOL); ! defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION); ! defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION); ! defCategory("L", ((1<<Character.UPPERCASE_LETTER) | ! (1<<Character.LOWERCASE_LETTER) | ! (1<<Character.TITLECASE_LETTER) | ! (1<<Character.MODIFIER_LETTER) | ! (1<<Character.OTHER_LETTER))); ! defCategory("M", ((1<<Character.NON_SPACING_MARK) | ! (1<<Character.ENCLOSING_MARK) | ! (1<<Character.COMBINING_SPACING_MARK))); ! defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) | ! (1<<Character.LETTER_NUMBER) | ! (1<<Character.OTHER_NUMBER))); ! defCategory("Z", ((1<<Character.SPACE_SEPARATOR) | ! (1<<Character.LINE_SEPARATOR) | ! (1<<Character.PARAGRAPH_SEPARATOR))); ! defCategory("C", ((1<<Character.CONTROL) | ! (1<<Character.FORMAT) | ! (1<<Character.PRIVATE_USE) | ! (1<<Character.SURROGATE))); // Other ! defCategory("P", ((1<<Character.DASH_PUNCTUATION) | ! (1<<Character.START_PUNCTUATION) | ! (1<<Character.END_PUNCTUATION) | ! (1<<Character.CONNECTOR_PUNCTUATION) | ! (1<<Character.OTHER_PUNCTUATION) | ! (1<<Character.INITIAL_QUOTE_PUNCTUATION) | ! (1<<Character.FINAL_QUOTE_PUNCTUATION))); ! defCategory("S", ((1<<Character.MATH_SYMBOL) | ! (1<<Character.CURRENCY_SYMBOL) | ! (1<<Character.MODIFIER_SYMBOL) | ! (1<<Character.OTHER_SYMBOL))); ! defCategory("LC", ((1<<Character.UPPERCASE_LETTER) | ! (1<<Character.LOWERCASE_LETTER) | ! (1<<Character.TITLECASE_LETTER))); ! defCategory("LD", ((1<<Character.UPPERCASE_LETTER) | ! (1<<Character.LOWERCASE_LETTER) | ! (1<<Character.TITLECASE_LETTER) | ! (1<<Character.MODIFIER_LETTER) | ! (1<<Character.OTHER_LETTER) | ! (1<<Character.DECIMAL_DIGIT_NUMBER))); ! defRange("L1", 0x00, 0xFF); // Latin-1 ! map.put("all", new CharPropertyFactory() { ! CharProperty make() { return new All(); }}); ! ! // Posix regular expression character classes, defined in ! // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html ! defRange("ASCII", 0x00, 0x7F); // ASCII ! defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters ! defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters ! defCtype("Blank", ASCII.BLANK); // Space and tab characters ! defCtype("Cntrl", ASCII.CNTRL); // Control characters ! defRange("Digit", '0', '9'); // Numeric characters ! defCtype("Graph", ASCII.GRAPH); // printable and visible ! defRange("Lower", 'a', 'z'); // Lower-case alphabetic ! defRange("Print", 0x20, 0x7E); // Printable characters ! defCtype("Punct", ASCII.PUNCT); // Punctuation characters ! defCtype("Space", ASCII.SPACE); // Space characters ! defRange("Upper", 'A', 'Z'); // Upper-case alphabetic ! defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits ! ! // Java character properties, defined by methods in Character.java ! defClone("javaLowerCase", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isLowerCase(ch);}}); ! defClone("javaUpperCase", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isUpperCase(ch);}}); ! defClone("javaAlphabetic", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isAlphabetic(ch);}}); ! defClone("javaIdeographic", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isIdeographic(ch);}}); ! defClone("javaTitleCase", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isTitleCase(ch);}}); ! defClone("javaDigit", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isDigit(ch);}}); ! defClone("javaDefined", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isDefined(ch);}}); ! defClone("javaLetter", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isLetter(ch);}}); ! defClone("javaLetterOrDigit", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isLetterOrDigit(ch);}}); ! defClone("javaJavaIdentifierStart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isJavaIdentifierStart(ch);}}); ! defClone("javaJavaIdentifierPart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isJavaIdentifierPart(ch);}}); ! defClone("javaUnicodeIdentifierStart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isUnicodeIdentifierStart(ch);}}); ! defClone("javaUnicodeIdentifierPart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isUnicodeIdentifierPart(ch);}}); ! defClone("javaIdentifierIgnorable", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isIdentifierIgnorable(ch);}}); ! defClone("javaSpaceChar", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isSpaceChar(ch);}}); ! defClone("javaWhitespace", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isWhitespace(ch);}}); ! defClone("javaISOControl", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isISOControl(ch);}}); ! defClone("javaMirrored", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isMirrored(ch);}}); } } /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.8 */ --- 5487,5646 ---- matcher.hitEnd = true; return false; } } ! @FunctionalInterface ! static interface CharPredicate { ! boolean is(int ch); ! ! default CharPredicate and(CharPredicate p) { ! return ch -> is(ch) && p.is(ch); ! } ! default CharPredicate union(CharPredicate p) { ! return ch -> is(ch) || p.is(ch); ! } ! default CharPredicate union(CharPredicate p1, ! CharPredicate p2 ) { ! return ch -> is(ch) || p1.is(ch) || p2.is(ch); ! } ! default CharPredicate negate() { ! return ch -> !is(ch); ! } ! } ! ! static interface BmpCharPredicate extends CharPredicate { ! ! default CharPredicate and(CharPredicate p) { ! if(p instanceof BmpCharPredicate) ! return (BmpCharPredicate)((ch) -> is(ch) && p.is(ch)); ! return ch -> is(ch) && p.is(ch); ! } ! default CharPredicate union(CharPredicate p) { ! if (p instanceof BmpCharPredicate) ! return (BmpCharPredicate)((ch) -> is(ch) || p.is(ch)); ! return ch -> is(ch) || p.is(ch); ! } ! static CharPredicate union(CharPredicate... predicates) { ! CharPredicate cp = ch -> { ! for (CharPredicate p : predicates) { ! if (!p.is(ch)) ! return false; ! } ! return true; ! }; ! for (CharPredicate p : predicates) { ! if (! (p instanceof BmpCharPredicate)) ! return cp; ! } ! return (BmpCharPredicate)cp; ! } ! } /** ! * matches a Perl vertical whitespace */ ! static BmpCharPredicate VertWS = cp -> ! (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029; ! /** ! * matches a Perl horizontal whitespace ! */ ! static BmpCharPredicate HorizWS = cp -> ! cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || ! cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || ! cp == 0x205f || cp == 0x3000; ! /** ! * for the Unicode category ALL and the dot metacharacter when ! * in dotall mode. ! */ ! static CharPredicate ALL = ch -> true; ! /** ! * for the dot metacharacter when dotall is not enabled. ! */ ! static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r' ! && (ch|1) != '\u2029' ! && ch != '\u0085'); ! /** ! * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. ! */ ! static CharPredicate UNIXDOT = ch -> ch != '\n'; ! ! /** ! * Indicate that matches a Supplementary Unicode character ! */ ! static CharPredicate SingleS(int c) { ! return ch -> ch == c; } ! /** ! * A bmp/optimized predicate of single ! */ ! static BmpCharPredicate Single(int c) { ! return ch -> ch == c; } ! /** ! * Case insensitive matches a given BMP character ! */ ! static BmpCharPredicate SingleI(int lower, int upper) { ! return ch -> ch == lower || ch == upper; } ! /** ! * Unicode case insensitive matches a given Unicode character ! */ ! static CharPredicate SingleU(int lower) { ! return ch -> lower == ch || ! lower == Character.toLowerCase(Character.toUpperCase(ch)); } ! private static boolean inRange(int lower, int ch, int upper) { ! return lower <= ch && ch <= upper; } ! /** ! * Charactrs within a explicit value range ! */ ! static CharPredicate Range(int lower, int upper) { ! if (upper < Character.MIN_HIGH_SURROGATE || ! lower > Character.MAX_HIGH_SURROGATE && ! upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) ! return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); ! return ch -> inRange(lower, ch, upper); ! } ! ! /** ! * Charactrs within a explicit value range in a case insensitive manner. ! */ ! static CharPredicate CIRange(int lower, int upper) { ! return ch -> inRange(lower, ch, upper) || ! ASCII.isAscii(ch) && ! (inRange(lower, ASCII.toUpper(ch), upper) || ! inRange(lower, ASCII.toLower(ch), upper)); } + + static CharPredicate CIRangeU(int lower, int upper) { + return ch -> { + if (inRange(lower, ch, upper)) + return true; + int up = Character.toUpperCase(ch); + return inRange(lower, up, upper) || + inRange(lower, Character.toLowerCase(up), upper); + }; } /** + * This must be the very first initializer. + */ + static Node accept = new Node(); + + static Node lastAccept = new LastNode(); + + /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.8 */