src/java.base/share/classes/java/util/regex/Pattern.java

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 24,38 **** --- 24,42 ---- */ package java.util.regex; import java.text.Normalizer; + import java.text.Normalizer.Form; import java.util.Locale; import java.util.Iterator; import java.util.Map; import java.util.ArrayList; import java.util.HashMap; + import java.util.LinkedHashSet; + import java.util.List; + import java.util.Set; import java.util.Arrays; import java.util.NoSuchElementException; import java.util.Spliterator; import java.util.Spliterators; import java.util.function.Predicate;
*** 982,991 **** --- 986,1000 ---- * Temporary storage used by parsing pattern slice. */ transient int[] buffer; /** + * A temporary storage used for predicate for double return. + */ + transient CharPredicate predicate; + + /** * Map the "name" of the "named capturing group" to its group id * node. */ transient volatile Map<String, Integer> namedGroups;
*** 993,1002 **** --- 1002,1029 ---- * Temporary storage used while parsing group references. */ transient GroupHead[] groupNodes; /** + * Temporary storage used to store the top level closure nodes. + */ + transient List<Node> topClosureNodes; + + /** + * The number of top greedy closure nodes in this Pattern. Used by + * matchers to allocate storage needed for a IntHashSet to keep the + * beginning pos {@code i} of all failed match. + */ + transient int localTCNCount; + + /* + * Turn off the stop-exponential-backtracking optimization if there + * is a group ref in the pattern. + */ + transient boolean hasGroupRef; + + /** * Temporary null terminated code point array used by pattern compiling. */ private transient int[] temp; /**
*** 1024,1034 **** /** * If the Start node might possibly match supplementary characters. * It is set to true during compiling if * (1) There is supplementary char in pattern, or ! * (2) There is complement node of Category or Block */ private transient boolean hasSupplementary; /** * Compiles the given regular expression into a pattern. --- 1051,1061 ---- /** * If the Start node might possibly match supplementary characters. * It is set to true during compiling if * (1) There is supplementary char in pattern, or ! * (2) There is complement node of a "family" CharProperty */ private transient boolean hasSupplementary; /** * Compiles the given regular expression into a pattern.
*** 1336,1345 **** --- 1363,1373 ---- s.defaultReadObject(); // Initialize counts capturingGroupCount = 1; localCount = 0; + localTCNCount = 0; // if length > 0, the Pattern is lazily compiled if (pattern.length() == 0) { root = new Start(lastAccept); matchRoot = lastAccept;
*** 1366,1513 **** flags |= UNICODE_CASE; // Reset group index count capturingGroupCount = 1; localCount = 0; if (pattern.length() > 0) { compile(); } else { root = new Start(lastAccept); matchRoot = lastAccept; } } /** ! * The pattern is converted to normalized form ({@linkplain ! * java.text.Normalizer.Form.NFD NFD}, canonical decomposition) ! * and then a pure group is constructed to match canonical ! * equivalences of the characters. */ ! private void normalize() { ! int lastCodePoint = -1; ! ! // Convert pattern into normalized form ! normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD); ! patternLength = normalizedPattern.length(); ! ! // Modify pattern to match canonical equivalences ! StringBuilder newPattern = new StringBuilder(patternLength); ! for(int i=0; i<patternLength; ) { ! int c = normalizedPattern.codePointAt(i); ! StringBuilder sequenceBuffer; ! if ((Character.getType(c) == Character.NON_SPACING_MARK) ! && (lastCodePoint != -1)) { ! sequenceBuffer = new StringBuilder(); ! sequenceBuffer.appendCodePoint(lastCodePoint); ! sequenceBuffer.appendCodePoint(c); ! while(Character.getType(c) == Character.NON_SPACING_MARK) { ! i += Character.charCount(c); ! if (i >= patternLength) ! break; ! c = normalizedPattern.codePointAt(i); ! sequenceBuffer.appendCodePoint(c); } ! String ea = produceEquivalentAlternation( ! sequenceBuffer.toString()); ! newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); ! newPattern.append("(?:").append(ea).append(")"); ! } else if (c == '[' && lastCodePoint != '\\') { ! i = normalizeCharClass(newPattern, i); ! } else { ! newPattern.appendCodePoint(c); } - lastCodePoint = c; - i += Character.charCount(c); } ! normalizedPattern = newPattern.toString(); } ! /** ! * Complete the character class being parsed and add a set ! * of alternations to it that will match the canonical equivalences ! * of the characters within the class. ! */ ! private int normalizeCharClass(StringBuilder newPattern, int i) { ! StringBuilder charClass = new StringBuilder(); ! StringBuilder eq = null; ! int lastCodePoint = -1; ! String result; ! ! i++; ! charClass.append("["); ! while(true) { ! int c = normalizedPattern.codePointAt(i); ! StringBuilder sequenceBuffer; ! ! if (c == ']' && lastCodePoint != '\\') { ! charClass.append((char)c); ! break; ! } else if (Character.getType(c) == Character.NON_SPACING_MARK) { ! sequenceBuffer = new StringBuilder(); ! sequenceBuffer.appendCodePoint(lastCodePoint); ! while(Character.getType(c) == Character.NON_SPACING_MARK) { ! sequenceBuffer.appendCodePoint(c); ! i += Character.charCount(c); ! if (i >= normalizedPattern.length()) break; ! c = normalizedPattern.codePointAt(i); } ! String ea = produceEquivalentAlternation( ! sequenceBuffer.toString()); ! ! charClass.setLength(charClass.length()-Character.charCount(lastCodePoint)); ! if (eq == null) ! eq = new StringBuilder(); ! eq.append('|'); ! eq.append(ea); ! } else { ! charClass.appendCodePoint(c); ! i++; } - if (i == normalizedPattern.length()) - throw error("Unclosed character class"); - lastCodePoint = c; } ! ! if (eq != null) { ! result = "(?:"+charClass.toString()+eq.toString()+")"; ! } else { ! result = charClass.toString(); } ! newPattern.append(result); ! return i; } /** * Given a specific sequence composed of a regular character and * combining marks that follow it, produce the alternation that will * match all canonical equivalences of that sequence. */ ! private String produceEquivalentAlternation(String source) { ! int len = countChars(source, 0, 1); ! if (source.length() == len) ! // source has one character. ! return source; ! ! String base = source.substring(0,len); ! String combiningMarks = source.substring(len); ! String[] perms = producePermutations(combiningMarks); - StringBuilder result = new StringBuilder(source); - // Add combined permutations ! for(int x=0; x<perms.length; x++) { String next = base + perms[x]; ! if (x>0) ! result.append("|"+next); next = composeOneStep(next); ! if (next != null) ! result.append("|"+produceEquivalentAlternation(next)); } - return result.toString(); } /** * Returns an array of strings that have all the possible * permutations of the characters in the input string. --- 1394,1549 ---- flags |= UNICODE_CASE; // Reset group index count capturingGroupCount = 1; localCount = 0; + localTCNCount = 0; if (pattern.length() > 0) { compile(); } else { root = new Start(lastAccept); matchRoot = lastAccept; } } /** ! * The pattern is converted to normalized form ({@link ! * java.text.Normalizer.Form.NFC NFC}, canonical decomposition, ! * followed by canonical composition for the character class ! * part, and {@link java.text.Normalizer.Form.NFD NFD}, ! * canonical decomposition) for the rest), and then a pure ! * group is constructed to match canonical equivalences of the ! * characters. */ ! private static String normalize(String pattern) { ! int plen = pattern.length(); ! StringBuilder pbuf = new StringBuilder(plen); ! char last = 0; ! int lastStart = 0; ! char cc = 0; ! for (int i = 0; i < plen;) { ! char c = pattern.charAt(i); ! if (cc == 0 && // top level ! c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') { ! i += 2; last = 0; ! continue; } ! if (c == '[' && last != '\\') { ! if (cc == 0) { ! if (lastStart < i) ! normalizeSlice(pattern, lastStart, i, pbuf); ! lastStart = i; ! } ! cc++; ! } else if (c == ']' && last != '\\') { ! cc--; ! if (cc == 0) { ! normalizeClazz(pattern, lastStart, i + 1, pbuf); ! lastStart = i + 1; } } ! last = c; ! i++; ! } ! assert (cc == 0); ! if (lastStart < plen) ! normalizeSlice(pattern, lastStart, plen, pbuf); ! return pbuf.toString(); } ! private static void normalizeSlice(String src, int off, int limit, ! StringBuilder dst) ! { ! int len = src.length(); ! int off0 = off; ! while (off < limit && ASCII.isAscii(src.charAt(off))) { ! off++; ! } ! if (off == limit) { ! dst.append(src, off0, limit); ! return; ! } ! off--; ! if (off < off0) ! off = off0; ! else ! dst.append(src, off0, off); ! while (off < limit) { ! int ch0 = src.codePointAt(off); ! if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) { ! dst.append((char)ch0); ! off++; ! continue; ! } ! int j = off + Character.charCount(ch0); ! int ch1; ! while (j < limit) { ! ch1 = src.codePointAt(j); ! if (Grapheme.isBoundary(ch0, ch1)) break; ! ch0 = ch1; ! j += Character.charCount(ch1); } ! String seq = src.substring(off, j); ! String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD); ! off = j; ! if (nfd.length() > 1) { ! ch0 = nfd.codePointAt(0); ! ch1 = nfd.codePointAt(Character.charCount(ch0)); ! if (Character.getType(ch1) == Character.NON_SPACING_MARK) { ! Set<String> altns = new LinkedHashSet<>(); ! altns.add(seq); ! produceEquivalentAlternation(nfd, altns); ! dst.append("(?:"); ! altns.forEach( s -> dst.append(s + "|")); ! dst.delete(dst.length() - 1, dst.length()); ! dst.append(")"); ! continue; } } ! String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC); ! if (!seq.equals(nfc) && !nfd.equals(nfc)) ! dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")"); ! else if (!seq.equals(nfd)) ! dst.append("(?:" + seq + "|" + nfd + ")"); ! else ! dst.append(seq); ! } } ! private static void normalizeClazz(String src, int off, int limit, ! StringBuilder dst) ! { ! dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC)); } /** * Given a specific sequence composed of a regular character and * combining marks that follow it, produce the alternation that will * match all canonical equivalences of that sequence. */ ! private static void produceEquivalentAlternation(String src, ! Set<String> dst) ! { ! int len = countChars(src, 0, 1); ! if (src.length() == len) { ! dst.add(src); // source has one character. ! return; ! } ! String base = src.substring(0,len); ! String combiningMarks = src.substring(len); String[] perms = producePermutations(combiningMarks); // Add combined permutations ! for(int x = 0; x < perms.length; x++) { String next = base + perms[x]; ! dst.add(next); next = composeOneStep(next); ! if (next != null) { ! produceEquivalentAlternation(next, dst); ! } } } /** * Returns an array of strings that have all the possible * permutations of the characters in the input string.
*** 1515,1525 **** * of a set of combining marks. Note that some of the permutations * are invalid because of combining class collisions, and these * possibilities must be removed because they are not canonically * equivalent. */ ! private String[] producePermutations(String input) { if (input.length() == countChars(input, 0, 1)) return new String[] {input}; if (input.length() == countChars(input, 0, 2)) { int c0 = Character.codePointAt(input, 0); --- 1551,1561 ---- * of a set of combining marks. Note that some of the permutations * are invalid because of combining class collisions, and these * possibilities must be removed because they are not canonically * equivalent. */ ! private static String[] producePermutations(String input) { if (input.length() == countChars(input, 0, 1)) return new String[] {input}; if (input.length() == countChars(input, 0, 2)) { int c0 = Character.codePointAt(input, 0);
*** 1573,1598 **** String[] result = new String[index]; System.arraycopy(temp, 0, result, 0, index); return result; } ! private int getClass(int c) { return sun.text.Normalizer.getCombiningClass(c); } /** * Attempts to compose input by combining the first character * with the first combining mark following it. Returns a String * that is the composition of the leading character with its first * combining mark followed by the remaining combining marks. Returns * null if the first two characters cannot be further composed. */ ! private String composeOneStep(String input) { int len = countChars(input, 0, 2); String firstTwoCharacters = input.substring(0, len); String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); - if (result.equals(firstTwoCharacters)) return null; else { String remainder = input.substring(len); return result + remainder; --- 1609,1633 ---- String[] result = new String[index]; System.arraycopy(temp, 0, result, 0, index); return result; } ! private static int getClass(int c) { return sun.text.Normalizer.getCombiningClass(c); } /** * Attempts to compose input by combining the first character * with the first combining mark following it. Returns a String * that is the composition of the leading character with its first * combining mark followed by the remaining combining marks. Returns * null if the first two characters cannot be further composed. */ ! private static String composeOneStep(String input) { int len = countChars(input, 0, 2); String firstTwoCharacters = input.substring(0, len); String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); if (result.equals(firstTwoCharacters)) return null; else { String remainder = input.substring(len); return result + remainder;
*** 1675,1685 **** * of the expression which will create the object tree. */ private void compile() { // Handle canonical equivalences if (has(CANON_EQ) && !has(LITERAL)) { ! normalize(); } else { normalizedPattern = pattern; } patternLength = normalizedPattern.length(); --- 1710,1720 ---- * of the expression which will create the object tree. */ private void compile() { // Handle canonical equivalences if (has(CANON_EQ) && !has(LITERAL)) { ! normalizedPattern = normalize(pattern); } else { normalizedPattern = pattern; } patternLength = normalizedPattern.length();
*** 1705,1714 **** --- 1740,1750 ---- // Allocate all temporary objects here. buffer = new int[32]; groupNodes = new GroupHead[10]; namedGroups = null; + topClosureNodes = new ArrayList<>(10); if (has(LITERAL)) { // Literal pattern handling matchRoot = newSlice(temp, patternLength, hasSupplementary); matchRoot.next = lastAccept;
*** 1735,1750 **** --- 1771,1800 ---- root = matchRoot; } else { root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); } + // Optimize the greedy Loop to prevent exponential backtracking, IF there + // is no group ref in this pattern. With a non-negative localTCNCount value, + // the greedy type Loop, Curly will skip the backtracking for any starting + // position "i" that failed in the past. + if (!hasGroupRef) { + for (Node node : topClosureNodes) { + if (node instanceof Loop) { + // non-deterministic-greedy-group + ((Loop)node).posIndex = localTCNCount++; + } + } + } + // Release temporary storage temp = null; buffer = null; groupNodes = null; patternLength = 0; compiled = true; + topClosureNodes = null; } Map<String, Integer> namedGroups() { Map<String, Integer> groups = namedGroups; if (groups == null) {
*** 1752,1799 **** } return groups; } /** - * Used to print out a subtree of the Pattern to help with debugging. - */ - private static void printObjectTree(Node node) { - while(node != null) { - if (node instanceof Prolog) { - System.out.println(node); - printObjectTree(((Prolog)node).loop); - System.out.println("**** end contents prolog loop"); - } else if (node instanceof Loop) { - System.out.println(node); - printObjectTree(((Loop)node).body); - System.out.println("**** end contents Loop body"); - } else if (node instanceof Curly) { - System.out.println(node); - printObjectTree(((Curly)node).atom); - System.out.println("**** end contents Curly body"); - } else if (node instanceof GroupCurly) { - System.out.println(node); - printObjectTree(((GroupCurly)node).atom); - System.out.println("**** end contents GroupCurly body"); - } else if (node instanceof GroupTail) { - System.out.println(node); - System.out.println("Tail next is "+node.next); - return; - } else { - System.out.println(node); - } - node = node.next; - if (node != null) - System.out.println("->next:"); - if (node == Pattern.accept) { - System.out.println("Accept Node"); - node = null; - } - } - } - - /** * Used to accumulate information about a subtree of the object graph * so that optimizations can be applied to the subtree. */ static final class TreeInfo { int minLength; --- 1802,1811 ----
*** 2081,2091 **** tail.next = node; // Double return: Tail was returned in root tail = root; continue; case '[': ! node = clazz(true); break; case '\\': ch = nextEscaped(); if (ch == 'p' || ch == 'P') { boolean oneLetter = true; --- 2093,2106 ---- tail.next = node; // Double return: Tail was returned in root tail = root; continue; case '[': ! if (has(CANON_EQ) && !has(LITERAL)) ! node = new NFCCharProperty(clazz(true)); ! else ! node = newCharProperty(clazz(true)); break; case '\\': ch = nextEscaped(); if (ch == 'p' || ch == 'P') { boolean oneLetter = true;
*** 2094,2104 **** if (ch != '{') { unread(); } else { oneLetter = false; } ! node = family(oneLetter, comp); } else { unread(); node = atom(); } break; --- 2109,2123 ---- if (ch != '{') { unread(); } else { oneLetter = false; } ! // node = newCharProperty(family(oneLetter, comp)); ! if (has(CANON_EQ) && !has(LITERAL)) ! node = new NFCCharProperty(family(oneLetter, comp)); ! else ! node = newCharProperty(family(oneLetter, comp)); } else { unread(); node = atom(); } break;
*** 2121,2136 **** node = new Dollar(has(MULTILINE)); break; case '.': next(); if (has(DOTALL)) { ! node = new All(); } else { ! if (has(UNIX_LINES)) ! node = new UnixDot(); ! else { ! node = new Dot(); } } break; case '|': case ')': --- 2140,2155 ---- node = new Dollar(has(MULTILINE)); break; case '.': next(); if (has(DOTALL)) { ! node = new CharProperty(ALL); } else { ! if (has(UNIX_LINES)) { ! node = new CharProperty(UNIXDOT); ! } else { ! node = new CharProperty(DOT); } } break; case '|': case ')':
*** 2153,2163 **** node = atom(); break; } node = closure(node); ! if (head == null) { head = tail = node; } else { tail.next = node; tail = node; --- 2172,2187 ---- node = atom(); break; } node = closure(node); ! /* save the top dot-greedy nodes (.*, .+) as well ! if (node instanceof GreedyCharProperty && ! ((GreedyCharProperty)node).cp instanceof Dot) { ! topClosureNodes.add(node); ! } ! */ if (head == null) { head = tail = node; } else { tail.next = node; tail = node;
*** 2211,2221 **** ch = next(); // Consume { if present if (ch != '{') unread(); else oneLetter = false; ! return family(oneLetter, comp); } } unread(); prev = cursor; ch = escape(false, first == 0, false); --- 2235,2248 ---- ch = next(); // Consume { if present if (ch != '{') unread(); else oneLetter = false; ! if (has(CANON_EQ) && !has(LITERAL)) ! return new NFCCharProperty(family(oneLetter, comp)); ! else ! return newCharProperty(family(oneLetter, comp)); } } unread(); prev = cursor; ch = escape(false, first == 0, false);
*** 2249,2259 **** continue; } break; } if (first == 1) { ! return newSingle(buffer[0]); } else { return newSlice(buffer, first, hasSupplementary); } } --- 2276,2286 ---- continue; } break; } if (first == 1) { ! return newCharProperty(single(buffer[0])); } else { return newSlice(buffer, first, hasSupplementary); } }
*** 2300,2309 **** --- 2327,2337 ---- default: done = true; break; } } + hasGroupRef = true; if (has(CASE_INSENSITIVE)) return new CIBackRef(refNum, has(UNICODE_CASE)); else return new BackRef(refNum); }
*** 2344,2366 **** if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.DIGIT).complement() ! : new Ctype(ASCII.DIGIT).complement(); return -1; case 'E': case 'F': break; case 'G': if (inclass) break; if (create) root = new LastMatch(); return -1; case 'H': ! if (create) root = new HorizWS().complement(); return -1; case 'I': case 'J': case 'K': case 'L': --- 2372,2402 ---- if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT; ! predicate = predicate.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'E': case 'F': break; case 'G': if (inclass) break; if (create) root = new LastMatch(); return -1; case 'H': ! if (create) { ! predicate = HorizWS.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'I': case 'J': case 'K': case 'L':
*** 2375,2398 **** case 'R': if (inclass) break; if (create) root = new LineEnding(); return -1; case 'S': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WHITE_SPACE).complement() ! : new Ctype(ASCII.SPACE).complement(); return -1; case 'T': case 'U': break; case 'V': ! if (create) root = new VertWS().complement(); return -1; case 'W': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WORD).complement() ! : new Ctype(ASCII.WORD).complement(); return -1; case 'X': if (inclass) break; if (create) { root = new XGrapheme(); --- 2411,2446 ---- case 'R': if (inclass) break; if (create) root = new LineEnding(); return -1; case 'S': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE; ! predicate = predicate.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'T': case 'U': break; case 'V': ! if (create) { ! predicate = VertWS.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'W': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WORD : CharPredicates.ASCII_WORD; ! predicate = predicate.negate(); ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'X': if (inclass) break; if (create) { root = new XGrapheme();
*** 2428,2449 **** } return -1; case 'c': return c(); case 'd': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.DIGIT) ! : new Ctype(ASCII.DIGIT); return -1; case 'e': return '\033'; case 'f': return '\f'; case 'g': break; case 'h': ! if (create) root = new HorizWS(); return -1; case 'i': case 'j': break; case 'k': --- 2476,2504 ---- } return -1; case 'c': return c(); case 'd': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'e': return '\033'; case 'f': return '\f'; case 'g': break; case 'h': ! if (create) { ! predicate = HorizWS; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'i': case 'j': break; case 'k':
*** 2453,2462 **** --- 2508,2518 ---- throw error("\\k is not followed by '<' for named capturing group"); String name = groupname(read()); if (!namedGroups().containsKey(name)) throw error("(named capturing group <"+ name+"> does not exit"); if (create) { + hasGroupRef = true; if (has(CASE_INSENSITIVE)) root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); else root = new BackRef(namedGroups().get(name)); }
*** 2471,2483 **** case 'q': break; case 'r': return '\r'; case 's': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WHITE_SPACE) ! : new Ctype(ASCII.SPACE); return -1; case 't': return '\t'; case 'u': return u(); --- 2527,2542 ---- case 'q': break; case 'r': return '\r'; case 's': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 't': return '\t'; case 'u': return u();
*** 2490,2505 **** // the start or end value, such as [\v-...] or [...-\v], in // which a single definite value (0x0B) is expected. For // compatibility concern '\013'/0x0B is returned if isrange. if (isrange) return '\013'; ! if (create) root = new VertWS(); return -1; case 'w': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WORD) ! : new Ctype(ASCII.WORD); return -1; case 'x': return x(); case 'y': break; --- 2549,2571 ---- // the start or end value, such as [\v-...] or [...-\v], in // which a single definite value (0x0B) is expected. For // compatibility concern '\013'/0x0B is returned if isrange. if (isrange) return '\013'; ! if (create) { ! predicate = VertWS; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'w': ! if (create) { ! predicate = has(UNICODE_CHARACTER_CLASS) ? ! CharPredicates.WORD : CharPredicates.ASCII_WORD; ! if (!inclass) ! root = newCharProperty(predicate); ! } return -1; case 'x': return x(); case 'y': break;
*** 2518,2629 **** * * Consumes a ] on the way out if consume is true. Usually consume * is true except for the case of [abc&&def] where def is a separate * right hand node with "understood" brackets. */ ! private CharProperty clazz(boolean consume) { ! CharProperty prev = null; ! CharProperty node = null; BitClass bits = new BitClass(); ! boolean include = true; ! boolean firstInClass = true; int ch = next(); ! for (;;) { ! switch (ch) { ! case '^': // Negates if first char in a class, otherwise literal ! if (firstInClass) { ! if (temp[cursor-1] != '[') ! break; ch = next(); ! include = !include; ! continue; ! } else { ! // ^ not first in class, treat as literal ! break; } case '[': ! firstInClass = false; ! node = clazz(true); if (prev == null) ! prev = node; else ! prev = union(prev, node); ch = peek(); continue; case '&': - firstInClass = false; ch = next(); if (ch == '&') { ch = next(); ! CharProperty rightNode = null; while (ch != ']' && ch != '&') { if (ch == '[') { ! if (rightNode == null) ! rightNode = clazz(true); else ! rightNode = union(rightNode, clazz(true)); } else { // abc&&def unread(); ! rightNode = clazz(false); } ch = peek(); } ! if (rightNode != null) ! node = rightNode; if (prev == null) { ! if (rightNode == null) throw error("Bad class syntax"); else ! prev = rightNode; } else { ! prev = intersection(prev, node); } } else { // treat as a literal & unread(); break; } continue; case 0: - firstInClass = false; if (cursor >= patternLength) throw error("Unclosed character class"); break; case ']': ! firstInClass = false; ! if (prev != null) { if (consume) next(); return prev; } break; default: - firstInClass = false; break; } ! node = range(bits); ! if (include) { ! if (prev == null) { ! prev = node; ! } else { ! if (prev != node) ! prev = union(prev, node); ! } ! } else { ! if (prev == null) { ! prev = node.complement(); } else { ! if (prev != node) ! prev = setDifference(prev, node); ! } } ch = peek(); } } ! private CharProperty bitsOrSingle(BitClass bits, int ch) { /* Bits can only handle codepoints in [u+0000-u+00ff] range. Use "single" node instead of bits when dealing with unicode case folding for codepoints listed below. (1)Uppercase out of range: u+00ff, u+00b5 toUpperCase(u+00ff) -> u+0178 --- 2584,2694 ---- * * Consumes a ] on the way out if consume is true. Usually consume * is true except for the case of [abc&&def] where def is a separate * right hand node with "understood" brackets. */ ! private CharPredicate clazz(boolean consume) { ! CharPredicate prev = null; ! CharPredicate curr = null; BitClass bits = new BitClass(); ! BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch]; ! ! boolean isNeg = false; ! boolean hasBits = false; int ch = next(); ! // Negates if first char in a class, otherwise literal ! if (ch == '^' && temp[cursor-1] == '[') { ch = next(); ! isNeg = true; } + for (;;) { + switch (ch) { case '[': ! curr = clazz(true); if (prev == null) ! prev = curr; else ! prev = prev.union(curr); ch = peek(); continue; case '&': ch = next(); if (ch == '&') { ch = next(); ! CharPredicate right = null; while (ch != ']' && ch != '&') { if (ch == '[') { ! if (right == null) ! right = clazz(true); else ! right = right.union(clazz(true)); } else { // abc&&def unread(); ! right = clazz(false); } ch = peek(); } ! if (hasBits) { ! // bits used, union has high precedence if (prev == null) { ! prev = curr = bitsP; ! } else { ! prev = prev.union(bitsP); ! } ! hasBits = false; ! } ! if (right != null) ! curr = right; ! if (prev == null) { ! if (right == null) throw error("Bad class syntax"); else ! prev = right; } else { ! prev = prev.and(curr); } } else { // treat as a literal & unread(); break; } continue; case 0: if (cursor >= patternLength) throw error("Unclosed character class"); break; case ']': ! if (prev != null || hasBits) { if (consume) next(); + if (prev == null) + prev = bitsP; + else if (hasBits) + prev = prev.union(bitsP); + if (isNeg) + return prev.negate(); return prev; } break; default: break; } ! curr = range(bits); ! if (curr == null) { // the bits used ! hasBits = true; } else { ! if (prev == null) ! prev = curr; ! else if (prev != curr) ! prev = prev.union(curr); } ch = peek(); } } ! private CharPredicate bitsOrSingle(BitClass bits, int ch) { /* Bits can only handle codepoints in [u+0000-u+00ff] range. Use "single" node instead of bits when dealing with unicode case folding for codepoints listed below. (1)Uppercase out of range: u+00ff, u+00b5 toUpperCase(u+00ff) -> u+0178
*** 2644,2663 **** !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && (ch == 0xff || ch == 0xb5 || ch == 0x49 || ch == 0x69 || //I and i ch == 0x53 || ch == 0x73 || //S and s ch == 0x4b || ch == 0x6b || //K and k ! ch == 0xc5 || ch == 0xe5))) //A+ring ! return bits.add(ch, flags()); ! return newSingle(ch); } /** * Parse a single character or a character range in a character class * and return its representative node. */ ! private CharProperty range(BitClass bits) { int ch = peek(); if (ch == '\\') { ch = nextEscaped(); if (ch == 'p' || ch == 'P') { // A property boolean comp = (ch == 'P'); --- 2709,2755 ---- !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && (ch == 0xff || ch == 0xb5 || ch == 0x49 || ch == 0x69 || //I and i ch == 0x53 || ch == 0x73 || //S and s ch == 0x4b || ch == 0x6b || //K and k ! ch == 0xc5 || ch == 0xe5))) { //A+ring ! bits.add(ch, flags()); ! return null; ! } ! return single(ch); ! } ! ! /** ! * Returns a suitably optimized, single character predicate ! */ ! private CharPredicate single(final int ch) { ! if (has(CASE_INSENSITIVE)) { ! int lower, upper; ! if (has(UNICODE_CASE)) { ! upper = Character.toUpperCase(ch); ! lower = Character.toLowerCase(upper); ! // Unicode case insensitive matches ! if (upper != lower) ! return SingleU(lower); ! } else if (ASCII.isAscii(ch)) { ! lower = ASCII.toLower(ch); ! upper = ASCII.toUpper(ch); ! // Case insensitive matches a given BMP character ! if (lower != upper) ! return SingleI(lower, upper); ! } ! } ! if (isSupplementary(ch)) ! return SingleS(ch); ! return Single(ch); // Match a given BMP character } /** * Parse a single character or a character range in a character class * and return its representative node. */ ! private CharPredicate range(BitClass bits) { int ch = peek(); if (ch == '\\') { ch = nextEscaped(); if (ch == 'p' || ch == 'P') { // A property boolean comp = (ch == 'P');
*** 2672,2682 **** } else { // ordinary escape boolean isrange = temp[cursor+1] == '-'; unread(); ch = escape(true, true, isrange); if (ch == -1) ! return (CharProperty) root; } } else { next(); } if (ch >= 0) { --- 2764,2774 ---- } else { // ordinary escape boolean isrange = temp[cursor+1] == '-'; unread(); ch = escape(true, true, isrange); if (ch == -1) ! return predicate; } } else { next(); } if (ch >= 0) {
*** 2694,2723 **** next(); } if (m < ch) { throw error("Illegal character range"); } ! if (has(CASE_INSENSITIVE)) ! return caseInsensitiveRangeFor(ch, m); ! else ! return rangeFor(ch, m); } } return bitsOrSingle(bits, ch); } throw error("Unexpected character '"+((char)ch)+"'"); } /** * Parses a Unicode character family and returns its representative node. */ ! private CharProperty family(boolean singleLetter, ! boolean maybeComplement) ! { next(); String name; ! CharProperty node = null; if (singleLetter) { int c = temp[cursor]; if (!Character.isSupplementaryCodePoint(c)) { name = String.valueOf((char)c); --- 2786,2816 ---- next(); } if (m < ch) { throw error("Illegal character range"); } ! if (has(CASE_INSENSITIVE)) { ! if (has(UNICODE_CASE)) ! return CIRangeU(ch, m); ! return CIRange(ch, m); ! } else { ! return Range(ch, m); ! } } } return bitsOrSingle(bits, ch); } throw error("Unexpected character '"+((char)ch)+"'"); } /** * Parses a Unicode character family and returns its representative node. */ ! private CharPredicate family(boolean singleLetter, boolean isComplement) { next(); String name; ! CharPredicate p = null; if (singleLetter) { int c = temp[cursor]; if (!Character.isSupplementaryCodePoint(c)) { name = String.valueOf((char)c);
*** 2745,2836 **** String value = name.substring(i + 1); name = name.substring(0, i).toLowerCase(Locale.ENGLISH); switch (name) { case "sc": case "script": ! node = unicodeScriptPropertyFor(value); break; case "blk": case "block": ! node = unicodeBlockPropertyFor(value); break; case "gc": case "general_category": ! node = charPropertyNodeFor(value); break; default: throw error("Unknown Unicode property {name=<" + name + ">, " + "value=<" + value + ">}"); ! } } else { if (name.startsWith("In")) { ! // \p{inBlockName} ! node = unicodeBlockPropertyFor(name.substring(2)); } else if (name.startsWith("Is")) { ! // \p{isGeneralCategory} and \p{isScriptName} name = name.substring(2); ! UnicodeProp uprop = UnicodeProp.forName(name); ! if (uprop != null) ! node = new Utype(uprop); ! if (node == null) ! node = CharPropertyNames.charPropertyFor(name); ! if (node == null) ! node = unicodeScriptPropertyFor(name); } else { if (has(UNICODE_CHARACTER_CLASS)) { ! UnicodeProp uprop = UnicodeProp.forPOSIXName(name); ! if (uprop != null) ! node = new Utype(uprop); } ! if (node == null) ! node = charPropertyNodeFor(name); } } ! if (maybeComplement) { ! if (node instanceof Category || node instanceof Block) hasSupplementary = true; ! node = node.complement(); ! } ! return node; ! } ! ! ! /** ! * Returns a CharProperty matching all characters belong to ! * a UnicodeScript. ! */ ! private CharProperty unicodeScriptPropertyFor(String name) { ! final Character.UnicodeScript script; ! try { ! script = Character.UnicodeScript.forName(name); ! } catch (IllegalArgumentException iae) { ! throw error("Unknown character script name {" + name + "}"); ! } ! return new Script(script); ! } ! ! /** ! * Returns a CharProperty matching all characters in a UnicodeBlock. ! */ ! private CharProperty unicodeBlockPropertyFor(String name) { ! final Character.UnicodeBlock block; ! try { ! block = Character.UnicodeBlock.forName(name); ! } catch (IllegalArgumentException iae) { ! throw error("Unknown character block name {" + name + "}"); } ! return new Block(block); } ! /** ! * Returns a CharProperty matching all characters in a named property. ! */ ! private CharProperty charPropertyNodeFor(String name) { ! CharProperty p = CharPropertyNames.charPropertyFor(name); if (p == null) ! throw error("Unknown character property name {" + name + "}"); ! return p; } /** * Parses and returns the name of a "named capturing group", the trailing * ">" is consumed after parsing. --- 2838,2903 ---- String value = name.substring(i + 1); name = name.substring(0, i).toLowerCase(Locale.ENGLISH); switch (name) { case "sc": case "script": ! p = CharPredicates.forUnicodeScript(value); break; case "blk": case "block": ! p = CharPredicates.forUnicodeBlock(value); break; case "gc": case "general_category": ! p = CharPredicates.forProperty(value); break; default: + break; + } + if (p == null) throw error("Unknown Unicode property {name=<" + name + ">, " + "value=<" + value + ">}"); ! } else { if (name.startsWith("In")) { ! // \p{InBlockName} ! p = CharPredicates.forUnicodeBlock(name.substring(2)); } else if (name.startsWith("Is")) { ! // \p{IsGeneralCategory} and \p{IsScriptName} name = name.substring(2); ! p = CharPredicates.forUnicodeProperty(name); ! if (p == null) ! p = CharPredicates.forProperty(name); ! if (p == null) ! p = CharPredicates.forUnicodeScript(name); } else { if (has(UNICODE_CHARACTER_CLASS)) { ! p = CharPredicates.forPOSIXName(name); } ! if (p == null) ! p = CharPredicates.forProperty(name); } + if (p == null) + throw error("Unknown character property name {In/Is" + name + "}"); } ! if (isComplement) { ! // it might be too expensive to detect if a complement of ! // CharProperty can match "certain" supplementary. So just ! // go with StartS. hasSupplementary = true; ! p = p.negate(); } ! return p; } ! private CharProperty newCharProperty(CharPredicate p) { if (p == null) ! return null; ! if (p instanceof BmpCharPredicate) ! return new BmpCharProperty((BmpCharPredicate)p); ! else ! return new CharProperty(p); } /** * Parses and returns the name of a "named capturing group", the trailing * ">" is consumed after parsing.
*** 2857,2866 **** --- 2924,2934 ---- private Node group0() { boolean capturingGroup = false; Node head = null; Node tail = null; int save = flags; + int saveTCNCount = topClosureNodes.size(); root = null; int ch = next(); if (ch == '?') { ch = skip(); switch (ch) {
*** 2882,2892 **** break; case '>': // (?>xxx) independent group head = createGroup(true); tail = root; head.next = expr(tail); ! head = tail = new Ques(head, INDEPENDENT); break; case '<': // (?<xxx) look behind ch = read(); if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { // named captured group --- 2950,2960 ---- break; case '>': // (?>xxx) independent group head = createGroup(true); tail = root; head.next = expr(tail); ! head = tail = new Ques(head, Qtype.INDEPENDENT); break; case '<': // (?<xxx) look behind ch = read(); if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { // named captured group
*** 2926,2935 **** --- 2994,3006 ---- new NotBehind(head, info.maxLength, info.minLength)); } else { throw error("Unknown look-behind group"); } + // clear all top-closure-nodes inside lookbehind + if (saveTCNCount < topClosureNodes.size()) + topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); break; case '$': case '@': throw error("Unknown group type"); default: // (?xxx:) inlined match flags
*** 2966,2993 **** if (head == tail) { // Zero length assertion root = node; return node; // Dual return } if (node instanceof Ques) { Ques ques = (Ques) node; ! if (ques.type == POSSESSIVE) { root = node; return node; } tail.next = new BranchConn(); tail = tail.next; ! if (ques.type == GREEDY) { head = new Branch(head, null, tail); } else { // Reluctant quantifier head = new Branch(null, head, tail); } root = tail; return head; } else if (node instanceof Curly) { Curly curly = (Curly) node; ! if (curly.type == POSSESSIVE) { root = node; return node; } // Discover if the group is deterministic TreeInfo info = new TreeInfo(); --- 3037,3069 ---- if (head == tail) { // Zero length assertion root = node; return node; // Dual return } + // have group closure, clear all inner closure nodes from the + // top list (no backtracking stopper optimization for inner + if (saveTCNCount < topClosureNodes.size()) + topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); + if (node instanceof Ques) { Ques ques = (Ques) node; ! if (ques.type == Qtype.POSSESSIVE) { root = node; return node; } tail.next = new BranchConn(); tail = tail.next; ! if (ques.type == Qtype.GREEDY) { head = new Branch(head, null, tail); } else { // Reluctant quantifier head = new Branch(null, head, tail); } root = tail; return head; } else if (node instanceof Curly) { Curly curly = (Curly) node; ! if (curly.type == Qtype.POSSESSIVE) { root = node; return node; } // Discover if the group is deterministic TreeInfo info = new TreeInfo();
*** 3000,3013 **** capturingGroup); return head; } else { // Non-deterministic int temp = ((GroupHead) head).localIndex; Loop loop; ! if (curly.type == GREEDY) loop = new Loop(this.localCount, temp); ! else // Reluctant Curly loop = new LazyLoop(this.localCount, temp); Prolog prolog = new Prolog(loop); this.localCount += 1; loop.cmin = curly.cmin; loop.cmax = curly.cmax; loop.body = head; --- 3076,3093 ---- capturingGroup); return head; } else { // Non-deterministic int temp = ((GroupHead) head).localIndex; Loop loop; ! if (curly.type == Qtype.GREEDY) { loop = new Loop(this.localCount, temp); ! // add the max_reps greedy to the top-closure-node list ! if (curly.cmax == MAX_REPS) ! topClosureNodes.add(loop); ! } else { // Reluctant Curly loop = new LazyLoop(this.localCount, temp); + } Prolog prolog = new Prolog(loop); this.localCount += 1; loop.cmin = curly.cmin; loop.cmax = curly.cmax; loop.body = head;
*** 3029,3038 **** --- 3109,3122 ---- int groupIndex = 0; if (!anonymous) groupIndex = capturingGroupCount++; GroupHead head = new GroupHead(localIndex); root = new GroupTail(localIndex, groupIndex); + + // for debug/print only, head.match does NOT need the "tail" info + head.tail = (GroupTail)root; + if (!anonymous && groupIndex < 10) groupNodes[groupIndex] = head; return head; }
*** 3117,3133 **** } } static final int MAX_REPS = 0x7FFFFFFF; ! static final int GREEDY = 0; ! ! static final int LAZY = 1; ! ! static final int POSSESSIVE = 2; ! static final int INDEPENDENT = 3; /** * Processes repetition. If the next character peeked is a quantifier * then new nodes must be appended to handle the repetition. * Prev could be a single or a group, so it could be a chain of nodes. --- 3201,3230 ---- } } static final int MAX_REPS = 0x7FFFFFFF; ! static enum Qtype { ! GREEDY, LAZY, POSSESSIVE, INDEPENDENT ! } ! private Node curly(Node prev, int cmin) { ! int ch = next(); ! if (ch == '?') { ! next(); ! return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY); ! } else if (ch == '+') { ! next(); ! return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE); ! } ! if (prev instanceof BmpCharProperty) { ! return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin); ! } else if (prev instanceof CharProperty) { ! return new CharPropertyGreedy((CharProperty)prev, cmin); ! } ! return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY); ! } /** * Processes repetition. If the next character peeked is a quantifier * then new nodes must be appended to handle the repetition. * Prev could be a single or a group, so it could be a chain of nodes.
*** 3138,3173 **** switch (ch) { case '?': ch = next(); if (ch == '?') { next(); ! return new Ques(prev, LAZY); } else if (ch == '+') { next(); ! return new Ques(prev, POSSESSIVE); } ! return new Ques(prev, GREEDY); case '*': ! ch = next(); ! if (ch == '?') { ! next(); ! return new Curly(prev, 0, MAX_REPS, LAZY); ! } else if (ch == '+') { ! next(); ! return new Curly(prev, 0, MAX_REPS, POSSESSIVE); ! } ! return new Curly(prev, 0, MAX_REPS, GREEDY); case '+': ! ch = next(); ! if (ch == '?') { ! next(); ! return new Curly(prev, 1, MAX_REPS, LAZY); ! } else if (ch == '+') { ! next(); ! return new Curly(prev, 1, MAX_REPS, POSSESSIVE); ! } ! return new Curly(prev, 1, MAX_REPS, GREEDY); case '{': ch = temp[cursor+1]; if (ASCII.isDigit(ch)) { skip(); int cmin = 0; --- 3235,3254 ---- switch (ch) { case '?': ch = next(); if (ch == '?') { next(); ! return new Ques(prev, Qtype.LAZY); } else if (ch == '+') { next(); ! return new Ques(prev, Qtype.POSSESSIVE); } ! return new Ques(prev, Qtype.GREEDY); case '*': ! return curly(prev, 0); case '+': ! return curly(prev, 1); case '{': ch = temp[cursor+1]; if (ASCII.isDigit(ch)) { skip(); int cmin = 0;
*** 3192,3207 **** throw error("Illegal repetition range"); Curly curly; ch = peek(); if (ch == '?') { next(); ! curly = new Curly(prev, cmin, cmax, LAZY); } else if (ch == '+') { next(); ! curly = new Curly(prev, cmin, cmax, POSSESSIVE); } else { ! curly = new Curly(prev, cmin, cmax, GREEDY); } return curly; } else { throw error("Illegal repetition"); } --- 3273,3288 ---- throw error("Illegal repetition range"); Curly curly; ch = peek(); if (ch == '?') { next(); ! curly = new Curly(prev, cmin, cmax, Qtype.LAZY); } else if (ch == '+') { next(); ! curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE); } else { ! curly = new Curly(prev, cmin, cmax, Qtype.GREEDY); } return curly; } else { throw error("Illegal repetition"); }
*** 3374,3387 **** /** * Creates a bit vector for matching Latin-1 values. A normal BitClass * never matches values above Latin-1, and a complemented BitClass always * matches values above Latin-1. */ ! private static final class BitClass extends BmpCharProperty { final boolean[] bits; ! BitClass() { bits = new boolean[256]; } ! private BitClass(boolean[] bits) { this.bits = bits; } BitClass add(int c, int flags) { assert c >= 0 && c <= 255; if ((flags & CASE_INSENSITIVE) != 0) { if (ASCII.isAscii(c)) { bits[ASCII.toUpper(c)] = true; --- 3455,3473 ---- /** * Creates a bit vector for matching Latin-1 values. A normal BitClass * never matches values above Latin-1, and a complemented BitClass always * matches values above Latin-1. */ ! static final class BitClass extends BmpCharProperty { final boolean[] bits; ! BitClass() { ! this(new boolean[256]); ! } ! private BitClass(boolean[] bits) { ! super( ch -> ch < 256 && bits[ch]); ! this.bits = bits; ! } BitClass add(int c, int flags) { assert c >= 0 && c <= 255; if ((flags & CASE_INSENSITIVE) != 0) { if (ASCII.isAscii(c)) { bits[ASCII.toUpper(c)] = true;
*** 3392,3427 **** } } bits[c] = true; return this; } - boolean isSatisfiedBy(int ch) { - return ch < 256 && bits[ch]; - } - } - - /** - * Returns a suitably optimized, single character matcher. - */ - private CharProperty newSingle(final int ch) { - if (has(CASE_INSENSITIVE)) { - int lower, upper; - if (has(UNICODE_CASE)) { - upper = Character.toUpperCase(ch); - lower = Character.toLowerCase(upper); - if (upper != lower) - return new SingleU(lower); - } else if (ASCII.isAscii(ch)) { - lower = ASCII.toLower(ch); - upper = ASCII.toUpper(ch); - if (lower != upper) - return new SingleI(lower, upper); - } - } - if (isSupplementary(ch)) - return new SingleS(ch); // Match a given Unicode character - return new Single(ch); // Match a given BMP character } /** * Utility method for creating a string slice matcher. */ --- 3478,3487 ----
*** 3825,3846 **** /** * Abstract node class to match one character satisfying some * boolean property. */ ! private abstract static class CharProperty extends Node { ! abstract boolean isSatisfiedBy(int ch); ! CharProperty complement() { ! return new CharProperty() { ! boolean isSatisfiedBy(int ch) { ! return ! CharProperty.this.isSatisfiedBy(ch);}}; } boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { int ch = Character.codePointAt(seq, i); ! return isSatisfiedBy(ch) ! && next.match(matcher, i+Character.charCount(ch), seq); } else { matcher.hitEnd = true; return false; } } --- 3885,3905 ---- /** * Abstract node class to match one character satisfying some * boolean property. */ ! static class CharProperty extends Node { ! CharPredicate predicate; ! ! CharProperty (CharPredicate predicate) { ! this.predicate = predicate; } boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { int ch = Character.codePointAt(seq, i); ! return predicate.is(ch) && ! next.match(matcher, i + Character.charCount(ch), seq); } else { matcher.hitEnd = true; return false; } }
*** 3853,4003 **** /** * Optimized version of CharProperty that works only for * properties never satisfied by Supplementary characters. */ ! private abstract static class BmpCharProperty extends CharProperty { boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { ! return isSatisfiedBy(seq.charAt(i)) ! && next.match(matcher, i+1, seq); } else { matcher.hitEnd = true; return false; } } } ! /** ! * Node class that matches a Supplementary Unicode character ! */ ! static final class SingleS extends CharProperty { ! final int c; ! SingleS(int c) { this.c = c; } ! boolean isSatisfiedBy(int ch) { ! return ch == c; ! } ! } ! ! /** ! * Optimization -- matches a given BMP character ! */ ! static final class Single extends BmpCharProperty { ! final int c; ! Single(int c) { this.c = c; } ! boolean isSatisfiedBy(int ch) { ! return ch == c; ! } ! } ! ! /** ! * Case insensitive matches a given BMP character ! */ ! static final class SingleI extends BmpCharProperty { ! final int lower; ! final int upper; ! SingleI(int lower, int upper) { ! this.lower = lower; ! this.upper = upper; ! } ! boolean isSatisfiedBy(int ch) { ! return ch == lower || ch == upper; ! } ! } ! ! /** ! * Unicode case insensitive matches a given Unicode character ! */ ! static final class SingleU extends CharProperty { ! final int lower; ! SingleU(int lower) { ! this.lower = lower; ! } ! boolean isSatisfiedBy(int ch) { ! return lower == ch || ! lower == Character.toLowerCase(Character.toUpperCase(ch)); ! } ! } ! ! /** ! * Node class that matches a Unicode block. ! */ ! static final class Block extends CharProperty { ! final Character.UnicodeBlock block; ! Block(Character.UnicodeBlock block) { ! this.block = block; ! } ! boolean isSatisfiedBy(int ch) { ! return block == Character.UnicodeBlock.of(ch); ! } ! } ! ! /** ! * Node class that matches a Unicode script ! */ ! static final class Script extends CharProperty { ! final Character.UnicodeScript script; ! Script(Character.UnicodeScript script) { ! this.script = script; ! } ! boolean isSatisfiedBy(int ch) { ! return script == Character.UnicodeScript.of(ch); ! } } ! /** ! * Node class that matches a Unicode category. ! */ ! static final class Category extends CharProperty { ! final int typeMask; ! Category(int typeMask) { this.typeMask = typeMask; } ! boolean isSatisfiedBy(int ch) { ! return (typeMask & (1 << Character.getType(ch))) != 0; ! } } ! ! /** ! * Node class that matches a Unicode "type" ! */ ! static final class Utype extends CharProperty { ! final UnicodeProp uprop; ! Utype(UnicodeProp uprop) { this.uprop = uprop; } ! boolean isSatisfiedBy(int ch) { ! return uprop.is(ch); } } ! /** ! * Node class that matches a POSIX type. ! */ ! static final class Ctype extends BmpCharProperty { ! final int ctype; ! Ctype(int ctype) { this.ctype = ctype; } ! boolean isSatisfiedBy(int ch) { ! return ch < 128 && ASCII.isType(ch, ctype); } } ! ! /** ! * Node class that matches a Perl vertical whitespace ! */ ! static final class VertWS extends BmpCharProperty { ! boolean isSatisfiedBy(int cp) { ! return (cp >= 0x0A && cp <= 0x0D) || ! cp == 0x85 || cp == 0x2028 || cp == 0x2029; } } ! /** ! * Node class that matches a Perl horizontal whitespace ! */ ! static final class HorizWS extends BmpCharProperty { ! boolean isSatisfiedBy(int cp) { ! return cp == 0x09 || cp == 0x20 || cp == 0xa0 || ! cp == 0x1680 || cp == 0x180e || ! cp >= 0x2000 && cp <= 0x200a || ! cp == 0x202f || cp == 0x205f || cp == 0x3000; } } /** * Node class that matches an unicode extended grapheme cluster --- 3912,3983 ---- /** * Optimized version of CharProperty that works only for * properties never satisfied by Supplementary characters. */ ! private static class BmpCharProperty extends CharProperty { ! BmpCharProperty (BmpCharPredicate predicate) { ! super(predicate); ! } boolean match(Matcher matcher, int i, CharSequence seq) { if (i < matcher.to) { ! return predicate.is(seq.charAt(i)) && ! next.match(matcher, i + 1, seq); } else { matcher.hitEnd = true; return false; } } } ! private static class NFCCharProperty extends Node { ! CharPredicate predicate; ! NFCCharProperty (CharPredicate predicate) { ! this.predicate = predicate; } ! boolean match(Matcher matcher, int i, CharSequence seq) { ! if (i < matcher.to) { ! int ch0 = Character.codePointAt(seq, i); ! int n = Character.charCount(ch0); ! int j = i + n; ! while (j < matcher.to) { ! int ch1 = Character.codePointAt(seq, j); ! if (Grapheme.isBoundary(ch0, ch1)) ! break; ! ch0 = ch1; ! j += Character.charCount(ch1); } ! if (i + n == j) { // single, assume nfc cp ! if (predicate.is(ch0)) ! return next.match(matcher, j, seq); ! } else { ! while (i + n < j) { ! String nfc = Normalizer.normalize( ! seq.toString().substring(i, j), Normalizer.Form.NFC); ! if (nfc.codePointCount(0, nfc.length()) == 1) { ! if (predicate.is(nfc.codePointAt(0)) && ! next.match(matcher, j, seq)) { ! return true; } } ! ch0 = Character.codePointBefore(seq, j); ! j -= Character.charCount(ch0); } } ! if (j < matcher.to) ! return false; } + matcher.hitEnd = true; + return false; } ! boolean study(TreeInfo info) { ! info.minLength++; ! info.deterministic = false; ! return next.study(info); } } /** * Node class that matches an unicode extended grapheme cluster
*** 4215,4299 **** int toLower(int c) { return Character.toLowerCase(Character.toUpperCase(c)); } } - private static boolean inRange(int lower, int ch, int upper) { - return lower <= ch && ch <= upper; - } - - /** - * Returns node for matching characters within an explicit value range. - */ - private static CharProperty rangeFor(final int lower, - final int upper) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return inRange(lower, ch, upper);}}; - } - - /** - * Returns node for matching characters within an explicit value - * range in a case insensitive manner. - */ - private CharProperty caseInsensitiveRangeFor(final int lower, - final int upper) { - if (has(UNICODE_CASE)) - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - if (inRange(lower, ch, upper)) - return true; - int up = Character.toUpperCase(ch); - return inRange(lower, up, upper) || - inRange(lower, Character.toLowerCase(up), upper);}}; - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return inRange(lower, ch, upper) || - ASCII.isAscii(ch) && - (inRange(lower, ASCII.toUpper(ch), upper) || - inRange(lower, ASCII.toLower(ch), upper)); - }}; - } - - /** - * Implements the Unicode category ALL and the dot metacharacter when - * in dotall mode. - */ - static final class All extends CharProperty { - boolean isSatisfiedBy(int ch) { - return true; - } - } - - /** - * Node class for the dot metacharacter when dotall is not enabled. - */ - static final class Dot extends CharProperty { - boolean isSatisfiedBy(int ch) { - return (ch != '\n' && ch != '\r' - && (ch|1) != '\u2029' - && ch != '\u0085'); - } - } - - /** - * Node class for the dot metacharacter when dotall is not enabled - * but UNIX_LINES is enabled. - */ - static final class UnixDot extends CharProperty { - boolean isSatisfiedBy(int ch) { - return ch != '\n'; - } - } - /** * The 0 or 1 quantifier. This one class implements all three types. */ static final class Ques extends Node { Node atom; ! int type; ! Ques(Node node, int type) { this.atom = node; this.type = type; } boolean match(Matcher matcher, int i, CharSequence seq) { switch (type) { --- 4195,4211 ---- int toLower(int c) { return Character.toLowerCase(Character.toUpperCase(c)); } } /** * The 0 or 1 quantifier. This one class implements all three types. */ static final class Ques extends Node { Node atom; ! Qtype type; ! Ques(Node node, Qtype type) { this.atom = node; this.type = type; } boolean match(Matcher matcher, int i, CharSequence seq) { switch (type) {
*** 4309,4319 **** default: return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); } } boolean study(TreeInfo info) { ! if (type != INDEPENDENT) { int minL = info.minLength; atom.study(info); info.minLength = minL; info.deterministic = false; return next.study(info); --- 4221,4231 ---- default: return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); } } boolean study(TreeInfo info) { ! if (type != Qtype.INDEPENDENT) { int minL = info.minLength; atom.study(info); info.minLength = minL; info.deterministic = false; return next.study(info);
*** 4323,4343 **** } } } /** * Handles the curly-brace style repetition with a specified minimum and * maximum occurrences. The * quantifier is handled as a special case. * This class handles the three types. */ static final class Curly extends Node { Node atom; ! int type; int cmin; int cmax; ! Curly(Node node, int cmin, int cmax, int type) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax; } --- 4235,4328 ---- } } } /** + * Handles the greedy style repetition with the minimum either be + * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier. + */ + static class CharPropertyGreedy extends Node { + final CharPredicate predicate; + final int cmin; + + CharPropertyGreedy(CharProperty cp, int cmin) { + this.predicate = cp.predicate; + this.cmin = cmin; + } + boolean match(Matcher matcher, int i, CharSequence seq) { + int n = 0; + int to = matcher.to; + // greedy, all the way down + while (i < to) { + int ch = Character.codePointAt(seq, i); + if (!predicate.is(ch)) + break; + i += Character.charCount(ch); + n++; + } + if (i >= to) { + matcher.hitEnd = true; + } + while (n >= cmin) { + if (next.match(matcher, i, seq)) + return true; + if (n == cmin) + return false; + // backing off if match fails + int ch = Character.codePointBefore(seq, i); + i -= Character.charCount(ch); + n--; + } + return false; + } + + boolean study(TreeInfo info) { + info.minLength += cmin; + if (info.maxValid) { + info.maxLength += MAX_REPS; + } + info.deterministic = false; + return next.study(info); + } + } + + static final class BmpCharPropertyGreedy extends CharPropertyGreedy { + + BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { + super(bcp, cmin); + } + + boolean match(Matcher matcher, int i, CharSequence seq) { + int n = 0; + int to = matcher.to; + while (i < to && predicate.is(seq.charAt(i))) { + i++; n++; + } + if (i >= to) { + matcher.hitEnd = true; + } + while (n >= cmin) { + if (next.match(matcher, i, seq)) + return true; + i--; n--; // backing off if match fails + } + return false; + } + } + + /** * Handles the curly-brace style repetition with a specified minimum and * maximum occurrences. The * quantifier is handled as a special case. * This class handles the three types. */ static final class Curly extends Node { Node atom; ! Qtype type; int cmin; int cmax; ! Curly(Node node, int cmin, int cmax, Qtype type) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax; }
*** 4348,4360 **** i = matcher.last; continue; } return false; } ! if (type == GREEDY) return match0(matcher, i, j, seq); ! else if (type == LAZY) return match1(matcher, i, j, seq); else return match2(matcher, i, j, seq); } // Greedy match. --- 4333,4345 ---- i = matcher.last; continue; } return false; } ! if (type == Qtype.GREEDY) return match0(matcher, i, j, seq); ! else if (type == Qtype.LAZY) return match1(matcher, i, j, seq); else return match2(matcher, i, j, seq); } // Greedy match.
*** 4472,4489 **** * If capture is true then this class saves group settings and ensures * that groups are unset when backing off of a group match. */ static final class GroupCurly extends Node { Node atom; ! int type; int cmin; int cmax; int localIndex; int groupIndex; boolean capture; ! GroupCurly(Node node, int cmin, int cmax, int type, int local, int group, boolean capture) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax; --- 4457,4474 ---- * If capture is true then this class saves group settings and ensures * that groups are unset when backing off of a group match. */ static final class GroupCurly extends Node { Node atom; ! Qtype type; int cmin; int cmax; int localIndex; int groupIndex; boolean capture; ! GroupCurly(Node node, int cmin, int cmax, Qtype type, int local, int group, boolean capture) { this.atom = node; this.type = type; this.cmin = cmin; this.cmax = cmax;
*** 4519,4531 **** ret = false; break; } } if (ret) { ! if (type == GREEDY) { ret = match0(matcher, i, cmin, seq); ! } else if (type == LAZY) { ret = match1(matcher, i, cmin, seq); } else { ret = match2(matcher, i, cmin, seq); } } --- 4504,4516 ---- ret = false; break; } } if (ret) { ! if (type == Qtype.GREEDY) { ret = match0(matcher, i, cmin, seq); ! } else if (type == Qtype.LAZY) { ret = match1(matcher, i, cmin, seq); } else { ret = match2(matcher, i, cmin, seq); } }
*** 4767,4776 **** --- 4752,4762 ---- * indicate that we do not want to unset the group if the reference * doesn't match. */ static final class GroupHead extends Node { int localIndex; + GroupTail tail; // for debug/print only, match does not need to know GroupHead(int localCount) { localIndex = localCount; } boolean match(Matcher matcher, int i, CharSequence seq) { int save = matcher.locals[localIndex];
*** 4874,4886 **** --- 4860,4874 ---- static class Loop extends Node { Node body; int countIndex; // local count index in matcher locals int beginIndex; // group beginning index int cmin, cmax; + int posIndex; Loop(int countIndex, int beginIndex) { this.countIndex = countIndex; this.beginIndex = beginIndex; + this.posIndex = -1; } boolean match(Matcher matcher, int i, CharSequence seq) { // Avoid infinite loop in zero-length case. if (i > matcher.locals[beginIndex]) { int count = matcher.locals[countIndex];
*** 4899,4923 **** return b; } // This block is for after we have the minimum // iterations required for the loop to match if (count < cmax) { matcher.locals[countIndex] = count + 1; boolean b = body.match(matcher, i, seq); // If match failed we must backtrack, so // the loop count should NOT be incremented ! if (!b) ! matcher.locals[countIndex] = count; ! else return true; } } return next.match(matcher, i, seq); } boolean matchInit(Matcher matcher, int i, CharSequence seq) { int save = matcher.locals[countIndex]; boolean ret = false; if (0 < cmin) { matcher.locals[countIndex] = 1; ret = body.match(matcher, i, seq); } else if (0 < cmax) { matcher.locals[countIndex] = 1; --- 4887,4925 ---- return b; } // This block is for after we have the minimum // iterations required for the loop to match if (count < cmax) { + // Let's check if we have already tried and failed + // at this starting position "i" in the past. + // If yes, then just return false wihtout trying + // again, to stop the exponential backtracking. + if (posIndex != -1 && + matcher.localsPos[posIndex].contains(i)) { + return next.match(matcher, i, seq); + } matcher.locals[countIndex] = count + 1; boolean b = body.match(matcher, i, seq); // If match failed we must backtrack, so // the loop count should NOT be incremented ! if (b) return true; + matcher.locals[countIndex] = count; + // save the failed position + if (posIndex != -1) { + matcher.localsPos[posIndex].add(i); + } } } return next.match(matcher, i, seq); } boolean matchInit(Matcher matcher, int i, CharSequence seq) { int save = matcher.locals[countIndex]; boolean ret = false; + if (posIndex != -1 && matcher.localsPos[posIndex] == null) { + matcher.localsPos[posIndex] = new IntHashSet(); + } if (0 < cmin) { matcher.locals[countIndex] = 1; ret = body.match(matcher, i, seq); } else if (0 < cmax) { matcher.locals[countIndex] = 1;
*** 5360,5399 **** return !conditionMatched && next.match(matcher, i, seq); } } /** - * Returns the set union of two CharProperty nodes. - */ - private static CharProperty union(final CharProperty lhs, - final CharProperty rhs) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; - } - - /** - * Returns the set intersection of two CharProperty nodes. - */ - private static CharProperty intersection(final CharProperty lhs, - final CharProperty rhs) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; - } - - /** - * Returns the set difference of two CharProperty nodes. - */ - private static CharProperty setDifference(final CharProperty lhs, - final CharProperty rhs) { - return new CharProperty() { - boolean isSatisfiedBy(int ch) { - return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; - } - - /** * Handles word boundaries. Includes a field to allow this one class to * deal with the different types of word boundaries we can match. The word * characters include underscores, letters, and digits. Non spacing marks * can are also part of a word if they have a base character, otherwise * they are ignored for purposes of finding word boundaries. --- 5362,5371 ----
*** 5409,5419 **** type = n; this.useUWORD = useUWORD; } boolean isWord(int ch) { ! return useUWORD ? UnicodeProp.WORD.is(ch) : (ch == '_' || Character.isLetterOrDigit(ch)); } int check(Matcher matcher, int i, CharSequence seq) { int ch; --- 5381,5391 ---- type = n; this.useUWORD = useUWORD; } boolean isWord(int ch) { ! return useUWORD ? CharPredicates.WORD.is(ch) : (ch == '_' || Character.isLetterOrDigit(ch)); } int check(Matcher matcher, int i, CharSequence seq) { int ch;
*** 5655,5874 **** matcher.hitEnd = true; return false; } } ! /////////////////////////////////////////////////////////////////////////////// ! /////////////////////////////////////////////////////////////////////////////// /** ! * This must be the very first initializer. */ ! static Node accept = new Node(); ! static Node lastAccept = new LastNode(); ! private static class CharPropertyNames { ! static CharProperty charPropertyFor(String name) { ! CharPropertyFactory m = map.get(name); ! return m == null ? null : m.make(); } ! private abstract static class CharPropertyFactory { ! abstract CharProperty make(); } ! private static void defCategory(String name, ! final int typeMask) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return new Category(typeMask);}}); } ! private static void defRange(String name, ! final int lower, final int upper) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return rangeFor(lower, upper);}}); } ! private static void defCtype(String name, ! final int ctype) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return new Ctype(ctype);}}); } ! private abstract static class CloneableProperty ! extends CharProperty implements Cloneable ! { ! public CloneableProperty clone() { ! try { ! return (CloneableProperty) super.clone(); ! } catch (CloneNotSupportedException e) { ! throw new AssertionError(e); ! } ! } ! } ! ! private static void defClone(String name, ! final CloneableProperty p) { ! map.put(name, new CharPropertyFactory() { ! CharProperty make() { return p.clone();}}); ! } ! ! private static final HashMap<String, CharPropertyFactory> map ! = new HashMap<>(); ! ! static { ! // Unicode character property aliases, defined in ! // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt ! defCategory("Cn", 1<<Character.UNASSIGNED); ! defCategory("Lu", 1<<Character.UPPERCASE_LETTER); ! defCategory("Ll", 1<<Character.LOWERCASE_LETTER); ! defCategory("Lt", 1<<Character.TITLECASE_LETTER); ! defCategory("Lm", 1<<Character.MODIFIER_LETTER); ! defCategory("Lo", 1<<Character.OTHER_LETTER); ! defCategory("Mn", 1<<Character.NON_SPACING_MARK); ! defCategory("Me", 1<<Character.ENCLOSING_MARK); ! defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK); ! defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER); ! defCategory("Nl", 1<<Character.LETTER_NUMBER); ! defCategory("No", 1<<Character.OTHER_NUMBER); ! defCategory("Zs", 1<<Character.SPACE_SEPARATOR); ! defCategory("Zl", 1<<Character.LINE_SEPARATOR); ! defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR); ! defCategory("Cc", 1<<Character.CONTROL); ! defCategory("Cf", 1<<Character.FORMAT); ! defCategory("Co", 1<<Character.PRIVATE_USE); ! defCategory("Cs", 1<<Character.SURROGATE); ! defCategory("Pd", 1<<Character.DASH_PUNCTUATION); ! defCategory("Ps", 1<<Character.START_PUNCTUATION); ! defCategory("Pe", 1<<Character.END_PUNCTUATION); ! defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION); ! defCategory("Po", 1<<Character.OTHER_PUNCTUATION); ! defCategory("Sm", 1<<Character.MATH_SYMBOL); ! defCategory("Sc", 1<<Character.CURRENCY_SYMBOL); ! defCategory("Sk", 1<<Character.MODIFIER_SYMBOL); ! defCategory("So", 1<<Character.OTHER_SYMBOL); ! defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION); ! defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION); ! defCategory("L", ((1<<Character.UPPERCASE_LETTER) | ! (1<<Character.LOWERCASE_LETTER) | ! (1<<Character.TITLECASE_LETTER) | ! (1<<Character.MODIFIER_LETTER) | ! (1<<Character.OTHER_LETTER))); ! defCategory("M", ((1<<Character.NON_SPACING_MARK) | ! (1<<Character.ENCLOSING_MARK) | ! (1<<Character.COMBINING_SPACING_MARK))); ! defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) | ! (1<<Character.LETTER_NUMBER) | ! (1<<Character.OTHER_NUMBER))); ! defCategory("Z", ((1<<Character.SPACE_SEPARATOR) | ! (1<<Character.LINE_SEPARATOR) | ! (1<<Character.PARAGRAPH_SEPARATOR))); ! defCategory("C", ((1<<Character.CONTROL) | ! (1<<Character.FORMAT) | ! (1<<Character.PRIVATE_USE) | ! (1<<Character.SURROGATE))); // Other ! defCategory("P", ((1<<Character.DASH_PUNCTUATION) | ! (1<<Character.START_PUNCTUATION) | ! (1<<Character.END_PUNCTUATION) | ! (1<<Character.CONNECTOR_PUNCTUATION) | ! (1<<Character.OTHER_PUNCTUATION) | ! (1<<Character.INITIAL_QUOTE_PUNCTUATION) | ! (1<<Character.FINAL_QUOTE_PUNCTUATION))); ! defCategory("S", ((1<<Character.MATH_SYMBOL) | ! (1<<Character.CURRENCY_SYMBOL) | ! (1<<Character.MODIFIER_SYMBOL) | ! (1<<Character.OTHER_SYMBOL))); ! defCategory("LC", ((1<<Character.UPPERCASE_LETTER) | ! (1<<Character.LOWERCASE_LETTER) | ! (1<<Character.TITLECASE_LETTER))); ! defCategory("LD", ((1<<Character.UPPERCASE_LETTER) | ! (1<<Character.LOWERCASE_LETTER) | ! (1<<Character.TITLECASE_LETTER) | ! (1<<Character.MODIFIER_LETTER) | ! (1<<Character.OTHER_LETTER) | ! (1<<Character.DECIMAL_DIGIT_NUMBER))); ! defRange("L1", 0x00, 0xFF); // Latin-1 ! map.put("all", new CharPropertyFactory() { ! CharProperty make() { return new All(); }}); ! ! // Posix regular expression character classes, defined in ! // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html ! defRange("ASCII", 0x00, 0x7F); // ASCII ! defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters ! defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters ! defCtype("Blank", ASCII.BLANK); // Space and tab characters ! defCtype("Cntrl", ASCII.CNTRL); // Control characters ! defRange("Digit", '0', '9'); // Numeric characters ! defCtype("Graph", ASCII.GRAPH); // printable and visible ! defRange("Lower", 'a', 'z'); // Lower-case alphabetic ! defRange("Print", 0x20, 0x7E); // Printable characters ! defCtype("Punct", ASCII.PUNCT); // Punctuation characters ! defCtype("Space", ASCII.SPACE); // Space characters ! defRange("Upper", 'A', 'Z'); // Upper-case alphabetic ! defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits ! ! // Java character properties, defined by methods in Character.java ! defClone("javaLowerCase", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isLowerCase(ch);}}); ! defClone("javaUpperCase", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isUpperCase(ch);}}); ! defClone("javaAlphabetic", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isAlphabetic(ch);}}); ! defClone("javaIdeographic", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isIdeographic(ch);}}); ! defClone("javaTitleCase", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isTitleCase(ch);}}); ! defClone("javaDigit", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isDigit(ch);}}); ! defClone("javaDefined", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isDefined(ch);}}); ! defClone("javaLetter", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isLetter(ch);}}); ! defClone("javaLetterOrDigit", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isLetterOrDigit(ch);}}); ! defClone("javaJavaIdentifierStart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isJavaIdentifierStart(ch);}}); ! defClone("javaJavaIdentifierPart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isJavaIdentifierPart(ch);}}); ! defClone("javaUnicodeIdentifierStart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isUnicodeIdentifierStart(ch);}}); ! defClone("javaUnicodeIdentifierPart", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isUnicodeIdentifierPart(ch);}}); ! defClone("javaIdentifierIgnorable", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isIdentifierIgnorable(ch);}}); ! defClone("javaSpaceChar", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isSpaceChar(ch);}}); ! defClone("javaWhitespace", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isWhitespace(ch);}}); ! defClone("javaISOControl", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isISOControl(ch);}}); ! defClone("javaMirrored", new CloneableProperty() { ! boolean isSatisfiedBy(int ch) { ! return Character.isMirrored(ch);}}); } } /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.8 */ --- 5627,5786 ---- matcher.hitEnd = true; return false; } } ! @FunctionalInterface ! static interface CharPredicate { ! boolean is(int ch); ! ! default CharPredicate and(CharPredicate p) { ! return ch -> is(ch) && p.is(ch); ! } ! default CharPredicate union(CharPredicate p) { ! return ch -> is(ch) || p.is(ch); ! } ! default CharPredicate union(CharPredicate p1, ! CharPredicate p2 ) { ! return ch -> is(ch) || p1.is(ch) || p2.is(ch); ! } ! default CharPredicate negate() { ! return ch -> !is(ch); ! } ! } ! ! static interface BmpCharPredicate extends CharPredicate { ! ! default CharPredicate and(CharPredicate p) { ! if(p instanceof BmpCharPredicate) ! return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); ! return ch -> is(ch) && p.is(ch); ! } ! default CharPredicate union(CharPredicate p) { ! if (p instanceof BmpCharPredicate) ! return (BmpCharPredicate)(ch -> is(ch) || p.is(ch)); ! return ch -> is(ch) || p.is(ch); ! } ! static CharPredicate union(CharPredicate... predicates) { ! CharPredicate cp = ch -> { ! for (CharPredicate p : predicates) { ! if (!p.is(ch)) ! return false; ! } ! return true; ! }; ! for (CharPredicate p : predicates) { ! if (! (p instanceof BmpCharPredicate)) ! return cp; ! } ! return (BmpCharPredicate)cp; ! } ! } ! ! /** ! * matches a Perl vertical whitespace ! */ ! static BmpCharPredicate VertWS = cp -> ! (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029; /** ! * matches a Perl horizontal whitespace */ ! static BmpCharPredicate HorizWS = cp -> ! cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || ! cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || ! cp == 0x205f || cp == 0x3000; ! /** ! * for the Unicode category ALL and the dot metacharacter when ! * in dotall mode. ! */ ! static CharPredicate ALL = ch -> true; ! ! /** ! * for the dot metacharacter when dotall is not enabled. ! */ ! static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r' ! && (ch|1) != '\u2029' ! && ch != '\u0085'); ! /** ! * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. ! */ ! static CharPredicate UNIXDOT = ch -> ch != '\n'; ! /** ! * Indicate that matches a Supplementary Unicode character ! */ ! static CharPredicate SingleS(int c) { ! return ch -> ch == c; ! } ! /** ! * A bmp/optimized predicate of single ! */ ! static BmpCharPredicate Single(int c) { ! return ch -> ch == c; } ! /** ! * Case insensitive matches a given BMP character ! */ ! static BmpCharPredicate SingleI(int lower, int upper) { ! return ch -> ch == lower || ch == upper; } ! /** ! * Unicode case insensitive matches a given Unicode character ! */ ! static CharPredicate SingleU(int lower) { ! return ch -> lower == ch || ! lower == Character.toLowerCase(Character.toUpperCase(ch)); } ! private static boolean inRange(int lower, int ch, int upper) { ! return lower <= ch && ch <= upper; } ! /** ! * Charactrs within a explicit value range ! */ ! static CharPredicate Range(int lower, int upper) { ! if (upper < Character.MIN_HIGH_SURROGATE || ! lower > Character.MAX_HIGH_SURROGATE && ! upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) ! return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); ! return ch -> inRange(lower, ch, upper); } ! /** ! * Charactrs within a explicit value range in a case insensitive manner. ! */ ! static CharPredicate CIRange(int lower, int upper) { ! return ch -> inRange(lower, ch, upper) || ! ASCII.isAscii(ch) && ! (inRange(lower, ASCII.toUpper(ch), upper) || ! inRange(lower, ASCII.toLower(ch), upper)); } + + static CharPredicate CIRangeU(int lower, int upper) { + return ch -> { + if (inRange(lower, ch, upper)) + return true; + int up = Character.toUpperCase(ch); + return inRange(lower, up, upper) || + inRange(lower, Character.toLowerCase(up), upper); + }; } /** + * This must be the very first initializer. + */ + static final Node accept = new Node(); + + static final Node lastAccept = new LastNode(); + + /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.8 */