src/java.base/share/classes/java/util/regex/Pattern.java

Print this page

        

@@ -24,15 +24,18 @@
  */
 
 package java.util.regex;
 
 import java.text.Normalizer;
+import java.text.Normalizer.Form;
 import java.util.Locale;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.Set;
 import java.util.Arrays;
 import java.util.NoSuchElementException;
 import java.util.Spliterator;
 import java.util.Spliterators;
 import java.util.function.Predicate;

@@ -982,10 +985,15 @@
      * Temporary storage used by parsing pattern slice.
      */
     transient int[] buffer;
 
     /**
+     * A temporary storage used for predicate for double return.
+     */
+    transient CharPredicate predicate;
+
+    /**
      * Map the "name" of the "named capturing group" to its group id
      * node.
      */
     transient volatile Map<String, Integer> namedGroups;
 

@@ -1024,11 +1032,11 @@
 
     /**
      * If the Start node might possibly match supplementary characters.
      * It is set to true during compiling if
      * (1) There is supplementary char in pattern, or
-     * (2) There is complement node of Category or Block
+     * (2) There is complement node of a "family" CharProperty
      */
     private transient boolean hasSupplementary;
 
     /**
      * Compiles the given regular expression into a pattern.

@@ -1376,138 +1384,145 @@
             matchRoot = lastAccept;
         }
     }
 
     /**
-     * The pattern is converted to normalized form ({@linkplain
-     * java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
-     * and then a pure group is constructed to match canonical
-     * equivalences of the characters.
+     * The pattern is converted to normalized form ({@link
+     * java.text.Normalizer.Form.NFC NFC}, canonical decomposition,
+     * followed by canonical composition for the character class
+     * part, and {@link java.text.Normalizer.Form.NFD NFD},
+     * canonical decomposition) for the rest), and then a pure
+     * group is constructed to match canonical equivalences of the
+     * characters.
      */
-    private void normalize() {
-        int lastCodePoint = -1;
-
-        // Convert pattern into normalized form
-        normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
-        patternLength = normalizedPattern.length();
-
-        // Modify pattern to match canonical equivalences
-        StringBuilder newPattern = new StringBuilder(patternLength);
-        for(int i=0; i<patternLength; ) {
-            int c = normalizedPattern.codePointAt(i);
-            StringBuilder sequenceBuffer;
-            if ((Character.getType(c) == Character.NON_SPACING_MARK)
-                && (lastCodePoint != -1)) {
-                sequenceBuffer = new StringBuilder();
-                sequenceBuffer.appendCodePoint(lastCodePoint);
-                sequenceBuffer.appendCodePoint(c);
-                while(Character.getType(c) == Character.NON_SPACING_MARK) {
-                    i += Character.charCount(c);
-                    if (i >= patternLength)
-                        break;
-                    c = normalizedPattern.codePointAt(i);
-                    sequenceBuffer.appendCodePoint(c);
+    private static String normalize(String pattern) {
+        int plen = pattern.length();
+        StringBuilder pbuf = new StringBuilder(plen);
+        char last = 0;
+        int lastStart = 0;
+        char cc = 0;
+        for (int i = 0; i < plen;) {
+            char c = pattern.charAt(i);
+            if (cc == 0 &&    // top level
+                c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {
+                i += 2; last = 0;
+                continue;
                 }
-                String ea = produceEquivalentAlternation(
-                                               sequenceBuffer.toString());
-                newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
-                newPattern.append("(?:").append(ea).append(")");
-            } else if (c == '[' && lastCodePoint != '\\') {
-                i = normalizeCharClass(newPattern, i);
-            } else {
-                newPattern.appendCodePoint(c);
+            if (c == '[' && last != '\\') {
+                if (cc == 0) {
+                    if (lastStart < i)
+                        normalizeSlice(pattern, lastStart, i, pbuf);
+                    lastStart = i;
+                }
+                cc++;
+            } else if (c == ']' && last != '\\') {
+                cc--;
+                if (cc == 0) {
+                    normalizeClazz(pattern, lastStart, i + 1, pbuf);
+                    lastStart = i + 1;
             }
-            lastCodePoint = c;
-            i += Character.charCount(c);
         }
-        normalizedPattern = newPattern.toString();
+            last = c;
+            i++;
+        }
+        assert (cc == 0);
+        if (lastStart < plen)
+            normalizeSlice(pattern, lastStart, plen, pbuf);
+        return pbuf.toString();
     }
 
-    /**
-     * Complete the character class being parsed and add a set
-     * of alternations to it that will match the canonical equivalences
-     * of the characters within the class.
-     */
-    private int normalizeCharClass(StringBuilder newPattern, int i) {
-        StringBuilder charClass = new StringBuilder();
-        StringBuilder eq = null;
-        int lastCodePoint = -1;
-        String result;
-
-        i++;
-        charClass.append("[");
-        while(true) {
-            int c = normalizedPattern.codePointAt(i);
-            StringBuilder sequenceBuffer;
-
-            if (c == ']' && lastCodePoint != '\\') {
-                charClass.append((char)c);
-                break;
-            } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
-                sequenceBuffer = new StringBuilder();
-                sequenceBuffer.appendCodePoint(lastCodePoint);
-                while(Character.getType(c) == Character.NON_SPACING_MARK) {
-                    sequenceBuffer.appendCodePoint(c);
-                    i += Character.charCount(c);
-                    if (i >= normalizedPattern.length())
+    private static void normalizeSlice(String src, int off, int limit,
+                                       StringBuilder dst)
+    {
+        int len = src.length();
+        int off0 = off;
+        while (off < limit && ASCII.isAscii(src.charAt(off))) {
+            off++;
+        }
+        if (off == limit) {
+            dst.append(src, off0, limit);
+            return;
+        }
+        off--;
+        if (off < off0)
+            off = off0;
+        else
+            dst.append(src, off0, off);
+        while (off < limit) {
+            int ch0 = src.codePointAt(off);
+            if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
+                dst.append((char)ch0);
+                off++;
+                continue;
+            }
+            int j = off + Character.charCount(ch0);
+            int ch1;
+            while (j < limit) {
+                ch1 = src.codePointAt(j);
+                if (Grapheme.isBoundary(ch0, ch1))
                         break;
-                    c = normalizedPattern.codePointAt(i);
+                ch0 = ch1;
+                j += Character.charCount(ch1);
                 }
-                String ea = produceEquivalentAlternation(
-                                                  sequenceBuffer.toString());
-
-                charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
-                if (eq == null)
-                    eq = new StringBuilder();
-                eq.append('|');
-                eq.append(ea);
-            } else {
-                charClass.appendCodePoint(c);
-                i++;
+            String seq = src.substring(off, j);
+            String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
+            off = j;
+            if (nfd.length() > 1) {
+                ch0 = nfd.codePointAt(0);
+                ch1 = nfd.codePointAt(Character.charCount(ch0));
+                if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
+                    Set<String> altns = new LinkedHashSet<>();
+                    altns.add(seq);
+                    produceEquivalentAlternation(nfd, altns);
+                    dst.append("(?:");
+                    altns.forEach( s -> dst.append(s + "|"));
+                    dst.delete(dst.length() - 1, dst.length());
+                    dst.append(")");
+                    continue;
             }
-            if (i == normalizedPattern.length())
-                throw error("Unclosed character class");
-            lastCodePoint = c;
         }
-
-        if (eq != null) {
-            result = "(?:"+charClass.toString()+eq.toString()+")";
-        } else {
-            result = charClass.toString();
+            String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);
+            if (!seq.equals(nfc) && !nfd.equals(nfc))
+                dst.append("(?:" + seq + "|" + nfd  + "|" + nfc + ")");
+            else if (!seq.equals(nfd))
+                dst.append("(?:" + seq + "|" + nfd + ")");
+            else
+                dst.append(seq);
+        }
         }
 
-        newPattern.append(result);
-        return i;
+    private static void normalizeClazz(String src, int off, int limit,
+                                       StringBuilder dst)
+    {
+        dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));
     }
 
     /**
      * Given a specific sequence composed of a regular character and
      * combining marks that follow it, produce the alternation that will
      * match all canonical equivalences of that sequence.
      */
-    private String produceEquivalentAlternation(String source) {
-        int len = countChars(source, 0, 1);
-        if (source.length() == len)
-            // source has one character.
-            return source;
-
-        String base = source.substring(0,len);
-        String combiningMarks = source.substring(len);
-
+    private static void produceEquivalentAlternation(String src,
+                                                     Set<String> dst)
+    {
+        int len = countChars(src, 0, 1);
+        if (src.length() == len) {
+            dst.add(src);  // source has one character.
+            return;
+        }
+        String base = src.substring(0,len);
+        String combiningMarks = src.substring(len);
         String[] perms = producePermutations(combiningMarks);
-        StringBuilder result = new StringBuilder(source);
-
         // Add combined permutations
-        for(int x=0; x<perms.length; x++) {
+        for(int x = 0; x < perms.length; x++) {
             String next = base + perms[x];
-            if (x>0)
-                result.append("|"+next);
+            dst.add(next);
             next = composeOneStep(next);
-            if (next != null)
-                result.append("|"+produceEquivalentAlternation(next));
+            if (next != null) {
+                produceEquivalentAlternation(next, dst);
+            }
         }
-        return result.toString();
     }
 
     /**
      * Returns an array of strings that have all the possible
      * permutations of the characters in the input string.

@@ -1515,11 +1530,11 @@
      * of a set of combining marks. Note that some of the permutations
      * are invalid because of combining class collisions, and these
      * possibilities must be removed because they are not canonically
      * equivalent.
      */
-    private String[] producePermutations(String input) {
+    private static String[] producePermutations(String input) {
         if (input.length() == countChars(input, 0, 1))
             return new String[] {input};
 
         if (input.length() == countChars(input, 0, 2)) {
             int c0 = Character.codePointAt(input, 0);

@@ -1573,26 +1588,25 @@
         String[] result = new String[index];
         System.arraycopy(temp, 0, result, 0, index);
         return result;
     }
 
-    private int getClass(int c) {
+    private static int getClass(int c) {
         return sun.text.Normalizer.getCombiningClass(c);
     }
 
     /**
      * Attempts to compose input by combining the first character
      * with the first combining mark following it. Returns a String
      * that is the composition of the leading character with its first
      * combining mark followed by the remaining combining marks. Returns
      * null if the first two characters cannot be further composed.
      */
-    private String composeOneStep(String input) {
+    private static String composeOneStep(String input) {
         int len = countChars(input, 0, 2);
         String firstTwoCharacters = input.substring(0, len);
         String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
-
         if (result.equals(firstTwoCharacters))
             return null;
         else {
             String remainder = input.substring(len);
             return result + remainder;

@@ -1675,11 +1689,11 @@
      * of the expression which will create the object tree.
      */
     private void compile() {
         // Handle canonical equivalences
         if (has(CANON_EQ) && !has(LITERAL)) {
-            normalize();
+            normalizedPattern = normalize(pattern);
         } else {
             normalizedPattern = pattern;
         }
         patternLength = normalizedPattern.length();
 

@@ -1752,48 +1766,10 @@
         }
         return groups;
     }
 
     /**
-     * Used to print out a subtree of the Pattern to help with debugging.
-     */
-    private static void printObjectTree(Node node) {
-        while(node != null) {
-            if (node instanceof Prolog) {
-                System.out.println(node);
-                printObjectTree(((Prolog)node).loop);
-                System.out.println("**** end contents prolog loop");
-            } else if (node instanceof Loop) {
-                System.out.println(node);
-                printObjectTree(((Loop)node).body);
-                System.out.println("**** end contents Loop body");
-            } else if (node instanceof Curly) {
-                System.out.println(node);
-                printObjectTree(((Curly)node).atom);
-                System.out.println("**** end contents Curly body");
-            } else if (node instanceof GroupCurly) {
-                System.out.println(node);
-                printObjectTree(((GroupCurly)node).atom);
-                System.out.println("**** end contents GroupCurly body");
-            } else if (node instanceof GroupTail) {
-                System.out.println(node);
-                System.out.println("Tail next is "+node.next);
-                return;
-            } else {
-                System.out.println(node);
-            }
-            node = node.next;
-            if (node != null)
-                System.out.println("->next:");
-            if (node == Pattern.accept) {
-                System.out.println("Accept Node");
-                node = null;
-            }
-       }
-    }
-
-    /**
      * Used to accumulate information about a subtree of the object graph
      * so that optimizations can be applied to the subtree.
      */
     static final class TreeInfo {
         int minLength;

@@ -2081,11 +2057,14 @@
                     tail.next = node;
                 // Double return: Tail was returned in root
                 tail = root;
                 continue;
             case '[':
-                node = clazz(true);
+                if (has(CANON_EQ) && !has(LITERAL))
+                    node = new NFCCharProperty(clazz(true));
+                else
+                    node = newCharProperty(clazz(true));
                 break;
             case '\\':
                 ch = nextEscaped();
                 if (ch == 'p' || ch == 'P') {
                     boolean oneLetter = true;

@@ -2094,11 +2073,15 @@
                     if (ch != '{') {
                         unread();
                     } else {
                         oneLetter = false;
                     }
-                    node = family(oneLetter, comp);
+                    // node = newCharProperty(family(oneLetter, comp));
+                    if (has(CANON_EQ) && !has(LITERAL))
+                        node = new NFCCharProperty(family(oneLetter, comp));
+                    else
+                        node = newCharProperty(family(oneLetter, comp));
                 } else {
                     unread();
                     node = atom();
                 }
                 break;

@@ -2121,16 +2104,16 @@
                     node = new Dollar(has(MULTILINE));
                 break;
             case '.':
                 next();
                 if (has(DOTALL)) {
-                    node = new All();
+                    node = new CharProperty(ALL);
                 } else {
-                    if (has(UNIX_LINES))
-                        node = new UnixDot();
-                    else {
-                        node = new Dot();
+                    if (has(UNIX_LINES)) {
+                        node = new CharProperty(UNIXDOT);
+                    } else {
+                        node = new CharProperty(DOT);
                     }
                 }
                 break;
             case '|':
             case ')':

@@ -2153,11 +2136,10 @@
                 node = atom();
                 break;
             }
 
             node = closure(node);
-
             if (head == null) {
                 head = tail = node;
             } else {
                 tail.next = node;
                 tail = node;

@@ -2211,11 +2193,14 @@
                         ch = next(); // Consume { if present
                         if (ch != '{')
                             unread();
                         else
                             oneLetter = false;
-                        return family(oneLetter, comp);
+                        if (has(CANON_EQ) && !has(LITERAL))
+                            return new NFCCharProperty(family(oneLetter, comp));
+                        else
+                            return newCharProperty(family(oneLetter, comp));
                     }
                 }
                 unread();
                 prev = cursor;
                 ch = escape(false, first == 0, false);

@@ -2249,11 +2234,11 @@
                 continue;
             }
             break;
         }
         if (first == 1) {
-            return newSingle(buffer[0]);
+            return newCharProperty(single(buffer[0]));
         } else {
             return newSlice(buffer, first, hasSupplementary);
         }
     }
 

@@ -2344,23 +2329,31 @@
             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
             return -1;
         case 'C':
             break;
         case 'D':
-            if (create) root = has(UNICODE_CHARACTER_CLASS)
-                               ? new Utype(UnicodeProp.DIGIT).complement()
-                               : new Ctype(ASCII.DIGIT).complement();
+            if (create) {
+                predicate = has(UNICODE_CHARACTER_CLASS) ?
+                            CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
+                predicate = predicate.negate();
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'E':
         case 'F':
             break;
         case 'G':
             if (inclass) break;
             if (create) root = new LastMatch();
             return -1;
         case 'H':
-            if (create) root = new HorizWS().complement();
+            if (create) {
+                predicate = HorizWS.negate();
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'I':
         case 'J':
         case 'K':
         case 'L':

@@ -2375,24 +2368,36 @@
         case 'R':
             if (inclass) break;
             if (create) root = new LineEnding();
             return -1;
         case 'S':
-            if (create) root = has(UNICODE_CHARACTER_CLASS)
-                               ? new Utype(UnicodeProp.WHITE_SPACE).complement()
-                               : new Ctype(ASCII.SPACE).complement();
+            if (create) {
+                predicate = has(UNICODE_CHARACTER_CLASS) ?
+                            CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
+                predicate = predicate.negate();
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'T':
         case 'U':
             break;
         case 'V':
-            if (create) root = new VertWS().complement();
+            if (create) {
+                predicate = VertWS.negate();
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'W':
-            if (create) root = has(UNICODE_CHARACTER_CLASS)
-                               ? new Utype(UnicodeProp.WORD).complement()
-                               : new Ctype(ASCII.WORD).complement();
+            if (create) {
+                predicate = has(UNICODE_CHARACTER_CLASS) ?
+                            CharPredicates.WORD : CharPredicates.ASCII_WORD;
+                predicate = predicate.negate();
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'X':
             if (inclass) break;
             if (create) {
                 root = new XGrapheme();

@@ -2428,22 +2433,29 @@
             }
             return -1;
         case 'c':
             return c();
         case 'd':
-            if (create) root = has(UNICODE_CHARACTER_CLASS)
-                               ? new Utype(UnicodeProp.DIGIT)
-                               : new Ctype(ASCII.DIGIT);
+            if (create) {
+                predicate = has(UNICODE_CHARACTER_CLASS) ?
+                            CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'e':
             return '\033';
         case 'f':
             return '\f';
         case 'g':
             break;
         case 'h':
-            if (create) root = new HorizWS();
+            if (create) {
+                predicate = HorizWS;
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'i':
         case 'j':
             break;
         case 'k':

@@ -2471,13 +2483,16 @@
         case 'q':
             break;
         case 'r':
             return '\r';
         case 's':
-            if (create) root = has(UNICODE_CHARACTER_CLASS)
-                               ? new Utype(UnicodeProp.WHITE_SPACE)
-                               : new Ctype(ASCII.SPACE);
+            if (create) {
+                predicate = has(UNICODE_CHARACTER_CLASS) ?
+                            CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 't':
             return '\t';
         case 'u':
             return u();

@@ -2490,16 +2505,23 @@
             // the start or end value, such as [\v-...] or [...-\v], in
             // which a single definite value (0x0B) is expected. For
             // compatibility concern '\013'/0x0B is returned if isrange.
             if (isrange)
                 return '\013';
-            if (create) root = new VertWS();
+            if (create) {
+                predicate = VertWS;
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'w':
-            if (create) root = has(UNICODE_CHARACTER_CLASS)
-                               ? new Utype(UnicodeProp.WORD)
-                               : new Ctype(ASCII.WORD);
+            if (create) {
+                predicate = has(UNICODE_CHARACTER_CLASS) ?
+                            CharPredicates.WORD : CharPredicates.ASCII_WORD;
+                if (!inclass)
+                    root = newCharProperty(predicate);
+            }
             return -1;
         case 'x':
             return x();
         case 'y':
             break;

@@ -2518,112 +2540,111 @@
      *
      * Consumes a ] on the way out if consume is true. Usually consume
      * is true except for the case of [abc&&def] where def is a separate
      * right hand node with "understood" brackets.
      */
-    private CharProperty clazz(boolean consume) {
-        CharProperty prev = null;
-        CharProperty node = null;
+    private CharPredicate clazz(boolean consume) {
+        CharPredicate prev = null;
+        CharPredicate curr = null;
         BitClass bits = new BitClass();
-        boolean include = true;
-        boolean firstInClass = true;
+        BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
+
+        boolean isNeg = false;
+        boolean hasBits = false;
         int ch = next();
-        for (;;) {
-            switch (ch) {
-                case '^':
+
                     // Negates if first char in a class, otherwise literal
-                    if (firstInClass) {
-                        if (temp[cursor-1] != '[')
-                            break;
+        if (ch == '^' && temp[cursor-1] == '[') {
                         ch = next();
-                        include = !include;
-                        continue;
-                    } else {
-                        // ^ not first in class, treat as literal
-                        break;
+            isNeg = true;
                     }
+        for (;;) {
+            switch (ch) {
                 case '[':
-                    firstInClass = false;
-                    node = clazz(true);
+                    curr = clazz(true);
                     if (prev == null)
-                        prev = node;
+                        prev = curr;
                     else
-                        prev = union(prev, node);
+                        prev = prev.union(curr);
                     ch = peek();
                     continue;
                 case '&':
-                    firstInClass = false;
                     ch = next();
                     if (ch == '&') {
                         ch = next();
-                        CharProperty rightNode = null;
+                        CharPredicate right = null;
                         while (ch != ']' && ch != '&') {
                             if (ch == '[') {
-                                if (rightNode == null)
-                                    rightNode = clazz(true);
+                                if (right == null)
+                                    right = clazz(true);
                                 else
-                                    rightNode = union(rightNode, clazz(true));
+                                    right = right.union(clazz(true));
                             } else { // abc&&def
                                 unread();
-                                rightNode = clazz(false);
+                                right = clazz(false);
                             }
                             ch = peek();
                         }
-                        if (rightNode != null)
-                            node = rightNode;
+                        if (hasBits) {
+                            // bits used, union has high precedence
                         if (prev == null) {
-                            if (rightNode == null)
+                                prev = curr = bitsP;
+                            } else {
+                                prev = prev.union(bitsP);
+                            }
+                            hasBits = false;
+                        }
+                        if (right != null)
+                            curr = right;
+                        if (prev == null) {
+                            if (right == null)
                                 throw error("Bad class syntax");
                             else
-                                prev = rightNode;
+                                prev = right;
                         } else {
-                            prev = intersection(prev, node);
+                            prev = prev.and(curr);
                         }
                     } else {
                         // treat as a literal &
                         unread();
                         break;
                     }
                     continue;
                 case 0:
-                    firstInClass = false;
                     if (cursor >= patternLength)
                         throw error("Unclosed character class");
                     break;
                 case ']':
-                    firstInClass = false;
-                    if (prev != null) {
+                    if (prev != null || hasBits) {
                         if (consume)
                             next();
+                        if (prev == null)
+                            prev = bitsP;
+                        else if (hasBits)
+                            prev = prev.union(bitsP);
+                        if (isNeg)
+                            return prev.negate();
                         return prev;
                     }
                     break;
                 default:
-                    firstInClass = false;
                     break;
             }
-            node = range(bits);
-            if (include) {
-                if (prev == null) {
-                    prev = node;
-                } else {
-                    if (prev != node)
-                        prev = union(prev, node);
-                }
+            curr = range(bits);
+            if (curr == null) {    // the bits used
+                hasBits = true;
             } else {
-                if (prev == null) {
-                    prev = node.complement();
-                } else {
-                    if (prev != node)
-                        prev = setDifference(prev, node);
-                }
+                if (prev == null)
+                    prev = curr;
+                else if (prev != curr)
+                    prev = prev.union(curr);
             }
             ch = peek();
         }
     }
 
-    private CharProperty bitsOrSingle(BitClass bits, int ch) {
+    private CharPredicate bitsOrSingle(BitClass bits, int ch) {
         /* Bits can only handle codepoints in [u+0000-u+00ff] range.
            Use "single" node instead of bits when dealing with unicode
            case folding for codepoints listed below.
            (1)Uppercase out of range: u+00ff, u+00b5
               toUpperCase(u+00ff) -> u+0178

@@ -2644,20 +2665,47 @@
             !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
               (ch == 0xff || ch == 0xb5 ||
                ch == 0x49 || ch == 0x69 ||  //I and i
                ch == 0x53 || ch == 0x73 ||  //S and s
                ch == 0x4b || ch == 0x6b ||  //K and k
-               ch == 0xc5 || ch == 0xe5)))  //A+ring
-            return bits.add(ch, flags());
-        return newSingle(ch);
+               ch == 0xc5 || ch == 0xe5))) {  //A+ring {
+            bits.add(ch, flags());
+            return null;
+        }
+        return single(ch);
+    }
+
+    /**
+     *  Returns a suitably optimized, single character predicate
+     */
+    private CharPredicate single(final int ch) {
+        if (has(CASE_INSENSITIVE)) {
+            int lower, upper;
+            if (has(UNICODE_CASE)) {
+                upper = Character.toUpperCase(ch);
+                lower = Character.toLowerCase(upper);
+                // Unicode case insensitive matches
+                if (upper != lower)
+                    return SingleU(lower);
+            } else if (ASCII.isAscii(ch)) {
+                lower = ASCII.toLower(ch);
+                upper = ASCII.toUpper(ch);
+                // Case insensitive matches a given BMP character
+                if (lower != upper)
+                    return SingleI(lower, upper);
+            }
+        }
+        if (isSupplementary(ch))
+            return SingleS(ch);
+        return Single(ch);  // Match a given BMP character
     }
 
     /**
      * Parse a single character or a character range in a character class
      * and return its representative node.
      */
-    private CharProperty range(BitClass bits) {
+    private CharPredicate range(BitClass bits) {
         int ch = peek();
         if (ch == '\\') {
             ch = nextEscaped();
             if (ch == 'p' || ch == 'P') { // A property
                 boolean comp = (ch == 'P');

@@ -2672,11 +2720,11 @@
             } else { // ordinary escape
                 boolean isrange = temp[cursor+1] == '-';
                 unread();
                 ch = escape(true, true, isrange);
                 if (ch == -1)
-                    return (CharProperty) root;
+                    return predicate;
             }
         } else {
             next();
         }
         if (ch >= 0) {

@@ -2694,30 +2742,33 @@
                         next();
                     }
                     if (m < ch) {
                         throw error("Illegal character range");
                     }
-                    if (has(CASE_INSENSITIVE))
-                        return caseInsensitiveRangeFor(ch, m);
-                    else
-                        return rangeFor(ch, m);
+                    if (has(CASE_INSENSITIVE)) {
+                        if (has(UNICODE_CASE))
+                            return CIRangeU(ch, m);
+                        return CIRange(ch, m);
+                    } else {
+                        return Range(ch, m);
+                    }
                 }
             }
             return bitsOrSingle(bits, ch);
         }
         throw error("Unexpected character '"+((char)ch)+"'");
     }
 
     /**
      * Parses a Unicode character family and returns its representative node.
      */
-    private CharProperty family(boolean singleLetter,
-                                boolean maybeComplement)
+    private CharPredicate family(boolean singleLetter,
+                                     boolean isComplement)
     {
         next();
         String name;
-        CharProperty node = null;
+        CharPredicate p = null;
 
         if (singleLetter) {
             int c = temp[cursor];
             if (!Character.isSupplementaryCodePoint(c)) {
                 name = String.valueOf((char)c);

@@ -2745,92 +2796,66 @@
             String value = name.substring(i + 1);
             name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
             switch (name) {
                 case "sc":
                 case "script":
-                    node = unicodeScriptPropertyFor(value);
+                    p = CharPredicates.forUnicodeScript(value);
                     break;
                 case "blk":
                 case "block":
-                    node = unicodeBlockPropertyFor(value);
+                    p = CharPredicates.forUnicodeBlock(value);
                     break;
                 case "gc":
                 case "general_category":
-                    node = charPropertyNodeFor(value);
+                    p = CharPredicates.forProperty(value);
                     break;
                 default:
+                    break;
+            }
+            if (p == null)
                     throw error("Unknown Unicode property {name=<" + name + ">, "
                                 + "value=<" + value + ">}");
-            }
+
         } else {
             if (name.startsWith("In")) {
-                // \p{inBlockName}
-                node = unicodeBlockPropertyFor(name.substring(2));
+                // \p{InBlockName}
+                p = CharPredicates.forUnicodeBlock(name.substring(2));
             } else if (name.startsWith("Is")) {
-                // \p{isGeneralCategory} and \p{isScriptName}
+                // \p{IsGeneralCategory} and \p{IsScriptName}
                 name = name.substring(2);
-                UnicodeProp uprop = UnicodeProp.forName(name);
-                if (uprop != null)
-                    node = new Utype(uprop);
-                if (node == null)
-                    node = CharPropertyNames.charPropertyFor(name);
-                if (node == null)
-                    node = unicodeScriptPropertyFor(name);
+                p = CharPredicates.forUnicodeProperty(name);
+                if (p == null)
+                    p = CharPredicates.forProperty(name);
+                if (p == null)
+                    p = CharPredicates.forUnicodeScript(name);
             } else {
                 if (has(UNICODE_CHARACTER_CLASS)) {
-                    UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
-                    if (uprop != null)
-                        node = new Utype(uprop);
+                    p = CharPredicates.forPOSIXName(name);
                 }
-                if (node == null)
-                    node = charPropertyNodeFor(name);
+                if (p == null)
+                    p = CharPredicates.forProperty(name);
             }
+            if (p == null)
+                throw error("Unknown character property name {In/Is" + name + "}");
         }
-        if (maybeComplement) {
-            if (node instanceof Category || node instanceof Block)
+        if (isComplement) {
+            // it might be too expensive to detect if a complement of
+            // CharProperty can match "certain" supplementary. So just
+            // go with StartS.
                 hasSupplementary = true;
-            node = node.complement();
-        }
-        return node;
-    }
-
-
-    /**
-     * Returns a CharProperty matching all characters belong to
-     * a UnicodeScript.
-     */
-    private CharProperty unicodeScriptPropertyFor(String name) {
-        final Character.UnicodeScript script;
-        try {
-            script = Character.UnicodeScript.forName(name);
-        } catch (IllegalArgumentException iae) {
-            throw error("Unknown character script name {" + name + "}");
-        }
-        return new Script(script);
-    }
-
-    /**
-     * Returns a CharProperty matching all characters in a UnicodeBlock.
-     */
-    private CharProperty unicodeBlockPropertyFor(String name) {
-        final Character.UnicodeBlock block;
-        try {
-            block = Character.UnicodeBlock.forName(name);
-        } catch (IllegalArgumentException iae) {
-            throw error("Unknown character block name {" + name + "}");
+            p = p.negate();
         }
-        return new Block(block);
+        return p;
     }
 
-    /**
-     * Returns a CharProperty matching all characters in a named property.
-     */
-    private CharProperty charPropertyNodeFor(String name) {
-        CharProperty p = CharPropertyNames.charPropertyFor(name);
+    private CharProperty newCharProperty(CharPredicate p) {
         if (p == null)
-            throw error("Unknown character property name {" + name + "}");
-        return p;
+            return null;
+        if (p instanceof BmpCharPredicate)
+            return new BmpCharProperty((BmpCharPredicate)p);
+        else
+            return new CharProperty(p);
     }
 
     /**
      * Parses and returns the name of a "named capturing group", the trailing
      * ">" is consumed after parsing.

@@ -2882,11 +2907,11 @@
                 break;
             case '>':   // (?>xxx)  independent group
                 head = createGroup(true);
                 tail = root;
                 head.next = expr(tail);
-                head = tail = new Ques(head, INDEPENDENT);
+                head = tail = new Ques(head, Qtype.INDEPENDENT);
                 break;
             case '<':   // (?<xxx)  look behind
                 ch = read();
                 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
                     // named captured group

@@ -2968,26 +2993,26 @@
             return node;    // Dual return
         }
 
         if (node instanceof Ques) {
             Ques ques = (Ques) node;
-            if (ques.type == POSSESSIVE) {
+            if (ques.type == Qtype.POSSESSIVE) {
                 root = node;
                 return node;
             }
             tail.next = new BranchConn();
             tail = tail.next;
-            if (ques.type == GREEDY) {
+            if (ques.type == Qtype.GREEDY) {
                 head = new Branch(head, null, tail);
             } else { // Reluctant quantifier
                 head = new Branch(null, head, tail);
             }
             root = tail;
             return head;
         } else if (node instanceof Curly) {
             Curly curly = (Curly) node;
-            if (curly.type == POSSESSIVE) {
+            if (curly.type == Qtype.POSSESSIVE) {
                 root = node;
                 return node;
             }
             // Discover if the group is deterministic
             TreeInfo info = new TreeInfo();

@@ -3000,11 +3025,11 @@
                                              capturingGroup);
                 return head;
             } else { // Non-deterministic
                 int temp = ((GroupHead) head).localIndex;
                 Loop loop;
-                if (curly.type == GREEDY)
+                if (curly.type == Qtype.GREEDY)
                     loop = new Loop(this.localCount, temp);
                 else  // Reluctant Curly
                     loop = new LazyLoop(this.localCount, temp);
                 Prolog prolog = new Prolog(loop);
                 this.localCount += 1;

@@ -3029,10 +3054,14 @@
         int groupIndex = 0;
         if (!anonymous)
             groupIndex = capturingGroupCount++;
         GroupHead head = new GroupHead(localIndex);
         root = new GroupTail(localIndex, groupIndex);
+
+        // for debug/print only, head.match does NOT need the "tail" info
+        head.tail = (GroupTail)root;
+
         if (!anonymous && groupIndex < 10)
             groupNodes[groupIndex] = head;
         return head;
     }
 

@@ -3117,17 +3146,30 @@
         }
     }
 
     static final int MAX_REPS   = 0x7FFFFFFF;
 
-    static final int GREEDY     = 0;
-
-    static final int LAZY       = 1;
-
-    static final int POSSESSIVE = 2;
+    static enum Qtype {
+        GREEDY, LAZY, POSSESSIVE, INDEPENDENT
+    }
 
-    static final int INDEPENDENT = 3;
+    private Node curly(Node prev, int cmin) {
+        int ch = next();
+        if (ch == '?') {
+            next();
+            return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
+        } else if (ch == '+') {
+            next();
+            return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
+        }
+        if (prev instanceof BmpCharProperty) {
+            return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
+        } else if (prev instanceof CharProperty) {
+            return new CharPropertyGreedy((CharProperty)prev, cmin);
+        }
+        return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
+    }
 
     /**
      * Processes repetition. If the next character peeked is a quantifier
      * then new nodes must be appended to handle the repetition.
      * Prev could be a single or a group, so it could be a chain of nodes.

@@ -3138,36 +3180,20 @@
         switch (ch) {
         case '?':
             ch = next();
             if (ch == '?') {
                 next();
-                return new Ques(prev, LAZY);
+                return new Ques(prev, Qtype.LAZY);
             } else if (ch == '+') {
                 next();
-                return new Ques(prev, POSSESSIVE);
+                return new Ques(prev, Qtype.POSSESSIVE);
             }
-            return new Ques(prev, GREEDY);
+            return new Ques(prev, Qtype.GREEDY);
         case '*':
-            ch = next();
-            if (ch == '?') {
-                next();
-                return new Curly(prev, 0, MAX_REPS, LAZY);
-            } else if (ch == '+') {
-                next();
-                return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
-            }
-            return new Curly(prev, 0, MAX_REPS, GREEDY);
+            return curly(prev, 0);
         case '+':
-            ch = next();
-            if (ch == '?') {
-                next();
-                return new Curly(prev, 1, MAX_REPS, LAZY);
-            } else if (ch == '+') {
-                next();
-                return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
-            }
-            return new Curly(prev, 1, MAX_REPS, GREEDY);
+            return curly(prev, 1);
         case '{':
             ch = temp[cursor+1];
             if (ASCII.isDigit(ch)) {
                 skip();
                 int cmin = 0;

@@ -3192,16 +3218,16 @@
                     throw error("Illegal repetition range");
                 Curly curly;
                 ch = peek();
                 if (ch == '?') {
                     next();
-                    curly = new Curly(prev, cmin, cmax, LAZY);
+                    curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
                 } else if (ch == '+') {
                     next();
-                    curly = new Curly(prev, cmin, cmax, POSSESSIVE);
+                    curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
                 } else {
-                    curly = new Curly(prev, cmin, cmax, GREEDY);
+                    curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
                 }
                 return curly;
             } else {
                 throw error("Illegal repetition");
             }

@@ -3374,14 +3400,19 @@
     /**
      *  Creates a bit vector for matching Latin-1 values. A normal BitClass
      *  never matches values above Latin-1, and a complemented BitClass always
      *  matches values above Latin-1.
      */
-    private static final class BitClass extends BmpCharProperty {
+    static final class BitClass extends BmpCharProperty {
         final boolean[] bits;
-        BitClass() { bits = new boolean[256]; }
-        private BitClass(boolean[] bits) { this.bits = bits; }
+        BitClass() {
+            this(new boolean[256]);
+        }
+        private BitClass(boolean[] bits) {
+            super( ch -> ch < 256 && bits[ch]);
+            this.bits = bits;
+        }
         BitClass add(int c, int flags) {
             assert c >= 0 && c <= 255;
             if ((flags & CASE_INSENSITIVE) != 0) {
                 if (ASCII.isAscii(c)) {
                     bits[ASCII.toUpper(c)] = true;

@@ -3392,36 +3423,10 @@
                 }
             }
             bits[c] = true;
             return this;
         }
-        boolean isSatisfiedBy(int ch) {
-            return ch < 256 && bits[ch];
-        }
-    }
-
-    /**
-     *  Returns a suitably optimized, single character matcher.
-     */
-    private CharProperty newSingle(final int ch) {
-        if (has(CASE_INSENSITIVE)) {
-            int lower, upper;
-            if (has(UNICODE_CASE)) {
-                upper = Character.toUpperCase(ch);
-                lower = Character.toLowerCase(upper);
-                if (upper != lower)
-                    return new SingleU(lower);
-            } else if (ASCII.isAscii(ch)) {
-                lower = ASCII.toLower(ch);
-                upper = ASCII.toUpper(ch);
-                if (lower != upper)
-                    return new SingleI(lower, upper);
-            }
-        }
-        if (isSupplementary(ch))
-            return new SingleS(ch);    // Match a given Unicode character
-        return new Single(ch);         // Match a given BMP character
     }
 
     /**
      *  Utility method for creating a string slice matcher.
      */

@@ -3825,22 +3830,21 @@
 
     /**
      * Abstract node class to match one character satisfying some
      * boolean property.
      */
-    private abstract static class CharProperty extends Node {
-        abstract boolean isSatisfiedBy(int ch);
-        CharProperty complement() {
-            return new CharProperty() {
-                    boolean isSatisfiedBy(int ch) {
-                        return ! CharProperty.this.isSatisfiedBy(ch);}};
+    static class CharProperty extends Node {
+        CharPredicate predicate;
+
+        CharProperty (CharPredicate predicate) {
+            this.predicate = predicate;
         }
         boolean match(Matcher matcher, int i, CharSequence seq) {
             if (i < matcher.to) {
                 int ch = Character.codePointAt(seq, i);
-                return isSatisfiedBy(ch)
-                    && next.match(matcher, i+Character.charCount(ch), seq);
+                return predicate.is(ch) &&
+                       next.match(matcher, i + Character.charCount(ch), seq);
             } else {
                 matcher.hitEnd = true;
                 return false;
             }
         }

@@ -3853,151 +3857,72 @@
 
     /**
      * Optimized version of CharProperty that works only for
      * properties never satisfied by Supplementary characters.
      */
-    private abstract static class BmpCharProperty extends CharProperty {
+    private static class BmpCharProperty extends CharProperty {
+        BmpCharProperty (BmpCharPredicate predicate) {
+            super(predicate);
+        }
         boolean match(Matcher matcher, int i, CharSequence seq) {
             if (i < matcher.to) {
-                return isSatisfiedBy(seq.charAt(i))
-                    && next.match(matcher, i+1, seq);
+                return predicate.is(seq.charAt(i)) &&
+                       next.match(matcher, i + 1, seq);
             } else {
                 matcher.hitEnd = true;
                 return false;
             }
         }
     }
 
-    /**
-     * Node class that matches a Supplementary Unicode character
-     */
-    static final class SingleS extends CharProperty {
-        final int c;
-        SingleS(int c) { this.c = c; }
-        boolean isSatisfiedBy(int ch) {
-            return ch == c;
-        }
+    private static class NFCCharProperty extends Node {
+        CharPredicate predicate;
+        NFCCharProperty (CharPredicate predicate) {
+            this.predicate = predicate;
     }
 
-    /**
-     * Optimization -- matches a given BMP character
-     */
-    static final class Single extends BmpCharProperty {
-        final int c;
-        Single(int c) { this.c = c; }
-        boolean isSatisfiedBy(int ch) {
-            return ch == c;
-        }
-    }
-
-    /**
-     * Case insensitive matches a given BMP character
-     */
-    static final class SingleI extends BmpCharProperty {
-        final int lower;
-        final int upper;
-        SingleI(int lower, int upper) {
-            this.lower = lower;
-            this.upper = upper;
-        }
-        boolean isSatisfiedBy(int ch) {
-            return ch == lower || ch == upper;
-        }
-    }
-
-    /**
-     * Unicode case insensitive matches a given Unicode character
-     */
-    static final class SingleU extends CharProperty {
-        final int lower;
-        SingleU(int lower) {
-            this.lower = lower;
-        }
-        boolean isSatisfiedBy(int ch) {
-            return lower == ch ||
-                lower == Character.toLowerCase(Character.toUpperCase(ch));
-        }
-    }
-
-    /**
-     * Node class that matches a Unicode block.
-     */
-    static final class Block extends CharProperty {
-        final Character.UnicodeBlock block;
-        Block(Character.UnicodeBlock block) {
-            this.block = block;
-        }
-        boolean isSatisfiedBy(int ch) {
-            return block == Character.UnicodeBlock.of(ch);
-        }
-    }
-
-    /**
-     * Node class that matches a Unicode script
-     */
-    static final class Script extends CharProperty {
-        final Character.UnicodeScript script;
-        Script(Character.UnicodeScript script) {
-            this.script = script;
-        }
-        boolean isSatisfiedBy(int ch) {
-            return script == Character.UnicodeScript.of(ch);
-        }
-    }
-
-    /**
-     * Node class that matches a Unicode category.
-     */
-    static final class Category extends CharProperty {
-        final int typeMask;
-        Category(int typeMask) { this.typeMask = typeMask; }
-        boolean isSatisfiedBy(int ch) {
-            return (typeMask & (1 << Character.getType(ch))) != 0;
-        }
+        boolean match(Matcher matcher, int i, CharSequence seq) {
+            if (i < matcher.to) {
+                int ch0 = Character.codePointAt(seq, i);
+                int n = Character.charCount(ch0);
+                int j = i + n;
+                while (j < matcher.to) {
+                    int ch1 = Character.codePointAt(seq, j);
+                    if (Grapheme.isBoundary(ch0, ch1))
+                        break;
+                    ch0 = ch1;
+                    j += Character.charCount(ch1);
     }
-
-    /**
-     * Node class that matches a Unicode "type"
-     */
-    static final class Utype extends CharProperty {
-        final UnicodeProp uprop;
-        Utype(UnicodeProp uprop) { this.uprop = uprop; }
-        boolean isSatisfiedBy(int ch) {
-            return uprop.is(ch);
+                if (i + n == j) {    // single, assume nfc cp
+                    if (predicate.is(ch0))
+                        return next.match(matcher, j, seq);
+                } else {
+                    while (i + n < j) {
+                        String nfc = Normalizer.normalize(
+                            seq.toString().substring(i, j), Normalizer.Form.NFC);
+                        if (nfc.codePointCount(0, nfc.length()) == 1) {
+                            if (predicate.is(nfc.codePointAt(0)) &&
+                                next.match(matcher, j, seq)) {
+                                return true;
         }
     }
 
-    /**
-     * Node class that matches a POSIX type.
-     */
-    static final class Ctype extends BmpCharProperty {
-        final int ctype;
-        Ctype(int ctype) { this.ctype = ctype; }
-        boolean isSatisfiedBy(int ch) {
-            return ch < 128 && ASCII.isType(ch, ctype);
+                        ch0 = Character.codePointBefore(seq, j);
+                        j -= Character.charCount(ch0);
         }
     }
-
-    /**
-     * Node class that matches a Perl vertical whitespace
-     */
-    static final class VertWS extends BmpCharProperty {
-        boolean isSatisfiedBy(int cp) {
-            return (cp >= 0x0A && cp <= 0x0D) ||
-                   cp == 0x85 || cp == 0x2028 || cp == 0x2029;
+                if (j < matcher.to)
+                    return false;
         }
+            matcher.hitEnd = true;
+            return false;
     }
 
-    /**
-     * Node class that matches a Perl horizontal whitespace
-     */
-    static final class HorizWS extends BmpCharProperty {
-        boolean isSatisfiedBy(int cp) {
-            return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
-                   cp == 0x1680 || cp == 0x180e ||
-                   cp >= 0x2000 && cp <= 0x200a ||
-                   cp == 0x202f || cp == 0x205f || cp == 0x3000;
+        boolean study(TreeInfo info) {
+            info.minLength++;
+            info.deterministic = false;
+            return next.study(info);
         }
     }
 
     /**
      * Node class that matches an unicode extended grapheme cluster

@@ -4215,85 +4140,17 @@
         int toLower(int c) {
             return Character.toLowerCase(Character.toUpperCase(c));
         }
     }
 
-    private static boolean inRange(int lower, int ch, int upper) {
-        return lower <= ch && ch <= upper;
-    }
-
-    /**
-     * Returns node for matching characters within an explicit value range.
-     */
-    private static CharProperty rangeFor(final int lower,
-                                         final int upper) {
-        return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return inRange(lower, ch, upper);}};
-    }
-
-    /**
-     * Returns node for matching characters within an explicit value
-     * range in a case insensitive manner.
-     */
-    private CharProperty caseInsensitiveRangeFor(final int lower,
-                                                 final int upper) {
-        if (has(UNICODE_CASE))
-            return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    if (inRange(lower, ch, upper))
-                        return true;
-                    int up = Character.toUpperCase(ch);
-                    return inRange(lower, up, upper) ||
-                           inRange(lower, Character.toLowerCase(up), upper);}};
-        return new CharProperty() {
-            boolean isSatisfiedBy(int ch) {
-                return inRange(lower, ch, upper) ||
-                    ASCII.isAscii(ch) &&
-                        (inRange(lower, ASCII.toUpper(ch), upper) ||
-                         inRange(lower, ASCII.toLower(ch), upper));
-            }};
-    }
-
-    /**
-     * Implements the Unicode category ALL and the dot metacharacter when
-     * in dotall mode.
-     */
-    static final class All extends CharProperty {
-        boolean isSatisfiedBy(int ch) {
-            return true;
-        }
-    }
-
-    /**
-     * Node class for the dot metacharacter when dotall is not enabled.
-     */
-    static final class Dot extends CharProperty {
-        boolean isSatisfiedBy(int ch) {
-            return (ch != '\n' && ch != '\r'
-                    && (ch|1) != '\u2029'
-                    && ch != '\u0085');
-        }
-    }
-
-    /**
-     * Node class for the dot metacharacter when dotall is not enabled
-     * but UNIX_LINES is enabled.
-     */
-    static final class UnixDot extends CharProperty {
-        boolean isSatisfiedBy(int ch) {
-            return ch != '\n';
-        }
-    }
-
     /**
      * The 0 or 1 quantifier. This one class implements all three types.
      */
     static final class Ques extends Node {
         Node atom;
-        int type;
-        Ques(Node node, int type) {
+        Qtype type;
+        Ques(Node node, Qtype type) {
             this.atom = node;
             this.type = type;
         }
         boolean match(Matcher matcher, int i, CharSequence seq) {
             switch (type) {

@@ -4309,11 +4166,11 @@
             default:
                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
             }
         }
         boolean study(TreeInfo info) {
-            if (type != INDEPENDENT) {
+            if (type != Qtype.INDEPENDENT) {
                 int minL = info.minLength;
                 atom.study(info);
                 info.minLength = minL;
                 info.deterministic = false;
                 return next.study(info);

@@ -4323,21 +4180,94 @@
             }
         }
     }
 
     /**
+     * Handles the greedy style repetition with the minimum either be
+     * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
+     */
+    static class CharPropertyGreedy extends Node {
+        final CharPredicate predicate;
+        final int cmin;
+
+        CharPropertyGreedy(CharProperty cp, int cmin) {
+            this.predicate = cp.predicate;
+            this.cmin = cmin;
+        }
+        boolean match(Matcher matcher, int i,  CharSequence seq) {
+            int n = 0;
+            int to = matcher.to;
+            // greedy, all the way down
+            while (i < to) {
+                int ch = Character.codePointAt(seq, i);
+                if (!predicate.is(ch))
+                   break;
+                i += Character.charCount(ch);
+                n++;
+            }
+            if (i >= to) {
+                matcher.hitEnd = true;
+            }
+            while (n >= cmin) {
+                if (next.match(matcher, i, seq))
+                    return true;
+                if (n == cmin)
+                    return false;
+                 // backing off if match fails
+                int ch = Character.codePointBefore(seq, i);
+                i -= Character.charCount(ch);
+                n--;
+            }
+            return false; 
+        }
+
+        boolean study(TreeInfo info) {
+            info.minLength += cmin;
+            if (info.maxValid) {
+                info.maxLength += MAX_REPS;
+            }
+            info.deterministic = false;
+            return next.study(info);
+        }
+    }
+
+    static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
+
+        BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
+            super(bcp, cmin);
+        }
+
+        boolean match(Matcher matcher, int i,  CharSequence seq) {
+            int n = 0;
+            int to = matcher.to;
+            while (i < to && predicate.is(seq.charAt(i))) {
+                i++; n++;
+            }
+            if (i >= to) {
+                matcher.hitEnd = true;
+            }
+            while (n >= cmin) {
+                if (next.match(matcher, i, seq))
+                    return true;
+                i--; n--;  // backing off if match fails
+            }
+            return false; 
+        }
+    }
+
+    /**
      * Handles the curly-brace style repetition with a specified minimum and
      * maximum occurrences. The * quantifier is handled as a special case.
      * This class handles the three types.
      */
     static final class Curly extends Node {
         Node atom;
-        int type;
+        Qtype type;
         int cmin;
         int cmax;
 
-        Curly(Node node, int cmin, int cmax, int type) {
+        Curly(Node node, int cmin, int cmax, Qtype type) {
             this.atom = node;
             this.type = type;
             this.cmin = cmin;
             this.cmax = cmax;
         }

@@ -4348,13 +4278,13 @@
                     i = matcher.last;
                     continue;
                 }
                 return false;
             }
-            if (type == GREEDY)
+            if (type == Qtype.GREEDY)
                 return match0(matcher, i, j, seq);
-            else if (type == LAZY)
+            else if (type == Qtype.LAZY)
                 return match1(matcher, i, j, seq);
             else
                 return match2(matcher, i, j, seq);
         }
         // Greedy match.

@@ -4472,18 +4402,18 @@
      * If capture is true then this class saves group settings and ensures
      * that groups are unset when backing off of a group match.
      */
     static final class GroupCurly extends Node {
         Node atom;
-        int type;
+        Qtype type;
         int cmin;
         int cmax;
         int localIndex;
         int groupIndex;
         boolean capture;
 
-        GroupCurly(Node node, int cmin, int cmax, int type, int local,
+        GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
                    int group, boolean capture) {
             this.atom = node;
             this.type = type;
             this.cmin = cmin;
             this.cmax = cmax;

@@ -4519,13 +4449,13 @@
                     ret = false;
                     break;
                 }
             }
             if (ret) {
-                if (type == GREEDY) {
+                if (type == Qtype.GREEDY) {
                     ret = match0(matcher, i, cmin, seq);
-                } else if (type == LAZY) {
+                } else if (type == Qtype.LAZY) {
                     ret = match1(matcher, i, cmin, seq);
                 } else {
                     ret = match2(matcher, i, cmin, seq);
                 }
             }

@@ -4767,10 +4697,11 @@
      * indicate that we do not want to unset the group if the reference
      * doesn't match.
      */
     static final class GroupHead extends Node {
         int localIndex;
+        GroupTail tail;    // for debug/print only, match does not need to know
         GroupHead(int localCount) {
             localIndex = localCount;
         }
         boolean match(Matcher matcher, int i, CharSequence seq) {
             int save = matcher.locals[localIndex];

@@ -5360,40 +5291,10 @@
             return !conditionMatched && next.match(matcher, i, seq);
         }
     }
 
     /**
-     * Returns the set union of two CharProperty nodes.
-     */
-    private static CharProperty union(final CharProperty lhs,
-                                      final CharProperty rhs) {
-        return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
-    }
-
-    /**
-     * Returns the set intersection of two CharProperty nodes.
-     */
-    private static CharProperty intersection(final CharProperty lhs,
-                                             final CharProperty rhs) {
-        return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
-    }
-
-    /**
-     * Returns the set difference of two CharProperty nodes.
-     */
-    private static CharProperty setDifference(final CharProperty lhs,
-                                              final CharProperty rhs) {
-        return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
-    }
-
-    /**
      * Handles word boundaries. Includes a field to allow this one class to
      * deal with the different types of word boundaries we can match. The word
      * characters include underscores, letters, and digits. Non spacing marks
      * can are also part of a word if they have a base character, otherwise
      * they are ignored for purposes of finding word boundaries.

@@ -5409,11 +5310,11 @@
             type = n;
             this.useUWORD = useUWORD;
         }
 
         boolean isWord(int ch) {
-            return useUWORD ? UnicodeProp.WORD.is(ch)
+            return useUWORD ? CharPredicates.WORD.is(ch)
                             : (ch == '_' || Character.isLetterOrDigit(ch));
         }
 
         int check(Matcher matcher, int i, CharSequence seq) {
             int ch;

@@ -5655,219 +5556,159 @@
             matcher.hitEnd = true;
             return false;
         }
     }
 
-///////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
+    @FunctionalInterface
+    static interface CharPredicate {
+        boolean is(int ch);
+
+        default CharPredicate and(CharPredicate p) {
+            return ch -> is(ch) && p.is(ch);
+        }
+        default CharPredicate union(CharPredicate p) {
+            return ch -> is(ch) || p.is(ch);
+        }
+        default CharPredicate union(CharPredicate p1,
+                                    CharPredicate p2 ) {
+            return ch -> is(ch) || p1.is(ch) || p2.is(ch);
+        }
+        default CharPredicate negate() {
+            return ch -> !is(ch);
+        }
+    }
+
+    static interface BmpCharPredicate extends CharPredicate {
+
+        default CharPredicate and(CharPredicate p) {
+            if(p instanceof BmpCharPredicate)
+                return (BmpCharPredicate)((ch) -> is(ch) && p.is(ch));
+            return ch -> is(ch) && p.is(ch);
+        }
+        default CharPredicate union(CharPredicate p) {
+            if (p instanceof BmpCharPredicate)
+                return (BmpCharPredicate)((ch) -> is(ch) || p.is(ch));
+            return ch -> is(ch) || p.is(ch);
+        }
+        static CharPredicate union(CharPredicate... predicates) {
+            CharPredicate cp = ch -> {
+                for (CharPredicate p : predicates) {
+                    if (!p.is(ch))
+                        return false;
+                }
+                return true;
+            };
+            for (CharPredicate p : predicates) {
+                if (! (p instanceof BmpCharPredicate))
+                    return cp;
+            }
+            return (BmpCharPredicate)cp;
+        }
+    }
+
+    /**
+     * matches a Perl vertical whitespace
+     */
+    static BmpCharPredicate VertWS = cp ->
+        (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
 
     /**
-     *  This must be the very first initializer.
+     * matches a Perl horizontal whitespace
      */
-    static Node accept = new Node();
+    static BmpCharPredicate HorizWS = cp ->
+        cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
+        cp == 0x180e || cp >= 0x2000 && cp <= 0x200a ||  cp == 0x202f ||
+        cp == 0x205f || cp == 0x3000;
 
-    static Node lastAccept = new LastNode();
+    /**
+     *  for the Unicode category ALL and the dot metacharacter when
+     *  in dotall mode.
+     */
+    static CharPredicate ALL = ch -> true;
 
-    private static class CharPropertyNames {
+    /**
+     * for the dot metacharacter when dotall is not enabled.
+     */
+    static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
+                                          && (ch|1) != '\u2029'
+                                          && ch != '\u0085');
+    /**
+     *  the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
+     */
+    static CharPredicate UNIXDOT = ch ->  ch != '\n';
 
-        static CharProperty charPropertyFor(String name) {
-            CharPropertyFactory m = map.get(name);
-            return m == null ? null : m.make();
+    /**
+     * Indicate that matches a Supplementary Unicode character
+     */
+    static CharPredicate SingleS(int c) {
+        return ch -> ch == c;
         }
 
-        private abstract static class CharPropertyFactory {
-            abstract CharProperty make();
+    /**
+     * A bmp/optimized predicate of single
+     */
+    static BmpCharPredicate Single(int c) {
+        return ch -> ch == c;
         }
 
-        private static void defCategory(String name,
-                                        final int typeMask) {
-            map.put(name, new CharPropertyFactory() {
-                    CharProperty make() { return new Category(typeMask);}});
+    /**
+     * Case insensitive matches a given BMP character
+     */
+    static BmpCharPredicate SingleI(int lower, int upper) {
+        return ch -> ch == lower || ch == upper;
         }
 
-        private static void defRange(String name,
-                                     final int lower, final int upper) {
-            map.put(name, new CharPropertyFactory() {
-                    CharProperty make() { return rangeFor(lower, upper);}});
+    /**
+     * Unicode case insensitive matches a given Unicode character
+     */
+    static CharPredicate SingleU(int lower) {
+        return ch -> lower == ch ||
+                     lower == Character.toLowerCase(Character.toUpperCase(ch));
         }
 
-        private static void defCtype(String name,
-                                     final int ctype) {
-            map.put(name, new CharPropertyFactory() {
-                    CharProperty make() { return new Ctype(ctype);}});
+    private static boolean inRange(int lower, int ch, int upper) {
+        return lower <= ch && ch <= upper;
         }
 
-        private abstract static class CloneableProperty
-            extends CharProperty implements Cloneable
-        {
-            public CloneableProperty clone() {
-                try {
-                    return (CloneableProperty) super.clone();
-                } catch (CloneNotSupportedException e) {
-                    throw new AssertionError(e);
-                }
-            }
-        }
-
-        private static void defClone(String name,
-                                     final CloneableProperty p) {
-            map.put(name, new CharPropertyFactory() {
-                    CharProperty make() { return p.clone();}});
-        }
-
-        private static final HashMap<String, CharPropertyFactory> map
-            = new HashMap<>();
-
-        static {
-            // Unicode character property aliases, defined in
-            // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
-            defCategory("Cn", 1<<Character.UNASSIGNED);
-            defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
-            defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
-            defCategory("Lt", 1<<Character.TITLECASE_LETTER);
-            defCategory("Lm", 1<<Character.MODIFIER_LETTER);
-            defCategory("Lo", 1<<Character.OTHER_LETTER);
-            defCategory("Mn", 1<<Character.NON_SPACING_MARK);
-            defCategory("Me", 1<<Character.ENCLOSING_MARK);
-            defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
-            defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
-            defCategory("Nl", 1<<Character.LETTER_NUMBER);
-            defCategory("No", 1<<Character.OTHER_NUMBER);
-            defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
-            defCategory("Zl", 1<<Character.LINE_SEPARATOR);
-            defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
-            defCategory("Cc", 1<<Character.CONTROL);
-            defCategory("Cf", 1<<Character.FORMAT);
-            defCategory("Co", 1<<Character.PRIVATE_USE);
-            defCategory("Cs", 1<<Character.SURROGATE);
-            defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
-            defCategory("Ps", 1<<Character.START_PUNCTUATION);
-            defCategory("Pe", 1<<Character.END_PUNCTUATION);
-            defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
-            defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
-            defCategory("Sm", 1<<Character.MATH_SYMBOL);
-            defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
-            defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
-            defCategory("So", 1<<Character.OTHER_SYMBOL);
-            defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
-            defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
-            defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
-                              (1<<Character.LOWERCASE_LETTER) |
-                              (1<<Character.TITLECASE_LETTER) |
-                              (1<<Character.MODIFIER_LETTER)  |
-                              (1<<Character.OTHER_LETTER)));
-            defCategory("M", ((1<<Character.NON_SPACING_MARK) |
-                              (1<<Character.ENCLOSING_MARK)   |
-                              (1<<Character.COMBINING_SPACING_MARK)));
-            defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
-                              (1<<Character.LETTER_NUMBER)        |
-                              (1<<Character.OTHER_NUMBER)));
-            defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
-                              (1<<Character.LINE_SEPARATOR)  |
-                              (1<<Character.PARAGRAPH_SEPARATOR)));
-            defCategory("C", ((1<<Character.CONTROL)     |
-                              (1<<Character.FORMAT)      |
-                              (1<<Character.PRIVATE_USE) |
-                              (1<<Character.SURROGATE))); // Other
-            defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
-                              (1<<Character.START_PUNCTUATION)     |
-                              (1<<Character.END_PUNCTUATION)       |
-                              (1<<Character.CONNECTOR_PUNCTUATION) |
-                              (1<<Character.OTHER_PUNCTUATION)     |
-                              (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
-                              (1<<Character.FINAL_QUOTE_PUNCTUATION)));
-            defCategory("S", ((1<<Character.MATH_SYMBOL)     |
-                              (1<<Character.CURRENCY_SYMBOL) |
-                              (1<<Character.MODIFIER_SYMBOL) |
-                              (1<<Character.OTHER_SYMBOL)));
-            defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
-                               (1<<Character.LOWERCASE_LETTER) |
-                               (1<<Character.TITLECASE_LETTER)));
-            defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
-                               (1<<Character.LOWERCASE_LETTER) |
-                               (1<<Character.TITLECASE_LETTER) |
-                               (1<<Character.MODIFIER_LETTER)  |
-                               (1<<Character.OTHER_LETTER)     |
-                               (1<<Character.DECIMAL_DIGIT_NUMBER)));
-            defRange("L1", 0x00, 0xFF); // Latin-1
-            map.put("all", new CharPropertyFactory() {
-                    CharProperty make() { return new All(); }});
-
-            // Posix regular expression character classes, defined in
-            // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
-            defRange("ASCII", 0x00, 0x7F);   // ASCII
-            defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
-            defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
-            defCtype("Blank", ASCII.BLANK);  // Space and tab characters
-            defCtype("Cntrl", ASCII.CNTRL);  // Control characters
-            defRange("Digit", '0', '9');     // Numeric characters
-            defCtype("Graph", ASCII.GRAPH);  // printable and visible
-            defRange("Lower", 'a', 'z');     // Lower-case alphabetic
-            defRange("Print", 0x20, 0x7E);   // Printable characters
-            defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
-            defCtype("Space", ASCII.SPACE);  // Space characters
-            defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
-            defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
-
-            // Java character properties, defined by methods in Character.java
-            defClone("javaLowerCase", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isLowerCase(ch);}});
-            defClone("javaUpperCase", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isUpperCase(ch);}});
-            defClone("javaAlphabetic", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isAlphabetic(ch);}});
-            defClone("javaIdeographic", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isIdeographic(ch);}});
-            defClone("javaTitleCase", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isTitleCase(ch);}});
-            defClone("javaDigit", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isDigit(ch);}});
-            defClone("javaDefined", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isDefined(ch);}});
-            defClone("javaLetter", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isLetter(ch);}});
-            defClone("javaLetterOrDigit", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isLetterOrDigit(ch);}});
-            defClone("javaJavaIdentifierStart", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isJavaIdentifierStart(ch);}});
-            defClone("javaJavaIdentifierPart", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isJavaIdentifierPart(ch);}});
-            defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isUnicodeIdentifierStart(ch);}});
-            defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isUnicodeIdentifierPart(ch);}});
-            defClone("javaIdentifierIgnorable", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isIdentifierIgnorable(ch);}});
-            defClone("javaSpaceChar", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isSpaceChar(ch);}});
-            defClone("javaWhitespace", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isWhitespace(ch);}});
-            defClone("javaISOControl", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isISOControl(ch);}});
-            defClone("javaMirrored", new CloneableProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return Character.isMirrored(ch);}});
+    /**
+     * Charactrs within a explicit value range
+     */
+    static CharPredicate Range(int lower, int upper) {
+        if (upper < Character.MIN_HIGH_SURROGATE ||
+            lower > Character.MAX_HIGH_SURROGATE &&
+            upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
+            return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
+        return ch -> inRange(lower, ch, upper);
         }
+
+   /**
+    * Charactrs within a explicit value range in a case insensitive manner.
+    */
+    static CharPredicate CIRange(int lower, int upper) {
+        return ch -> inRange(lower, ch, upper) ||
+                     ASCII.isAscii(ch) &&
+                     (inRange(lower, ASCII.toUpper(ch), upper) ||
+                      inRange(lower, ASCII.toLower(ch), upper));
     }
 
+    static CharPredicate CIRangeU(int lower, int upper) {
+        return ch -> {
+            if (inRange(lower, ch, upper))
+                return true;
+            int up = Character.toUpperCase(ch);
+            return inRange(lower, up, upper) ||
+                   inRange(lower, Character.toLowerCase(up), upper);
+        };
+    }
+
+    /**
+     *  This must be the very first initializer.
+     */
+    static Node accept = new Node();
+
+    static Node lastAccept = new LastNode();
+
     /**
      * Creates a predicate which can be used to match a string.
      *
      * @return  The predicate which can be used for matching on a string
      * @since   1.8