< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java

Print this page

        

@@ -1,7 +1,7 @@
 /*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this

@@ -20,33 +20,35 @@
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  */
+
 /*
  *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
- *                                                                             *
- * The original version of this source code and documentation is copyrighted   *
- * and owned by IBM, These materials are provided under terms of a License     *
- * Agreement between IBM and Sun. This technology is protected by multiple     *
- * US and International patents. This notice and attribution to IBM may not    *
- * to removed.                                                                 *
+ * Copyright (C) 1996-2015, International Business Machines Corporation and
+ * others. All Rights Reserved.
  *******************************************************************************
  */
-
 package sun.text.normalizer;
 
+import java.io.IOException;
 import java.text.ParsePosition;
-import java.util.Iterator;
+import java.util.ArrayList;
 import java.util.TreeSet;
 
 /**
- * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
- * represent <em>character classes</em> used in regular expressions.
- * A character specifies a subset of Unicode code points.  Legal
- * code points are U+0000 to U+10FFFF, inclusive.
+ * A mutable set of Unicode characters and multicharacter strings.
+ * Objects of this class represent <em>character classes</em> used
+ * in regular expressions. A character specifies a subset of Unicode
+ * code points.  Legal code points are U+0000 to U+10FFFF, inclusive.
+ *
+ * Note: method freeze() will not only make the set immutable, but
+ * also makes important methods much higher performance:
+ * contains(c), containsNone(...), span(...), spanBack(...) etc.
+ * After the object is frozen, any subsequent call that wants to change
+ * the object will throw UnsupportedOperationException.
  *
  * <p>The UnicodeSet class is not designed to be subclassed.
  *
  * <p><code>UnicodeSet</code> supports two APIs. The first is the
  * <em>operand</em> API that allows the caller to modify the value of

@@ -116,11 +118,11 @@
  *     </tr>
  *   </table>
  * </blockquote>
  *
  * Any character may be preceded by a backslash in order to remove any special
- * meaning.  White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
+ * meaning.  White space characters, as defined by the Unicode Pattern_White_Space property, are
  * ignored, unless they are escaped.
  *
  * <p>Property patterns specify a set of characters having a certain
  * property as defined by the Unicode standard.  Both the POSIX-like
  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a

@@ -265,18 +267,24 @@
  *       </table>
  *       </td>
  *     </tr>
  *   </table>
  * </blockquote>
- * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
+ * <p>To iterate over contents of UnicodeSet, the following are available:
+ * <ul><li>{@link #ranges()} to iterate through the ranges</li>
+ * <li>{@link #strings()} to iterate through the strings</li>
+ * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
+ * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
+ * </ul>
+ * All of the above can be used in <b>for</b> loops.
+ * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
  *
  * @author Alan Liu
  * @stable ICU 2.0
- * @see UnicodeSetIterator
  */
-@SuppressWarnings("deprecation")
-public class UnicodeSet implements UnicodeMatcher {
+class UnicodeSet {
 
     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
                                              // 110000 for codepoints
 

@@ -297,50 +305,54 @@
     private int[] rangeList; // internal buffer
     private int[] buffer; // internal buffer
 
     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
     // is not private so that UnicodeSetIterator can get access
-    TreeSet<String> strings = new TreeSet<>();
+    TreeSet<String> strings = new TreeSet<String>();
 
     /**
      * The pattern representation of this set.  This may not be the
      * most economical pattern.  It is the pattern supplied to
      * applyPattern(), with variables substituted and whitespace
      * removed.  For sets constructed without applyPattern(), or
      * modified using the non-pattern API, this string will be null,
      * indicating that toPattern() must generate a pattern
      * representation from the inversion list.
      */
-    private String pat = null;
 
     private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
     private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
 
-    /**
-     * A set of all characters _except_ the second through last characters of
-     * certain ranges.  These ranges are ranges of characters whose
-     * properties are all exactly alike, e.g. CJK Ideographs from
-     * U+4E00 to U+9FA5.
-     */
-    private static UnicodeSet INCLUSIONS[] = null;
+    private static UnicodeSet INCLUSION = null;
+
+    private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
+    private volatile UnicodeSetStringSpan stringSpan;
 
     //----------------------------------------------------------------
     // Public API
     //----------------------------------------------------------------
 
     /**
      * Constructs an empty set.
      * @stable ICU 2.0
      */
-    public UnicodeSet() {
+    private UnicodeSet() {
         list = new int[1 + START_EXTRA];
         list[len++] = HIGH;
     }
 
     /**
-     * Constructs a set containing the given range.
-     * If {@code end > start} then an empty set is created.
+     * Constructs a copy of an existing set.
+     * @stable ICU 2.0
+     */
+    private UnicodeSet(UnicodeSet other) {
+        set(other);
+    }
+
+    /**
+     * Constructs a set containing the given range. If <code>end >
+     * start</code> then an empty set is created.
      *
      * @param start first character, inclusive, of range
      * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */

@@ -357,185 +369,42 @@
      * a syntax error.
      * @stable ICU 2.0
      */
     public UnicodeSet(String pattern) {
         this();
-        applyPattern(pattern, null, null, IGNORE_SPACE);
+        applyPattern(pattern, null);
     }
 
     /**
      * Make this object represent the same set as <code>other</code>.
      * @param other a <code>UnicodeSet</code> whose value will be
      * copied to this object
      * @stable ICU 2.0
      */
-    @SuppressWarnings("unchecked") // Casting result of clone of a collection
     public UnicodeSet set(UnicodeSet other) {
+        checkFrozen();
         list = other.list.clone();
         len = other.len;
-        pat = other.pat;
-        strings = (TreeSet)other.strings.clone();
+        strings = new TreeSet<String>(other.strings);
         return this;
     }
 
     /**
-     * Modifies this set to represent the set specified by the given pattern.
-     * See the class description for the syntax of the pattern language.
-     * Whitespace is ignored.
-     * @param pattern a string specifying what characters are in the set
-     * @exception java.lang.IllegalArgumentException if the pattern
-     * contains a syntax error.
+     * Returns the number of elements in this set (its cardinality)
+     * Note than the elements of a set may include both individual
+     * codepoints and strings.
+     *
+     * @return the number of elements in this set (its cardinality).
      * @stable ICU 2.0
      */
-    public final UnicodeSet applyPattern(String pattern) {
-        return applyPattern(pattern, null, null, IGNORE_SPACE);
-    }
-
-    /**
-     * Append the <code>toPattern()</code> representation of a
-     * string to the given <code>StringBuffer</code>.
-     */
-    private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
-        for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
-            _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
-        }
-    }
-
-    /**
-     * Append the <code>toPattern()</code> representation of a
-     * character to the given <code>StringBuffer</code>.
-     */
-    private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
-        if (escapeUnprintable && Utility.isUnprintable(c)) {
-            // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
-            // unprintable
-            if (Utility.escapeUnprintable(buf, c)) {
-                return;
-            }
-        }
-        // Okay to let ':' pass through
-        switch (c) {
-        case '[': // SET_OPEN:
-        case ']': // SET_CLOSE:
-        case '-': // HYPHEN:
-        case '^': // COMPLEMENT:
-        case '&': // INTERSECTION:
-        case '\\': //BACKSLASH:
-        case '{':
-        case '}':
-        case '$':
-        case ':':
-            buf.append('\\');
-            break;
-        default:
-            // Escape whitespace
-            if (UCharacterProperty.isRuleWhiteSpace(c)) {
-                buf.append('\\');
-            }
-            break;
-        }
-        UTF16.append(buf, c);
-    }
-
-    /**
-     * Append a string representation of this set to result.  This will be
-     * a cleaned version of the string passed to applyPattern(), if there
-     * is one.  Otherwise it will be generated.
-     */
-    private StringBuffer _toPattern(StringBuffer result,
-                                    boolean escapeUnprintable) {
-        if (pat != null) {
-            int i;
-            int backslashCount = 0;
-            for (i=0; i<pat.length(); ) {
-                int c = UTF16.charAt(pat, i);
-                i += UTF16.getCharCount(c);
-                if (escapeUnprintable && Utility.isUnprintable(c)) {
-                    // If the unprintable character is preceded by an odd
-                    // number of backslashes, then it has been escaped.
-                    // Before unescaping it, we delete the final
-                    // backslash.
-                    if ((backslashCount % 2) == 1) {
-                        result.setLength(result.length() - 1);
-                    }
-                    Utility.escapeUnprintable(result, c);
-                    backslashCount = 0;
-                } else {
-                    UTF16.append(result, c);
-                    if (c == '\\') {
-                        ++backslashCount;
-                    } else {
-                        backslashCount = 0;
-                    }
-                }
-            }
-            return result;
-        }
-
-        return _generatePattern(result, escapeUnprintable, true);
-    }
-
-    /**
-     * Generate and append a string representation of this set to result.
-     * This does not use this.pat, the cleaned up copy of the string
-     * passed to applyPattern().
-     * @param includeStrings if false, doesn't include the strings.
-     * @stable ICU 3.8
-     */
-    public StringBuffer _generatePattern(StringBuffer result,
-                                         boolean escapeUnprintable, boolean includeStrings) {
-        result.append('[');
-
+    public int size() {
+        int n = 0;
         int count = getRangeCount();
-
-        // If the set contains at least 2 intervals and includes both
-        // MIN_VALUE and MAX_VALUE, then the inverse representation will
-        // be more economical.
-        if (count > 1 &&
-            getRangeStart(0) == MIN_VALUE &&
-            getRangeEnd(count-1) == MAX_VALUE) {
-
-            // Emit the inverse
-            result.append('^');
-
-            for (int i = 1; i < count; ++i) {
-                int start = getRangeEnd(i-1)+1;
-                int end = getRangeStart(i)-1;
-                _appendToPat(result, start, escapeUnprintable);
-                if (start != end) {
-                    if ((start+1) != end) {
-                        result.append('-');
-                    }
-                    _appendToPat(result, end, escapeUnprintable);
-                }
-            }
-        }
-
-        // Default; emit the ranges as pairs
-        else {
             for (int i = 0; i < count; ++i) {
-                int start = getRangeStart(i);
-                int end = getRangeEnd(i);
-                _appendToPat(result, start, escapeUnprintable);
-                if (start != end) {
-                    if ((start+1) != end) {
-                        result.append('-');
-                    }
-                    _appendToPat(result, end, escapeUnprintable);
-                }
+            n += getRangeEnd(i) - getRangeStart(i) + 1;
             }
-        }
-
-        if (includeStrings && strings.size() > 0) {
-            Iterator<String> it = strings.iterator();
-            while (it.hasNext()) {
-                result.append('{');
-                _appendToPat(result, it.next(), escapeUnprintable);
-                result.append('}');
-            }
-        }
-        return result.append(']');
+        return n + strings.size();
     }
 
     // for internal use, after checkFrozen has been called
     private UnicodeSet add_unchecked(int start, int end) {
         if (start < MIN_VALUE || start > MAX_VALUE) {

@@ -557,10 +426,11 @@
      * present.  If this set already contains the specified character,
      * the call leaves this set unchanged.
      * @stable ICU 2.0
      */
     public final UnicodeSet add(int c) {
+        checkFrozen();
         return add_unchecked(c);
     }
 
     // for internal use only, after checkFrozen has been called
     private final UnicodeSet add_unchecked(int c) {

@@ -641,11 +511,10 @@
             list[i] = c;
             list[i+1] = c+1;
             len += 2;
         }
 
-        pat = null;
         return this;
     }
 
     /**
      * Adds the specified multicharacter to this set if it is not already

@@ -655,27 +524,29 @@
      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
      * @param s the source string
      * @return this object, for chaining
      * @stable ICU 2.0
      */
-    public final UnicodeSet add(String s) {
+    public final UnicodeSet add(CharSequence s) {
+        checkFrozen();
         int cp = getSingleCP(s);
         if (cp < 0) {
-            strings.add(s);
-            pat = null;
+            strings.add(s.toString());
         } else {
             add_unchecked(cp, cp);
         }
         return this;
     }
 
     /**
+     * Utility for getting code point from single code point CharSequence.
+     * See the public UTF16.getSingleCodePoint()
      * @return a code point IF the string consists of a single one.
      * otherwise returns -1.
-     * @param string to test
+     * @param s to test
      */
-    private static int getSingleCP(String s) {
+    private static int getSingleCP(CharSequence s) {
         if (s.length() < 1) {
             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
         }
         if (s.length() > 2) return -1;
         if (s.length() == 1) return s.charAt(0);

@@ -699,39 +570,20 @@
      * @param end last character, inclusive, of range to be removed
      * from this set.
      * @stable ICU 2.0
      */
     public UnicodeSet complement(int start, int end) {
+        checkFrozen();
         if (start < MIN_VALUE || start > MAX_VALUE) {
             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
         }
         if (end < MIN_VALUE || end > MAX_VALUE) {
             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
         }
         if (start <= end) {
             xor(range(start, end), 2, 0);
         }
-        pat = null;
-        return this;
-    }
-
-    /**
-     * This is equivalent to
-     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
-     * @stable ICU 2.0
-     */
-    public UnicodeSet complement() {
-        if (list[0] == LOW) {
-            System.arraycopy(list, 1, list, 0, len-1);
-            --len;
-        } else {
-            ensureCapacity(len+1);
-            System.arraycopy(list, 0, list, 1, len);
-            list[0] = LOW;
-            ++len;
-        }
-        pat = null;
         return this;
     }
 
     /**
      * Returns true if this set contains the given character.

@@ -741,10 +593,16 @@
      */
     public boolean contains(int c) {
         if (c < MIN_VALUE || c > MAX_VALUE) {
             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
         }
+        if (bmpSet != null) {
+            return bmpSet.contains(c);
+        }
+        if (stringSpan != null) {
+            return stringSpan.contains(c);
+        }
 
         /*
         // Set i to the index of the start item greater than ch
         // We know we will terminate without length test!
         int i = -1;

@@ -798,66 +656,35 @@
             }
         }
     }
 
     /**
-     * Adds all of the elements in the specified set to this set if
-     * they're not already present.  This operation effectively
-     * modifies this set so that its value is the <i>union</i> of the two
-     * sets.  The behavior of this operation is unspecified if the specified
-     * collection is modified while the operation is in progress.
-     *
-     * @param c set whose elements are to be added to this set.
-     * @stable ICU 2.0
-     */
-    public UnicodeSet addAll(UnicodeSet c) {
-        add(c.list, c.len, 0);
-        strings.addAll(c.strings);
-        return this;
-    }
-
-    /**
      * Retains only the elements in this set that are contained in the
      * specified set.  In other words, removes from this set all of
      * its elements that are not contained in the specified set.  This
      * operation effectively modifies this set so that its value is
      * the <i>intersection</i> of the two sets.
      *
      * @param c set that defines which elements this set will retain.
      * @stable ICU 2.0
      */
     public UnicodeSet retainAll(UnicodeSet c) {
+        checkFrozen();
         retain(c.list, c.len, 0);
         strings.retainAll(c.strings);
         return this;
     }
 
     /**
-     * Removes from this set all of its elements that are contained in the
-     * specified set.  This operation effectively modifies this
-     * set so that its value is the <i>asymmetric set difference</i> of
-     * the two sets.
-     *
-     * @param c set that defines which elements will be removed from
-     *          this set.
-     * @stable ICU 2.0
-     */
-    public UnicodeSet removeAll(UnicodeSet c) {
-        retain(c.list, c.len, 2);
-        strings.removeAll(c.strings);
-        return this;
-    }
-
-    /**
      * Removes all of the elements from this set.  This set will be
      * empty after this call returns.
      * @stable ICU 2.0
      */
     public UnicodeSet clear() {
+        checkFrozen();
         list[0] = HIGH;
         len = 1;
-        pat = null;
         strings.clear();
         return this;
     }
 
     /**

@@ -921,409 +748,22 @@
      * is the last character of the pattern string.
      * @return an inversion list for the parsed substring
      * of <code>pattern</code>
      * @exception java.lang.IllegalArgumentException if the parse fails.
      */
-    UnicodeSet applyPattern(String pattern,
-                      ParsePosition pos,
-                      SymbolTable symbols,
-                      int options) {
-
-        // Need to build the pattern in a temporary string because
-        // _applyPattern calls add() etc., which set pat to empty.
-        boolean parsePositionWasNull = pos == null;
-        if (parsePositionWasNull) {
-            pos = new ParsePosition(0);
-        }
-
-        StringBuffer rebuiltPat = new StringBuffer();
-        RuleCharacterIterator chars =
-            new RuleCharacterIterator(pattern, symbols, pos);
-        applyPattern(chars, symbols, rebuiltPat, options);
-        if (chars.inVariable()) {
-            syntaxError(chars, "Extra chars in variable value");
-        }
-        pat = rebuiltPat.toString();
-        if (parsePositionWasNull) {
-            int i = pos.getIndex();
-
-            // Skip over trailing whitespace
-            if ((options & IGNORE_SPACE) != 0) {
-                i = Utility.skipWhitespace(pattern, i);
-            }
-
-            if (i != pattern.length()) {
-                throw new IllegalArgumentException("Parse of \"" + pattern +
-                                                   "\" failed at " + i);
-            }
-        }
-        return this;
-    }
-
-    /**
-     * Parse the pattern from the given RuleCharacterIterator.  The
-     * iterator is advanced over the parsed pattern.
-     * @param chars iterator over the pattern characters.  Upon return
-     * it will be advanced to the first character after the parsed
-     * pattern, or the end of the iteration if all characters are
-     * parsed.
-     * @param symbols symbol table to use to parse and dereference
-     * variables, or null if none.
-     * @param rebuiltPat the pattern that was parsed, rebuilt or
-     * copied from the input pattern, as appropriate.
-     * @param options a bit mask of zero or more of the following:
-     * IGNORE_SPACE, CASE.
-     */
-    void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
-                      StringBuffer rebuiltPat, int options) {
-        // Syntax characters: [ ] ^ - & { }
-
-        // Recognized special forms for chars, sets: c-c s-s s&s
-
-        int opts = RuleCharacterIterator.PARSE_VARIABLES |
-                   RuleCharacterIterator.PARSE_ESCAPES;
-        if ((options & IGNORE_SPACE) != 0) {
-            opts |= RuleCharacterIterator.SKIP_WHITESPACE;
-        }
-
-        StringBuffer patBuf = new StringBuffer(), buf = null;
-        boolean usePat = false;
-        UnicodeSet scratch = null;
-        Object backup = null;
-
-        // mode: 0=before [, 1=between [...], 2=after ]
-        // lastItem: 0=none, 1=char, 2=set
-        int lastItem = 0, lastChar = 0, mode = 0;
-        char op = 0;
-
-        boolean invert = false;
-
-        clear();
-
-        while (mode != 2 && !chars.atEnd()) {
-            if (false) {
-                // Debugging assertion
-                if (!((lastItem == 0 && op == 0) ||
-                      (lastItem == 1 && (op == 0 || op == '-')) ||
-                      (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
-                    throw new IllegalArgumentException();
-                }
-            }
-
-            int c = 0;
-            boolean literal = false;
-            UnicodeSet nested = null;
-
-            // -------- Check for property pattern
-
-            // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
-            int setMode = 0;
-            if (resemblesPropertyPattern(chars, opts)) {
-                setMode = 2;
-            }
-
-            // -------- Parse '[' of opening delimiter OR nested set.
-            // If there is a nested set, use `setMode' to define how
-            // the set should be parsed.  If the '[' is part of the
-            // opening delimiter for this pattern, parse special
-            // strings "[", "[^", "[-", and "[^-".  Check for stand-in
-            // characters representing a nested set in the symbol
-            // table.
-
-            else {
-                // Prepare to backup if necessary
-                backup = chars.getPos(backup);
-                c = chars.next(opts);
-                literal = chars.isEscaped();
-
-                if (c == '[' && !literal) {
-                    if (mode == 1) {
-                        chars.setPos(backup); // backup
-                        setMode = 1;
-                    } else {
-                        // Handle opening '[' delimiter
-                        mode = 1;
-                        patBuf.append('[');
-                        backup = chars.getPos(backup); // prepare to backup
-                        c = chars.next(opts);
-                        literal = chars.isEscaped();
-                        if (c == '^' && !literal) {
-                            invert = true;
-                            patBuf.append('^');
-                            backup = chars.getPos(backup); // prepare to backup
-                            c = chars.next(opts);
-                            literal = chars.isEscaped();
-                        }
-                        // Fall through to handle special leading '-';
-                        // otherwise restart loop for nested [], \p{}, etc.
-                        if (c == '-') {
-                            literal = true;
-                            // Fall through to handle literal '-' below
-                        } else {
-                            chars.setPos(backup); // backup
-                            continue;
-                        }
-                    }
-                } else if (symbols != null) {
-                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
-                     if (m != null) {
-                         try {
-                             nested = (UnicodeSet) m;
-                             setMode = 3;
-                         } catch (ClassCastException e) {
-                             syntaxError(chars, "Syntax error");
-                         }
-                     }
-                }
-            }
-
-            // -------- Handle a nested set.  This either is inline in
-            // the pattern or represented by a stand-in that has
-            // previously been parsed and was looked up in the symbol
-            // table.
-
-            if (setMode != 0) {
-                if (lastItem == 1) {
-                    if (op != 0) {
-                        syntaxError(chars, "Char expected after operator");
-                    }
-                    add_unchecked(lastChar, lastChar);
-                    _appendToPat(patBuf, lastChar, false);
-                    lastItem = op = 0;
-                }
-
-                if (op == '-' || op == '&') {
-                    patBuf.append(op);
-                }
-
-                if (nested == null) {
-                    if (scratch == null) scratch = new UnicodeSet();
-                    nested = scratch;
-                }
-                switch (setMode) {
-                case 1:
-                    nested.applyPattern(chars, symbols, patBuf, options);
-                    break;
-                case 2:
-                    chars.skipIgnored(opts);
-                    nested.applyPropertyPattern(chars, patBuf, symbols);
-                    break;
-                case 3: // `nested' already parsed
-                    nested._toPattern(patBuf, false);
-                    break;
-                }
-
-                usePat = true;
-
-                if (mode == 0) {
-                    // Entire pattern is a category; leave parse loop
-                    set(nested);
-                    mode = 2;
-                    break;
-                }
-
-                switch (op) {
-                case '-':
-                    removeAll(nested);
-                    break;
-                case '&':
-                    retainAll(nested);
-                    break;
-                case 0:
-                    addAll(nested);
-                    break;
-                }
-
-                op = 0;
-                lastItem = 2;
-
-                continue;
-            }
-
-            if (mode == 0) {
-                syntaxError(chars, "Missing '['");
-            }
-
-            // -------- Parse special (syntax) characters.  If the
-            // current character is not special, or if it is escaped,
-            // then fall through and handle it below.
-
-            if (!literal) {
-                switch (c) {
-                case ']':
-                    if (lastItem == 1) {
-                        add_unchecked(lastChar, lastChar);
-                        _appendToPat(patBuf, lastChar, false);
-                    }
-                    // Treat final trailing '-' as a literal
-                    if (op == '-') {
-                        add_unchecked(op, op);
-                        patBuf.append(op);
-                    } else if (op == '&') {
-                        syntaxError(chars, "Trailing '&'");
-                    }
-                    patBuf.append(']');
-                    mode = 2;
-                    continue;
-                case '-':
-                    if (op == 0) {
-                        if (lastItem != 0) {
-                            op = (char) c;
-                            continue;
-                        } else {
-                            // Treat final trailing '-' as a literal
-                            add_unchecked(c, c);
-                            c = chars.next(opts);
-                            literal = chars.isEscaped();
-                            if (c == ']' && !literal) {
-                                patBuf.append("-]");
-                                mode = 2;
-                                continue;
-                            }
-                        }
-                    }
-                    syntaxError(chars, "'-' not after char or set");
-                    break;
-                case '&':
-                    if (lastItem == 2 && op == 0) {
-                        op = (char) c;
-                        continue;
-                    }
-                    syntaxError(chars, "'&' not after set");
-                    break;
-                case '^':
-                    syntaxError(chars, "'^' not after '['");
-                    break;
-                case '{':
-                    if (op != 0) {
-                        syntaxError(chars, "Missing operand after operator");
-                    }
-                    if (lastItem == 1) {
-                        add_unchecked(lastChar, lastChar);
-                        _appendToPat(patBuf, lastChar, false);
-                    }
-                    lastItem = 0;
-                    if (buf == null) {
-                        buf = new StringBuffer();
-                    } else {
-                        buf.setLength(0);
-                    }
-                    boolean ok = false;
-                    while (!chars.atEnd()) {
-                        c = chars.next(opts);
-                        literal = chars.isEscaped();
-                        if (c == '}' && !literal) {
-                            ok = true;
-                            break;
-                        }
-                        UTF16.append(buf, c);
-                    }
-                    if (buf.length() < 1 || !ok) {
-                        syntaxError(chars, "Invalid multicharacter string");
-                    }
-                    // We have new string. Add it to set and continue;
-                    // we don't need to drop through to the further
-                    // processing
-                    add(buf.toString());
-                    patBuf.append('{');
-                    _appendToPat(patBuf, buf.toString(), false);
-                    patBuf.append('}');
-                    continue;
-                case SymbolTable.SYMBOL_REF:
-                    //         symbols  nosymbols
-                    // [a-$]   error    error (ambiguous)
-                    // [a$]    anchor   anchor
-                    // [a-$x]  var "x"* literal '$'
-                    // [a-$.]  error    literal '$'
-                    // *We won't get here in the case of var "x"
-                    backup = chars.getPos(backup);
-                    c = chars.next(opts);
-                    literal = chars.isEscaped();
-                    boolean anchor = (c == ']' && !literal);
-                    if (symbols == null && !anchor) {
-                        c = SymbolTable.SYMBOL_REF;
-                        chars.setPos(backup);
-                        break; // literal '$'
-                    }
-                    if (anchor && op == 0) {
-                        if (lastItem == 1) {
-                            add_unchecked(lastChar, lastChar);
-                            _appendToPat(patBuf, lastChar, false);
-                        }
-                        add_unchecked(UnicodeMatcher.ETHER);
-                        usePat = true;
-                        patBuf.append(SymbolTable.SYMBOL_REF).append(']');
-                        mode = 2;
-                        continue;
-                    }
-                    syntaxError(chars, "Unquoted '$'");
-                    break;
-                default:
-                    break;
-                }
-            }
-
-            // -------- Parse literal characters.  This includes both
-            // escaped chars ("\u4E01") and non-syntax characters
-            // ("a").
-
-            switch (lastItem) {
-            case 0:
-                lastItem = 1;
-                lastChar = c;
-                break;
-            case 1:
-                if (op == '-') {
-                    if (lastChar >= c) {
-                        // Don't allow redundant (a-a) or empty (b-a) ranges;
-                        // these are most likely typos.
-                        syntaxError(chars, "Invalid range");
-                    }
-                    add_unchecked(lastChar, c);
-                    _appendToPat(patBuf, lastChar, false);
-                    patBuf.append(op);
-                    _appendToPat(patBuf, c, false);
-                    lastItem = op = 0;
-                } else {
-                    add_unchecked(lastChar, lastChar);
-                    _appendToPat(patBuf, lastChar, false);
-                    lastChar = c;
-                }
-                break;
-            case 2:
-                if (op != 0) {
-                    syntaxError(chars, "Set expected after operator");
-                }
-                lastChar = c;
-                lastItem = 1;
-                break;
-            }
-        }
-
-        if (mode != 2) {
-            syntaxError(chars, "Missing ']'");
-        }
-
-        chars.skipIgnored(opts);
-
-        if (invert) {
-            complement();
-        }
-
-        // Use the rebuilt pattern (pat) only if necessary.  Prefer the
-        // generated pattern.
-        if (usePat) {
-            rebuiltPat.append(patBuf.toString());
+    private UnicodeSet applyPattern(String pattern,
+            ParsePosition pos) {
+        if ("[:age=3.2:]".equals(pattern)) {
+            checkFrozen();
+            VersionInfo version = VersionInfo.getInstance("3.2");
+            applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
         } else {
-            _generatePattern(rebuiltPat, false, true);
-        }
+            throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
+                          + pattern + ")");
     }
 
-    private static void syntaxError(RuleCharacterIterator chars, String msg) {
-        throw new IllegalArgumentException("Error: " + msg + " at \"" +
-                                           Utility.escape(chars.toString()) +
-                                           '"');
+        return this;
     }
 
     //----------------------------------------------------------------
     // Implementation: Utility methods
     //----------------------------------------------------------------

@@ -1395,11 +835,10 @@
         }
         // swap list and buffer
         int[] temp = list;
         list = buffer;
         buffer = temp;
-        pat = null;
         return this;
     }
 
     // polarity = 0 is normal: x union y
     // polarity = 2: x union ~y

@@ -1493,11 +932,10 @@
         len = k;
         // swap list and buffer
         int[] temp = list;
         list = buffer;
         buffer = temp;
-        pat = null;
         return this;
     }
 
     // polarity = 0 is normal: x intersect y
     // polarity = 2: x intersect ~y == set-minus

@@ -1564,11 +1002,10 @@
         len = k;
         // swap list and buffer
         int[] temp = list;
         list = buffer;
         buffer = temp;
-        pat = null;
         return this;
     }
 
     private static final int max(int a, int b) {
         return (a > b) ? a : b;

@@ -1580,62 +1017,50 @@
 
     private static interface Filter {
         boolean contains(int codePoint);
     }
 
-    // VersionInfo for unassigned characters
-    static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+    private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
 
     private static class VersionFilter implements Filter {
         VersionInfo version;
-
         VersionFilter(VersionInfo version) { this.version = version; }
-
         public boolean contains(int ch) {
             VersionInfo v = UCharacter.getAge(ch);
             // Reference comparison ok; VersionInfo caches and reuses
             // unique objects.
             return v != NO_VERSION &&
                    v.compareTo(version) <= 0;
         }
     }
 
     private static synchronized UnicodeSet getInclusions(int src) {
-        if (INCLUSIONS == null) {
-            INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
-        }
-        if(INCLUSIONS[src] == null) {
-            UnicodeSet incl = new UnicodeSet();
-            switch(src) {
-            case UCharacterProperty.SRC_PROPSVEC:
-                UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
-                break;
-            default:
+        if (src != UCharacterProperty.SRC_PROPSVEC) {
                 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
             }
-            INCLUSIONS[src] = incl;
+
+        if (INCLUSION == null) {
+            UnicodeSet incl = new UnicodeSet();
+            UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+            INCLUSION = incl;
         }
-        return INCLUSIONS[src];
+        return INCLUSION;
     }
 
     /**
      * Generic filter-based scanning code for UCD property UnicodeSets.
      */
     private UnicodeSet applyFilter(Filter filter, int src) {
-        // Walk through all Unicode characters, noting the start
+        // Logically, walk through all Unicode characters, noting the start
         // and end of each range for which filter.contain(c) is
         // true.  Add each range to a set.
         //
-        // To improve performance, use the INCLUSIONS set, which
+        // To improve performance, use an inclusions set which
         // encodes information about character ranges that are known
-        // to have identical properties, such as the CJK Ideographs
-        // from U+4E00 to U+9FA5.  INCLUSIONS contains all characters
-        // except the first characters of such ranges.
-        //
-        // TODO Where possible, instead of scanning over code points,
-        // use internal property data to initialize UnicodeSets for
-        // those properties.  Scanning code points is slow.
+        // to have identical properties.
+        // getInclusions(src) contains exactly the first characters of
+        // same-value ranges for the given properties "source".
 
         clear();
 
         int startHasProperty = -1;
         UnicodeSet inclusions = getInclusions(src);

@@ -1666,206 +1091,317 @@
 
         return this;
     }
 
     /**
-     * Remove leading and trailing rule white space and compress
-     * internal rule white space to a single space character.
+     * Is this frozen, according to the Freezable interface?
      *
-     * @see UCharacterProperty#isRuleWhiteSpace
+     * @return value
+     * @stable ICU 3.8
      */
-    private static String mungeCharName(String source) {
-        StringBuffer buf = new StringBuffer();
-        for (int i=0; i<source.length(); ) {
-            int ch = UTF16.charAt(source, i);
-            i += UTF16.getCharCount(ch);
-            if (UCharacterProperty.isRuleWhiteSpace(ch)) {
-                if (buf.length() == 0 ||
-                    buf.charAt(buf.length() - 1) == ' ') {
-                    continue;
-                }
-                ch = ' '; // convert to ' '
-            }
-            UTF16.append(buf, ch);
-        }
-        if (buf.length() != 0 &&
-            buf.charAt(buf.length() - 1) == ' ') {
-            buf.setLength(buf.length() - 1);
-        }
-        return buf.toString();
+    public boolean isFrozen() {
+        return (bmpSet != null || stringSpan != null);
     }
 
     /**
-     * Modifies this set to contain those code points which have the
-     * given value for the given property.  Prior contents of this
-     * set are lost.
-     * @param propertyAlias the property alias
-     * @param valueAlias the value alias
-     * @param symbols if not null, then symbols are first called to see if a property
-     * is available. If true, then everything else is skipped.
-     * @return this set
-     * @stable ICU 3.2
-     */
-    public UnicodeSet applyPropertyAlias(String propertyAlias,
-                                         String valueAlias, SymbolTable symbols) {
-        if (valueAlias.length() > 0) {
-            if (propertyAlias.equals("Age")) {
-                // Must munge name, since
-                // VersionInfo.getInstance() does not do
-                // 'loose' matching.
-                VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
-                applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
-                return this;
-            }
+     * Freeze this class, according to the Freezable interface.
+     *
+     * @return this
+     * @stable ICU 4.4
+     */
+    public UnicodeSet freeze() {
+        if (!isFrozen()) {
+            // Do most of what compact() does before freezing because
+            // compact() will not work when the set is frozen.
+            // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
+
+            // Delete buffer first to defragment memory less.
+            buffer = null;
+            if (list.length > (len + GROW_EXTRA)) {
+                // Make the capacity equal to len or 1.
+                // We don't want to realloc of 0 size.
+                int capacity = (len == 0) ? 1 : len;
+                int[] oldList = list;
+                list = new int[capacity];
+                for (int i = capacity; i-- > 0;) {
+                    list[i] = oldList[i];
         }
-        throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
     }
 
-    /**
-     * Return true if the given iterator appears to point at a
-     * property pattern.  Regardless of the result, return with the
-     * iterator unchanged.
-     * @param chars iterator over the pattern characters.  Upon return
-     * it will be unchanged.
-     * @param iterOpts RuleCharacterIterator options
-     */
-    private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
-                                                    int iterOpts) {
-        boolean result = false;
-        iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
-        Object pos = chars.getPos(null);
-        int c = chars.next(iterOpts);
-        if (c == '[' || c == '\\') {
-            int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
-            result = (c == '[') ? (d == ':') :
-                     (d == 'N' || d == 'p' || d == 'P');
+            // Optimize contains() and span() and similar functions.
+            if (!strings.isEmpty()) {
+                stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
+            }
+            if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
+                // Optimize for code point spans.
+                // There are no strings, or
+                // all strings are irrelevant for span() etc. because
+                // all of each string's code points are contained in this set.
+                // However, fully contained strings are relevant for spanAndCount(),
+                // so we create both objects.
+                bmpSet = new BMPSet(list, len);
         }
-        chars.setPos(pos);
-        return result;
+        }
+        return this;
     }
 
     /**
-     * Parse the given property pattern at the given parse position.
-     * @param symbols TODO
+     * Span a string using this UnicodeSet.
+     * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+     * @param s The string to be spanned
+     * @param spanCondition The span condition
+     * @return the length of the span
+     * @stable ICU 4.4
      */
-    private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
-        int pos = ppos.getIndex();
-
-        // On entry, ppos should point to one of the following locations:
+    public int span(CharSequence s, SpanCondition spanCondition) {
+        return span(s, 0, spanCondition);
+    }
 
-        // Minimum length is 5 characters, e.g. \p{L}
-        if ((pos+5) > pattern.length()) {
-            return null;
-        }
-
-        boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
-        boolean isName = false; // true for \N{pat}, o/w false
-        boolean invert = false;
-
-        // Look for an opening [:, [:^, \p, or \P
-        if (pattern.regionMatches(pos, "[:", 0, 2)) {
-            posix = true;
-            pos = Utility.skipWhitespace(pattern, pos+2);
-            if (pos < pattern.length() && pattern.charAt(pos) == '^') {
-                ++pos;
-                invert = true;
-            }
-        } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
-                   pattern.regionMatches(pos, "\\N", 0, 2)) {
-            char c = pattern.charAt(pos+1);
-            invert = (c == 'P');
-            isName = (c == 'N');
-            pos = Utility.skipWhitespace(pattern, pos+2);
-            if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
-                // Syntax error; "\p" or "\P" not followed by "{"
-                return null;
+    /**
+     * Span a string using this UnicodeSet.
+     *   If the start index is less than 0, span will start from 0.
+     *   If the start index is greater than the string length, span returns the string length.
+     * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+     * @param s The string to be spanned
+     * @param start The start index that the span begins
+     * @param spanCondition The span condition
+     * @return the string index which ends the span (i.e. exclusive)
+     * @stable ICU 4.4
+     */
+    public int span(CharSequence s, int start, SpanCondition spanCondition) {
+        int end = s.length();
+        if (start < 0) {
+            start = 0;
+        } else if (start >= end) {
+            return end;
             }
-        } else {
-            // Open delimiter not seen
-            return null;
+        if (bmpSet != null) {
+            // Frozen set without strings, or no string is relevant for span().
+            return bmpSet.span(s, start, spanCondition, null);
+        }
+        if (stringSpan != null) {
+            return stringSpan.span(s, start, spanCondition);
+        } else if (!strings.isEmpty()) {
+            int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+                    : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+            UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+            if (strSpan.needsStringSpanUTF16()) {
+                return strSpan.span(s, start, spanCondition);
         }
-
-        // Look for the matching close delimiter, either :] or }
-        int close = pattern.indexOf(posix ? ":]" : "}", pos);
-        if (close < 0) {
-            // Syntax error; close delimiter missing
-            return null;
-        }
-
-        // Look for an '=' sign.  If this is present, we will parse a
-        // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
-        // pattern.
-        int equals = pattern.indexOf('=', pos);
-        String propName, valueName;
-        if (equals >= 0 && equals < close && !isName) {
-            // Equals seen; parse medium/long pattern
-            propName = pattern.substring(pos, equals);
-            valueName = pattern.substring(equals+1, close);
         }
 
-        else {
-            // Handle case where no '=' is seen, and \N{}
-            propName = pattern.substring(pos, close);
-            valueName = "";
+        return spanCodePointsAndCount(s, start, spanCondition, null);
+    }
 
-            // Handle \N{name}
-            if (isName) {
-                // This is a little inefficient since it means we have to
-                // parse "na" back to UProperty.NAME even though we already
-                // know it's UProperty.NAME.  If we refactor the API to
-                // support args of (int, String) then we can remove
-                // "na" and make this a little more efficient.
-                valueName = propName;
-                propName = "na";
+    /**
+     * Same as span() but also counts the smallest number of set elements on any path across the span.
+     * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+     * @param outCount An output-only object (must not be null) for returning the count.
+     * @return the limit (exclusive end) of the span
+     */
+    public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
+        if (outCount == null) {
+            throw new IllegalArgumentException("outCount must not be null");
             }
+        int end = s.length();
+        if (start < 0) {
+            start = 0;
+        } else if (start >= end) {
+            return end;
+        }
+        if (stringSpan != null) {
+            // We might also have bmpSet != null,
+            // but fully-contained strings are relevant for counting elements.
+            return stringSpan.spanAndCount(s, start, spanCondition, outCount);
+        } else if (bmpSet != null) {
+            return bmpSet.span(s, start, spanCondition, outCount);
+        } else if (!strings.isEmpty()) {
+            int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+                    : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+            which |= UnicodeSetStringSpan.WITH_COUNT;
+            UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+            return strSpan.spanAndCount(s, start, spanCondition, outCount);
         }
 
-        applyPropertyAlias(propName, valueName, symbols);
-
-        if (invert) {
-            complement();
+        return spanCodePointsAndCount(s, start, spanCondition, outCount);
         }
 
-        // Move to the limit position after the close delimiter
-        ppos.setIndex(close + (posix ? 2 : 1));
+    private int spanCodePointsAndCount(CharSequence s, int start,
+            SpanCondition spanCondition, OutputInt outCount) {
+        // Pin to 0/1 values.
+        boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
 
-        return this;
+        int c;
+        int next = start;
+        int length = s.length();
+        int count = 0;
+        do {
+            c = Character.codePointAt(s, next);
+            if (spanContained != contains(c)) {
+                break;
+            }
+            ++count;
+            next += Character.charCount(c);
+        } while (next < length);
+        if (outCount != null) { outCount.value = count; }
+        return next;
     }
 
     /**
-     * Parse a property pattern.
-     * @param chars iterator over the pattern characters.  Upon return
-     * it will be advanced to the first character after the parsed
-     * pattern, or the end of the iteration if all characters are
-     * parsed.
-     * @param rebuiltPat the pattern that was parsed, rebuilt or
-     * copied from the input pattern, as appropriate.
-     * @param symbols TODO
-     */
-    private void applyPropertyPattern(RuleCharacterIterator chars,
-                                      StringBuffer rebuiltPat, SymbolTable symbols) {
-        String patStr = chars.lookahead();
-        ParsePosition pos = new ParsePosition(0);
-        applyPropertyPattern(patStr, pos, symbols);
-        if (pos.getIndex() == 0) {
-            syntaxError(chars, "Invalid property pattern");
+     * Span a string backwards (from the fromIndex) using this UnicodeSet.
+     * If the fromIndex is less than 0, spanBack will return 0.
+     * If fromIndex is greater than the string length, spanBack will start from the string length.
+     * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+     * @param s The string to be spanned
+     * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
+     * @param spanCondition The span condition
+     * @return The string index which starts the span (i.e. inclusive).
+     * @stable ICU 4.4
+     */
+    public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
+        if (fromIndex <= 0) {
+            return 0;
+        }
+        if (fromIndex > s.length()) {
+            fromIndex = s.length();
+        }
+        if (bmpSet != null) {
+            // Frozen set without strings, or no string is relevant for spanBack().
+            return bmpSet.spanBack(s, fromIndex, spanCondition);
+        }
+        if (stringSpan != null) {
+            return stringSpan.spanBack(s, fromIndex, spanCondition);
+        } else if (!strings.isEmpty()) {
+            int which = (spanCondition == SpanCondition.NOT_CONTAINED)
+                    ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
+                            : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
+            UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+            if (strSpan.needsStringSpanUTF16()) {
+                return strSpan.spanBack(s, fromIndex, spanCondition);
         }
-        chars.jumpahead(pos.getIndex());
-        rebuiltPat.append(patStr, 0, pos.getIndex());
     }
 
-    //----------------------------------------------------------------
-    // Case folding API
-    //----------------------------------------------------------------
+        // Pin to 0/1 values.
+        boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
+
+        int c;
+        int prev = fromIndex;
+        do {
+            c = Character.codePointBefore(s, prev);
+            if (spanContained != contains(c)) {
+                break;
+            }
+            prev -= Character.charCount(c);
+        } while (prev > 0);
+        return prev;
+    }
 
     /**
-     * Bitmask for constructor and applyPattern() indicating that
-     * white space should be ignored.  If set, ignore characters for
-     * which UCharacterProperty.isRuleWhiteSpace() returns true,
-     * unless they are quoted or escaped.  This may be ORed together
-     * with other selectors.
-     * @stable ICU 3.8
+     * Clone a thawed version of this class, according to the Freezable interface.
+     * @return the clone, not frozen
+     * @stable ICU 4.4
      */
-    public static final int IGNORE_SPACE = 1;
+    public UnicodeSet cloneAsThawed() {
+        UnicodeSet result = new UnicodeSet(this);
+        assert !result.isFrozen();
+        return result;
+    }
 
-}
+    // internal function
+    private void checkFrozen() {
+        if (isFrozen()) {
+            throw new UnsupportedOperationException("Attempt to modify frozen object");
+        }
+    }
+
+    /**
+     * Argument values for whether span() and similar functions continue while the current character is contained vs.
+     * not contained in the set.
+     * <p>
+     * The functionality is straightforward for sets with only single code points, without strings (which is the common
+     * case):
+     * <ul>
+     * <li>CONTAINED and SIMPLE work the same.
+     * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
+     * <li>span() and spanBack() partition any string the
+     * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
+     * <li>Using a
+     * complemented (inverted) set and the opposite span conditions yields the same results.
+     * </ul>
+     * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
+     * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
+     * strings:
+     * <ul>
+     * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
+     * Therefore, complementing both the set and the span conditions may yield different results.
+     * <li>When starting spans
+     * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
+     * because a set string may start before the later position.
+     * <li>span(SIMPLE) may be shorter than
+     * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
+     * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
+     * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
+     * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
+     * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
+     * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
+     * </ul>
+     * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
+     * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
+     * be used.
+     * <p>
+     * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
+     * boundaries, never in the middle of a surrogate pair.
+     *
+     * @stable ICU 4.4
+     */
+    public enum SpanCondition {
+        /**
+         * Continues a span() while there is no set element at the current position.
+         * Increments by one code point at a time.
+         * Stops before the first set element (character or string).
+         * (For code points only, this is like while contains(current)==false).
+         * <p>
+         * When span() returns, the substring between where it started and the position it returned consists only of
+         * characters that are not in the set, and none of its strings overlap with the span.
+         *
+         * @stable ICU 4.4
+         */
+        NOT_CONTAINED,
+
+        /**
+         * Spans the longest substring that is a concatenation of set elements (characters or strings).
+         * (For characters only, this is like while contains(current)==true).
+         * <p>
+         * When span() returns, the substring between where it started and the position it returned consists only of set
+         * elements (characters or strings) that are in the set.
+         * <p>
+         * If a set contains strings, then the span will be the longest substring for which there
+         * exists at least one non-overlapping concatenation of set elements (characters or strings).
+         * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
+         * (Java/ICU/Perl regex stops at the first match of an OR.)
+         *
+         * @stable ICU 4.4
+         */
+        CONTAINED,
+
+        /**
+         * Continues a span() while there is a set element at the current position.
+         * Increments by the longest matching element at each position.
+         * (For characters only, this is like while contains(current)==true).
+         * <p>
+         * When span() returns, the substring between where it started and the position it returned consists only of set
+         * elements (characters or strings) that are in the set.
+         * <p>
+         * If a set only contains single characters, then this is the same as CONTAINED.
+         * <p>
+         * If a set contains strings, then the span will be the longest substring with a match at each position with the
+         * longest single set element (character or string).
+         * <p>
+         * Use this span condition together with other longest-match algorithms, such as ICU converters
+         * (ucnv_getUnicodeSet()).
+         *
+         * @stable ICU 4.4
+         */
+        SIMPLE,
+    }
 
+}
< prev index next >