< prev index next >
jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java
Print this page
@@ -1,7 +1,7 @@
/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
@@ -20,33 +20,35 @@
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2015, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
-
package sun.text.normalizer;
+import java.io.IOException;
import java.text.ParsePosition;
-import java.util.Iterator;
+import java.util.ArrayList;
import java.util.TreeSet;
/**
- * A mutable set of Unicode characters and multicharacter strings. Objects of this class
- * represent <em>character classes</em> used in regular expressions.
- * A character specifies a subset of Unicode code points. Legal
- * code points are U+0000 to U+10FFFF, inclusive.
+ * A mutable set of Unicode characters and multicharacter strings.
+ * Objects of this class represent <em>character classes</em> used
+ * in regular expressions. A character specifies a subset of Unicode
+ * code points. Legal code points are U+0000 to U+10FFFF, inclusive.
+ *
+ * Note: method freeze() will not only make the set immutable, but
+ * also makes important methods much higher performance:
+ * contains(c), containsNone(...), span(...), spanBack(...) etc.
+ * After the object is frozen, any subsequent call that wants to change
+ * the object will throw UnsupportedOperationException.
*
* <p>The UnicodeSet class is not designed to be subclassed.
*
* <p><code>UnicodeSet</code> supports two APIs. The first is the
* <em>operand</em> API that allows the caller to modify the value of
@@ -116,11 +118,11 @@
* </tr>
* </table>
* </blockquote>
*
* Any character may be preceded by a backslash in order to remove any special
- * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
+ * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are
* ignored, unless they are escaped.
*
* <p>Property patterns specify a set of characters having a certain
* property as defined by the Unicode standard. Both the POSIX-like
* "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
@@ -265,18 +267,24 @@
* </table>
* </td>
* </tr>
* </table>
* </blockquote>
- * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
+ * <p>To iterate over contents of UnicodeSet, the following are available:
+ * <ul><li>{@link #ranges()} to iterate through the ranges</li>
+ * <li>{@link #strings()} to iterate through the strings</li>
+ * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
+ * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
+ * </ul>
+ * All of the above can be used in <b>for</b> loops.
+ * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
* @stable ICU 2.0
- * @see UnicodeSetIterator
*/
-@SuppressWarnings("deprecation")
-public class UnicodeSet implements UnicodeMatcher {
+class UnicodeSet {
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
// 110000 for codepoints
@@ -297,50 +305,54 @@
private int[] rangeList; // internal buffer
private int[] buffer; // internal buffer
// NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
// is not private so that UnicodeSetIterator can get access
- TreeSet<String> strings = new TreeSet<>();
+ TreeSet<String> strings = new TreeSet<String>();
/**
* The pattern representation of this set. This may not be the
* most economical pattern. It is the pattern supplied to
* applyPattern(), with variables substituted and whitespace
* removed. For sets constructed without applyPattern(), or
* modified using the non-pattern API, this string will be null,
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
- private String pat = null;
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
- /**
- * A set of all characters _except_ the second through last characters of
- * certain ranges. These ranges are ranges of characters whose
- * properties are all exactly alike, e.g. CJK Ideographs from
- * U+4E00 to U+9FA5.
- */
- private static UnicodeSet INCLUSIONS[] = null;
+ private static UnicodeSet INCLUSION = null;
+
+ private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
+ private volatile UnicodeSetStringSpan stringSpan;
//----------------------------------------------------------------
// Public API
//----------------------------------------------------------------
/**
* Constructs an empty set.
* @stable ICU 2.0
*/
- public UnicodeSet() {
+ private UnicodeSet() {
list = new int[1 + START_EXTRA];
list[len++] = HIGH;
}
/**
- * Constructs a set containing the given range.
- * If {@code end > start} then an empty set is created.
+ * Constructs a copy of an existing set.
+ * @stable ICU 2.0
+ */
+ private UnicodeSet(UnicodeSet other) {
+ set(other);
+ }
+
+ /**
+ * Constructs a set containing the given range. If <code>end >
+ * start</code> then an empty set is created.
*
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
* @stable ICU 2.0
*/
@@ -357,185 +369,42 @@
* a syntax error.
* @stable ICU 2.0
*/
public UnicodeSet(String pattern) {
this();
- applyPattern(pattern, null, null, IGNORE_SPACE);
+ applyPattern(pattern, null);
}
/**
* Make this object represent the same set as <code>other</code>.
* @param other a <code>UnicodeSet</code> whose value will be
* copied to this object
* @stable ICU 2.0
*/
- @SuppressWarnings("unchecked") // Casting result of clone of a collection
public UnicodeSet set(UnicodeSet other) {
+ checkFrozen();
list = other.list.clone();
len = other.len;
- pat = other.pat;
- strings = (TreeSet)other.strings.clone();
+ strings = new TreeSet<String>(other.strings);
return this;
}
/**
- * Modifies this set to represent the set specified by the given pattern.
- * See the class description for the syntax of the pattern language.
- * Whitespace is ignored.
- * @param pattern a string specifying what characters are in the set
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
+ * Returns the number of elements in this set (its cardinality)
+ * Note than the elements of a set may include both individual
+ * codepoints and strings.
+ *
+ * @return the number of elements in this set (its cardinality).
* @stable ICU 2.0
*/
- public final UnicodeSet applyPattern(String pattern) {
- return applyPattern(pattern, null, null, IGNORE_SPACE);
- }
-
- /**
- * Append the <code>toPattern()</code> representation of a
- * string to the given <code>StringBuffer</code>.
- */
- private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
- _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
- }
- }
-
- /**
- * Append the <code>toPattern()</code> representation of a
- * character to the given <code>StringBuffer</code>.
- */
- private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
- // unprintable
- if (Utility.escapeUnprintable(buf, c)) {
- return;
- }
- }
- // Okay to let ':' pass through
- switch (c) {
- case '[': // SET_OPEN:
- case ']': // SET_CLOSE:
- case '-': // HYPHEN:
- case '^': // COMPLEMENT:
- case '&': // INTERSECTION:
- case '\\': //BACKSLASH:
- case '{':
- case '}':
- case '$':
- case ':':
- buf.append('\\');
- break;
- default:
- // Escape whitespace
- if (UCharacterProperty.isRuleWhiteSpace(c)) {
- buf.append('\\');
- }
- break;
- }
- UTF16.append(buf, c);
- }
-
- /**
- * Append a string representation of this set to result. This will be
- * a cleaned version of the string passed to applyPattern(), if there
- * is one. Otherwise it will be generated.
- */
- private StringBuffer _toPattern(StringBuffer result,
- boolean escapeUnprintable) {
- if (pat != null) {
- int i;
- int backslashCount = 0;
- for (i=0; i<pat.length(); ) {
- int c = UTF16.charAt(pat, i);
- i += UTF16.getCharCount(c);
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // If the unprintable character is preceded by an odd
- // number of backslashes, then it has been escaped.
- // Before unescaping it, we delete the final
- // backslash.
- if ((backslashCount % 2) == 1) {
- result.setLength(result.length() - 1);
- }
- Utility.escapeUnprintable(result, c);
- backslashCount = 0;
- } else {
- UTF16.append(result, c);
- if (c == '\\') {
- ++backslashCount;
- } else {
- backslashCount = 0;
- }
- }
- }
- return result;
- }
-
- return _generatePattern(result, escapeUnprintable, true);
- }
-
- /**
- * Generate and append a string representation of this set to result.
- * This does not use this.pat, the cleaned up copy of the string
- * passed to applyPattern().
- * @param includeStrings if false, doesn't include the strings.
- * @stable ICU 3.8
- */
- public StringBuffer _generatePattern(StringBuffer result,
- boolean escapeUnprintable, boolean includeStrings) {
- result.append('[');
-
+ public int size() {
+ int n = 0;
int count = getRangeCount();
-
- // If the set contains at least 2 intervals and includes both
- // MIN_VALUE and MAX_VALUE, then the inverse representation will
- // be more economical.
- if (count > 1 &&
- getRangeStart(0) == MIN_VALUE &&
- getRangeEnd(count-1) == MAX_VALUE) {
-
- // Emit the inverse
- result.append('^');
-
- for (int i = 1; i < count; ++i) {
- int start = getRangeEnd(i-1)+1;
- int end = getRangeStart(i)-1;
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append('-');
- }
- _appendToPat(result, end, escapeUnprintable);
- }
- }
- }
-
- // Default; emit the ranges as pairs
- else {
for (int i = 0; i < count; ++i) {
- int start = getRangeStart(i);
- int end = getRangeEnd(i);
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append('-');
- }
- _appendToPat(result, end, escapeUnprintable);
- }
+ n += getRangeEnd(i) - getRangeStart(i) + 1;
}
- }
-
- if (includeStrings && strings.size() > 0) {
- Iterator<String> it = strings.iterator();
- while (it.hasNext()) {
- result.append('{');
- _appendToPat(result, it.next(), escapeUnprintable);
- result.append('}');
- }
- }
- return result.append(']');
+ return n + strings.size();
}
// for internal use, after checkFrozen has been called
private UnicodeSet add_unchecked(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
@@ -557,10 +426,11 @@
* present. If this set already contains the specified character,
* the call leaves this set unchanged.
* @stable ICU 2.0
*/
public final UnicodeSet add(int c) {
+ checkFrozen();
return add_unchecked(c);
}
// for internal use only, after checkFrozen has been called
private final UnicodeSet add_unchecked(int c) {
@@ -641,11 +511,10 @@
list[i] = c;
list[i+1] = c+1;
len += 2;
}
- pat = null;
return this;
}
/**
* Adds the specified multicharacter to this set if it is not already
@@ -655,27 +524,29 @@
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.0
*/
- public final UnicodeSet add(String s) {
+ public final UnicodeSet add(CharSequence s) {
+ checkFrozen();
int cp = getSingleCP(s);
if (cp < 0) {
- strings.add(s);
- pat = null;
+ strings.add(s.toString());
} else {
add_unchecked(cp, cp);
}
return this;
}
/**
+ * Utility for getting code point from single code point CharSequence.
+ * See the public UTF16.getSingleCodePoint()
* @return a code point IF the string consists of a single one.
* otherwise returns -1.
- * @param string to test
+ * @param s to test
*/
- private static int getSingleCP(String s) {
+ private static int getSingleCP(CharSequence s) {
if (s.length() < 1) {
throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
}
if (s.length() > 2) return -1;
if (s.length() == 1) return s.charAt(0);
@@ -699,39 +570,20 @@
* @param end last character, inclusive, of range to be removed
* from this set.
* @stable ICU 2.0
*/
public UnicodeSet complement(int start, int end) {
+ checkFrozen();
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
if (end < MIN_VALUE || end > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
}
if (start <= end) {
xor(range(start, end), 2, 0);
}
- pat = null;
- return this;
- }
-
- /**
- * This is equivalent to
- * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
- * @stable ICU 2.0
- */
- public UnicodeSet complement() {
- if (list[0] == LOW) {
- System.arraycopy(list, 1, list, 0, len-1);
- --len;
- } else {
- ensureCapacity(len+1);
- System.arraycopy(list, 0, list, 1, len);
- list[0] = LOW;
- ++len;
- }
- pat = null;
return this;
}
/**
* Returns true if this set contains the given character.
@@ -741,10 +593,16 @@
*/
public boolean contains(int c) {
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
+ if (bmpSet != null) {
+ return bmpSet.contains(c);
+ }
+ if (stringSpan != null) {
+ return stringSpan.contains(c);
+ }
/*
// Set i to the index of the start item greater than ch
// We know we will terminate without length test!
int i = -1;
@@ -798,66 +656,35 @@
}
}
}
/**
- * Adds all of the elements in the specified set to this set if
- * they're not already present. This operation effectively
- * modifies this set so that its value is the <i>union</i> of the two
- * sets. The behavior of this operation is unspecified if the specified
- * collection is modified while the operation is in progress.
- *
- * @param c set whose elements are to be added to this set.
- * @stable ICU 2.0
- */
- public UnicodeSet addAll(UnicodeSet c) {
- add(c.list, c.len, 0);
- strings.addAll(c.strings);
- return this;
- }
-
- /**
* Retains only the elements in this set that are contained in the
* specified set. In other words, removes from this set all of
* its elements that are not contained in the specified set. This
* operation effectively modifies this set so that its value is
* the <i>intersection</i> of the two sets.
*
* @param c set that defines which elements this set will retain.
* @stable ICU 2.0
*/
public UnicodeSet retainAll(UnicodeSet c) {
+ checkFrozen();
retain(c.list, c.len, 0);
strings.retainAll(c.strings);
return this;
}
/**
- * Removes from this set all of its elements that are contained in the
- * specified set. This operation effectively modifies this
- * set so that its value is the <i>asymmetric set difference</i> of
- * the two sets.
- *
- * @param c set that defines which elements will be removed from
- * this set.
- * @stable ICU 2.0
- */
- public UnicodeSet removeAll(UnicodeSet c) {
- retain(c.list, c.len, 2);
- strings.removeAll(c.strings);
- return this;
- }
-
- /**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* @stable ICU 2.0
*/
public UnicodeSet clear() {
+ checkFrozen();
list[0] = HIGH;
len = 1;
- pat = null;
strings.clear();
return this;
}
/**
@@ -921,409 +748,22 @@
* is the last character of the pattern string.
* @return an inversion list for the parsed substring
* of <code>pattern</code>
* @exception java.lang.IllegalArgumentException if the parse fails.
*/
- UnicodeSet applyPattern(String pattern,
- ParsePosition pos,
- SymbolTable symbols,
- int options) {
-
- // Need to build the pattern in a temporary string because
- // _applyPattern calls add() etc., which set pat to empty.
- boolean parsePositionWasNull = pos == null;
- if (parsePositionWasNull) {
- pos = new ParsePosition(0);
- }
-
- StringBuffer rebuiltPat = new StringBuffer();
- RuleCharacterIterator chars =
- new RuleCharacterIterator(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, options);
- if (chars.inVariable()) {
- syntaxError(chars, "Extra chars in variable value");
- }
- pat = rebuiltPat.toString();
- if (parsePositionWasNull) {
- int i = pos.getIndex();
-
- // Skip over trailing whitespace
- if ((options & IGNORE_SPACE) != 0) {
- i = Utility.skipWhitespace(pattern, i);
- }
-
- if (i != pattern.length()) {
- throw new IllegalArgumentException("Parse of \"" + pattern +
- "\" failed at " + i);
- }
- }
- return this;
- }
-
- /**
- * Parse the pattern from the given RuleCharacterIterator. The
- * iterator is advanced over the parsed pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param symbols symbol table to use to parse and dereference
- * variables, or null if none.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
- */
- void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
- StringBuffer rebuiltPat, int options) {
- // Syntax characters: [ ] ^ - & { }
-
- // Recognized special forms for chars, sets: c-c s-s s&s
-
- int opts = RuleCharacterIterator.PARSE_VARIABLES |
- RuleCharacterIterator.PARSE_ESCAPES;
- if ((options & IGNORE_SPACE) != 0) {
- opts |= RuleCharacterIterator.SKIP_WHITESPACE;
- }
-
- StringBuffer patBuf = new StringBuffer(), buf = null;
- boolean usePat = false;
- UnicodeSet scratch = null;
- Object backup = null;
-
- // mode: 0=before [, 1=between [...], 2=after ]
- // lastItem: 0=none, 1=char, 2=set
- int lastItem = 0, lastChar = 0, mode = 0;
- char op = 0;
-
- boolean invert = false;
-
- clear();
-
- while (mode != 2 && !chars.atEnd()) {
- if (false) {
- // Debugging assertion
- if (!((lastItem == 0 && op == 0) ||
- (lastItem == 1 && (op == 0 || op == '-')) ||
- (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
- throw new IllegalArgumentException();
- }
- }
-
- int c = 0;
- boolean literal = false;
- UnicodeSet nested = null;
-
- // -------- Check for property pattern
-
- // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
- int setMode = 0;
- if (resemblesPropertyPattern(chars, opts)) {
- setMode = 2;
- }
-
- // -------- Parse '[' of opening delimiter OR nested set.
- // If there is a nested set, use `setMode' to define how
- // the set should be parsed. If the '[' is part of the
- // opening delimiter for this pattern, parse special
- // strings "[", "[^", "[-", and "[^-". Check for stand-in
- // characters representing a nested set in the symbol
- // table.
-
- else {
- // Prepare to backup if necessary
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
-
- if (c == '[' && !literal) {
- if (mode == 1) {
- chars.setPos(backup); // backup
- setMode = 1;
- } else {
- // Handle opening '[' delimiter
- mode = 1;
- patBuf.append('[');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '^' && !literal) {
- invert = true;
- patBuf.append('^');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- }
- // Fall through to handle special leading '-';
- // otherwise restart loop for nested [], \p{}, etc.
- if (c == '-') {
- literal = true;
- // Fall through to handle literal '-' below
- } else {
- chars.setPos(backup); // backup
- continue;
- }
- }
- } else if (symbols != null) {
- UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
- if (m != null) {
- try {
- nested = (UnicodeSet) m;
- setMode = 3;
- } catch (ClassCastException e) {
- syntaxError(chars, "Syntax error");
- }
- }
- }
- }
-
- // -------- Handle a nested set. This either is inline in
- // the pattern or represented by a stand-in that has
- // previously been parsed and was looked up in the symbol
- // table.
-
- if (setMode != 0) {
- if (lastItem == 1) {
- if (op != 0) {
- syntaxError(chars, "Char expected after operator");
- }
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastItem = op = 0;
- }
-
- if (op == '-' || op == '&') {
- patBuf.append(op);
- }
-
- if (nested == null) {
- if (scratch == null) scratch = new UnicodeSet();
- nested = scratch;
- }
- switch (setMode) {
- case 1:
- nested.applyPattern(chars, symbols, patBuf, options);
- break;
- case 2:
- chars.skipIgnored(opts);
- nested.applyPropertyPattern(chars, patBuf, symbols);
- break;
- case 3: // `nested' already parsed
- nested._toPattern(patBuf, false);
- break;
- }
-
- usePat = true;
-
- if (mode == 0) {
- // Entire pattern is a category; leave parse loop
- set(nested);
- mode = 2;
- break;
- }
-
- switch (op) {
- case '-':
- removeAll(nested);
- break;
- case '&':
- retainAll(nested);
- break;
- case 0:
- addAll(nested);
- break;
- }
-
- op = 0;
- lastItem = 2;
-
- continue;
- }
-
- if (mode == 0) {
- syntaxError(chars, "Missing '['");
- }
-
- // -------- Parse special (syntax) characters. If the
- // current character is not special, or if it is escaped,
- // then fall through and handle it below.
-
- if (!literal) {
- switch (c) {
- case ']':
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- // Treat final trailing '-' as a literal
- if (op == '-') {
- add_unchecked(op, op);
- patBuf.append(op);
- } else if (op == '&') {
- syntaxError(chars, "Trailing '&'");
- }
- patBuf.append(']');
- mode = 2;
- continue;
- case '-':
- if (op == 0) {
- if (lastItem != 0) {
- op = (char) c;
- continue;
- } else {
- // Treat final trailing '-' as a literal
- add_unchecked(c, c);
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == ']' && !literal) {
- patBuf.append("-]");
- mode = 2;
- continue;
- }
- }
- }
- syntaxError(chars, "'-' not after char or set");
- break;
- case '&':
- if (lastItem == 2 && op == 0) {
- op = (char) c;
- continue;
- }
- syntaxError(chars, "'&' not after set");
- break;
- case '^':
- syntaxError(chars, "'^' not after '['");
- break;
- case '{':
- if (op != 0) {
- syntaxError(chars, "Missing operand after operator");
- }
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- lastItem = 0;
- if (buf == null) {
- buf = new StringBuffer();
- } else {
- buf.setLength(0);
- }
- boolean ok = false;
- while (!chars.atEnd()) {
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '}' && !literal) {
- ok = true;
- break;
- }
- UTF16.append(buf, c);
- }
- if (buf.length() < 1 || !ok) {
- syntaxError(chars, "Invalid multicharacter string");
- }
- // We have new string. Add it to set and continue;
- // we don't need to drop through to the further
- // processing
- add(buf.toString());
- patBuf.append('{');
- _appendToPat(patBuf, buf.toString(), false);
- patBuf.append('}');
- continue;
- case SymbolTable.SYMBOL_REF:
- // symbols nosymbols
- // [a-$] error error (ambiguous)
- // [a$] anchor anchor
- // [a-$x] var "x"* literal '$'
- // [a-$.] error literal '$'
- // *We won't get here in the case of var "x"
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
- boolean anchor = (c == ']' && !literal);
- if (symbols == null && !anchor) {
- c = SymbolTable.SYMBOL_REF;
- chars.setPos(backup);
- break; // literal '$'
- }
- if (anchor && op == 0) {
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- add_unchecked(UnicodeMatcher.ETHER);
- usePat = true;
- patBuf.append(SymbolTable.SYMBOL_REF).append(']');
- mode = 2;
- continue;
- }
- syntaxError(chars, "Unquoted '$'");
- break;
- default:
- break;
- }
- }
-
- // -------- Parse literal characters. This includes both
- // escaped chars ("\u4E01") and non-syntax characters
- // ("a").
-
- switch (lastItem) {
- case 0:
- lastItem = 1;
- lastChar = c;
- break;
- case 1:
- if (op == '-') {
- if (lastChar >= c) {
- // Don't allow redundant (a-a) or empty (b-a) ranges;
- // these are most likely typos.
- syntaxError(chars, "Invalid range");
- }
- add_unchecked(lastChar, c);
- _appendToPat(patBuf, lastChar, false);
- patBuf.append(op);
- _appendToPat(patBuf, c, false);
- lastItem = op = 0;
- } else {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastChar = c;
- }
- break;
- case 2:
- if (op != 0) {
- syntaxError(chars, "Set expected after operator");
- }
- lastChar = c;
- lastItem = 1;
- break;
- }
- }
-
- if (mode != 2) {
- syntaxError(chars, "Missing ']'");
- }
-
- chars.skipIgnored(opts);
-
- if (invert) {
- complement();
- }
-
- // Use the rebuilt pattern (pat) only if necessary. Prefer the
- // generated pattern.
- if (usePat) {
- rebuiltPat.append(patBuf.toString());
+ private UnicodeSet applyPattern(String pattern,
+ ParsePosition pos) {
+ if ("[:age=3.2:]".equals(pattern)) {
+ checkFrozen();
+ VersionInfo version = VersionInfo.getInstance("3.2");
+ applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
} else {
- _generatePattern(rebuiltPat, false, true);
- }
+ throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
+ + pattern + ")");
}
- private static void syntaxError(RuleCharacterIterator chars, String msg) {
- throw new IllegalArgumentException("Error: " + msg + " at \"" +
- Utility.escape(chars.toString()) +
- '"');
+ return this;
}
//----------------------------------------------------------------
// Implementation: Utility methods
//----------------------------------------------------------------
@@ -1395,11 +835,10 @@
}
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
// polarity = 0 is normal: x union y
// polarity = 2: x union ~y
@@ -1493,11 +932,10 @@
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
// polarity = 0 is normal: x intersect y
// polarity = 2: x intersect ~y == set-minus
@@ -1564,11 +1002,10 @@
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
private static final int max(int a, int b) {
return (a > b) ? a : b;
@@ -1580,62 +1017,50 @@
private static interface Filter {
boolean contains(int codePoint);
}
- // VersionInfo for unassigned characters
- static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+ private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
private static class VersionFilter implements Filter {
VersionInfo version;
-
VersionFilter(VersionInfo version) { this.version = version; }
-
public boolean contains(int ch) {
VersionInfo v = UCharacter.getAge(ch);
// Reference comparison ok; VersionInfo caches and reuses
// unique objects.
return v != NO_VERSION &&
v.compareTo(version) <= 0;
}
}
private static synchronized UnicodeSet getInclusions(int src) {
- if (INCLUSIONS == null) {
- INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
- }
- if(INCLUSIONS[src] == null) {
- UnicodeSet incl = new UnicodeSet();
- switch(src) {
- case UCharacterProperty.SRC_PROPSVEC:
- UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
- break;
- default:
+ if (src != UCharacterProperty.SRC_PROPSVEC) {
throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
}
- INCLUSIONS[src] = incl;
+
+ if (INCLUSION == null) {
+ UnicodeSet incl = new UnicodeSet();
+ UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+ INCLUSION = incl;
}
- return INCLUSIONS[src];
+ return INCLUSION;
}
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
private UnicodeSet applyFilter(Filter filter, int src) {
- // Walk through all Unicode characters, noting the start
+ // Logically, walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
//
- // To improve performance, use the INCLUSIONS set, which
+ // To improve performance, use an inclusions set which
// encodes information about character ranges that are known
- // to have identical properties, such as the CJK Ideographs
- // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
- // except the first characters of such ranges.
- //
- // TODO Where possible, instead of scanning over code points,
- // use internal property data to initialize UnicodeSets for
- // those properties. Scanning code points is slow.
+ // to have identical properties.
+ // getInclusions(src) contains exactly the first characters of
+ // same-value ranges for the given properties "source".
clear();
int startHasProperty = -1;
UnicodeSet inclusions = getInclusions(src);
@@ -1666,206 +1091,317 @@
return this;
}
/**
- * Remove leading and trailing rule white space and compress
- * internal rule white space to a single space character.
+ * Is this frozen, according to the Freezable interface?
*
- * @see UCharacterProperty#isRuleWhiteSpace
+ * @return value
+ * @stable ICU 3.8
*/
- private static String mungeCharName(String source) {
- StringBuffer buf = new StringBuffer();
- for (int i=0; i<source.length(); ) {
- int ch = UTF16.charAt(source, i);
- i += UTF16.getCharCount(ch);
- if (UCharacterProperty.isRuleWhiteSpace(ch)) {
- if (buf.length() == 0 ||
- buf.charAt(buf.length() - 1) == ' ') {
- continue;
- }
- ch = ' '; // convert to ' '
- }
- UTF16.append(buf, ch);
- }
- if (buf.length() != 0 &&
- buf.charAt(buf.length() - 1) == ' ') {
- buf.setLength(buf.length() - 1);
- }
- return buf.toString();
+ public boolean isFrozen() {
+ return (bmpSet != null || stringSpan != null);
}
/**
- * Modifies this set to contain those code points which have the
- * given value for the given property. Prior contents of this
- * set are lost.
- * @param propertyAlias the property alias
- * @param valueAlias the value alias
- * @param symbols if not null, then symbols are first called to see if a property
- * is available. If true, then everything else is skipped.
- * @return this set
- * @stable ICU 3.2
- */
- public UnicodeSet applyPropertyAlias(String propertyAlias,
- String valueAlias, SymbolTable symbols) {
- if (valueAlias.length() > 0) {
- if (propertyAlias.equals("Age")) {
- // Must munge name, since
- // VersionInfo.getInstance() does not do
- // 'loose' matching.
- VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
- applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
- return this;
- }
+ * Freeze this class, according to the Freezable interface.
+ *
+ * @return this
+ * @stable ICU 4.4
+ */
+ public UnicodeSet freeze() {
+ if (!isFrozen()) {
+ // Do most of what compact() does before freezing because
+ // compact() will not work when the set is frozen.
+ // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
+
+ // Delete buffer first to defragment memory less.
+ buffer = null;
+ if (list.length > (len + GROW_EXTRA)) {
+ // Make the capacity equal to len or 1.
+ // We don't want to realloc of 0 size.
+ int capacity = (len == 0) ? 1 : len;
+ int[] oldList = list;
+ list = new int[capacity];
+ for (int i = capacity; i-- > 0;) {
+ list[i] = oldList[i];
}
- throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
}
- /**
- * Return true if the given iterator appears to point at a
- * property pattern. Regardless of the result, return with the
- * iterator unchanged.
- * @param chars iterator over the pattern characters. Upon return
- * it will be unchanged.
- * @param iterOpts RuleCharacterIterator options
- */
- private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
- int iterOpts) {
- boolean result = false;
- iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
- Object pos = chars.getPos(null);
- int c = chars.next(iterOpts);
- if (c == '[' || c == '\\') {
- int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
- result = (c == '[') ? (d == ':') :
- (d == 'N' || d == 'p' || d == 'P');
+ // Optimize contains() and span() and similar functions.
+ if (!strings.isEmpty()) {
+ stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
+ }
+ if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
+ // Optimize for code point spans.
+ // There are no strings, or
+ // all strings are irrelevant for span() etc. because
+ // all of each string's code points are contained in this set.
+ // However, fully contained strings are relevant for spanAndCount(),
+ // so we create both objects.
+ bmpSet = new BMPSet(list, len);
}
- chars.setPos(pos);
- return result;
+ }
+ return this;
}
/**
- * Parse the given property pattern at the given parse position.
- * @param symbols TODO
+ * Span a string using this UnicodeSet.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param spanCondition The span condition
+ * @return the length of the span
+ * @stable ICU 4.4
*/
- private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
- int pos = ppos.getIndex();
-
- // On entry, ppos should point to one of the following locations:
+ public int span(CharSequence s, SpanCondition spanCondition) {
+ return span(s, 0, spanCondition);
+ }
- // Minimum length is 5 characters, e.g. \p{L}
- if ((pos+5) > pattern.length()) {
- return null;
- }
-
- boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
- boolean isName = false; // true for \N{pat}, o/w false
- boolean invert = false;
-
- // Look for an opening [:, [:^, \p, or \P
- if (pattern.regionMatches(pos, "[:", 0, 2)) {
- posix = true;
- pos = Utility.skipWhitespace(pattern, pos+2);
- if (pos < pattern.length() && pattern.charAt(pos) == '^') {
- ++pos;
- invert = true;
- }
- } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
- pattern.regionMatches(pos, "\\N", 0, 2)) {
- char c = pattern.charAt(pos+1);
- invert = (c == 'P');
- isName = (c == 'N');
- pos = Utility.skipWhitespace(pattern, pos+2);
- if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
- // Syntax error; "\p" or "\P" not followed by "{"
- return null;
+ /**
+ * Span a string using this UnicodeSet.
+ * If the start index is less than 0, span will start from 0.
+ * If the start index is greater than the string length, span returns the string length.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @return the string index which ends the span (i.e. exclusive)
+ * @stable ICU 4.4
+ */
+ public int span(CharSequence s, int start, SpanCondition spanCondition) {
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
}
- } else {
- // Open delimiter not seen
- return null;
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for span().
+ return bmpSet.span(s, start, spanCondition, null);
+ }
+ if (stringSpan != null) {
+ return stringSpan.span(s, start, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+ if (strSpan.needsStringSpanUTF16()) {
+ return strSpan.span(s, start, spanCondition);
}
-
- // Look for the matching close delimiter, either :] or }
- int close = pattern.indexOf(posix ? ":]" : "}", pos);
- if (close < 0) {
- // Syntax error; close delimiter missing
- return null;
- }
-
- // Look for an '=' sign. If this is present, we will parse a
- // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
- // pattern.
- int equals = pattern.indexOf('=', pos);
- String propName, valueName;
- if (equals >= 0 && equals < close && !isName) {
- // Equals seen; parse medium/long pattern
- propName = pattern.substring(pos, equals);
- valueName = pattern.substring(equals+1, close);
}
- else {
- // Handle case where no '=' is seen, and \N{}
- propName = pattern.substring(pos, close);
- valueName = "";
+ return spanCodePointsAndCount(s, start, spanCondition, null);
+ }
- // Handle \N{name}
- if (isName) {
- // This is a little inefficient since it means we have to
- // parse "na" back to UProperty.NAME even though we already
- // know it's UProperty.NAME. If we refactor the API to
- // support args of (int, String) then we can remove
- // "na" and make this a little more efficient.
- valueName = propName;
- propName = "na";
+ /**
+ * Same as span() but also counts the smallest number of set elements on any path across the span.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param outCount An output-only object (must not be null) for returning the count.
+ * @return the limit (exclusive end) of the span
+ */
+ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
+ if (outCount == null) {
+ throw new IllegalArgumentException("outCount must not be null");
}
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (stringSpan != null) {
+ // We might also have bmpSet != null,
+ // but fully-contained strings are relevant for counting elements.
+ return stringSpan.spanAndCount(s, start, spanCondition, outCount);
+ } else if (bmpSet != null) {
+ return bmpSet.span(s, start, spanCondition, outCount);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ which |= UnicodeSetStringSpan.WITH_COUNT;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+ return strSpan.spanAndCount(s, start, spanCondition, outCount);
}
- applyPropertyAlias(propName, valueName, symbols);
-
- if (invert) {
- complement();
+ return spanCodePointsAndCount(s, start, spanCondition, outCount);
}
- // Move to the limit position after the close delimiter
- ppos.setIndex(close + (posix ? 2 : 1));
+ private int spanCodePointsAndCount(CharSequence s, int start,
+ SpanCondition spanCondition, OutputInt outCount) {
+ // Pin to 0/1 values.
+ boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
- return this;
+ int c;
+ int next = start;
+ int length = s.length();
+ int count = 0;
+ do {
+ c = Character.codePointAt(s, next);
+ if (spanContained != contains(c)) {
+ break;
+ }
+ ++count;
+ next += Character.charCount(c);
+ } while (next < length);
+ if (outCount != null) { outCount.value = count; }
+ return next;
}
/**
- * Parse a property pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param symbols TODO
- */
- private void applyPropertyPattern(RuleCharacterIterator chars,
- StringBuffer rebuiltPat, SymbolTable symbols) {
- String patStr = chars.lookahead();
- ParsePosition pos = new ParsePosition(0);
- applyPropertyPattern(patStr, pos, symbols);
- if (pos.getIndex() == 0) {
- syntaxError(chars, "Invalid property pattern");
+ * Span a string backwards (from the fromIndex) using this UnicodeSet.
+ * If the fromIndex is less than 0, spanBack will return 0.
+ * If fromIndex is greater than the string length, spanBack will start from the string length.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
+ * @param spanCondition The span condition
+ * @return The string index which starts the span (i.e. inclusive).
+ * @stable ICU 4.4
+ */
+ public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
+ if (fromIndex <= 0) {
+ return 0;
+ }
+ if (fromIndex > s.length()) {
+ fromIndex = s.length();
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for spanBack().
+ return bmpSet.spanBack(s, fromIndex, spanCondition);
+ }
+ if (stringSpan != null) {
+ return stringSpan.spanBack(s, fromIndex, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = (spanCondition == SpanCondition.NOT_CONTAINED)
+ ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+ if (strSpan.needsStringSpanUTF16()) {
+ return strSpan.spanBack(s, fromIndex, spanCondition);
}
- chars.jumpahead(pos.getIndex());
- rebuiltPat.append(patStr, 0, pos.getIndex());
}
- //----------------------------------------------------------------
- // Case folding API
- //----------------------------------------------------------------
+ // Pin to 0/1 values.
+ boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
+
+ int c;
+ int prev = fromIndex;
+ do {
+ c = Character.codePointBefore(s, prev);
+ if (spanContained != contains(c)) {
+ break;
+ }
+ prev -= Character.charCount(c);
+ } while (prev > 0);
+ return prev;
+ }
/**
- * Bitmask for constructor and applyPattern() indicating that
- * white space should be ignored. If set, ignore characters for
- * which UCharacterProperty.isRuleWhiteSpace() returns true,
- * unless they are quoted or escaped. This may be ORed together
- * with other selectors.
- * @stable ICU 3.8
+ * Clone a thawed version of this class, according to the Freezable interface.
+ * @return the clone, not frozen
+ * @stable ICU 4.4
*/
- public static final int IGNORE_SPACE = 1;
+ public UnicodeSet cloneAsThawed() {
+ UnicodeSet result = new UnicodeSet(this);
+ assert !result.isFrozen();
+ return result;
+ }
-}
+ // internal function
+ private void checkFrozen() {
+ if (isFrozen()) {
+ throw new UnsupportedOperationException("Attempt to modify frozen object");
+ }
+ }
+
+ /**
+ * Argument values for whether span() and similar functions continue while the current character is contained vs.
+ * not contained in the set.
+ * <p>
+ * The functionality is straightforward for sets with only single code points, without strings (which is the common
+ * case):
+ * <ul>
+ * <li>CONTAINED and SIMPLE work the same.
+ * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
+ * <li>span() and spanBack() partition any string the
+ * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
+ * <li>Using a
+ * complemented (inverted) set and the opposite span conditions yields the same results.
+ * </ul>
+ * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
+ * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
+ * strings:
+ * <ul>
+ * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
+ * Therefore, complementing both the set and the span conditions may yield different results.
+ * <li>When starting spans
+ * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
+ * because a set string may start before the later position.
+ * <li>span(SIMPLE) may be shorter than
+ * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
+ * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
+ * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
+ * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
+ * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
+ * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
+ * </ul>
+ * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
+ * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
+ * be used.
+ * <p>
+ * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
+ * boundaries, never in the middle of a surrogate pair.
+ *
+ * @stable ICU 4.4
+ */
+ public enum SpanCondition {
+ /**
+ * Continues a span() while there is no set element at the current position.
+ * Increments by one code point at a time.
+ * Stops before the first set element (character or string).
+ * (For code points only, this is like while contains(current)==false).
+ * <p>
+ * When span() returns, the substring between where it started and the position it returned consists only of
+ * characters that are not in the set, and none of its strings overlap with the span.
+ *
+ * @stable ICU 4.4
+ */
+ NOT_CONTAINED,
+
+ /**
+ * Spans the longest substring that is a concatenation of set elements (characters or strings).
+ * (For characters only, this is like while contains(current)==true).
+ * <p>
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ * <p>
+ * If a set contains strings, then the span will be the longest substring for which there
+ * exists at least one non-overlapping concatenation of set elements (characters or strings).
+ * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
+ * (Java/ICU/Perl regex stops at the first match of an OR.)
+ *
+ * @stable ICU 4.4
+ */
+ CONTAINED,
+
+ /**
+ * Continues a span() while there is a set element at the current position.
+ * Increments by the longest matching element at each position.
+ * (For characters only, this is like while contains(current)==true).
+ * <p>
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ * <p>
+ * If a set only contains single characters, then this is the same as CONTAINED.
+ * <p>
+ * If a set contains strings, then the span will be the longest substring with a match at each position with the
+ * longest single set element (character or string).
+ * <p>
+ * Use this span condition together with other longest-match algorithms, such as ICU converters
+ * (ucnv_getUnicodeSet()).
+ *
+ * @stable ICU 4.4
+ */
+ SIMPLE,
+ }
+}
< prev index next >