--- old/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java 2015-07-13 16:11:57.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java 2015-07-13 16:11:57.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,29 +22,31 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2015, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ - package sun.text.normalizer; +import java.io.IOException; import java.text.ParsePosition; -import java.util.Iterator; +import java.util.ArrayList; import java.util.TreeSet; /** - * A mutable set of Unicode characters and multicharacter strings. Objects of this class - * represent character classes used in regular expressions. - * A character specifies a subset of Unicode code points. Legal - * code points are U+0000 to U+10FFFF, inclusive. + * A mutable set of Unicode characters and multicharacter strings. + * Objects of this class represent character classes used + * in regular expressions. A character specifies a subset of Unicode + * code points. Legal code points are U+0000 to U+10FFFF, inclusive. + * + * Note: method freeze() will not only make the set immutable, but + * also makes important methods much higher performance: + * contains(c), containsNone(...), span(...), spanBack(...) etc. + * After the object is frozen, any subsequent call that wants to change + * the object will throw UnsupportedOperationException. * *
The UnicodeSet class is not designed to be subclassed. * @@ -118,7 +120,7 @@ * * * Any character may be preceded by a backslash in order to remove any special - * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are + * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are * ignored, unless they are escaped. * *
Property patterns specify a set of characters having a certain @@ -267,18 +269,24 @@ * * * - *
To iterate over contents of UnicodeSet, use UnicodeSetIterator class. + *
To iterate over contents of UnicodeSet, the following are available: + *
To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
* @stable ICU 2.0
- * @see UnicodeSetIterator
*/
-@SuppressWarnings("deprecation")
-public class UnicodeSet implements UnicodeMatcher {
+class UnicodeSet {
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
- // 110000 for codepoints
+ // 110000 for codepoints
/**
* Minimum value that can be stored in a UnicodeSet.
@@ -299,7 +307,7 @@
// NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
// is not private so that UnicodeSetIterator can get access
- TreeSet To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param spanCondition The span condition
+ * @return the length of the span
+ * @stable ICU 4.4
+ */
+ public int span(CharSequence s, SpanCondition spanCondition) {
+ return span(s, 0, spanCondition);
+ }
+
+ /**
+ * Span a string using this UnicodeSet.
+ * If the start index is less than 0, span will start from 0.
+ * If the start index is greater than the string length, span returns the string length.
+ * To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @return the string index which ends the span (i.e. exclusive)
+ * @stable ICU 4.4
+ */
+ public int span(CharSequence s, int start, SpanCondition spanCondition) {
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for span().
+ return bmpSet.span(s, start, spanCondition, null);
+ }
+ if (stringSpan != null) {
+ return stringSpan.span(s, start, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param outCount An output-only object (must not be null) for returning the count.
+ * @return the limit (exclusive end) of the span
+ */
+ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
+ if (outCount == null) {
+ throw new IllegalArgumentException("outCount must not be null");
+ }
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (stringSpan != null) {
+ // We might also have bmpSet != null,
+ // but fully-contained strings are relevant for counting elements.
+ return stringSpan.spanAndCount(s, start, spanCondition, outCount);
+ } else if (bmpSet != null) {
+ return bmpSet.span(s, start, spanCondition, outCount);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ which |= UnicodeSetStringSpan.WITH_COUNT;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
+ * @param spanCondition The span condition
+ * @return The string index which starts the span (i.e. inclusive).
+ * @stable ICU 4.4
+ */
+ public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
+ if (fromIndex <= 0) {
+ return 0;
+ }
+ if (fromIndex > s.length()) {
+ fromIndex = s.length();
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for spanBack().
+ return bmpSet.spanBack(s, fromIndex, spanCondition);
+ }
+ if (stringSpan != null) {
+ return stringSpan.spanBack(s, fromIndex, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = (spanCondition == SpanCondition.NOT_CONTAINED)
+ ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList
+ * The functionality is straightforward for sets with only single code points, without strings (which is the common
+ * case):
+ *
+ * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
+ * boundaries, never in the middle of a surrogate pair.
+ *
+ * @stable ICU 4.4
*/
- public static final int IGNORE_SPACE = 1;
+ public enum SpanCondition {
+ /**
+ * Continues a span() while there is no set element at the current position.
+ * Increments by one code point at a time.
+ * Stops before the first set element (character or string).
+ * (For code points only, this is like while contains(current)==false).
+ *
+ * When span() returns, the substring between where it started and the position it returned consists only of
+ * characters that are not in the set, and none of its strings overlap with the span.
+ *
+ * @stable ICU 4.4
+ */
+ NOT_CONTAINED,
-}
+ /**
+ * Spans the longest substring that is a concatenation of set elements (characters or strings).
+ * (For characters only, this is like while contains(current)==true).
+ *
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ *
+ * If a set contains strings, then the span will be the longest substring for which there
+ * exists at least one non-overlapping concatenation of set elements (characters or strings).
+ * This is equivalent to a POSIX regular expression for
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ *
+ * If a set only contains single characters, then this is the same as CONTAINED.
+ *
+ * If a set contains strings, then the span will be the longest substring with a match at each position with the
+ * longest single set element (character or string).
+ *
+ * Use this span condition together with other longest-match algorithms, such as ICU converters
+ * (ucnv_getUnicodeSet()).
+ *
+ * @stable ICU 4.4
+ */
+ SIMPLE,
+ }
+
+}
end >
+ * start
then an empty set is created.
*
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
@@ -359,7 +371,7 @@
*/
public UnicodeSet(String pattern) {
this();
- applyPattern(pattern, null, null, IGNORE_SPACE);
+ applyPattern(pattern, null);
}
/**
@@ -368,172 +380,29 @@
* copied to this object
* @stable ICU 2.0
*/
- @SuppressWarnings("unchecked") // Casting result of clone of a collection
public UnicodeSet set(UnicodeSet other) {
+ checkFrozen();
list = other.list.clone();
len = other.len;
- pat = other.pat;
- strings = (TreeSet)other.strings.clone();
+ strings = new TreeSettoPattern()
representation of a
- * string to the given StringBuffer
.
- */
- private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
- _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
- }
- }
-
- /**
- * Append the toPattern()
representation of a
- * character to the given StringBuffer
.
- */
- private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // Use hex escape notation (complement(MIN_VALUE, MAX_VALUE)
.
- * @stable ICU 2.0
- */
- public UnicodeSet complement() {
- if (list[0] == LOW) {
- System.arraycopy(list, 1, list, 0, len-1);
- --len;
- } else {
- ensureCapacity(len+1);
- System.arraycopy(list, 0, list, 1, len);
- list[0] = LOW;
- ++len;
- }
- pat = null;
return this;
}
@@ -743,6 +595,12 @@
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
+ if (bmpSet != null) {
+ return bmpSet.contains(c);
+ }
+ if (stringSpan != null) {
+ return stringSpan.contains(c);
+ }
/*
// Set i to the index of the start item greater than ch
@@ -751,7 +609,7 @@
while (true) {
if (c < list[++i]) break;
}
- */
+ */
int i = findCodePoint(c);
@@ -790,7 +648,7 @@
// invariant: c < list[hi]
for (;;) {
int i = (lo + hi) >>> 1;
- if (i == lo) return hi;
+ if (i == lo) return hi;
if (c < list[i]) {
hi = i;
} else {
@@ -800,22 +658,6 @@
}
/**
- * Adds all of the elements in the specified set to this set if
- * they're not already present. This operation effectively
- * modifies this set so that its value is the union of the two
- * sets. The behavior of this operation is unspecified if the specified
- * collection is modified while the operation is in progress.
- *
- * @param c set whose elements are to be added to this set.
- * @stable ICU 2.0
- */
- public UnicodeSet addAll(UnicodeSet c) {
- add(c.list, c.len, 0);
- strings.addAll(c.strings);
- return this;
- }
-
- /**
* Retains only the elements in this set that are contained in the
* specified set. In other words, removes from this set all of
* its elements that are not contained in the specified set. This
@@ -826,36 +668,21 @@
* @stable ICU 2.0
*/
public UnicodeSet retainAll(UnicodeSet c) {
+ checkFrozen();
retain(c.list, c.len, 0);
strings.retainAll(c.strings);
return this;
}
/**
- * Removes from this set all of its elements that are contained in the
- * specified set. This operation effectively modifies this
- * set so that its value is the asymmetric set difference of
- * the two sets.
- *
- * @param c set that defines which elements will be removed from
- * this set.
- * @stable ICU 2.0
- */
- public UnicodeSet removeAll(UnicodeSet c) {
- retain(c.list, c.len, 2);
- strings.removeAll(c.strings);
- return this;
- }
-
- /**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* @stable ICU 2.0
*/
public UnicodeSet clear() {
+ checkFrozen();
list[0] = HIGH;
len = 1;
- pat = null;
strings.clear();
return this;
}
@@ -923,405 +750,18 @@
* of pattern
* @exception java.lang.IllegalArgumentException if the parse fails.
*/
- UnicodeSet applyPattern(String pattern,
- ParsePosition pos,
- SymbolTable symbols,
- int options) {
-
- // Need to build the pattern in a temporary string because
- // _applyPattern calls add() etc., which set pat to empty.
- boolean parsePositionWasNull = pos == null;
- if (parsePositionWasNull) {
- pos = new ParsePosition(0);
- }
-
- StringBuffer rebuiltPat = new StringBuffer();
- RuleCharacterIterator chars =
- new RuleCharacterIterator(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, options);
- if (chars.inVariable()) {
- syntaxError(chars, "Extra chars in variable value");
- }
- pat = rebuiltPat.toString();
- if (parsePositionWasNull) {
- int i = pos.getIndex();
-
- // Skip over trailing whitespace
- if ((options & IGNORE_SPACE) != 0) {
- i = Utility.skipWhitespace(pattern, i);
- }
-
- if (i != pattern.length()) {
- throw new IllegalArgumentException("Parse of \"" + pattern +
- "\" failed at " + i);
- }
- }
- return this;
- }
-
- /**
- * Parse the pattern from the given RuleCharacterIterator. The
- * iterator is advanced over the parsed pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param symbols symbol table to use to parse and dereference
- * variables, or null if none.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
- */
- void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
- StringBuffer rebuiltPat, int options) {
- // Syntax characters: [ ] ^ - & { }
-
- // Recognized special forms for chars, sets: c-c s-s s&s
-
- int opts = RuleCharacterIterator.PARSE_VARIABLES |
- RuleCharacterIterator.PARSE_ESCAPES;
- if ((options & IGNORE_SPACE) != 0) {
- opts |= RuleCharacterIterator.SKIP_WHITESPACE;
- }
-
- StringBuffer patBuf = new StringBuffer(), buf = null;
- boolean usePat = false;
- UnicodeSet scratch = null;
- Object backup = null;
-
- // mode: 0=before [, 1=between [...], 2=after ]
- // lastItem: 0=none, 1=char, 2=set
- int lastItem = 0, lastChar = 0, mode = 0;
- char op = 0;
-
- boolean invert = false;
-
- clear();
-
- while (mode != 2 && !chars.atEnd()) {
- if (false) {
- // Debugging assertion
- if (!((lastItem == 0 && op == 0) ||
- (lastItem == 1 && (op == 0 || op == '-')) ||
- (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
- throw new IllegalArgumentException();
- }
- }
-
- int c = 0;
- boolean literal = false;
- UnicodeSet nested = null;
-
- // -------- Check for property pattern
-
- // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
- int setMode = 0;
- if (resemblesPropertyPattern(chars, opts)) {
- setMode = 2;
- }
-
- // -------- Parse '[' of opening delimiter OR nested set.
- // If there is a nested set, use `setMode' to define how
- // the set should be parsed. If the '[' is part of the
- // opening delimiter for this pattern, parse special
- // strings "[", "[^", "[-", and "[^-". Check for stand-in
- // characters representing a nested set in the symbol
- // table.
-
- else {
- // Prepare to backup if necessary
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
-
- if (c == '[' && !literal) {
- if (mode == 1) {
- chars.setPos(backup); // backup
- setMode = 1;
- } else {
- // Handle opening '[' delimiter
- mode = 1;
- patBuf.append('[');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '^' && !literal) {
- invert = true;
- patBuf.append('^');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- }
- // Fall through to handle special leading '-';
- // otherwise restart loop for nested [], \p{}, etc.
- if (c == '-') {
- literal = true;
- // Fall through to handle literal '-' below
- } else {
- chars.setPos(backup); // backup
- continue;
- }
- }
- } else if (symbols != null) {
- UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
- if (m != null) {
- try {
- nested = (UnicodeSet) m;
- setMode = 3;
- } catch (ClassCastException e) {
- syntaxError(chars, "Syntax error");
- }
- }
- }
- }
-
- // -------- Handle a nested set. This either is inline in
- // the pattern or represented by a stand-in that has
- // previously been parsed and was looked up in the symbol
- // table.
-
- if (setMode != 0) {
- if (lastItem == 1) {
- if (op != 0) {
- syntaxError(chars, "Char expected after operator");
- }
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastItem = op = 0;
- }
-
- if (op == '-' || op == '&') {
- patBuf.append(op);
- }
-
- if (nested == null) {
- if (scratch == null) scratch = new UnicodeSet();
- nested = scratch;
- }
- switch (setMode) {
- case 1:
- nested.applyPattern(chars, symbols, patBuf, options);
- break;
- case 2:
- chars.skipIgnored(opts);
- nested.applyPropertyPattern(chars, patBuf, symbols);
- break;
- case 3: // `nested' already parsed
- nested._toPattern(patBuf, false);
- break;
- }
-
- usePat = true;
-
- if (mode == 0) {
- // Entire pattern is a category; leave parse loop
- set(nested);
- mode = 2;
- break;
- }
-
- switch (op) {
- case '-':
- removeAll(nested);
- break;
- case '&':
- retainAll(nested);
- break;
- case 0:
- addAll(nested);
- break;
- }
-
- op = 0;
- lastItem = 2;
-
- continue;
- }
-
- if (mode == 0) {
- syntaxError(chars, "Missing '['");
- }
-
- // -------- Parse special (syntax) characters. If the
- // current character is not special, or if it is escaped,
- // then fall through and handle it below.
-
- if (!literal) {
- switch (c) {
- case ']':
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- // Treat final trailing '-' as a literal
- if (op == '-') {
- add_unchecked(op, op);
- patBuf.append(op);
- } else if (op == '&') {
- syntaxError(chars, "Trailing '&'");
- }
- patBuf.append(']');
- mode = 2;
- continue;
- case '-':
- if (op == 0) {
- if (lastItem != 0) {
- op = (char) c;
- continue;
- } else {
- // Treat final trailing '-' as a literal
- add_unchecked(c, c);
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == ']' && !literal) {
- patBuf.append("-]");
- mode = 2;
- continue;
- }
- }
- }
- syntaxError(chars, "'-' not after char or set");
- break;
- case '&':
- if (lastItem == 2 && op == 0) {
- op = (char) c;
- continue;
- }
- syntaxError(chars, "'&' not after set");
- break;
- case '^':
- syntaxError(chars, "'^' not after '['");
- break;
- case '{':
- if (op != 0) {
- syntaxError(chars, "Missing operand after operator");
- }
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- lastItem = 0;
- if (buf == null) {
- buf = new StringBuffer();
- } else {
- buf.setLength(0);
- }
- boolean ok = false;
- while (!chars.atEnd()) {
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '}' && !literal) {
- ok = true;
- break;
- }
- UTF16.append(buf, c);
- }
- if (buf.length() < 1 || !ok) {
- syntaxError(chars, "Invalid multicharacter string");
- }
- // We have new string. Add it to set and continue;
- // we don't need to drop through to the further
- // processing
- add(buf.toString());
- patBuf.append('{');
- _appendToPat(patBuf, buf.toString(), false);
- patBuf.append('}');
- continue;
- case SymbolTable.SYMBOL_REF:
- // symbols nosymbols
- // [a-$] error error (ambiguous)
- // [a$] anchor anchor
- // [a-$x] var "x"* literal '$'
- // [a-$.] error literal '$'
- // *We won't get here in the case of var "x"
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
- boolean anchor = (c == ']' && !literal);
- if (symbols == null && !anchor) {
- c = SymbolTable.SYMBOL_REF;
- chars.setPos(backup);
- break; // literal '$'
- }
- if (anchor && op == 0) {
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- add_unchecked(UnicodeMatcher.ETHER);
- usePat = true;
- patBuf.append(SymbolTable.SYMBOL_REF).append(']');
- mode = 2;
- continue;
- }
- syntaxError(chars, "Unquoted '$'");
- break;
- default:
- break;
- }
- }
-
- // -------- Parse literal characters. This includes both
- // escaped chars ("\u4E01") and non-syntax characters
- // ("a").
-
- switch (lastItem) {
- case 0:
- lastItem = 1;
- lastChar = c;
- break;
- case 1:
- if (op == '-') {
- if (lastChar >= c) {
- // Don't allow redundant (a-a) or empty (b-a) ranges;
- // these are most likely typos.
- syntaxError(chars, "Invalid range");
- }
- add_unchecked(lastChar, c);
- _appendToPat(patBuf, lastChar, false);
- patBuf.append(op);
- _appendToPat(patBuf, c, false);
- lastItem = op = 0;
- } else {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastChar = c;
- }
- break;
- case 2:
- if (op != 0) {
- syntaxError(chars, "Set expected after operator");
- }
- lastChar = c;
- lastItem = 1;
- break;
- }
- }
-
- if (mode != 2) {
- syntaxError(chars, "Missing ']'");
- }
-
- chars.skipIgnored(opts);
-
- if (invert) {
- complement();
- }
-
- // Use the rebuilt pattern (pat) only if necessary. Prefer the
- // generated pattern.
- if (usePat) {
- rebuiltPat.append(patBuf.toString());
+ private UnicodeSet applyPattern(String pattern,
+ ParsePosition pos) {
+ if ("[:age=3.2:]".equals(pattern)) {
+ checkFrozen();
+ VersionInfo version = VersionInfo.getInstance("3.2");
+ applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
} else {
- _generatePattern(rebuiltPat, false, true);
+ throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
+ + pattern + ")");
}
- }
- private static void syntaxError(RuleCharacterIterator chars, String msg) {
- throw new IllegalArgumentException("Error: " + msg + " at \"" +
- Utility.escape(chars.toString()) +
- '"');
+ return this;
}
//----------------------------------------------------------------
@@ -1397,7 +837,6 @@
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1414,88 +853,87 @@
// change from xor is that we have to check overlapping pairs
// polarity bit 1 means a is second, bit 2 means b is.
main:
- while (true) {
- switch (polarity) {
- case 0: // both first; take lower if unequal
- if (a < b) { // take a
- // Back up over overlapping ranges in buffer[]
- if (k > 0 && a <= buffer[k-1]) {
- // Pick latter end value in buffer[] vs. list[]
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
- buffer[k++] = a;
- a = list[i];
- }
- i++; // Common if/else code factored out
- polarity ^= 1;
- } else if (b < a) { // take b
- if (k > 0 && b <= buffer[k-1]) {
- b = max(other[j], buffer[--k]);
- } else {
- buffer[k++] = b;
- b = other[j];
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; take lower if unequal
+ if (a < b) { // take a
+ // Back up over overlapping ranges in buffer[]
+ if (k > 0 && a <= buffer[k-1]) {
+ // Pick latter end value in buffer[] vs. list[]
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++; // Common if/else code factored out
+ polarity ^= 1;
+ } else if (b < a) { // take b
+ if (k > 0 && b <= buffer[k-1]) {
+ b = max(other[j], buffer[--k]);
+ } else {
+ buffer[k++] = b;
+ b = other[j];
+ }
+ j++;
+ polarity ^= 2;
+ } else { // a == b, take a, drop b
+ if (a == HIGH) break main;
+ // This is symmetrical; it doesn't matter if
+ // we backtrack with a or b. - liu
+ if (k > 0 && a <= buffer[k-1]) {
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++;
+ polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
}
- j++;
- polarity ^= 2;
- } else { // a == b, take a, drop b
- if (a == HIGH) break main;
- // This is symmetrical; it doesn't matter if
- // we backtrack with a or b. - liu
- if (k > 0 && a <= buffer[k-1]) {
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
+ break;
+ case 3: // both second; take higher if unequal, and drop other
+ if (b <= a) { // take a
+ if (a == HIGH) break main;
buffer[k++] = a;
- a = list[i];
+ } else { // take b
+ if (b == HIGH) break main;
+ buffer[k++] = b;
}
- i++;
- polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take higher if unequal, and drop other
- if (b <= a) { // take a
- if (a == HIGH) break main;
- buffer[k++] = a;
- } else { // take b
- if (b == HIGH) break main;
- buffer[k++] = b;
- }
- a = list[i++]; polarity ^= 1; // factored common code
- b = other[j++]; polarity ^= 2;
- break;
- case 1: // a second, b first; if b < a, overlap
- if (a < b) { // no overlap, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
+ a = list[i++]; polarity ^= 1; // factored common code
b = other[j++]; polarity ^= 2;
+ break;
+ case 1: // a second, b first; if b < a, overlap
+ if (a < b) { // no overlap, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
}
- break;
}
- }
buffer[k++] = HIGH; // terminate
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1512,61 +950,60 @@
// change from xor is that we have to check overlapping pairs
// polarity bit 1 means a is second, bit 2 means b is.
main:
- while (true) {
- switch (polarity) {
- case 0: // both first; drop the smaller
- if (a < b) { // drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take lower if unequal
- if (a < b) { // take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 1: // a second, b first;
- if (a < b) { // NO OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, drop b
- b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; drop the smaller
+ if (a < b) { // drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 3: // both second; take lower if unequal
+ if (a < b) { // take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 1: // a second, b first;
+ if (a < b) { // NO OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, drop b
+ b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
}
- break;
}
- }
buffer[k++] = HIGH; // terminate
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1582,58 +1019,46 @@
boolean contains(int codePoint);
}
- // VersionInfo for unassigned characters
- static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+ private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
private static class VersionFilter implements Filter {
VersionInfo version;
-
VersionFilter(VersionInfo version) { this.version = version; }
-
public boolean contains(int ch) {
VersionInfo v = UCharacter.getAge(ch);
// Reference comparison ok; VersionInfo caches and reuses
// unique objects.
return v != NO_VERSION &&
- v.compareTo(version) <= 0;
+ v.compareTo(version) <= 0;
}
}
private static synchronized UnicodeSet getInclusions(int src) {
- if (INCLUSIONS == null) {
- INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
+ if (src != UCharacterProperty.SRC_PROPSVEC) {
+ throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
}
- if(INCLUSIONS[src] == null) {
+
+ if (INCLUSION == null) {
UnicodeSet incl = new UnicodeSet();
- switch(src) {
- case UCharacterProperty.SRC_PROPSVEC:
- UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
- break;
- default:
- throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
- }
- INCLUSIONS[src] = incl;
+ UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+ INCLUSION = incl;
}
- return INCLUSIONS[src];
+ return INCLUSION;
}
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
private UnicodeSet applyFilter(Filter filter, int src) {
- // Walk through all Unicode characters, noting the start
+ // Logically, walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
//
- // To improve performance, use the INCLUSIONS set, which
+ // To improve performance, use an inclusions set which
// encodes information about character ranges that are known
- // to have identical properties, such as the CJK Ideographs
- // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
- // except the first characters of such ranges.
- //
- // TODO Where possible, instead of scanning over code points,
- // use internal property data to initialize UnicodeSets for
- // those properties. Scanning code points is slow.
+ // to have identical properties.
+ // getInclusions(src) contains exactly the first characters of
+ // same-value ranges for the given properties "source".
clear();
@@ -1668,204 +1093,315 @@
}
/**
- * Remove leading and trailing rule white space and compress
- * internal rule white space to a single space character.
+ * Is this frozen, according to the Freezable interface?
*
- * @see UCharacterProperty#isRuleWhiteSpace
+ * @return value
+ * @stable ICU 3.8
*/
- private static String mungeCharName(String source) {
- StringBuffer buf = new StringBuffer();
- for (int i=0; i
+ *
+ * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
+ * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
+ * strings:
+ *
+ *
+ * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
+ * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
+ * be used.
+ * (OR of each set element)*
.
+ * (Java/ICU/Perl regex stops at the first match of an OR.)
+ *
+ * @stable ICU 4.4
+ */
+ CONTAINED,
+ /**
+ * Continues a span() while there is a set element at the current position.
+ * Increments by the longest matching element at each position.
+ * (For characters only, this is like while contains(current)==true).
+ *