< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 20,52 **** * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* ! * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * ! * * ! * The original version of this source code and documentation is copyrighted * ! * and owned by IBM, These materials are provided under terms of a License * ! * Agreement between IBM and Sun. This technology is protected by multiple * ! * US and International patents. This notice and attribution to IBM may not * ! * to removed. * ******************************************************************************* */ - package sun.text.normalizer; import java.text.ParsePosition; ! import java.util.Iterator; import java.util.TreeSet; /** ! * A mutable set of Unicode characters and multicharacter strings. Objects of this class ! * represent <em>character classes</em> used in regular expressions. ! * A character specifies a subset of Unicode code points. Legal ! * code points are U+0000 to U+10FFFF, inclusive. * * <p>The UnicodeSet class is not designed to be subclassed. * * <p><code>UnicodeSet</code> supports two APIs. The first is the * <em>operand</em> API that allows the caller to modify the value of --- 20,54 ---- * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* ! * Copyright (C) 1996-2015, International Business Machines Corporation and ! * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; + import java.io.IOException; import java.text.ParsePosition; ! import java.util.ArrayList; import java.util.TreeSet; /** ! * A mutable set of Unicode characters and multicharacter strings. ! * Objects of this class represent <em>character classes</em> used ! * in regular expressions. A character specifies a subset of Unicode ! * code points. Legal code points are U+0000 to U+10FFFF, inclusive. ! * ! * Note: method freeze() will not only make the set immutable, but ! * also makes important methods much higher performance: ! * contains(c), containsNone(...), span(...), spanBack(...) etc. ! * After the object is frozen, any subsequent call that wants to change ! * the object will throw UnsupportedOperationException. * * <p>The UnicodeSet class is not designed to be subclassed. * * <p><code>UnicodeSet</code> supports two APIs. The first is the * <em>operand</em> API that allows the caller to modify the value of
*** 116,126 **** * </tr> * </table> * </blockquote> * * Any character may be preceded by a backslash in order to remove any special ! * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are * ignored, unless they are escaped. * * <p>Property patterns specify a set of characters having a certain * property as defined by the Unicode standard. Both the POSIX-like * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a --- 118,128 ---- * </tr> * </table> * </blockquote> * * Any character may be preceded by a backslash in order to remove any special ! * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are * ignored, unless they are escaped. * * <p>Property patterns specify a set of characters having a certain * property as defined by the Unicode standard. Both the POSIX-like * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
*** 265,282 **** * </table> * </td> * </tr> * </table> * </blockquote> ! * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class. * * @author Alan Liu * @stable ICU 2.0 - * @see UnicodeSetIterator */ ! @SuppressWarnings("deprecation") ! public class UnicodeSet implements UnicodeMatcher { private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. // 110000 for codepoints --- 267,290 ---- * </table> * </td> * </tr> * </table> * </blockquote> ! * <p>To iterate over contents of UnicodeSet, the following are available: ! * <ul><li>{@link #ranges()} to iterate through the ranges</li> ! * <li>{@link #strings()} to iterate through the strings</li> ! * <li>{@link #iterator()} to iterate through the entire contents in a single loop. ! * That method is, however, not particularly efficient, since it "boxes" each code point into a String. ! * </ul> ! * All of the above can be used in <b>for</b> loops. ! * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. ! * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * * @author Alan Liu * @stable ICU 2.0 */ ! class UnicodeSet { private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. // 110000 for codepoints
*** 297,346 **** private int[] rangeList; // internal buffer private int[] buffer; // internal buffer // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! // is not private so that UnicodeSetIterator can get access ! TreeSet<String> strings = new TreeSet<>(); /** * The pattern representation of this set. This may not be the * most economical pattern. It is the pattern supplied to * applyPattern(), with variables substituted and whitespace * removed. For sets constructed without applyPattern(), or * modified using the non-pattern API, this string will be null, * indicating that toPattern() must generate a pattern * representation from the inversion list. */ - private String pat = null; private static final int START_EXTRA = 16; // initial storage. Must be >= 0 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 ! /** ! * A set of all characters _except_ the second through last characters of ! * certain ranges. These ranges are ranges of characters whose ! * properties are all exactly alike, e.g. CJK Ideographs from ! * U+4E00 to U+9FA5. ! */ ! private static UnicodeSet INCLUSIONS[] = null; //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- /** * Constructs an empty set. * @stable ICU 2.0 */ ! public UnicodeSet() { list = new int[1 + START_EXTRA]; list[len++] = HIGH; } /** ! * Constructs a set containing the given range. ! * If {@code end > start} then an empty set is created. * * @param start first character, inclusive, of range * @param end last character, inclusive, of range * @stable ICU 2.0 */ --- 305,358 ---- private int[] rangeList; // internal buffer private int[] buffer; // internal buffer // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! // is not private so that UnicodeSetIterator can get access ! TreeSet<String> strings = new TreeSet<String>(); /** * The pattern representation of this set. This may not be the * most economical pattern. It is the pattern supplied to * applyPattern(), with variables substituted and whitespace * removed. For sets constructed without applyPattern(), or * modified using the non-pattern API, this string will be null, * indicating that toPattern() must generate a pattern * representation from the inversion list. */ private static final int START_EXTRA = 16; // initial storage. Must be >= 0 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 ! private static UnicodeSet INCLUSION = null; ! ! private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. ! private volatile UnicodeSetStringSpan stringSpan; //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- /** * Constructs an empty set. * @stable ICU 2.0 */ ! private UnicodeSet() { list = new int[1 + START_EXTRA]; list[len++] = HIGH; } /** ! * Constructs a copy of an existing set. ! * @stable ICU 2.0 ! */ ! private UnicodeSet(UnicodeSet other) { ! set(other); ! } ! ! /** ! * Constructs a set containing the given range. If <code>end > ! * start</code> then an empty set is created. * * @param start first character, inclusive, of range * @param end last character, inclusive, of range * @stable ICU 2.0 */
*** 357,541 **** * a syntax error. * @stable ICU 2.0 */ public UnicodeSet(String pattern) { this(); ! applyPattern(pattern, null, null, IGNORE_SPACE); } /** * Make this object represent the same set as <code>other</code>. * @param other a <code>UnicodeSet</code> whose value will be * copied to this object * @stable ICU 2.0 */ - @SuppressWarnings("unchecked") // Casting result of clone of a collection public UnicodeSet set(UnicodeSet other) { list = other.list.clone(); len = other.len; ! pat = other.pat; ! strings = (TreeSet)other.strings.clone(); return this; } /** ! * Modifies this set to represent the set specified by the given pattern. ! * See the class description for the syntax of the pattern language. ! * Whitespace is ignored. ! * @param pattern a string specifying what characters are in the set ! * @exception java.lang.IllegalArgumentException if the pattern ! * contains a syntax error. * @stable ICU 2.0 */ ! public final UnicodeSet applyPattern(String pattern) { ! return applyPattern(pattern, null, null, IGNORE_SPACE); ! } ! ! /** ! * Append the <code>toPattern()</code> representation of a ! * string to the given <code>StringBuffer</code>. ! */ ! private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) { ! for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) { ! _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable); ! } ! } ! ! /** ! * Append the <code>toPattern()</code> representation of a ! * character to the given <code>StringBuffer</code>. ! */ ! private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) { ! if (escapeUnprintable && Utility.isUnprintable(c)) { ! // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything ! // unprintable ! if (Utility.escapeUnprintable(buf, c)) { ! return; ! } ! } ! // Okay to let ':' pass through ! switch (c) { ! case '[': // SET_OPEN: ! case ']': // SET_CLOSE: ! case '-': // HYPHEN: ! case '^': // COMPLEMENT: ! case '&': // INTERSECTION: ! case '\\': //BACKSLASH: ! case '{': ! case '}': ! case '$': ! case ':': ! buf.append('\\'); ! break; ! default: ! // Escape whitespace ! if (UCharacterProperty.isRuleWhiteSpace(c)) { ! buf.append('\\'); ! } ! break; ! } ! UTF16.append(buf, c); ! } ! ! /** ! * Append a string representation of this set to result. This will be ! * a cleaned version of the string passed to applyPattern(), if there ! * is one. Otherwise it will be generated. ! */ ! private StringBuffer _toPattern(StringBuffer result, ! boolean escapeUnprintable) { ! if (pat != null) { ! int i; ! int backslashCount = 0; ! for (i=0; i<pat.length(); ) { ! int c = UTF16.charAt(pat, i); ! i += UTF16.getCharCount(c); ! if (escapeUnprintable && Utility.isUnprintable(c)) { ! // If the unprintable character is preceded by an odd ! // number of backslashes, then it has been escaped. ! // Before unescaping it, we delete the final ! // backslash. ! if ((backslashCount % 2) == 1) { ! result.setLength(result.length() - 1); ! } ! Utility.escapeUnprintable(result, c); ! backslashCount = 0; ! } else { ! UTF16.append(result, c); ! if (c == '\\') { ! ++backslashCount; ! } else { ! backslashCount = 0; ! } ! } ! } ! return result; ! } ! ! return _generatePattern(result, escapeUnprintable, true); ! } ! ! /** ! * Generate and append a string representation of this set to result. ! * This does not use this.pat, the cleaned up copy of the string ! * passed to applyPattern(). ! * @param includeStrings if false, doesn't include the strings. ! * @stable ICU 3.8 ! */ ! public StringBuffer _generatePattern(StringBuffer result, ! boolean escapeUnprintable, boolean includeStrings) { ! result.append('['); ! int count = getRangeCount(); - - // If the set contains at least 2 intervals and includes both - // MIN_VALUE and MAX_VALUE, then the inverse representation will - // be more economical. - if (count > 1 && - getRangeStart(0) == MIN_VALUE && - getRangeEnd(count-1) == MAX_VALUE) { - - // Emit the inverse - result.append('^'); - - for (int i = 1; i < count; ++i) { - int start = getRangeEnd(i-1)+1; - int end = getRangeStart(i)-1; - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append('-'); - } - _appendToPat(result, end, escapeUnprintable); - } - } - } - - // Default; emit the ranges as pairs - else { for (int i = 0; i < count; ++i) { ! int start = getRangeStart(i); ! int end = getRangeEnd(i); ! _appendToPat(result, start, escapeUnprintable); ! if (start != end) { ! if ((start+1) != end) { ! result.append('-'); ! } ! _appendToPat(result, end, escapeUnprintable); ! } } ! } ! ! if (includeStrings && strings.size() > 0) { ! Iterator<String> it = strings.iterator(); ! while (it.hasNext()) { ! result.append('{'); ! _appendToPat(result, it.next(), escapeUnprintable); ! result.append('}'); ! } ! } ! return result.append(']'); } // for internal use, after checkFrozen has been called private UnicodeSet add_unchecked(int start, int end) { if (start < MIN_VALUE || start > MAX_VALUE) { --- 369,410 ---- * a syntax error. * @stable ICU 2.0 */ public UnicodeSet(String pattern) { this(); ! applyPattern(pattern, null); } /** * Make this object represent the same set as <code>other</code>. * @param other a <code>UnicodeSet</code> whose value will be * copied to this object * @stable ICU 2.0 */ public UnicodeSet set(UnicodeSet other) { + checkFrozen(); list = other.list.clone(); len = other.len; ! strings = new TreeSet<String>(other.strings); return this; } /** ! * Returns the number of elements in this set (its cardinality) ! * Note than the elements of a set may include both individual ! * codepoints and strings. ! * ! * @return the number of elements in this set (its cardinality). * @stable ICU 2.0 */ ! public int size() { ! int n = 0; int count = getRangeCount(); for (int i = 0; i < count; ++i) { ! n += getRangeEnd(i) - getRangeStart(i) + 1; } ! return n + strings.size(); } // for internal use, after checkFrozen has been called private UnicodeSet add_unchecked(int start, int end) { if (start < MIN_VALUE || start > MAX_VALUE) {
*** 557,566 **** --- 426,436 ---- * present. If this set already contains the specified character, * the call leaves this set unchanged. * @stable ICU 2.0 */ public final UnicodeSet add(int c) { + checkFrozen(); return add_unchecked(c); } // for internal use only, after checkFrozen has been called private final UnicodeSet add_unchecked(int c) {
*** 641,651 **** list[i] = c; list[i+1] = c+1; len += 2; } - pat = null; return this; } /** * Adds the specified multicharacter to this set if it is not already --- 511,520 ----
*** 655,681 **** * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> * @param s the source string * @return this object, for chaining * @stable ICU 2.0 */ ! public final UnicodeSet add(String s) { int cp = getSingleCP(s); if (cp < 0) { ! strings.add(s); ! pat = null; } else { add_unchecked(cp, cp); } return this; } /** * @return a code point IF the string consists of a single one. * otherwise returns -1. ! * @param string to test */ ! private static int getSingleCP(String s) { if (s.length() < 1) { throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); } if (s.length() > 2) return -1; if (s.length() == 1) return s.charAt(0); --- 524,552 ---- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> * @param s the source string * @return this object, for chaining * @stable ICU 2.0 */ ! public final UnicodeSet add(CharSequence s) { ! checkFrozen(); int cp = getSingleCP(s); if (cp < 0) { ! strings.add(s.toString()); } else { add_unchecked(cp, cp); } return this; } /** + * Utility for getting code point from single code point CharSequence. + * See the public UTF16.getSingleCodePoint() * @return a code point IF the string consists of a single one. * otherwise returns -1. ! * @param s to test */ ! private static int getSingleCP(CharSequence s) { if (s.length() < 1) { throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); } if (s.length() > 2) return -1; if (s.length() == 1) return s.charAt(0);
*** 699,737 **** * @param end last character, inclusive, of range to be removed * from this set. * @stable ICU 2.0 */ public UnicodeSet complement(int start, int end) { if (start < MIN_VALUE || start > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); } if (end < MIN_VALUE || end > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); } if (start <= end) { xor(range(start, end), 2, 0); } - pat = null; - return this; - } - - /** - * This is equivalent to - * <code>complement(MIN_VALUE, MAX_VALUE)</code>. - * @stable ICU 2.0 - */ - public UnicodeSet complement() { - if (list[0] == LOW) { - System.arraycopy(list, 1, list, 0, len-1); - --len; - } else { - ensureCapacity(len+1); - System.arraycopy(list, 0, list, 1, len); - list[0] = LOW; - ++len; - } - pat = null; return this; } /** * Returns true if this set contains the given character. --- 570,589 ---- * @param end last character, inclusive, of range to be removed * from this set. * @stable ICU 2.0 */ public UnicodeSet complement(int start, int end) { + checkFrozen(); if (start < MIN_VALUE || start > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); } if (end < MIN_VALUE || end > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); } if (start <= end) { xor(range(start, end), 2, 0); } return this; } /** * Returns true if this set contains the given character.
*** 741,750 **** --- 593,608 ---- */ public boolean contains(int c) { if (c < MIN_VALUE || c > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); } + if (bmpSet != null) { + return bmpSet.contains(c); + } + if (stringSpan != null) { + return stringSpan.contains(c); + } /* // Set i to the index of the start item greater than ch // We know we will terminate without length test! int i = -1;
*** 798,863 **** } } } /** - * Adds all of the elements in the specified set to this set if - * they're not already present. This operation effectively - * modifies this set so that its value is the <i>union</i> of the two - * sets. The behavior of this operation is unspecified if the specified - * collection is modified while the operation is in progress. - * - * @param c set whose elements are to be added to this set. - * @stable ICU 2.0 - */ - public UnicodeSet addAll(UnicodeSet c) { - add(c.list, c.len, 0); - strings.addAll(c.strings); - return this; - } - - /** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the <i>intersection</i> of the two sets. * * @param c set that defines which elements this set will retain. * @stable ICU 2.0 */ public UnicodeSet retainAll(UnicodeSet c) { retain(c.list, c.len, 0); strings.retainAll(c.strings); return this; } /** - * Removes from this set all of its elements that are contained in the - * specified set. This operation effectively modifies this - * set so that its value is the <i>asymmetric set difference</i> of - * the two sets. - * - * @param c set that defines which elements will be removed from - * this set. - * @stable ICU 2.0 - */ - public UnicodeSet removeAll(UnicodeSet c) { - retain(c.list, c.len, 2); - strings.removeAll(c.strings); - return this; - } - - /** * Removes all of the elements from this set. This set will be * empty after this call returns. * @stable ICU 2.0 */ public UnicodeSet clear() { list[0] = HIGH; len = 1; - pat = null; strings.clear(); return this; } /** --- 656,690 ---- } } } /** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the <i>intersection</i> of the two sets. * * @param c set that defines which elements this set will retain. * @stable ICU 2.0 */ public UnicodeSet retainAll(UnicodeSet c) { + checkFrozen(); retain(c.list, c.len, 0); strings.retainAll(c.strings); return this; } /** * Removes all of the elements from this set. This set will be * empty after this call returns. * @stable ICU 2.0 */ public UnicodeSet clear() { + checkFrozen(); list[0] = HIGH; len = 1; strings.clear(); return this; } /**
*** 921,1329 **** * is the last character of the pattern string. * @return an inversion list for the parsed substring * of <code>pattern</code> * @exception java.lang.IllegalArgumentException if the parse fails. */ ! UnicodeSet applyPattern(String pattern, ! ParsePosition pos, ! SymbolTable symbols, ! int options) { ! ! // Need to build the pattern in a temporary string because ! // _applyPattern calls add() etc., which set pat to empty. ! boolean parsePositionWasNull = pos == null; ! if (parsePositionWasNull) { ! pos = new ParsePosition(0); ! } ! ! StringBuffer rebuiltPat = new StringBuffer(); ! RuleCharacterIterator chars = ! new RuleCharacterIterator(pattern, symbols, pos); ! applyPattern(chars, symbols, rebuiltPat, options); ! if (chars.inVariable()) { ! syntaxError(chars, "Extra chars in variable value"); ! } ! pat = rebuiltPat.toString(); ! if (parsePositionWasNull) { ! int i = pos.getIndex(); ! ! // Skip over trailing whitespace ! if ((options & IGNORE_SPACE) != 0) { ! i = Utility.skipWhitespace(pattern, i); ! } ! ! if (i != pattern.length()) { ! throw new IllegalArgumentException("Parse of \"" + pattern + ! "\" failed at " + i); ! } ! } ! return this; ! } ! ! /** ! * Parse the pattern from the given RuleCharacterIterator. The ! * iterator is advanced over the parsed pattern. ! * @param chars iterator over the pattern characters. Upon return ! * it will be advanced to the first character after the parsed ! * pattern, or the end of the iteration if all characters are ! * parsed. ! * @param symbols symbol table to use to parse and dereference ! * variables, or null if none. ! * @param rebuiltPat the pattern that was parsed, rebuilt or ! * copied from the input pattern, as appropriate. ! * @param options a bit mask of zero or more of the following: ! * IGNORE_SPACE, CASE. ! */ ! void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, ! StringBuffer rebuiltPat, int options) { ! // Syntax characters: [ ] ^ - & { } ! ! // Recognized special forms for chars, sets: c-c s-s s&s ! ! int opts = RuleCharacterIterator.PARSE_VARIABLES | ! RuleCharacterIterator.PARSE_ESCAPES; ! if ((options & IGNORE_SPACE) != 0) { ! opts |= RuleCharacterIterator.SKIP_WHITESPACE; ! } ! ! StringBuffer patBuf = new StringBuffer(), buf = null; ! boolean usePat = false; ! UnicodeSet scratch = null; ! Object backup = null; ! ! // mode: 0=before [, 1=between [...], 2=after ] ! // lastItem: 0=none, 1=char, 2=set ! int lastItem = 0, lastChar = 0, mode = 0; ! char op = 0; ! ! boolean invert = false; ! ! clear(); ! ! while (mode != 2 && !chars.atEnd()) { ! if (false) { ! // Debugging assertion ! if (!((lastItem == 0 && op == 0) || ! (lastItem == 1 && (op == 0 || op == '-')) || ! (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { ! throw new IllegalArgumentException(); ! } ! } ! ! int c = 0; ! boolean literal = false; ! UnicodeSet nested = null; ! ! // -------- Check for property pattern ! ! // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed ! int setMode = 0; ! if (resemblesPropertyPattern(chars, opts)) { ! setMode = 2; ! } ! ! // -------- Parse '[' of opening delimiter OR nested set. ! // If there is a nested set, use `setMode' to define how ! // the set should be parsed. If the '[' is part of the ! // opening delimiter for this pattern, parse special ! // strings "[", "[^", "[-", and "[^-". Check for stand-in ! // characters representing a nested set in the symbol ! // table. ! ! else { ! // Prepare to backup if necessary ! backup = chars.getPos(backup); ! c = chars.next(opts); ! literal = chars.isEscaped(); ! ! if (c == '[' && !literal) { ! if (mode == 1) { ! chars.setPos(backup); // backup ! setMode = 1; ! } else { ! // Handle opening '[' delimiter ! mode = 1; ! patBuf.append('['); ! backup = chars.getPos(backup); // prepare to backup ! c = chars.next(opts); ! literal = chars.isEscaped(); ! if (c == '^' && !literal) { ! invert = true; ! patBuf.append('^'); ! backup = chars.getPos(backup); // prepare to backup ! c = chars.next(opts); ! literal = chars.isEscaped(); ! } ! // Fall through to handle special leading '-'; ! // otherwise restart loop for nested [], \p{}, etc. ! if (c == '-') { ! literal = true; ! // Fall through to handle literal '-' below ! } else { ! chars.setPos(backup); // backup ! continue; ! } ! } ! } else if (symbols != null) { ! UnicodeMatcher m = symbols.lookupMatcher(c); // may be null ! if (m != null) { ! try { ! nested = (UnicodeSet) m; ! setMode = 3; ! } catch (ClassCastException e) { ! syntaxError(chars, "Syntax error"); ! } ! } ! } ! } ! ! // -------- Handle a nested set. This either is inline in ! // the pattern or represented by a stand-in that has ! // previously been parsed and was looked up in the symbol ! // table. ! ! if (setMode != 0) { ! if (lastItem == 1) { ! if (op != 0) { ! syntaxError(chars, "Char expected after operator"); ! } ! add_unchecked(lastChar, lastChar); ! _appendToPat(patBuf, lastChar, false); ! lastItem = op = 0; ! } ! ! if (op == '-' || op == '&') { ! patBuf.append(op); ! } ! ! if (nested == null) { ! if (scratch == null) scratch = new UnicodeSet(); ! nested = scratch; ! } ! switch (setMode) { ! case 1: ! nested.applyPattern(chars, symbols, patBuf, options); ! break; ! case 2: ! chars.skipIgnored(opts); ! nested.applyPropertyPattern(chars, patBuf, symbols); ! break; ! case 3: // `nested' already parsed ! nested._toPattern(patBuf, false); ! break; ! } ! ! usePat = true; ! ! if (mode == 0) { ! // Entire pattern is a category; leave parse loop ! set(nested); ! mode = 2; ! break; ! } ! ! switch (op) { ! case '-': ! removeAll(nested); ! break; ! case '&': ! retainAll(nested); ! break; ! case 0: ! addAll(nested); ! break; ! } ! ! op = 0; ! lastItem = 2; ! ! continue; ! } ! ! if (mode == 0) { ! syntaxError(chars, "Missing '['"); ! } ! ! // -------- Parse special (syntax) characters. If the ! // current character is not special, or if it is escaped, ! // then fall through and handle it below. ! ! if (!literal) { ! switch (c) { ! case ']': ! if (lastItem == 1) { ! add_unchecked(lastChar, lastChar); ! _appendToPat(patBuf, lastChar, false); ! } ! // Treat final trailing '-' as a literal ! if (op == '-') { ! add_unchecked(op, op); ! patBuf.append(op); ! } else if (op == '&') { ! syntaxError(chars, "Trailing '&'"); ! } ! patBuf.append(']'); ! mode = 2; ! continue; ! case '-': ! if (op == 0) { ! if (lastItem != 0) { ! op = (char) c; ! continue; ! } else { ! // Treat final trailing '-' as a literal ! add_unchecked(c, c); ! c = chars.next(opts); ! literal = chars.isEscaped(); ! if (c == ']' && !literal) { ! patBuf.append("-]"); ! mode = 2; ! continue; ! } ! } ! } ! syntaxError(chars, "'-' not after char or set"); ! break; ! case '&': ! if (lastItem == 2 && op == 0) { ! op = (char) c; ! continue; ! } ! syntaxError(chars, "'&' not after set"); ! break; ! case '^': ! syntaxError(chars, "'^' not after '['"); ! break; ! case '{': ! if (op != 0) { ! syntaxError(chars, "Missing operand after operator"); ! } ! if (lastItem == 1) { ! add_unchecked(lastChar, lastChar); ! _appendToPat(patBuf, lastChar, false); ! } ! lastItem = 0; ! if (buf == null) { ! buf = new StringBuffer(); ! } else { ! buf.setLength(0); ! } ! boolean ok = false; ! while (!chars.atEnd()) { ! c = chars.next(opts); ! literal = chars.isEscaped(); ! if (c == '}' && !literal) { ! ok = true; ! break; ! } ! UTF16.append(buf, c); ! } ! if (buf.length() < 1 || !ok) { ! syntaxError(chars, "Invalid multicharacter string"); ! } ! // We have new string. Add it to set and continue; ! // we don't need to drop through to the further ! // processing ! add(buf.toString()); ! patBuf.append('{'); ! _appendToPat(patBuf, buf.toString(), false); ! patBuf.append('}'); ! continue; ! case SymbolTable.SYMBOL_REF: ! // symbols nosymbols ! // [a-$] error error (ambiguous) ! // [a$] anchor anchor ! // [a-$x] var "x"* literal '$' ! // [a-$.] error literal '$' ! // *We won't get here in the case of var "x" ! backup = chars.getPos(backup); ! c = chars.next(opts); ! literal = chars.isEscaped(); ! boolean anchor = (c == ']' && !literal); ! if (symbols == null && !anchor) { ! c = SymbolTable.SYMBOL_REF; ! chars.setPos(backup); ! break; // literal '$' ! } ! if (anchor && op == 0) { ! if (lastItem == 1) { ! add_unchecked(lastChar, lastChar); ! _appendToPat(patBuf, lastChar, false); ! } ! add_unchecked(UnicodeMatcher.ETHER); ! usePat = true; ! patBuf.append(SymbolTable.SYMBOL_REF).append(']'); ! mode = 2; ! continue; ! } ! syntaxError(chars, "Unquoted '$'"); ! break; ! default: ! break; ! } ! } ! ! // -------- Parse literal characters. This includes both ! // escaped chars ("\u4E01") and non-syntax characters ! // ("a"). ! ! switch (lastItem) { ! case 0: ! lastItem = 1; ! lastChar = c; ! break; ! case 1: ! if (op == '-') { ! if (lastChar >= c) { ! // Don't allow redundant (a-a) or empty (b-a) ranges; ! // these are most likely typos. ! syntaxError(chars, "Invalid range"); ! } ! add_unchecked(lastChar, c); ! _appendToPat(patBuf, lastChar, false); ! patBuf.append(op); ! _appendToPat(patBuf, c, false); ! lastItem = op = 0; ! } else { ! add_unchecked(lastChar, lastChar); ! _appendToPat(patBuf, lastChar, false); ! lastChar = c; ! } ! break; ! case 2: ! if (op != 0) { ! syntaxError(chars, "Set expected after operator"); ! } ! lastChar = c; ! lastItem = 1; ! break; ! } ! } ! ! if (mode != 2) { ! syntaxError(chars, "Missing ']'"); ! } ! ! chars.skipIgnored(opts); ! ! if (invert) { ! complement(); ! } ! ! // Use the rebuilt pattern (pat) only if necessary. Prefer the ! // generated pattern. ! if (usePat) { ! rebuiltPat.append(patBuf.toString()); } else { ! _generatePattern(rebuiltPat, false, true); ! } } ! private static void syntaxError(RuleCharacterIterator chars, String msg) { ! throw new IllegalArgumentException("Error: " + msg + " at \"" + ! Utility.escape(chars.toString()) + ! '"'); } //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- --- 748,769 ---- * is the last character of the pattern string. * @return an inversion list for the parsed substring * of <code>pattern</code> * @exception java.lang.IllegalArgumentException if the parse fails. */ ! private UnicodeSet applyPattern(String pattern, ! ParsePosition pos) { ! if ("[:age=3.2:]".equals(pattern)) { ! checkFrozen(); ! VersionInfo version = VersionInfo.getInstance("3.2"); ! applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); } else { ! throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern " ! + pattern + ")"); } ! return this; } //---------------------------------------------------------------- // Implementation: Utility methods //----------------------------------------------------------------
*** 1395,1405 **** } // swap list and buffer int[] temp = list; list = buffer; buffer = temp; - pat = null; return this; } // polarity = 0 is normal: x union y // polarity = 2: x union ~y --- 835,844 ----
*** 1493,1503 **** len = k; // swap list and buffer int[] temp = list; list = buffer; buffer = temp; - pat = null; return this; } // polarity = 0 is normal: x intersect y // polarity = 2: x intersect ~y == set-minus --- 932,941 ----
*** 1564,1574 **** len = k; // swap list and buffer int[] temp = list; list = buffer; buffer = temp; - pat = null; return this; } private static final int max(int a, int b) { return (a > b) ? a : b; --- 1002,1011 ----
*** 1580,1641 **** private static interface Filter { boolean contains(int codePoint); } ! // VersionInfo for unassigned characters ! static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); private static class VersionFilter implements Filter { VersionInfo version; - VersionFilter(VersionInfo version) { this.version = version; } - public boolean contains(int ch) { VersionInfo v = UCharacter.getAge(ch); // Reference comparison ok; VersionInfo caches and reuses // unique objects. return v != NO_VERSION && v.compareTo(version) <= 0; } } private static synchronized UnicodeSet getInclusions(int src) { ! if (INCLUSIONS == null) { ! INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT]; ! } ! if(INCLUSIONS[src] == null) { ! UnicodeSet incl = new UnicodeSet(); ! switch(src) { ! case UCharacterProperty.SRC_PROPSVEC: ! UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl); ! break; ! default: throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); } ! INCLUSIONS[src] = incl; } ! return INCLUSIONS[src]; } /** * Generic filter-based scanning code for UCD property UnicodeSets. */ private UnicodeSet applyFilter(Filter filter, int src) { ! // Walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is // true. Add each range to a set. // ! // To improve performance, use the INCLUSIONS set, which // encodes information about character ranges that are known ! // to have identical properties, such as the CJK Ideographs ! // from U+4E00 to U+9FA5. INCLUSIONS contains all characters ! // except the first characters of such ranges. ! // ! // TODO Where possible, instead of scanning over code points, ! // use internal property data to initialize UnicodeSets for ! // those properties. Scanning code points is slow. clear(); int startHasProperty = -1; UnicodeSet inclusions = getInclusions(src); --- 1017,1066 ---- private static interface Filter { boolean contains(int codePoint); } ! private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); private static class VersionFilter implements Filter { VersionInfo version; VersionFilter(VersionInfo version) { this.version = version; } public boolean contains(int ch) { VersionInfo v = UCharacter.getAge(ch); // Reference comparison ok; VersionInfo caches and reuses // unique objects. return v != NO_VERSION && v.compareTo(version) <= 0; } } private static synchronized UnicodeSet getInclusions(int src) { ! if (src != UCharacterProperty.SRC_PROPSVEC) { throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); } ! ! if (INCLUSION == null) { ! UnicodeSet incl = new UnicodeSet(); ! UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); ! INCLUSION = incl; } ! return INCLUSION; } /** * Generic filter-based scanning code for UCD property UnicodeSets. */ private UnicodeSet applyFilter(Filter filter, int src) { ! // Logically, walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is // true. Add each range to a set. // ! // To improve performance, use an inclusions set which // encodes information about character ranges that are known ! // to have identical properties. ! // getInclusions(src) contains exactly the first characters of ! // same-value ranges for the given properties "source". clear(); int startHasProperty = -1; UnicodeSet inclusions = getInclusions(src);
*** 1666,1871 **** return this; } /** ! * Remove leading and trailing rule white space and compress ! * internal rule white space to a single space character. * ! * @see UCharacterProperty#isRuleWhiteSpace */ ! private static String mungeCharName(String source) { ! StringBuffer buf = new StringBuffer(); ! for (int i=0; i<source.length(); ) { ! int ch = UTF16.charAt(source, i); ! i += UTF16.getCharCount(ch); ! if (UCharacterProperty.isRuleWhiteSpace(ch)) { ! if (buf.length() == 0 || ! buf.charAt(buf.length() - 1) == ' ') { ! continue; ! } ! ch = ' '; // convert to ' ' ! } ! UTF16.append(buf, ch); ! } ! if (buf.length() != 0 && ! buf.charAt(buf.length() - 1) == ' ') { ! buf.setLength(buf.length() - 1); ! } ! return buf.toString(); } /** ! * Modifies this set to contain those code points which have the ! * given value for the given property. Prior contents of this ! * set are lost. ! * @param propertyAlias the property alias ! * @param valueAlias the value alias ! * @param symbols if not null, then symbols are first called to see if a property ! * is available. If true, then everything else is skipped. ! * @return this set ! * @stable ICU 3.2 ! */ ! public UnicodeSet applyPropertyAlias(String propertyAlias, ! String valueAlias, SymbolTable symbols) { ! if (valueAlias.length() > 0) { ! if (propertyAlias.equals("Age")) { ! // Must munge name, since ! // VersionInfo.getInstance() does not do ! // 'loose' matching. ! VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); ! applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); ! return this; ! } } - throw new IllegalArgumentException("Unsupported property: " + propertyAlias); } ! /** ! * Return true if the given iterator appears to point at a ! * property pattern. Regardless of the result, return with the ! * iterator unchanged. ! * @param chars iterator over the pattern characters. Upon return ! * it will be unchanged. ! * @param iterOpts RuleCharacterIterator options ! */ ! private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, ! int iterOpts) { ! boolean result = false; ! iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; ! Object pos = chars.getPos(null); ! int c = chars.next(iterOpts); ! if (c == '[' || c == '\\') { ! int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); ! result = (c == '[') ? (d == ':') : ! (d == 'N' || d == 'p' || d == 'P'); } ! chars.setPos(pos); ! return result; } /** ! * Parse the given property pattern at the given parse position. ! * @param symbols TODO */ ! private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { ! int pos = ppos.getIndex(); ! ! // On entry, ppos should point to one of the following locations: ! // Minimum length is 5 characters, e.g. \p{L} ! if ((pos+5) > pattern.length()) { ! return null; ! } ! ! boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} ! boolean isName = false; // true for \N{pat}, o/w false ! boolean invert = false; ! ! // Look for an opening [:, [:^, \p, or \P ! if (pattern.regionMatches(pos, "[:", 0, 2)) { ! posix = true; ! pos = Utility.skipWhitespace(pattern, pos+2); ! if (pos < pattern.length() && pattern.charAt(pos) == '^') { ! ++pos; ! invert = true; ! } ! } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || ! pattern.regionMatches(pos, "\\N", 0, 2)) { ! char c = pattern.charAt(pos+1); ! invert = (c == 'P'); ! isName = (c == 'N'); ! pos = Utility.skipWhitespace(pattern, pos+2); ! if (pos == pattern.length() || pattern.charAt(pos++) != '{') { ! // Syntax error; "\p" or "\P" not followed by "{" ! return null; } ! } else { ! // Open delimiter not seen ! return null; } - - // Look for the matching close delimiter, either :] or } - int close = pattern.indexOf(posix ? ":]" : "}", pos); - if (close < 0) { - // Syntax error; close delimiter missing - return null; - } - - // Look for an '=' sign. If this is present, we will parse a - // medium \p{gc=Cf} or long \p{GeneralCategory=Format} - // pattern. - int equals = pattern.indexOf('=', pos); - String propName, valueName; - if (equals >= 0 && equals < close && !isName) { - // Equals seen; parse medium/long pattern - propName = pattern.substring(pos, equals); - valueName = pattern.substring(equals+1, close); } ! else { ! // Handle case where no '=' is seen, and \N{} ! propName = pattern.substring(pos, close); ! valueName = ""; ! // Handle \N{name} ! if (isName) { ! // This is a little inefficient since it means we have to ! // parse "na" back to UProperty.NAME even though we already ! // know it's UProperty.NAME. If we refactor the API to ! // support args of (int, String) then we can remove ! // "na" and make this a little more efficient. ! valueName = propName; ! propName = "na"; } } ! applyPropertyAlias(propName, valueName, symbols); ! ! if (invert) { ! complement(); } ! // Move to the limit position after the close delimiter ! ppos.setIndex(close + (posix ? 2 : 1)); ! return this; } /** ! * Parse a property pattern. ! * @param chars iterator over the pattern characters. Upon return ! * it will be advanced to the first character after the parsed ! * pattern, or the end of the iteration if all characters are ! * parsed. ! * @param rebuiltPat the pattern that was parsed, rebuilt or ! * copied from the input pattern, as appropriate. ! * @param symbols TODO ! */ ! private void applyPropertyPattern(RuleCharacterIterator chars, ! StringBuffer rebuiltPat, SymbolTable symbols) { ! String patStr = chars.lookahead(); ! ParsePosition pos = new ParsePosition(0); ! applyPropertyPattern(patStr, pos, symbols); ! if (pos.getIndex() == 0) { ! syntaxError(chars, "Invalid property pattern"); } - chars.jumpahead(pos.getIndex()); - rebuiltPat.append(patStr, 0, pos.getIndex()); } ! //---------------------------------------------------------------- ! // Case folding API ! //---------------------------------------------------------------- /** ! * Bitmask for constructor and applyPattern() indicating that ! * white space should be ignored. If set, ignore characters for ! * which UCharacterProperty.isRuleWhiteSpace() returns true, ! * unless they are quoted or escaped. This may be ORed together ! * with other selectors. ! * @stable ICU 3.8 */ ! public static final int IGNORE_SPACE = 1; ! } --- 1091,1407 ---- return this; } /** ! * Is this frozen, according to the Freezable interface? * ! * @return value ! * @stable ICU 3.8 */ ! public boolean isFrozen() { ! return (bmpSet != null || stringSpan != null); } /** ! * Freeze this class, according to the Freezable interface. ! * ! * @return this ! * @stable ICU 4.4 ! */ ! public UnicodeSet freeze() { ! if (!isFrozen()) { ! // Do most of what compact() does before freezing because ! // compact() will not work when the set is frozen. ! // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). ! ! // Delete buffer first to defragment memory less. ! buffer = null; ! if (list.length > (len + GROW_EXTRA)) { ! // Make the capacity equal to len or 1. ! // We don't want to realloc of 0 size. ! int capacity = (len == 0) ? 1 : len; ! int[] oldList = list; ! list = new int[capacity]; ! for (int i = capacity; i-- > 0;) { ! list[i] = oldList[i]; } } ! // Optimize contains() and span() and similar functions. ! if (!strings.isEmpty()) { ! stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL); ! } ! if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { ! // Optimize for code point spans. ! // There are no strings, or ! // all strings are irrelevant for span() etc. because ! // all of each string's code points are contained in this set. ! // However, fully contained strings are relevant for spanAndCount(), ! // so we create both objects. ! bmpSet = new BMPSet(list, len); } ! } ! return this; } /** ! * Span a string using this UnicodeSet. ! * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. ! * @param s The string to be spanned ! * @param spanCondition The span condition ! * @return the length of the span ! * @stable ICU 4.4 */ ! public int span(CharSequence s, SpanCondition spanCondition) { ! return span(s, 0, spanCondition); ! } ! /** ! * Span a string using this UnicodeSet. ! * If the start index is less than 0, span will start from 0. ! * If the start index is greater than the string length, span returns the string length. ! * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. ! * @param s The string to be spanned ! * @param start The start index that the span begins ! * @param spanCondition The span condition ! * @return the string index which ends the span (i.e. exclusive) ! * @stable ICU 4.4 ! */ ! public int span(CharSequence s, int start, SpanCondition spanCondition) { ! int end = s.length(); ! if (start < 0) { ! start = 0; ! } else if (start >= end) { ! return end; } ! if (bmpSet != null) { ! // Frozen set without strings, or no string is relevant for span(). ! return bmpSet.span(s, start, spanCondition, null); ! } ! if (stringSpan != null) { ! return stringSpan.span(s, start, spanCondition); ! } else if (!strings.isEmpty()) { ! int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED ! : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; ! UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); ! if (strSpan.needsStringSpanUTF16()) { ! return strSpan.span(s, start, spanCondition); } } ! return spanCodePointsAndCount(s, start, spanCondition, null); ! } ! /** ! * Same as span() but also counts the smallest number of set elements on any path across the span. ! * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. ! * @param outCount An output-only object (must not be null) for returning the count. ! * @return the limit (exclusive end) of the span ! */ ! public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { ! if (outCount == null) { ! throw new IllegalArgumentException("outCount must not be null"); } + int end = s.length(); + if (start < 0) { + start = 0; + } else if (start >= end) { + return end; + } + if (stringSpan != null) { + // We might also have bmpSet != null, + // but fully-contained strings are relevant for counting elements. + return stringSpan.spanAndCount(s, start, spanCondition, outCount); + } else if (bmpSet != null) { + return bmpSet.span(s, start, spanCondition, outCount); + } else if (!strings.isEmpty()) { + int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; + which |= UnicodeSetStringSpan.WITH_COUNT; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); + return strSpan.spanAndCount(s, start, spanCondition, outCount); } ! return spanCodePointsAndCount(s, start, spanCondition, outCount); } ! private int spanCodePointsAndCount(CharSequence s, int start, ! SpanCondition spanCondition, OutputInt outCount) { ! // Pin to 0/1 values. ! boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); ! int c; ! int next = start; ! int length = s.length(); ! int count = 0; ! do { ! c = Character.codePointAt(s, next); ! if (spanContained != contains(c)) { ! break; ! } ! ++count; ! next += Character.charCount(c); ! } while (next < length); ! if (outCount != null) { outCount.value = count; } ! return next; } /** ! * Span a string backwards (from the fromIndex) using this UnicodeSet. ! * If the fromIndex is less than 0, spanBack will return 0. ! * If fromIndex is greater than the string length, spanBack will start from the string length. ! * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. ! * @param s The string to be spanned ! * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards ! * @param spanCondition The span condition ! * @return The string index which starts the span (i.e. inclusive). ! * @stable ICU 4.4 ! */ ! public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { ! if (fromIndex <= 0) { ! return 0; ! } ! if (fromIndex > s.length()) { ! fromIndex = s.length(); ! } ! if (bmpSet != null) { ! // Frozen set without strings, or no string is relevant for spanBack(). ! return bmpSet.spanBack(s, fromIndex, spanCondition); ! } ! if (stringSpan != null) { ! return stringSpan.spanBack(s, fromIndex, spanCondition); ! } else if (!strings.isEmpty()) { ! int which = (spanCondition == SpanCondition.NOT_CONTAINED) ! ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED ! : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; ! UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); ! if (strSpan.needsStringSpanUTF16()) { ! return strSpan.spanBack(s, fromIndex, spanCondition); } } ! // Pin to 0/1 values. ! boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); ! ! int c; ! int prev = fromIndex; ! do { ! c = Character.codePointBefore(s, prev); ! if (spanContained != contains(c)) { ! break; ! } ! prev -= Character.charCount(c); ! } while (prev > 0); ! return prev; ! } /** ! * Clone a thawed version of this class, according to the Freezable interface. ! * @return the clone, not frozen ! * @stable ICU 4.4 */ ! public UnicodeSet cloneAsThawed() { ! UnicodeSet result = new UnicodeSet(this); ! assert !result.isFrozen(); ! return result; ! } ! // internal function ! private void checkFrozen() { ! if (isFrozen()) { ! throw new UnsupportedOperationException("Attempt to modify frozen object"); ! } ! } ! ! /** ! * Argument values for whether span() and similar functions continue while the current character is contained vs. ! * not contained in the set. ! * <p> ! * The functionality is straightforward for sets with only single code points, without strings (which is the common ! * case): ! * <ul> ! * <li>CONTAINED and SIMPLE work the same. ! * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. ! * <li>span() and spanBack() partition any string the ! * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). ! * <li>Using a ! * complemented (inverted) set and the opposite span conditions yields the same results. ! * </ul> ! * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in ! * the set (for example, whether they overlap with each other) and the string that is processed. For a set with ! * strings: ! * <ul> ! * <li>The complement of the set contains the opposite set of code points, but the same set of strings. ! * Therefore, complementing both the set and the span conditions may yield different results. ! * <li>When starting spans ! * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different ! * because a set string may start before the later position. ! * <li>span(SIMPLE) may be shorter than ! * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which ! * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", ! * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). ! * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, ! * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield ! * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. ! * </ul> ! * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then ! * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could ! * be used. ! * <p> ! * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point ! * boundaries, never in the middle of a surrogate pair. ! * ! * @stable ICU 4.4 ! */ ! public enum SpanCondition { ! /** ! * Continues a span() while there is no set element at the current position. ! * Increments by one code point at a time. ! * Stops before the first set element (character or string). ! * (For code points only, this is like while contains(current)==false). ! * <p> ! * When span() returns, the substring between where it started and the position it returned consists only of ! * characters that are not in the set, and none of its strings overlap with the span. ! * ! * @stable ICU 4.4 ! */ ! NOT_CONTAINED, ! ! /** ! * Spans the longest substring that is a concatenation of set elements (characters or strings). ! * (For characters only, this is like while contains(current)==true). ! * <p> ! * When span() returns, the substring between where it started and the position it returned consists only of set ! * elements (characters or strings) that are in the set. ! * <p> ! * If a set contains strings, then the span will be the longest substring for which there ! * exists at least one non-overlapping concatenation of set elements (characters or strings). ! * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. ! * (Java/ICU/Perl regex stops at the first match of an OR.) ! * ! * @stable ICU 4.4 ! */ ! CONTAINED, ! ! /** ! * Continues a span() while there is a set element at the current position. ! * Increments by the longest matching element at each position. ! * (For characters only, this is like while contains(current)==true). ! * <p> ! * When span() returns, the substring between where it started and the position it returned consists only of set ! * elements (characters or strings) that are in the set. ! * <p> ! * If a set only contains single characters, then this is the same as CONTAINED. ! * <p> ! * If a set contains strings, then the span will be the longest substring with a match at each position with the ! * longest single set element (character or string). ! * <p> ! * Use this span condition together with other longest-match algorithms, such as ICU converters ! * (ucnv_getUnicodeSet()). ! * ! * @stable ICU 4.4 ! */ ! SIMPLE, ! } + }
< prev index next >