1 /*
   2  * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  * Copyright (C) 1996-2015, International Business Machines Corporation and
  29  * others. All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 package jdk.internal.icu.text;
  33 
  34 import java.text.ParsePosition;
  35 import java.util.ArrayList;
  36 import java.util.TreeSet;
  37 
  38 import jdk.internal.icu.impl.BMPSet;
  39 import jdk.internal.icu.impl.UCharacterProperty;
  40 import jdk.internal.icu.impl.UnicodeSetStringSpan;
  41 import jdk.internal.icu.impl.Utility;
  42 import jdk.internal.icu.lang.UCharacter;
  43 import jdk.internal.icu.util.OutputInt;
  44 import jdk.internal.icu.util.VersionInfo;
  45 
  46 /**
  47  * A mutable set of Unicode characters and multicharacter strings.
  48  * Objects of this class represent <em>character classes</em> used
  49  * in regular expressions. A character specifies a subset of Unicode
  50  * code points.  Legal code points are U+0000 to U+10FFFF, inclusive.
  51  *
  52  * Note: method freeze() will not only make the set immutable, but
  53  * also makes important methods much higher performance:
  54  * contains(c), containsNone(...), span(...), spanBack(...) etc.
  55  * After the object is frozen, any subsequent call that wants to change
  56  * the object will throw UnsupportedOperationException.
  57  *
  58  * <p>The UnicodeSet class is not designed to be subclassed.
  59  *
  60  * <p><code>UnicodeSet</code> supports two APIs. The first is the
  61  * <em>operand</em> API that allows the caller to modify the value of
  62  * a <code>UnicodeSet</code> object. It conforms to Java 2's
  63  * <code>java.util.Set</code> interface, although
  64  * <code>UnicodeSet</code> does not actually implement that
  65  * interface. All methods of <code>Set</code> are supported, with the
  66  * modification that they take a character range or single character
  67  * instead of an <code>Object</code>, and they take a
  68  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
  69  * operand API may be thought of in terms of boolean logic: a boolean
  70  * OR is implemented by <code>add</code>, a boolean AND is implemented
  71  * by <code>retain</code>, a boolean XOR is implemented by
  72  * <code>complement</code> taking an argument, and a boolean NOT is
  73  * implemented by <code>complement</code> with no argument.  In terms
  74  * of traditional set theory function names, <code>add</code> is a
  75  * union, <code>retain</code> is an intersection, <code>remove</code>
  76  * is an asymmetric difference, and <code>complement</code> with no
  77  * argument is a set complement with respect to the superset range
  78  * <code>MIN_VALUE-MAX_VALUE</code>
  79  *
  80  * <p>The second API is the
  81  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
  82  * <code>java.text.Format</code>-derived classes.  Unlike the
  83  * methods that add characters, add categories, and control the logic
  84  * of the set, the method <code>applyPattern()</code> sets all
  85  * attributes of a <code>UnicodeSet</code> at once, based on a
  86  * string pattern.
  87  *
  88  * <p><b>Pattern syntax</b></p>
  89  *
  90  * Patterns are accepted by the constructors and the
  91  * <code>applyPattern()</code> methods and returned by the
  92  * <code>toPattern()</code> method.  These patterns follow a syntax
  93  * similar to that employed by version 8 regular expression character
  94  * classes.  Here are some simple examples:
  95  *
  96  * <blockquote>
  97  *   <table>
  98  *     <tr align="top">
  99  *       <td nowrap valign="top" align="left"><code>[]</code></td>
 100  *       <td valign="top">No characters</td>
 101  *     </tr><tr align="top">
 102  *       <td nowrap valign="top" align="left"><code>[a]</code></td>
 103  *       <td valign="top">The character 'a'</td>
 104  *     </tr><tr align="top">
 105  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
 106  *       <td valign="top">The characters 'a' and 'e'</td>
 107  *     </tr>
 108  *     <tr>
 109  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
 110  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
 111  *       point order</td>
 112  *     </tr>
 113  *     <tr>
 114  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
 115  *       <td valign="top">The character U+4E01</td>
 116  *     </tr>
 117  *     <tr>
 118  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
 119  *       <td valign="top">The character 'a' and the multicharacter strings "ab" and
 120  *       "ac"</td>
 121  *     </tr>
 122  *     <tr>
 123  *       <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
 124  *       <td valign="top">All characters in the general category Uppercase Letter</td>
 125  *     </tr>
 126  *   </table>
 127  * </blockquote>
 128  *
 129  * Any character may be preceded by a backslash in order to remove any special
 130  * meaning.  White space characters, as defined by the Unicode Pattern_White_Space property, are
 131  * ignored, unless they are escaped.
 132  *
 133  * <p>Property patterns specify a set of characters having a certain
 134  * property as defined by the Unicode standard.  Both the POSIX-like
 135  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
 136  * complete list of supported property patterns, see the User's Guide
 137  * for UnicodeSet at
 138  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
 139  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
 140  * Actual determination of property data is defined by the underlying
 141  * Unicode database as implemented by UCharacter.
 142  *
 143  * <p>Patterns specify individual characters, ranges of characters, and
 144  * Unicode property sets.  When elements are concatenated, they
 145  * specify their union.  To complement a set, place a '^' immediately
 146  * after the opening '['.  Property patterns are inverted by modifying
 147  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
 148  * '^' has no special meaning.
 149  *
 150  * <p>Ranges are indicated by placing two a '-' between two
 151  * characters, as in "a-z".  This specifies the range of all
 152  * characters from the left to the right, in Unicode order.  If the
 153  * left character is greater than or equal to the
 154  * right character it is a syntax error.  If a '-' occurs as the first
 155  * character after the opening '[' or '[^', or if it occurs as the
 156  * last character before the closing ']', then it is taken as a
 157  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
 158  * set of three characters, 'a', 'b', and '-'.
 159  *
 160  * <p>Sets may be intersected using the {@literal '&'} operator or the asymmetric
 161  * set difference may be taken using the '-' operator, for example,
 162  * "{@code [[:L:]&[\\u0000-\\u0FFF]]}" indicates the set of all Unicode letters
 163  * with values less than 4096.  Operators ({@literal '&'} and '|') have equal
 164  * precedence and bind left-to-right.  Thus
 165  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
 166  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
 167  * difference; intersection is commutative.
 168  *
 169  * <table>
 170  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
 171  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
 172  * through 'z' and all letters in between, in Unicode order
 173  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 174  * all characters but 'a' through 'z',
 175  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 176  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 177  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 178  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&amp;[<em>pat2</em>]]</code>
 179  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
 180  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
 181  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
 182  * <em>pat2</em>
 183  * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
 184  * <td>The set of characters having the specified
 185  * Unicode property; in
 186  * this case, Unicode uppercase letters
 187  * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
 188  * <td>The set of characters <em>not</em> having the given
 189  * Unicode property
 190  * </table>
 191  *
 192  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
 193  *
 194  * <p><b>Formal syntax</b></p>
 195  *
 196  * <blockquote>
 197  *   <table>
 198  *     <tr align="top">
 199  *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
 200  *       <td valign="top"><code>('[' '^'? item* ']') |
 201  *       property</code></td>
 202  *     </tr>
 203  *     <tr align="top">
 204  *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
 205  *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
 206  *       </code></td>
 207  *     </tr>
 208  *     <tr align="top">
 209  *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
 210  *       <td valign="top"><code>pattern | pattern-expr pattern |
 211  *       pattern-expr op pattern<br>
 212  *       </code></td>
 213  *     </tr>
 214  *     <tr align="top">
 215  *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
 216  *       <td valign="top"><code>'&amp;' | '-'<br>
 217  *       </code></td>
 218  *     </tr>
 219  *     <tr align="top">
 220  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
 221  *       <td valign="top"><code>'[' | ']' | '-'<br>
 222  *       </code></td>
 223  *     </tr>
 224  *     <tr align="top">
 225  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
 226  *       <td valign="top"><em>any character that is not</em><code> special<br>
 227  *       | ('\\' </code><em>any character</em><code>)<br>
 228  *       | ('\u' hex hex hex hex)<br>
 229  *       </code></td>
 230  *     </tr>
 231  *     <tr align="top">
 232  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
 233  *       <td valign="top"><em>any character for which
 234  *       </em><code>Character.digit(c, 16)</code><em>
 235  *       returns a non-negative result</em></td>
 236  *     </tr>
 237  *     <tr>
 238  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
 239  *       <td valign="top"><em>a Unicode property set pattern</em></td>
 240  *     </tr>
 241  *   </table>
 242  *   <br>
 243  *   <table border="1">
 244  *     <tr>
 245  *       <td>Legend: <table>
 246  *         <tr>
 247  *           <td nowrap valign="top"><code>a := b</code></td>
 248  *           <td width="20" valign="top">&nbsp; </td>
 249  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
 250  *         </tr>
 251  *         <tr>
 252  *           <td nowrap valign="top"><code>a?</code></td>
 253  *           <td valign="top"></td>
 254  *           <td valign="top">zero or one instance of <code>a</code><br>
 255  *           </td>
 256  *         </tr>
 257  *         <tr>
 258  *           <td nowrap valign="top"><code>a*</code></td>
 259  *           <td valign="top"></td>
 260  *           <td valign="top">one or more instances of <code>a</code><br>
 261  *           </td>
 262  *         </tr>
 263  *         <tr>
 264  *           <td nowrap valign="top"><code>a | b</code></td>
 265  *           <td valign="top"></td>
 266  *           <td valign="top">either <code>a</code> or <code>b</code><br>
 267  *           </td>
 268  *         </tr>
 269  *         <tr>
 270  *           <td nowrap valign="top"><code>'a'</code></td>
 271  *           <td valign="top"></td>
 272  *           <td valign="top">the literal string between the quotes </td>
 273  *         </tr>
 274  *       </table>
 275  *       </td>
 276  *     </tr>
 277  *   </table>
 278  * </blockquote>
 279  * <p>To iterate over contents of UnicodeSet, the following are available:
 280  * <ul><li>{@link #ranges()} to iterate through the ranges</li>
 281  * <li>{@link #strings()} to iterate through the strings</li>
 282  * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
 283  * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
 284  * </ul>
 285  * All of the above can be used in <b>for</b> loops.
 286  * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
 287  * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
 288  *
 289  * @author Alan Liu
 290  * @stable ICU 2.0
 291  */
 292 public class UnicodeSet {
 293 
 294     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
 295     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
 296     // 110000 for codepoints
 297 
 298     /**
 299      * Minimum value that can be stored in a UnicodeSet.
 300      * @stable ICU 2.0
 301      */
 302     public static final int MIN_VALUE = LOW;
 303 
 304     /**
 305      * Maximum value that can be stored in a UnicodeSet.
 306      * @stable ICU 2.0
 307      */
 308     public static final int MAX_VALUE = HIGH - 1;
 309 
 310     private int len;      // length used; list may be longer to minimize reallocs
 311     private int[] list;   // MUST be terminated with HIGH
 312     private int[] rangeList; // internal buffer
 313     private int[] buffer; // internal buffer
 314 
 315     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
 316     // is not private so that UnicodeSetIterator can get access
 317     TreeSet<String> strings = new TreeSet<String>();
 318 
 319     /**
 320      * The pattern representation of this set.  This may not be the
 321      * most economical pattern.  It is the pattern supplied to
 322      * applyPattern(), with variables substituted and whitespace
 323      * removed.  For sets constructed without applyPattern(), or
 324      * modified using the non-pattern API, this string will be null,
 325      * indicating that toPattern() must generate a pattern
 326      * representation from the inversion list.
 327      */
 328 
 329     private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
 330     private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
 331 
 332     private static UnicodeSet INCLUSION = null;
 333 
 334     private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
 335     private volatile UnicodeSetStringSpan stringSpan;
 336 
 337     //----------------------------------------------------------------
 338     // Public API
 339     //----------------------------------------------------------------
 340 
 341     /**
 342      * Constructs an empty set.
 343      * @stable ICU 2.0
 344      */
 345     private UnicodeSet() {
 346         list = new int[1 + START_EXTRA];
 347         list[len++] = HIGH;
 348     }
 349 
 350     /**
 351      * Constructs a copy of an existing set.
 352      * @stable ICU 2.0
 353      */
 354     private UnicodeSet(UnicodeSet other) {
 355         set(other);
 356     }
 357 
 358     /**
 359      * Constructs a set containing the given range. If <code>end >
 360      * start</code> then an empty set is created.
 361      *
 362      * @param start first character, inclusive, of range
 363      * @param end last character, inclusive, of range
 364      * @stable ICU 2.0
 365      */
 366     public UnicodeSet(int start, int end) {
 367         this();
 368         complement(start, end);
 369     }
 370 
 371     /**
 372      * Constructs a set from the given pattern.  See the class description
 373      * for the syntax of the pattern language.  Whitespace is ignored.
 374      * @param pattern a string specifying what characters are in the set
 375      * @exception java.lang.IllegalArgumentException if the pattern contains
 376      * a syntax error.
 377      * @stable ICU 2.0
 378      */
 379     public UnicodeSet(String pattern) {
 380         this();
 381         applyPattern(pattern, null);
 382     }
 383 
 384     /**
 385      * Make this object represent the same set as <code>other</code>.
 386      * @param other a <code>UnicodeSet</code> whose value will be
 387      * copied to this object
 388      * @stable ICU 2.0
 389      */
 390     public UnicodeSet set(UnicodeSet other) {
 391         checkFrozen();
 392         list = other.list.clone();
 393         len = other.len;
 394         strings = new TreeSet<String>(other.strings);
 395         return this;
 396     }
 397 
 398     /**
 399      * Returns the number of elements in this set (its cardinality)
 400      * Note than the elements of a set may include both individual
 401      * codepoints and strings.
 402      *
 403      * @return the number of elements in this set (its cardinality).
 404      * @stable ICU 2.0
 405      */
 406     public int size() {
 407         int n = 0;
 408         int count = getRangeCount();
 409         for (int i = 0; i < count; ++i) {
 410             n += getRangeEnd(i) - getRangeStart(i) + 1;
 411         }
 412         return n + strings.size();
 413     }
 414 
 415     // for internal use, after checkFrozen has been called
 416     private UnicodeSet add_unchecked(int start, int end) {
 417         if (start < MIN_VALUE || start > MAX_VALUE) {
 418             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
 419         }
 420         if (end < MIN_VALUE || end > MAX_VALUE) {
 421             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
 422         }
 423         if (start < end) {
 424             add(range(start, end), 2, 0);
 425         } else if (start == end) {
 426             add(start);
 427         }
 428         return this;
 429     }
 430 
 431     /**
 432      * Adds the specified character to this set if it is not already
 433      * present.  If this set already contains the specified character,
 434      * the call leaves this set unchanged.
 435      * @stable ICU 2.0
 436      */
 437     public final UnicodeSet add(int c) {
 438         checkFrozen();
 439         return add_unchecked(c);
 440     }
 441 
 442     // for internal use only, after checkFrozen has been called
 443     private final UnicodeSet add_unchecked(int c) {
 444         if (c < MIN_VALUE || c > MAX_VALUE) {
 445             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
 446         }
 447 
 448         // find smallest i such that c < list[i]
 449         // if odd, then it is IN the set
 450         // if even, then it is OUT of the set
 451         int i = findCodePoint(c);
 452 
 453         // already in set?
 454         if ((i & 1) != 0) return this;
 455 
 456         // HIGH is 0x110000
 457         // assert(list[len-1] == HIGH);
 458 
 459         // empty = [HIGH]
 460         // [start_0, limit_0, start_1, limit_1, HIGH]
 461 
 462         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 463         //                             ^
 464         //                             list[i]
 465 
 466         // i == 0 means c is before the first range
 467 
 468         if (c == list[i]-1) {
 469             // c is before start of next range
 470             list[i] = c;
 471             // if we touched the HIGH mark, then add a new one
 472             if (c == MAX_VALUE) {
 473                 ensureCapacity(len+1);
 474                 list[len++] = HIGH;
 475             }
 476             if (i > 0 && c == list[i-1]) {
 477                 // collapse adjacent ranges
 478 
 479                 // [..., start_k-1, c, c, limit_k, ..., HIGH]
 480                 //                     ^
 481                 //                     list[i]
 482                 System.arraycopy(list, i+1, list, i-1, len-i-1);
 483                 len -= 2;
 484             }
 485         }
 486 
 487         else if (i > 0 && c == list[i-1]) {
 488             // c is after end of prior range
 489             list[i-1]++;
 490             // no need to chcek for collapse here
 491         }
 492 
 493         else {
 494             // At this point we know the new char is not adjacent to
 495             // any existing ranges, and it is not 10FFFF.
 496 
 497 
 498             // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 499             //                             ^
 500             //                             list[i]
 501 
 502             // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 503             //                             ^
 504             //                             list[i]
 505 
 506             // Don't use ensureCapacity() to save on copying.
 507             // NOTE: This has no measurable impact on performance,
 508             // but it might help in some usage patterns.
 509             if (len+2 > list.length) {
 510                 int[] temp = new int[len + 2 + GROW_EXTRA];
 511                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
 512                 System.arraycopy(list, i, temp, i+2, len-i);
 513                 list = temp;
 514             } else {
 515                 System.arraycopy(list, i, list, i+2, len-i);
 516             }
 517 
 518             list[i] = c;
 519             list[i+1] = c+1;
 520             len += 2;
 521         }
 522 
 523         return this;
 524     }
 525 
 526     /**
 527      * Adds the specified multicharacter to this set if it is not already
 528      * present.  If this set already contains the multicharacter,
 529      * the call leaves this set unchanged.
 530      * Thus {@code "ch" => {"ch"}}
 531      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 532      * @param s the source string
 533      * @return this object, for chaining
 534      * @stable ICU 2.0
 535      */
 536     public final UnicodeSet add(CharSequence s) {
 537         checkFrozen();
 538         int cp = getSingleCP(s);
 539         if (cp < 0) {
 540             strings.add(s.toString());
 541         } else {
 542             add_unchecked(cp, cp);
 543         }
 544         return this;
 545     }
 546 
 547     /**
 548      * Utility for getting code point from single code point CharSequence.
 549      * See the public UTF16.getSingleCodePoint()
 550      * @return a code point IF the string consists of a single one.
 551      * otherwise returns -1.
 552      * @param s to test
 553      */
 554     private static int getSingleCP(CharSequence s) {
 555         if (s.length() < 1) {
 556             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
 557         }
 558         if (s.length() > 2) return -1;
 559         if (s.length() == 1) return s.charAt(0);
 560 
 561         // at this point, len = 2
 562         int cp = UTF16.charAt(s, 0);
 563         if (cp > 0xFFFF) { // is surrogate pair
 564             return cp;
 565         }
 566         return -1;
 567     }
 568 
 569     /**
 570      * Complements the specified range in this set.  Any character in
 571      * the range will be removed if it is in this set, or will be
 572      * added if it is not in this set.  If {@code end > start}
 573      * then an empty range is complemented, leaving the set unchanged.
 574      *
 575      * @param start first character, inclusive, of range to be removed
 576      * from this set.
 577      * @param end last character, inclusive, of range to be removed
 578      * from this set.
 579      * @stable ICU 2.0
 580      */
 581     public UnicodeSet complement(int start, int end) {
 582         checkFrozen();
 583         if (start < MIN_VALUE || start > MAX_VALUE) {
 584             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
 585         }
 586         if (end < MIN_VALUE || end > MAX_VALUE) {
 587             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
 588         }
 589         if (start <= end) {
 590             xor(range(start, end), 2, 0);
 591         }
 592         return this;
 593     }
 594 
 595     /**
 596      * Returns true if this set contains the given character.
 597      * @param c character to be checked for containment
 598      * @return true if the test condition is met
 599      * @stable ICU 2.0
 600      */
 601     public boolean contains(int c) {
 602         if (c < MIN_VALUE || c > MAX_VALUE) {
 603             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
 604         }
 605         if (bmpSet != null) {
 606             return bmpSet.contains(c);
 607         }
 608         if (stringSpan != null) {
 609             return stringSpan.contains(c);
 610         }
 611 
 612         /*
 613         // Set i to the index of the start item greater than ch
 614         // We know we will terminate without length test!
 615         int i = -1;
 616         while (true) {
 617             if (c < list[++i]) break;
 618         }
 619          */
 620 
 621         int i = findCodePoint(c);
 622 
 623         return ((i & 1) != 0); // return true if odd
 624     }
 625 
 626     /**
 627      * Returns the smallest value i such that c < list[i].  Caller
 628      * must ensure that c is a legal value or this method will enter
 629      * an infinite loop.  This method performs a binary search.
 630      * @param c a character in the range MIN_VALUE..MAX_VALUE
 631      * inclusive
 632      * @return the smallest integer i in the range 0..len-1,
 633      * inclusive, such that c < list[i]
 634      */
 635     private final int findCodePoint(int c) {
 636         /* Examples:
 637                                            findCodePoint(c)
 638            set              list[]         c=0 1 3 4 7 8
 639            ===              ==============   ===========
 640            []               [110000]         0 0 0 0 0 0
 641            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 642            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 643            [:all:]          [0, 110000]      1 1 1 1 1 1
 644          */
 645 
 646         // Return the smallest i such that c < list[i].  Assume
 647         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 648         if (c < list[0]) return 0;
 649         // High runner test.  c is often after the last range, so an
 650         // initial check for this condition pays off.
 651         if (len >= 2 && c >= list[len-2]) return len-1;
 652         int lo = 0;
 653         int hi = len - 1;
 654         // invariant: c >= list[lo]
 655         // invariant: c < list[hi]
 656         for (;;) {
 657             int i = (lo + hi) >>> 1;
 658         if (i == lo) return hi;
 659             if (c < list[i]) {
 660                 hi = i;
 661             } else {
 662                 lo = i;
 663             }
 664         }
 665     }
 666 
 667     /**
 668      * Retains only the elements in this set that are contained in the
 669      * specified set.  In other words, removes from this set all of
 670      * its elements that are not contained in the specified set.  This
 671      * operation effectively modifies this set so that its value is
 672      * the <i>intersection</i> of the two sets.
 673      *
 674      * @param c set that defines which elements this set will retain.
 675      * @stable ICU 2.0
 676      */
 677     public UnicodeSet retainAll(UnicodeSet c) {
 678         checkFrozen();
 679         retain(c.list, c.len, 0);
 680         strings.retainAll(c.strings);
 681         return this;
 682     }
 683 
 684     /**
 685      * Removes all of the elements from this set.  This set will be
 686      * empty after this call returns.
 687      * @stable ICU 2.0
 688      */
 689     public UnicodeSet clear() {
 690         checkFrozen();
 691         list[0] = HIGH;
 692         len = 1;
 693         strings.clear();
 694         return this;
 695     }
 696 
 697     /**
 698      * Iteration method that returns the number of ranges contained in
 699      * this set.
 700      * @see #getRangeStart
 701      * @see #getRangeEnd
 702      * @stable ICU 2.0
 703      */
 704     public int getRangeCount() {
 705         return len/2;
 706     }
 707 
 708     /**
 709      * Iteration method that returns the first character in the
 710      * specified range of this set.
 711      * @exception ArrayIndexOutOfBoundsException if index is outside
 712      * the range <code>0..getRangeCount()-1</code>
 713      * @see #getRangeCount
 714      * @see #getRangeEnd
 715      * @stable ICU 2.0
 716      */
 717     public int getRangeStart(int index) {
 718         return list[index*2];
 719     }
 720 
 721     /**
 722      * Iteration method that returns the last character in the
 723      * specified range of this set.
 724      * @exception ArrayIndexOutOfBoundsException if index is outside
 725      * the range <code>0..getRangeCount()-1</code>
 726      * @see #getRangeStart
 727      * @see #getRangeEnd
 728      * @stable ICU 2.0
 729      */
 730     public int getRangeEnd(int index) {
 731         return (list[index*2 + 1] - 1);
 732     }
 733 
 734     //----------------------------------------------------------------
 735     // Implementation: Pattern parsing
 736     //----------------------------------------------------------------
 737 
 738     /**
 739      * Parses the given pattern, starting at the given position.  The character
 740      * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
 741      * Parsing continues until the corresponding closing ']'.  If a syntax error
 742      * is encountered between the opening and closing brace, the parse fails.
 743      * Upon return from a successful parse, the ParsePosition is updated to
 744      * point to the character following the closing ']', and an inversion
 745      * list for the parsed pattern is returned.  This method
 746      * calls itself recursively to parse embedded subpatterns.
 747      *
 748      * @param pattern the string containing the pattern to be parsed.  The
 749      * portion of the string from pos.getIndex(), which must be a '[', to the
 750      * corresponding closing ']', is parsed.
 751      * @param pos upon entry, the position at which to being parsing.  The
 752      * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
 753      * from a successful parse, pos.getIndex() is either the character after the
 754      * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
 755      * is the last character of the pattern string.
 756      * @return an inversion list for the parsed substring
 757      * of <code>pattern</code>
 758      * @exception java.lang.IllegalArgumentException if the parse fails.
 759      */
 760     private UnicodeSet applyPattern(String pattern,
 761             ParsePosition pos) {
 762         if ("[:age=3.2:]".equals(pattern)) {
 763             checkFrozen();
 764             VersionInfo version = VersionInfo.getInstance("3.2");
 765             applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
 766         } else {
 767             throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
 768                           + pattern + ")");
 769         }
 770 
 771         return this;
 772     }
 773 
 774     //----------------------------------------------------------------
 775     // Implementation: Utility methods
 776     //----------------------------------------------------------------
 777 
 778     private void ensureCapacity(int newLen) {
 779         if (newLen <= list.length) return;
 780         int[] temp = new int[newLen + GROW_EXTRA];
 781         System.arraycopy(list, 0, temp, 0, len);
 782         list = temp;
 783     }
 784 
 785     private void ensureBufferCapacity(int newLen) {
 786         if (buffer != null && newLen <= buffer.length) return;
 787         buffer = new int[newLen + GROW_EXTRA];
 788     }
 789 
 790     /**
 791      * Assumes start <= end.
 792      */
 793     private int[] range(int start, int end) {
 794         if (rangeList == null) {
 795             rangeList = new int[] { start, end+1, HIGH };
 796         } else {
 797             rangeList[0] = start;
 798             rangeList[1] = end+1;
 799         }
 800         return rangeList;
 801     }
 802 
 803     //----------------------------------------------------------------
 804     // Implementation: Fundamental operations
 805     //----------------------------------------------------------------
 806 
 807     // polarity = 0, 3 is normal: x xor y
 808     // polarity = 1, 2: x xor ~y == x === y
 809 
 810     private UnicodeSet xor(int[] other, int otherLen, int polarity) {
 811         ensureBufferCapacity(len + otherLen);
 812         int i = 0, j = 0, k = 0;
 813         int a = list[i++];
 814         int b;
 815         if (polarity == 1 || polarity == 2) {
 816             b = LOW;
 817             if (other[j] == LOW) { // skip base if already LOW
 818                 ++j;
 819                 b = other[j];
 820             }
 821         } else {
 822             b = other[j++];
 823         }
 824         // simplest of all the routines
 825         // sort the values, discarding identicals!
 826         while (true) {
 827             if (a < b) {
 828                 buffer[k++] = a;
 829                 a = list[i++];
 830             } else if (b < a) {
 831                 buffer[k++] = b;
 832                 b = other[j++];
 833             } else if (a != HIGH) { // at this point, a == b
 834                 // discard both values!
 835                 a = list[i++];
 836                 b = other[j++];
 837             } else { // DONE!
 838                 buffer[k++] = HIGH;
 839                 len = k;
 840                 break;
 841             }
 842         }
 843         // swap list and buffer
 844         int[] temp = list;
 845         list = buffer;
 846         buffer = temp;
 847         return this;
 848     }
 849 
 850     // polarity = 0 is normal: x union y
 851     // polarity = 2: x union ~y
 852     // polarity = 1: ~x union y
 853     // polarity = 3: ~x union ~y
 854 
 855     private UnicodeSet add(int[] other, int otherLen, int polarity) {
 856         ensureBufferCapacity(len + otherLen);
 857         int i = 0, j = 0, k = 0;
 858         int a = list[i++];
 859         int b = other[j++];
 860         // change from xor is that we have to check overlapping pairs
 861         // polarity bit 1 means a is second, bit 2 means b is.
 862         main:
 863             while (true) {
 864                 switch (polarity) {
 865                 case 0: // both first; take lower if unequal
 866                     if (a < b) { // take a
 867                         // Back up over overlapping ranges in buffer[]
 868                         if (k > 0 && a <= buffer[k-1]) {
 869                             // Pick latter end value in buffer[] vs. list[]
 870                             a = max(list[i], buffer[--k]);
 871                         } else {
 872                             // No overlap
 873                             buffer[k++] = a;
 874                             a = list[i];
 875                         }
 876                         i++; // Common if/else code factored out
 877                         polarity ^= 1;
 878                     } else if (b < a) { // take b
 879                         if (k > 0 && b <= buffer[k-1]) {
 880                             b = max(other[j], buffer[--k]);
 881                         } else {
 882                             buffer[k++] = b;
 883                             b = other[j];
 884                         }
 885                         j++;
 886                         polarity ^= 2;
 887                     } else { // a == b, take a, drop b
 888                         if (a == HIGH) break main;
 889                         // This is symmetrical; it doesn't matter if
 890                         // we backtrack with a or b. - liu
 891                         if (k > 0 && a <= buffer[k-1]) {
 892                             a = max(list[i], buffer[--k]);
 893                         } else {
 894                             // No overlap
 895                             buffer[k++] = a;
 896                             a = list[i];
 897                         }
 898                         i++;
 899                         polarity ^= 1;
 900                         b = other[j++]; polarity ^= 2;
 901                     }
 902                     break;
 903                 case 3: // both second; take higher if unequal, and drop other
 904                     if (b <= a) { // take a
 905                         if (a == HIGH) break main;
 906                         buffer[k++] = a;
 907                     } else { // take b
 908                         if (b == HIGH) break main;
 909                         buffer[k++] = b;
 910                     }
 911                     a = list[i++]; polarity ^= 1;   // factored common code
 912                     b = other[j++]; polarity ^= 2;
 913                     break;
 914                 case 1: // a second, b first; if b < a, overlap
 915                     if (a < b) { // no overlap, take a
 916                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
 917                     } else if (b < a) { // OVERLAP, drop b
 918                         b = other[j++]; polarity ^= 2;
 919                     } else { // a == b, drop both!
 920                         if (a == HIGH) break main;
 921                         a = list[i++]; polarity ^= 1;
 922                         b = other[j++]; polarity ^= 2;
 923                     }
 924                     break;
 925                 case 2: // a first, b second; if a < b, overlap
 926                     if (b < a) { // no overlap, take b
 927                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
 928                     } else  if (a < b) { // OVERLAP, drop a
 929                         a = list[i++]; polarity ^= 1;
 930                     } else { // a == b, drop both!
 931                         if (a == HIGH) break main;
 932                         a = list[i++]; polarity ^= 1;
 933                         b = other[j++]; polarity ^= 2;
 934                     }
 935                     break;
 936                 }
 937             }
 938         buffer[k++] = HIGH;    // terminate
 939         len = k;
 940         // swap list and buffer
 941         int[] temp = list;
 942         list = buffer;
 943         buffer = temp;
 944         return this;
 945     }
 946 
 947     // polarity = 0 is normal: x intersect y
 948     // polarity = 2: x intersect ~y == set-minus
 949     // polarity = 1: ~x intersect y
 950     // polarity = 3: ~x intersect ~y
 951 
 952     private UnicodeSet retain(int[] other, int otherLen, int polarity) {
 953         ensureBufferCapacity(len + otherLen);
 954         int i = 0, j = 0, k = 0;
 955         int a = list[i++];
 956         int b = other[j++];
 957         // change from xor is that we have to check overlapping pairs
 958         // polarity bit 1 means a is second, bit 2 means b is.
 959         main:
 960             while (true) {
 961                 switch (polarity) {
 962                 case 0: // both first; drop the smaller
 963                     if (a < b) { // drop a
 964                         a = list[i++]; polarity ^= 1;
 965                     } else if (b < a) { // drop b
 966                         b = other[j++]; polarity ^= 2;
 967                     } else { // a == b, take one, drop other
 968                         if (a == HIGH) break main;
 969                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
 970                         b = other[j++]; polarity ^= 2;
 971                     }
 972                     break;
 973                 case 3: // both second; take lower if unequal
 974                     if (a < b) { // take a
 975                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
 976                     } else if (b < a) { // take b
 977                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
 978                     } else { // a == b, take one, drop other
 979                         if (a == HIGH) break main;
 980                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
 981                         b = other[j++]; polarity ^= 2;
 982                     }
 983                     break;
 984                 case 1: // a second, b first;
 985                     if (a < b) { // NO OVERLAP, drop a
 986                         a = list[i++]; polarity ^= 1;
 987                     } else if (b < a) { // OVERLAP, take b
 988                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
 989                     } else { // a == b, drop both!
 990                         if (a == HIGH) break main;
 991                         a = list[i++]; polarity ^= 1;
 992                         b = other[j++]; polarity ^= 2;
 993                     }
 994                     break;
 995                 case 2: // a first, b second; if a < b, overlap
 996                     if (b < a) { // no overlap, drop b
 997                         b = other[j++]; polarity ^= 2;
 998                     } else  if (a < b) { // OVERLAP, take a
 999                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
1000                     } else { // a == b, drop both!
1001                         if (a == HIGH) break main;
1002                         a = list[i++]; polarity ^= 1;
1003                         b = other[j++]; polarity ^= 2;
1004                     }
1005                     break;
1006                 }
1007             }
1008         buffer[k++] = HIGH;    // terminate
1009         len = k;
1010         // swap list and buffer
1011         int[] temp = list;
1012         list = buffer;
1013         buffer = temp;
1014         return this;
1015     }
1016 
1017     private static final int max(int a, int b) {
1018         return (a > b) ? a : b;
1019     }
1020 
1021     //----------------------------------------------------------------
1022     // Generic filter-based scanning code
1023     //----------------------------------------------------------------
1024 
1025     private static interface Filter {
1026         boolean contains(int codePoint);
1027     }
1028 
1029     private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
1030 
1031     private static class VersionFilter implements Filter {
1032         VersionInfo version;
1033         VersionFilter(VersionInfo version) { this.version = version; }
1034         public boolean contains(int ch) {
1035             VersionInfo v = UCharacter.getAge(ch);
1036             // Reference comparison ok; VersionInfo caches and reuses
1037             // unique objects.
1038             return v != NO_VERSION &&
1039                     v.compareTo(version) <= 0;
1040         }
1041     }
1042 
1043     private static synchronized UnicodeSet getInclusions(int src) {
1044         if (src != UCharacterProperty.SRC_PROPSVEC) {
1045             throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
1046         }
1047 
1048         if (INCLUSION == null) {
1049             UnicodeSet incl = new UnicodeSet();
1050             UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
1051             INCLUSION = incl;
1052         }
1053         return INCLUSION;
1054     }
1055 
1056     /**
1057      * Generic filter-based scanning code for UCD property UnicodeSets.
1058      */
1059     private UnicodeSet applyFilter(Filter filter, int src) {
1060         // Logically, walk through all Unicode characters, noting the start
1061         // and end of each range for which filter.contain(c) is
1062         // true.  Add each range to a set.
1063         //
1064         // To improve performance, use an inclusions set which
1065         // encodes information about character ranges that are known
1066         // to have identical properties.
1067         // getInclusions(src) contains exactly the first characters of
1068         // same-value ranges for the given properties "source".
1069 
1070         clear();
1071 
1072         int startHasProperty = -1;
1073         UnicodeSet inclusions = getInclusions(src);
1074         int limitRange = inclusions.getRangeCount();
1075 
1076         for (int j=0; j<limitRange; ++j) {
1077             // get current range
1078             int start = inclusions.getRangeStart(j);
1079             int end = inclusions.getRangeEnd(j);
1080 
1081             // for all the code points in the range, process
1082             for (int ch = start; ch <= end; ++ch) {
1083                 // only add to the unicodeset on inflection points --
1084                 // where the hasProperty value changes to false
1085                 if (filter.contains(ch)) {
1086                     if (startHasProperty < 0) {
1087                         startHasProperty = ch;
1088                     }
1089                 } else if (startHasProperty >= 0) {
1090                     add_unchecked(startHasProperty, ch-1);
1091                     startHasProperty = -1;
1092                 }
1093             }
1094         }
1095         if (startHasProperty >= 0) {
1096             add_unchecked(startHasProperty, 0x10FFFF);
1097         }
1098 
1099         return this;
1100     }
1101 
1102     /**
1103      * Is this frozen, according to the Freezable interface?
1104      *
1105      * @return value
1106      * @stable ICU 3.8
1107      */
1108     public boolean isFrozen() {
1109         return (bmpSet != null || stringSpan != null);
1110     }
1111 
1112     /**
1113      * Freeze this class, according to the Freezable interface.
1114      *
1115      * @return this
1116      * @stable ICU 4.4
1117      */
1118     public UnicodeSet freeze() {
1119         if (!isFrozen()) {
1120             // Do most of what compact() does before freezing because
1121             // compact() will not work when the set is frozen.
1122             // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
1123 
1124             // Delete buffer first to defragment memory less.
1125             buffer = null;
1126             if (list.length > (len + GROW_EXTRA)) {
1127                 // Make the capacity equal to len or 1.
1128                 // We don't want to realloc of 0 size.
1129                 int capacity = (len == 0) ? 1 : len;
1130                 int[] oldList = list;
1131                 list = new int[capacity];
1132                 for (int i = capacity; i-- > 0;) {
1133                     list[i] = oldList[i];
1134                 }
1135             }
1136 
1137             // Optimize contains() and span() and similar functions.
1138             if (!strings.isEmpty()) {
1139                 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
1140             }
1141             if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
1142                 // Optimize for code point spans.
1143                 // There are no strings, or
1144                 // all strings are irrelevant for span() etc. because
1145                 // all of each string's code points are contained in this set.
1146                 // However, fully contained strings are relevant for spanAndCount(),
1147                 // so we create both objects.
1148                 bmpSet = new BMPSet(list, len);
1149             }
1150         }
1151         return this;
1152     }
1153 
1154     /**
1155      * Span a string using this UnicodeSet.
1156      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1157      * @param s The string to be spanned
1158      * @param spanCondition The span condition
1159      * @return the length of the span
1160      * @stable ICU 4.4
1161      */
1162     public int span(CharSequence s, SpanCondition spanCondition) {
1163         return span(s, 0, spanCondition);
1164     }
1165 
1166     /**
1167      * Span a string using this UnicodeSet.
1168      *   If the start index is less than 0, span will start from 0.
1169      *   If the start index is greater than the string length, span returns the string length.
1170      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1171      * @param s The string to be spanned
1172      * @param start The start index that the span begins
1173      * @param spanCondition The span condition
1174      * @return the string index which ends the span (i.e. exclusive)
1175      * @stable ICU 4.4
1176      */
1177     public int span(CharSequence s, int start, SpanCondition spanCondition) {
1178         int end = s.length();
1179         if (start < 0) {
1180             start = 0;
1181         } else if (start >= end) {
1182             return end;
1183         }
1184         if (bmpSet != null) {
1185             // Frozen set without strings, or no string is relevant for span().
1186             return bmpSet.span(s, start, spanCondition, null);
1187         }
1188         if (stringSpan != null) {
1189             return stringSpan.span(s, start, spanCondition);
1190         } else if (!strings.isEmpty()) {
1191             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
1192                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
1193             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
1194             if (strSpan.needsStringSpanUTF16()) {
1195                 return strSpan.span(s, start, spanCondition);
1196             }
1197         }
1198 
1199         return spanCodePointsAndCount(s, start, spanCondition, null);
1200     }
1201 
1202     /**
1203      * Same as span() but also counts the smallest number of set elements on any path across the span.
1204      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1205      * @param outCount An output-only object (must not be null) for returning the count.
1206      * @return the limit (exclusive end) of the span
1207      */
1208     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
1209         if (outCount == null) {
1210             throw new IllegalArgumentException("outCount must not be null");
1211         }
1212         int end = s.length();
1213         if (start < 0) {
1214             start = 0;
1215         } else if (start >= end) {
1216             return end;
1217         }
1218         if (stringSpan != null) {
1219             // We might also have bmpSet != null,
1220             // but fully-contained strings are relevant for counting elements.
1221             return stringSpan.spanAndCount(s, start, spanCondition, outCount);
1222         } else if (bmpSet != null) {
1223             return bmpSet.span(s, start, spanCondition, outCount);
1224         } else if (!strings.isEmpty()) {
1225             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
1226                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
1227             which |= UnicodeSetStringSpan.WITH_COUNT;
1228             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
1229             return strSpan.spanAndCount(s, start, spanCondition, outCount);
1230         }
1231 
1232         return spanCodePointsAndCount(s, start, spanCondition, outCount);
1233     }
1234 
1235     private int spanCodePointsAndCount(CharSequence s, int start,
1236             SpanCondition spanCondition, OutputInt outCount) {
1237         // Pin to 0/1 values.
1238         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
1239 
1240         int c;
1241         int next = start;
1242         int length = s.length();
1243         int count = 0;
1244         do {
1245             c = Character.codePointAt(s, next);
1246             if (spanContained != contains(c)) {
1247                 break;
1248             }
1249             ++count;
1250             next += Character.charCount(c);
1251         } while (next < length);
1252         if (outCount != null) { outCount.value = count; }
1253         return next;
1254     }
1255 
1256     /**
1257      * Span a string backwards (from the fromIndex) using this UnicodeSet.
1258      * If the fromIndex is less than 0, spanBack will return 0.
1259      * If fromIndex is greater than the string length, spanBack will start from the string length.
1260      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1261      * @param s The string to be spanned
1262      * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
1263      * @param spanCondition The span condition
1264      * @return The string index which starts the span (i.e. inclusive).
1265      * @stable ICU 4.4
1266      */
1267     public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
1268         if (fromIndex <= 0) {
1269             return 0;
1270         }
1271         if (fromIndex > s.length()) {
1272             fromIndex = s.length();
1273         }
1274         if (bmpSet != null) {
1275             // Frozen set without strings, or no string is relevant for spanBack().
1276             return bmpSet.spanBack(s, fromIndex, spanCondition);
1277         }
1278         if (stringSpan != null) {
1279             return stringSpan.spanBack(s, fromIndex, spanCondition);
1280         } else if (!strings.isEmpty()) {
1281             int which = (spanCondition == SpanCondition.NOT_CONTAINED)
1282                     ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
1283                             : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
1284             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
1285             if (strSpan.needsStringSpanUTF16()) {
1286                 return strSpan.spanBack(s, fromIndex, spanCondition);
1287             }
1288         }
1289 
1290         // Pin to 0/1 values.
1291         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
1292 
1293         int c;
1294         int prev = fromIndex;
1295         do {
1296             c = Character.codePointBefore(s, prev);
1297             if (spanContained != contains(c)) {
1298                 break;
1299             }
1300             prev -= Character.charCount(c);
1301         } while (prev > 0);
1302         return prev;
1303     }
1304 
1305     /**
1306      * Clone a thawed version of this class, according to the Freezable interface.
1307      * @return the clone, not frozen
1308      * @stable ICU 4.4
1309      */
1310     public UnicodeSet cloneAsThawed() {
1311         UnicodeSet result = new UnicodeSet(this);
1312         assert !result.isFrozen();
1313         return result;
1314     }
1315 
1316     // internal function
1317     private void checkFrozen() {
1318         if (isFrozen()) {
1319             throw new UnsupportedOperationException("Attempt to modify frozen object");
1320         }
1321     }
1322 
1323     /**
1324      * Argument values for whether span() and similar functions continue while the current character is contained vs.
1325      * not contained in the set.
1326      * <p>
1327      * The functionality is straightforward for sets with only single code points, without strings (which is the common
1328      * case):
1329      * <ul>
1330      * <li>CONTAINED and SIMPLE work the same.
1331      * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
1332      * <li>span() and spanBack() partition any string the
1333      * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
1334      * <li>Using a
1335      * complemented (inverted) set and the opposite span conditions yields the same results.
1336      * </ul>
1337      * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
1338      * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
1339      * strings:
1340      * <ul>
1341      * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
1342      * Therefore, complementing both the set and the span conditions may yield different results.
1343      * <li>When starting spans
1344      * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
1345      * because a set string may start before the later position.
1346      * <li>span(SIMPLE) may be shorter than
1347      * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
1348      * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
1349      * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
1350      * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
1351      * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
1352      * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
1353      * </ul>
1354      * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
1355      * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
1356      * be used.
1357      * <p>
1358      * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
1359      * boundaries, never in the middle of a surrogate pair.
1360      *
1361      * @stable ICU 4.4
1362      */
1363     public enum SpanCondition {
1364         /**
1365          * Continues a span() while there is no set element at the current position.
1366          * Increments by one code point at a time.
1367          * Stops before the first set element (character or string).
1368          * (For code points only, this is like while contains(current)==false).
1369          * <p>
1370          * When span() returns, the substring between where it started and the position it returned consists only of
1371          * characters that are not in the set, and none of its strings overlap with the span.
1372          *
1373          * @stable ICU 4.4
1374          */
1375         NOT_CONTAINED,
1376 
1377         /**
1378          * Spans the longest substring that is a concatenation of set elements (characters or strings).
1379          * (For characters only, this is like while contains(current)==true).
1380          * <p>
1381          * When span() returns, the substring between where it started and the position it returned consists only of set
1382          * elements (characters or strings) that are in the set.
1383          * <p>
1384          * If a set contains strings, then the span will be the longest substring for which there
1385          * exists at least one non-overlapping concatenation of set elements (characters or strings).
1386          * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
1387          * (Java/ICU/Perl regex stops at the first match of an OR.)
1388          *
1389          * @stable ICU 4.4
1390          */
1391         CONTAINED,
1392 
1393         /**
1394          * Continues a span() while there is a set element at the current position.
1395          * Increments by the longest matching element at each position.
1396          * (For characters only, this is like while contains(current)==true).
1397          * <p>
1398          * When span() returns, the substring between where it started and the position it returned consists only of set
1399          * elements (characters or strings) that are in the set.
1400          * <p>
1401          * If a set only contains single characters, then this is the same as CONTAINED.
1402          * <p>
1403          * If a set contains strings, then the span will be the longest substring with a match at each position with the
1404          * longest single set element (character or string).
1405          * <p>
1406          * Use this span condition together with other longest-match algorithms, such as ICU converters
1407          * (ucnv_getUnicodeSet()).
1408          *
1409          * @stable ICU 4.4
1410          */
1411         SIMPLE,
1412     }
1413 
1414 }