1 /*
   2  * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.text.ParsePosition;
  40 import java.util.Iterator;
  41 import java.util.TreeSet;
  42 
  43 /**
  44  * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
  45  * represent <em>character classes</em> used in regular expressions.
  46  * A character specifies a subset of Unicode code points.  Legal
  47  * code points are U+0000 to U+10FFFF, inclusive.
  48  *
  49  * <p>The UnicodeSet class is not designed to be subclassed.
  50  *
  51  * <p><code>UnicodeSet</code> supports two APIs. The first is the
  52  * <em>operand</em> API that allows the caller to modify the value of
  53  * a <code>UnicodeSet</code> object. It conforms to Java 2's
  54  * <code>java.util.Set</code> interface, although
  55  * <code>UnicodeSet</code> does not actually implement that
  56  * interface. All methods of <code>Set</code> are supported, with the
  57  * modification that they take a character range or single character
  58  * instead of an <code>Object</code>, and they take a
  59  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
  60  * operand API may be thought of in terms of boolean logic: a boolean
  61  * OR is implemented by <code>add</code>, a boolean AND is implemented
  62  * by <code>retain</code>, a boolean XOR is implemented by
  63  * <code>complement</code> taking an argument, and a boolean NOT is
  64  * implemented by <code>complement</code> with no argument.  In terms
  65  * of traditional set theory function names, <code>add</code> is a
  66  * union, <code>retain</code> is an intersection, <code>remove</code>
  67  * is an asymmetric difference, and <code>complement</code> with no
  68  * argument is a set complement with respect to the superset range
  69  * <code>MIN_VALUE-MAX_VALUE</code>
  70  *
  71  * <p>The second API is the
  72  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
  73  * <code>java.text.Format</code>-derived classes.  Unlike the
  74  * methods that add characters, add categories, and control the logic
  75  * of the set, the method <code>applyPattern()</code> sets all
  76  * attributes of a <code>UnicodeSet</code> at once, based on a
  77  * string pattern.
  78  *
  79  * <p><b>Pattern syntax</b></p>
  80  *
  81  * Patterns are accepted by the constructors and the
  82  * <code>applyPattern()</code> methods and returned by the
  83  * <code>toPattern()</code> method.  These patterns follow a syntax
  84  * similar to that employed by version 8 regular expression character
  85  * classes.  Here are some simple examples:
  86  *
  87  * <blockquote>
  88  *   <table>
  89  *     <tr align="top">
  90  *       <td nowrap valign="top" align="left"><code>[]</code></td>
  91  *       <td valign="top">No characters</td>
  92  *     </tr><tr align="top">
  93  *       <td nowrap valign="top" align="left"><code>[a]</code></td>
  94  *       <td valign="top">The character 'a'</td>
  95  *     </tr><tr align="top">
  96  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
  97  *       <td valign="top">The characters 'a' and 'e'</td>
  98  *     </tr>
  99  *     <tr>
 100  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
 101  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
 102  *       point order</td>
 103  *     </tr>
 104  *     <tr>
 105  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
 106  *       <td valign="top">The character U+4E01</td>
 107  *     </tr>
 108  *     <tr>
 109  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
 110  *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
 111  *       &quot;ac&quot;</td>
 112  *     </tr>
 113  *     <tr>
 114  *       <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
 115  *       <td valign="top">All characters in the general category Uppercase Letter</td>
 116  *     </tr>
 117  *   </table>
 118  * </blockquote>
 119  *
 120  * Any character may be preceded by a backslash in order to remove any special
 121  * meaning.  White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
 122  * ignored, unless they are escaped.
 123  *
 124  * <p>Property patterns specify a set of characters having a certain
 125  * property as defined by the Unicode standard.  Both the POSIX-like
 126  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
 127  * complete list of supported property patterns, see the User's Guide
 128  * for UnicodeSet at
 129  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
 130  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
 131  * Actual determination of property data is defined by the underlying
 132  * Unicode database as implemented by UCharacter.
 133  *
 134  * <p>Patterns specify individual characters, ranges of characters, and
 135  * Unicode property sets.  When elements are concatenated, they
 136  * specify their union.  To complement a set, place a '^' immediately
 137  * after the opening '['.  Property patterns are inverted by modifying
 138  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
 139  * '^' has no special meaning.
 140  *
 141  * <p>Ranges are indicated by placing two a '-' between two
 142  * characters, as in "a-z".  This specifies the range of all
 143  * characters from the left to the right, in Unicode order.  If the
 144  * left character is greater than or equal to the
 145  * right character it is a syntax error.  If a '-' occurs as the first
 146  * character after the opening '[' or '[^', or if it occurs as the
 147  * last character before the closing ']', then it is taken as a
 148  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
 149  * set of three characters, 'a', 'b', and '-'.
 150  *
 151  * <p>Sets may be intersected using the '&' operator or the asymmetric
 152  * set difference may be taken using the '-' operator, for example,
 153  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
 154  * with values less than 4096.  Operators ('&' and '|') have equal
 155  * precedence and bind left-to-right.  Thus
 156  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
 157  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
 158  * difference; intersection is commutative.
 159  *
 160  * <table>
 161  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
 162  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
 163  * through 'z' and all letters in between, in Unicode order
 164  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 165  * all characters but 'a' through 'z',
 166  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 167  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 168  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 169  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
 170  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
 171  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
 172  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
 173  * <em>pat2</em>
 174  * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
 175  * <td>The set of characters having the specified
 176  * Unicode property; in
 177  * this case, Unicode uppercase letters
 178  * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
 179  * <td>The set of characters <em>not</em> having the given
 180  * Unicode property
 181  * </table>
 182  *
 183  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
 184  *
 185  * <p><b>Formal syntax</b></p>
 186  *
 187  * <blockquote>
 188  *   <table>
 189  *     <tr align="top">
 190  *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
 191  *       <td valign="top"><code>('[' '^'? item* ']') |
 192  *       property</code></td>
 193  *     </tr>
 194  *     <tr align="top">
 195  *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
 196  *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
 197  *       </code></td>
 198  *     </tr>
 199  *     <tr align="top">
 200  *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
 201  *       <td valign="top"><code>pattern | pattern-expr pattern |
 202  *       pattern-expr op pattern<br>
 203  *       </code></td>
 204  *     </tr>
 205  *     <tr align="top">
 206  *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
 207  *       <td valign="top"><code>'&amp;' | '-'<br>
 208  *       </code></td>
 209  *     </tr>
 210  *     <tr align="top">
 211  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
 212  *       <td valign="top"><code>'[' | ']' | '-'<br>
 213  *       </code></td>
 214  *     </tr>
 215  *     <tr align="top">
 216  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
 217  *       <td valign="top"><em>any character that is not</em><code> special<br>
 218  *       | ('\\' </code><em>any character</em><code>)<br>
 219  *       | ('\u' hex hex hex hex)<br>
 220  *       </code></td>
 221  *     </tr>
 222  *     <tr align="top">
 223  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
 224  *       <td valign="top"><em>any character for which
 225  *       </em><code>Character.digit(c, 16)</code><em>
 226  *       returns a non-negative result</em></td>
 227  *     </tr>
 228  *     <tr>
 229  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
 230  *       <td valign="top"><em>a Unicode property set pattern</td>
 231  *     </tr>
 232  *   </table>
 233  *   <br>
 234  *   <table border="1">
 235  *     <tr>
 236  *       <td>Legend: <table>
 237  *         <tr>
 238  *           <td nowrap valign="top"><code>a := b</code></td>
 239  *           <td width="20" valign="top">&nbsp; </td>
 240  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
 241  *         </tr>
 242  *         <tr>
 243  *           <td nowrap valign="top"><code>a?</code></td>
 244  *           <td valign="top"></td>
 245  *           <td valign="top">zero or one instance of <code>a</code><br>
 246  *           </td>
 247  *         </tr>
 248  *         <tr>
 249  *           <td nowrap valign="top"><code>a*</code></td>
 250  *           <td valign="top"></td>
 251  *           <td valign="top">one or more instances of <code>a</code><br>
 252  *           </td>
 253  *         </tr>
 254  *         <tr>
 255  *           <td nowrap valign="top"><code>a | b</code></td>
 256  *           <td valign="top"></td>
 257  *           <td valign="top">either <code>a</code> or <code>b</code><br>
 258  *           </td>
 259  *         </tr>
 260  *         <tr>
 261  *           <td nowrap valign="top"><code>'a'</code></td>
 262  *           <td valign="top"></td>
 263  *           <td valign="top">the literal string between the quotes </td>
 264  *         </tr>
 265  *       </table>
 266  *       </td>
 267  *     </tr>
 268  *   </table>
 269  * </blockquote>
 270  * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
 271  *
 272  * @author Alan Liu
 273  * @stable ICU 2.0
 274  * @see UnicodeSetIterator
 275  */
 276 public class UnicodeSet implements UnicodeMatcher {
 277 
 278     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
 279     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
 280                                              // 110000 for codepoints
 281 
 282     /**
 283      * Minimum value that can be stored in a UnicodeSet.
 284      * @stable ICU 2.0
 285      */
 286     public static final int MIN_VALUE = LOW;
 287 
 288     /**
 289      * Maximum value that can be stored in a UnicodeSet.
 290      * @stable ICU 2.0
 291      */
 292     public static final int MAX_VALUE = HIGH - 1;
 293 
 294     private int len;      // length used; list may be longer to minimize reallocs
 295     private int[] list;   // MUST be terminated with HIGH
 296     private int[] rangeList; // internal buffer
 297     private int[] buffer; // internal buffer
 298 
 299     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
 300     // is not private so that UnicodeSetIterator can get access
 301     TreeSet<String> strings = new TreeSet<>();
 302 
 303     /**
 304      * The pattern representation of this set.  This may not be the
 305      * most economical pattern.  It is the pattern supplied to
 306      * applyPattern(), with variables substituted and whitespace
 307      * removed.  For sets constructed without applyPattern(), or
 308      * modified using the non-pattern API, this string will be null,
 309      * indicating that toPattern() must generate a pattern
 310      * representation from the inversion list.
 311      */
 312     private String pat = null;
 313 
 314     private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
 315     private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
 316 
 317     /**
 318      * A set of all characters _except_ the second through last characters of
 319      * certain ranges.  These ranges are ranges of characters whose
 320      * properties are all exactly alike, e.g. CJK Ideographs from
 321      * U+4E00 to U+9FA5.
 322      */
 323     private static UnicodeSet INCLUSIONS[] = null;
 324 
 325     //----------------------------------------------------------------
 326     // Public API
 327     //----------------------------------------------------------------
 328 
 329     /**
 330      * Constructs an empty set.
 331      * @stable ICU 2.0
 332      */
 333     public UnicodeSet() {
 334         list = new int[1 + START_EXTRA];
 335         list[len++] = HIGH;
 336     }
 337 
 338     /**
 339      * Constructs a set containing the given range. If <code>end >
 340      * start</code> then an empty set is created.
 341      *
 342      * @param start first character, inclusive, of range
 343      * @param end last character, inclusive, of range
 344      * @stable ICU 2.0
 345      */
 346     public UnicodeSet(int start, int end) {
 347         this();
 348         complement(start, end);
 349     }
 350 
 351     /**
 352      * Constructs a set from the given pattern.  See the class description
 353      * for the syntax of the pattern language.  Whitespace is ignored.
 354      * @param pattern a string specifying what characters are in the set
 355      * @exception java.lang.IllegalArgumentException if the pattern contains
 356      * a syntax error.
 357      * @stable ICU 2.0
 358      */
 359     public UnicodeSet(String pattern) {
 360         this();
 361         applyPattern(pattern, null, null, IGNORE_SPACE);
 362     }
 363 
 364     /**
 365      * Make this object represent the same set as <code>other</code>.
 366      * @param other a <code>UnicodeSet</code> whose value will be
 367      * copied to this object
 368      * @stable ICU 2.0
 369      */
 370     @SuppressWarnings("unchecked") // Casting result of clone of a collection
 371     public UnicodeSet set(UnicodeSet other) {
 372         list = other.list.clone();
 373         len = other.len;
 374         pat = other.pat;
 375         strings = (TreeSet)other.strings.clone();
 376         return this;
 377     }
 378 
 379     /**
 380      * Modifies this set to represent the set specified by the given pattern.
 381      * See the class description for the syntax of the pattern language.
 382      * Whitespace is ignored.
 383      * @param pattern a string specifying what characters are in the set
 384      * @exception java.lang.IllegalArgumentException if the pattern
 385      * contains a syntax error.
 386      * @stable ICU 2.0
 387      */
 388     public final UnicodeSet applyPattern(String pattern) {
 389         return applyPattern(pattern, null, null, IGNORE_SPACE);
 390     }
 391 
 392     /**
 393      * Append the <code>toPattern()</code> representation of a
 394      * string to the given <code>StringBuffer</code>.
 395      */
 396     private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
 397         for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
 398             _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
 399         }
 400     }
 401 
 402     /**
 403      * Append the <code>toPattern()</code> representation of a
 404      * character to the given <code>StringBuffer</code>.
 405      */
 406     private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
 407         if (escapeUnprintable && Utility.isUnprintable(c)) {
 408             // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
 409             // unprintable
 410             if (Utility.escapeUnprintable(buf, c)) {
 411                 return;
 412             }
 413         }
 414         // Okay to let ':' pass through
 415         switch (c) {
 416         case '[': // SET_OPEN:
 417         case ']': // SET_CLOSE:
 418         case '-': // HYPHEN:
 419         case '^': // COMPLEMENT:
 420         case '&': // INTERSECTION:
 421         case '\\': //BACKSLASH:
 422         case '{':
 423         case '}':
 424         case '$':
 425         case ':':
 426             buf.append('\\');
 427             break;
 428         default:
 429             // Escape whitespace
 430             if (UCharacterProperty.isRuleWhiteSpace(c)) {
 431                 buf.append('\\');
 432             }
 433             break;
 434         }
 435         UTF16.append(buf, c);
 436     }
 437 
 438     /**
 439      * Append a string representation of this set to result.  This will be
 440      * a cleaned version of the string passed to applyPattern(), if there
 441      * is one.  Otherwise it will be generated.
 442      */
 443     private StringBuffer _toPattern(StringBuffer result,
 444                                     boolean escapeUnprintable) {
 445         if (pat != null) {
 446             int i;
 447             int backslashCount = 0;
 448             for (i=0; i<pat.length(); ) {
 449                 int c = UTF16.charAt(pat, i);
 450                 i += UTF16.getCharCount(c);
 451                 if (escapeUnprintable && Utility.isUnprintable(c)) {
 452                     // If the unprintable character is preceded by an odd
 453                     // number of backslashes, then it has been escaped.
 454                     // Before unescaping it, we delete the final
 455                     // backslash.
 456                     if ((backslashCount % 2) == 1) {
 457                         result.setLength(result.length() - 1);
 458                     }
 459                     Utility.escapeUnprintable(result, c);
 460                     backslashCount = 0;
 461                 } else {
 462                     UTF16.append(result, c);
 463                     if (c == '\\') {
 464                         ++backslashCount;
 465                     } else {
 466                         backslashCount = 0;
 467                     }
 468                 }
 469             }
 470             return result;
 471         }
 472 
 473         return _generatePattern(result, escapeUnprintable, true);
 474     }
 475 
 476     /**
 477      * Generate and append a string representation of this set to result.
 478      * This does not use this.pat, the cleaned up copy of the string
 479      * passed to applyPattern().
 480      * @param includeStrings if false, doesn't include the strings.
 481      * @stable ICU 3.8
 482      */
 483     public StringBuffer _generatePattern(StringBuffer result,
 484                                          boolean escapeUnprintable, boolean includeStrings) {
 485         result.append('[');
 486 
 487         int count = getRangeCount();
 488 
 489         // If the set contains at least 2 intervals and includes both
 490         // MIN_VALUE and MAX_VALUE, then the inverse representation will
 491         // be more economical.
 492         if (count > 1 &&
 493             getRangeStart(0) == MIN_VALUE &&
 494             getRangeEnd(count-1) == MAX_VALUE) {
 495 
 496             // Emit the inverse
 497             result.append('^');
 498 
 499             for (int i = 1; i < count; ++i) {
 500                 int start = getRangeEnd(i-1)+1;
 501                 int end = getRangeStart(i)-1;
 502                 _appendToPat(result, start, escapeUnprintable);
 503                 if (start != end) {
 504                     if ((start+1) != end) {
 505                         result.append('-');
 506                     }
 507                     _appendToPat(result, end, escapeUnprintable);
 508                 }
 509             }
 510         }
 511 
 512         // Default; emit the ranges as pairs
 513         else {
 514             for (int i = 0; i < count; ++i) {
 515                 int start = getRangeStart(i);
 516                 int end = getRangeEnd(i);
 517                 _appendToPat(result, start, escapeUnprintable);
 518                 if (start != end) {
 519                     if ((start+1) != end) {
 520                         result.append('-');
 521                     }
 522                     _appendToPat(result, end, escapeUnprintable);
 523                 }
 524             }
 525         }
 526 
 527         if (includeStrings && strings.size() > 0) {
 528             Iterator<String> it = strings.iterator();
 529             while (it.hasNext()) {
 530                 result.append('{');
 531                 _appendToPat(result, it.next(), escapeUnprintable);
 532                 result.append('}');
 533             }
 534         }
 535         return result.append(']');
 536     }
 537 
 538     // for internal use, after checkFrozen has been called
 539     private UnicodeSet add_unchecked(int start, int end) {
 540         if (start < MIN_VALUE || start > MAX_VALUE) {
 541             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
 542         }
 543         if (end < MIN_VALUE || end > MAX_VALUE) {
 544             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
 545         }
 546         if (start < end) {
 547             add(range(start, end), 2, 0);
 548         } else if (start == end) {
 549             add(start);
 550         }
 551         return this;
 552     }
 553 
 554     /**
 555      * Adds the specified character to this set if it is not already
 556      * present.  If this set already contains the specified character,
 557      * the call leaves this set unchanged.
 558      * @stable ICU 2.0
 559      */
 560     public final UnicodeSet add(int c) {
 561         return add_unchecked(c);
 562     }
 563 
 564     // for internal use only, after checkFrozen has been called
 565     private final UnicodeSet add_unchecked(int c) {
 566         if (c < MIN_VALUE || c > MAX_VALUE) {
 567             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
 568         }
 569 
 570         // find smallest i such that c < list[i]
 571         // if odd, then it is IN the set
 572         // if even, then it is OUT of the set
 573         int i = findCodePoint(c);
 574 
 575         // already in set?
 576         if ((i & 1) != 0) return this;
 577 
 578         // HIGH is 0x110000
 579         // assert(list[len-1] == HIGH);
 580 
 581         // empty = [HIGH]
 582         // [start_0, limit_0, start_1, limit_1, HIGH]
 583 
 584         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 585         //                             ^
 586         //                             list[i]
 587 
 588         // i == 0 means c is before the first range
 589 
 590         if (c == list[i]-1) {
 591             // c is before start of next range
 592             list[i] = c;
 593             // if we touched the HIGH mark, then add a new one
 594             if (c == MAX_VALUE) {
 595                 ensureCapacity(len+1);
 596                 list[len++] = HIGH;
 597             }
 598             if (i > 0 && c == list[i-1]) {
 599                 // collapse adjacent ranges
 600 
 601                 // [..., start_k-1, c, c, limit_k, ..., HIGH]
 602                 //                     ^
 603                 //                     list[i]
 604                 System.arraycopy(list, i+1, list, i-1, len-i-1);
 605                 len -= 2;
 606             }
 607         }
 608 
 609         else if (i > 0 && c == list[i-1]) {
 610             // c is after end of prior range
 611             list[i-1]++;
 612             // no need to chcek for collapse here
 613         }
 614 
 615         else {
 616             // At this point we know the new char is not adjacent to
 617             // any existing ranges, and it is not 10FFFF.
 618 
 619 
 620             // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
 621             //                             ^
 622             //                             list[i]
 623 
 624             // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
 625             //                             ^
 626             //                             list[i]
 627 
 628             // Don't use ensureCapacity() to save on copying.
 629             // NOTE: This has no measurable impact on performance,
 630             // but it might help in some usage patterns.
 631             if (len+2 > list.length) {
 632                 int[] temp = new int[len + 2 + GROW_EXTRA];
 633                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
 634                 System.arraycopy(list, i, temp, i+2, len-i);
 635                 list = temp;
 636             } else {
 637                 System.arraycopy(list, i, list, i+2, len-i);
 638             }
 639 
 640             list[i] = c;
 641             list[i+1] = c+1;
 642             len += 2;
 643         }
 644 
 645         pat = null;
 646         return this;
 647     }
 648 
 649     /**
 650      * Adds the specified multicharacter to this set if it is not already
 651      * present.  If this set already contains the multicharacter,
 652      * the call leaves this set unchanged.
 653      * Thus "ch" => {"ch"}
 654      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 655      * @param s the source string
 656      * @return this object, for chaining
 657      * @stable ICU 2.0
 658      */
 659     public final UnicodeSet add(String s) {
 660         int cp = getSingleCP(s);
 661         if (cp < 0) {
 662             strings.add(s);
 663             pat = null;
 664         } else {
 665             add_unchecked(cp, cp);
 666         }
 667         return this;
 668     }
 669 
 670     /**
 671      * @return a code point IF the string consists of a single one.
 672      * otherwise returns -1.
 673      * @param string to test
 674      */
 675     private static int getSingleCP(String s) {
 676         if (s.length() < 1) {
 677             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
 678         }
 679         if (s.length() > 2) return -1;
 680         if (s.length() == 1) return s.charAt(0);
 681 
 682         // at this point, len = 2
 683         int cp = UTF16.charAt(s, 0);
 684         if (cp > 0xFFFF) { // is surrogate pair
 685             return cp;
 686         }
 687         return -1;
 688     }
 689 
 690     /**
 691      * Complements the specified range in this set.  Any character in
 692      * the range will be removed if it is in this set, or will be
 693      * added if it is not in this set.  If <code>end > start</code>
 694      * then an empty range is complemented, leaving the set unchanged.
 695      *
 696      * @param start first character, inclusive, of range to be removed
 697      * from this set.
 698      * @param end last character, inclusive, of range to be removed
 699      * from this set.
 700      * @stable ICU 2.0
 701      */
 702     public UnicodeSet complement(int start, int end) {
 703         if (start < MIN_VALUE || start > MAX_VALUE) {
 704             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
 705         }
 706         if (end < MIN_VALUE || end > MAX_VALUE) {
 707             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
 708         }
 709         if (start <= end) {
 710             xor(range(start, end), 2, 0);
 711         }
 712         pat = null;
 713         return this;
 714     }
 715 
 716     /**
 717      * This is equivalent to
 718      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
 719      * @stable ICU 2.0
 720      */
 721     public UnicodeSet complement() {
 722         if (list[0] == LOW) {
 723             System.arraycopy(list, 1, list, 0, len-1);
 724             --len;
 725         } else {
 726             ensureCapacity(len+1);
 727             System.arraycopy(list, 0, list, 1, len);
 728             list[0] = LOW;
 729             ++len;
 730         }
 731         pat = null;
 732         return this;
 733     }
 734 
 735     /**
 736      * Returns true if this set contains the given character.
 737      * @param c character to be checked for containment
 738      * @return true if the test condition is met
 739      * @stable ICU 2.0
 740      */
 741     public boolean contains(int c) {
 742         if (c < MIN_VALUE || c > MAX_VALUE) {
 743             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
 744         }
 745 
 746         /*
 747         // Set i to the index of the start item greater than ch
 748         // We know we will terminate without length test!
 749         int i = -1;
 750         while (true) {
 751             if (c < list[++i]) break;
 752         }
 753         */
 754 
 755         int i = findCodePoint(c);
 756 
 757         return ((i & 1) != 0); // return true if odd
 758     }
 759 
 760     /**
 761      * Returns the smallest value i such that c < list[i].  Caller
 762      * must ensure that c is a legal value or this method will enter
 763      * an infinite loop.  This method performs a binary search.
 764      * @param c a character in the range MIN_VALUE..MAX_VALUE
 765      * inclusive
 766      * @return the smallest integer i in the range 0..len-1,
 767      * inclusive, such that c < list[i]
 768      */
 769     private final int findCodePoint(int c) {
 770         /* Examples:
 771                                            findCodePoint(c)
 772            set              list[]         c=0 1 3 4 7 8
 773            ===              ==============   ===========
 774            []               [110000]         0 0 0 0 0 0
 775            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 776            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 777            [:all:]          [0, 110000]      1 1 1 1 1 1
 778          */
 779 
 780         // Return the smallest i such that c < list[i].  Assume
 781         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 782         if (c < list[0]) return 0;
 783         // High runner test.  c is often after the last range, so an
 784         // initial check for this condition pays off.
 785         if (len >= 2 && c >= list[len-2]) return len-1;
 786         int lo = 0;
 787         int hi = len - 1;
 788         // invariant: c >= list[lo]
 789         // invariant: c < list[hi]
 790         for (;;) {
 791             int i = (lo + hi) >>> 1;
 792             if (i == lo) return hi;
 793             if (c < list[i]) {
 794                 hi = i;
 795             } else {
 796                 lo = i;
 797             }
 798         }
 799     }
 800 
 801     /**
 802      * Adds all of the elements in the specified set to this set if
 803      * they're not already present.  This operation effectively
 804      * modifies this set so that its value is the <i>union</i> of the two
 805      * sets.  The behavior of this operation is unspecified if the specified
 806      * collection is modified while the operation is in progress.
 807      *
 808      * @param c set whose elements are to be added to this set.
 809      * @stable ICU 2.0
 810      */
 811     public UnicodeSet addAll(UnicodeSet c) {
 812         add(c.list, c.len, 0);
 813         strings.addAll(c.strings);
 814         return this;
 815     }
 816 
 817     /**
 818      * Retains only the elements in this set that are contained in the
 819      * specified set.  In other words, removes from this set all of
 820      * its elements that are not contained in the specified set.  This
 821      * operation effectively modifies this set so that its value is
 822      * the <i>intersection</i> of the two sets.
 823      *
 824      * @param c set that defines which elements this set will retain.
 825      * @stable ICU 2.0
 826      */
 827     public UnicodeSet retainAll(UnicodeSet c) {
 828         retain(c.list, c.len, 0);
 829         strings.retainAll(c.strings);
 830         return this;
 831     }
 832 
 833     /**
 834      * Removes from this set all of its elements that are contained in the
 835      * specified set.  This operation effectively modifies this
 836      * set so that its value is the <i>asymmetric set difference</i> of
 837      * the two sets.
 838      *
 839      * @param c set that defines which elements will be removed from
 840      *          this set.
 841      * @stable ICU 2.0
 842      */
 843     public UnicodeSet removeAll(UnicodeSet c) {
 844         retain(c.list, c.len, 2);
 845         strings.removeAll(c.strings);
 846         return this;
 847     }
 848 
 849     /**
 850      * Removes all of the elements from this set.  This set will be
 851      * empty after this call returns.
 852      * @stable ICU 2.0
 853      */
 854     public UnicodeSet clear() {
 855         list[0] = HIGH;
 856         len = 1;
 857         pat = null;
 858         strings.clear();
 859         return this;
 860     }
 861 
 862     /**
 863      * Iteration method that returns the number of ranges contained in
 864      * this set.
 865      * @see #getRangeStart
 866      * @see #getRangeEnd
 867      * @stable ICU 2.0
 868      */
 869     public int getRangeCount() {
 870         return len/2;
 871     }
 872 
 873     /**
 874      * Iteration method that returns the first character in the
 875      * specified range of this set.
 876      * @exception ArrayIndexOutOfBoundsException if index is outside
 877      * the range <code>0..getRangeCount()-1</code>
 878      * @see #getRangeCount
 879      * @see #getRangeEnd
 880      * @stable ICU 2.0
 881      */
 882     public int getRangeStart(int index) {
 883         return list[index*2];
 884     }
 885 
 886     /**
 887      * Iteration method that returns the last character in the
 888      * specified range of this set.
 889      * @exception ArrayIndexOutOfBoundsException if index is outside
 890      * the range <code>0..getRangeCount()-1</code>
 891      * @see #getRangeStart
 892      * @see #getRangeEnd
 893      * @stable ICU 2.0
 894      */
 895     public int getRangeEnd(int index) {
 896         return (list[index*2 + 1] - 1);
 897     }
 898 
 899     //----------------------------------------------------------------
 900     // Implementation: Pattern parsing
 901     //----------------------------------------------------------------
 902 
 903     /**
 904      * Parses the given pattern, starting at the given position.  The character
 905      * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
 906      * Parsing continues until the corresponding closing ']'.  If a syntax error
 907      * is encountered between the opening and closing brace, the parse fails.
 908      * Upon return from a successful parse, the ParsePosition is updated to
 909      * point to the character following the closing ']', and an inversion
 910      * list for the parsed pattern is returned.  This method
 911      * calls itself recursively to parse embedded subpatterns.
 912      *
 913      * @param pattern the string containing the pattern to be parsed.  The
 914      * portion of the string from pos.getIndex(), which must be a '[', to the
 915      * corresponding closing ']', is parsed.
 916      * @param pos upon entry, the position at which to being parsing.  The
 917      * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
 918      * from a successful parse, pos.getIndex() is either the character after the
 919      * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
 920      * is the last character of the pattern string.
 921      * @return an inversion list for the parsed substring
 922      * of <code>pattern</code>
 923      * @exception java.lang.IllegalArgumentException if the parse fails.
 924      */
 925     UnicodeSet applyPattern(String pattern,
 926                       ParsePosition pos,
 927                       SymbolTable symbols,
 928                       int options) {
 929 
 930         // Need to build the pattern in a temporary string because
 931         // _applyPattern calls add() etc., which set pat to empty.
 932         boolean parsePositionWasNull = pos == null;
 933         if (parsePositionWasNull) {
 934             pos = new ParsePosition(0);
 935         }
 936 
 937         StringBuffer rebuiltPat = new StringBuffer();
 938         RuleCharacterIterator chars =
 939             new RuleCharacterIterator(pattern, symbols, pos);
 940         applyPattern(chars, symbols, rebuiltPat, options);
 941         if (chars.inVariable()) {
 942             syntaxError(chars, "Extra chars in variable value");
 943         }
 944         pat = rebuiltPat.toString();
 945         if (parsePositionWasNull) {
 946             int i = pos.getIndex();
 947 
 948             // Skip over trailing whitespace
 949             if ((options & IGNORE_SPACE) != 0) {
 950                 i = Utility.skipWhitespace(pattern, i);
 951             }
 952 
 953             if (i != pattern.length()) {
 954                 throw new IllegalArgumentException("Parse of \"" + pattern +
 955                                                    "\" failed at " + i);
 956             }
 957         }
 958         return this;
 959     }
 960 
 961     /**
 962      * Parse the pattern from the given RuleCharacterIterator.  The
 963      * iterator is advanced over the parsed pattern.
 964      * @param chars iterator over the pattern characters.  Upon return
 965      * it will be advanced to the first character after the parsed
 966      * pattern, or the end of the iteration if all characters are
 967      * parsed.
 968      * @param symbols symbol table to use to parse and dereference
 969      * variables, or null if none.
 970      * @param rebuiltPat the pattern that was parsed, rebuilt or
 971      * copied from the input pattern, as appropriate.
 972      * @param options a bit mask of zero or more of the following:
 973      * IGNORE_SPACE, CASE.
 974      */
 975     void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
 976                       StringBuffer rebuiltPat, int options) {
 977         // Syntax characters: [ ] ^ - & { }
 978 
 979         // Recognized special forms for chars, sets: c-c s-s s&s
 980 
 981         int opts = RuleCharacterIterator.PARSE_VARIABLES |
 982                    RuleCharacterIterator.PARSE_ESCAPES;
 983         if ((options & IGNORE_SPACE) != 0) {
 984             opts |= RuleCharacterIterator.SKIP_WHITESPACE;
 985         }
 986 
 987         StringBuffer patBuf = new StringBuffer(), buf = null;
 988         boolean usePat = false;
 989         UnicodeSet scratch = null;
 990         Object backup = null;
 991 
 992         // mode: 0=before [, 1=between [...], 2=after ]
 993         // lastItem: 0=none, 1=char, 2=set
 994         int lastItem = 0, lastChar = 0, mode = 0;
 995         char op = 0;
 996 
 997         boolean invert = false;
 998 
 999         clear();
1000 
1001         while (mode != 2 && !chars.atEnd()) {
1002             if (false) {
1003                 // Debugging assertion
1004                 if (!((lastItem == 0 && op == 0) ||
1005                       (lastItem == 1 && (op == 0 || op == '-')) ||
1006                       (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
1007                     throw new IllegalArgumentException();
1008                 }
1009             }
1010 
1011             int c = 0;
1012             boolean literal = false;
1013             UnicodeSet nested = null;
1014 
1015             // -------- Check for property pattern
1016 
1017             // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
1018             int setMode = 0;
1019             if (resemblesPropertyPattern(chars, opts)) {
1020                 setMode = 2;
1021             }
1022 
1023             // -------- Parse '[' of opening delimiter OR nested set.
1024             // If there is a nested set, use `setMode' to define how
1025             // the set should be parsed.  If the '[' is part of the
1026             // opening delimiter for this pattern, parse special
1027             // strings "[", "[^", "[-", and "[^-".  Check for stand-in
1028             // characters representing a nested set in the symbol
1029             // table.
1030 
1031             else {
1032                 // Prepare to backup if necessary
1033                 backup = chars.getPos(backup);
1034                 c = chars.next(opts);
1035                 literal = chars.isEscaped();
1036 
1037                 if (c == '[' && !literal) {
1038                     if (mode == 1) {
1039                         chars.setPos(backup); // backup
1040                         setMode = 1;
1041                     } else {
1042                         // Handle opening '[' delimiter
1043                         mode = 1;
1044                         patBuf.append('[');
1045                         backup = chars.getPos(backup); // prepare to backup
1046                         c = chars.next(opts);
1047                         literal = chars.isEscaped();
1048                         if (c == '^' && !literal) {
1049                             invert = true;
1050                             patBuf.append('^');
1051                             backup = chars.getPos(backup); // prepare to backup
1052                             c = chars.next(opts);
1053                             literal = chars.isEscaped();
1054                         }
1055                         // Fall through to handle special leading '-';
1056                         // otherwise restart loop for nested [], \p{}, etc.
1057                         if (c == '-') {
1058                             literal = true;
1059                             // Fall through to handle literal '-' below
1060                         } else {
1061                             chars.setPos(backup); // backup
1062                             continue;
1063                         }
1064                     }
1065                 } else if (symbols != null) {
1066                      UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
1067                      if (m != null) {
1068                          try {
1069                              nested = (UnicodeSet) m;
1070                              setMode = 3;
1071                          } catch (ClassCastException e) {
1072                              syntaxError(chars, "Syntax error");
1073                          }
1074                      }
1075                 }
1076             }
1077 
1078             // -------- Handle a nested set.  This either is inline in
1079             // the pattern or represented by a stand-in that has
1080             // previously been parsed and was looked up in the symbol
1081             // table.
1082 
1083             if (setMode != 0) {
1084                 if (lastItem == 1) {
1085                     if (op != 0) {
1086                         syntaxError(chars, "Char expected after operator");
1087                     }
1088                     add_unchecked(lastChar, lastChar);
1089                     _appendToPat(patBuf, lastChar, false);
1090                     lastItem = op = 0;
1091                 }
1092 
1093                 if (op == '-' || op == '&') {
1094                     patBuf.append(op);
1095                 }
1096 
1097                 if (nested == null) {
1098                     if (scratch == null) scratch = new UnicodeSet();
1099                     nested = scratch;
1100                 }
1101                 switch (setMode) {
1102                 case 1:
1103                     nested.applyPattern(chars, symbols, patBuf, options);
1104                     break;
1105                 case 2:
1106                     chars.skipIgnored(opts);
1107                     nested.applyPropertyPattern(chars, patBuf, symbols);
1108                     break;
1109                 case 3: // `nested' already parsed
1110                     nested._toPattern(patBuf, false);
1111                     break;
1112                 }
1113 
1114                 usePat = true;
1115 
1116                 if (mode == 0) {
1117                     // Entire pattern is a category; leave parse loop
1118                     set(nested);
1119                     mode = 2;
1120                     break;
1121                 }
1122 
1123                 switch (op) {
1124                 case '-':
1125                     removeAll(nested);
1126                     break;
1127                 case '&':
1128                     retainAll(nested);
1129                     break;
1130                 case 0:
1131                     addAll(nested);
1132                     break;
1133                 }
1134 
1135                 op = 0;
1136                 lastItem = 2;
1137 
1138                 continue;
1139             }
1140 
1141             if (mode == 0) {
1142                 syntaxError(chars, "Missing '['");
1143             }
1144 
1145             // -------- Parse special (syntax) characters.  If the
1146             // current character is not special, or if it is escaped,
1147             // then fall through and handle it below.
1148 
1149             if (!literal) {
1150                 switch (c) {
1151                 case ']':
1152                     if (lastItem == 1) {
1153                         add_unchecked(lastChar, lastChar);
1154                         _appendToPat(patBuf, lastChar, false);
1155                     }
1156                     // Treat final trailing '-' as a literal
1157                     if (op == '-') {
1158                         add_unchecked(op, op);
1159                         patBuf.append(op);
1160                     } else if (op == '&') {
1161                         syntaxError(chars, "Trailing '&'");
1162                     }
1163                     patBuf.append(']');
1164                     mode = 2;
1165                     continue;
1166                 case '-':
1167                     if (op == 0) {
1168                         if (lastItem != 0) {
1169                             op = (char) c;
1170                             continue;
1171                         } else {
1172                             // Treat final trailing '-' as a literal
1173                             add_unchecked(c, c);
1174                             c = chars.next(opts);
1175                             literal = chars.isEscaped();
1176                             if (c == ']' && !literal) {
1177                                 patBuf.append("-]");
1178                                 mode = 2;
1179                                 continue;
1180                             }
1181                         }
1182                     }
1183                     syntaxError(chars, "'-' not after char or set");
1184                     break;
1185                 case '&':
1186                     if (lastItem == 2 && op == 0) {
1187                         op = (char) c;
1188                         continue;
1189                     }
1190                     syntaxError(chars, "'&' not after set");
1191                     break;
1192                 case '^':
1193                     syntaxError(chars, "'^' not after '['");
1194                     break;
1195                 case '{':
1196                     if (op != 0) {
1197                         syntaxError(chars, "Missing operand after operator");
1198                     }
1199                     if (lastItem == 1) {
1200                         add_unchecked(lastChar, lastChar);
1201                         _appendToPat(patBuf, lastChar, false);
1202                     }
1203                     lastItem = 0;
1204                     if (buf == null) {
1205                         buf = new StringBuffer();
1206                     } else {
1207                         buf.setLength(0);
1208                     }
1209                     boolean ok = false;
1210                     while (!chars.atEnd()) {
1211                         c = chars.next(opts);
1212                         literal = chars.isEscaped();
1213                         if (c == '}' && !literal) {
1214                             ok = true;
1215                             break;
1216                         }
1217                         UTF16.append(buf, c);
1218                     }
1219                     if (buf.length() < 1 || !ok) {
1220                         syntaxError(chars, "Invalid multicharacter string");
1221                     }
1222                     // We have new string. Add it to set and continue;
1223                     // we don't need to drop through to the further
1224                     // processing
1225                     add(buf.toString());
1226                     patBuf.append('{');
1227                     _appendToPat(patBuf, buf.toString(), false);
1228                     patBuf.append('}');
1229                     continue;
1230                 case SymbolTable.SYMBOL_REF:
1231                     //         symbols  nosymbols
1232                     // [a-$]   error    error (ambiguous)
1233                     // [a$]    anchor   anchor
1234                     // [a-$x]  var "x"* literal '$'
1235                     // [a-$.]  error    literal '$'
1236                     // *We won't get here in the case of var "x"
1237                     backup = chars.getPos(backup);
1238                     c = chars.next(opts);
1239                     literal = chars.isEscaped();
1240                     boolean anchor = (c == ']' && !literal);
1241                     if (symbols == null && !anchor) {
1242                         c = SymbolTable.SYMBOL_REF;
1243                         chars.setPos(backup);
1244                         break; // literal '$'
1245                     }
1246                     if (anchor && op == 0) {
1247                         if (lastItem == 1) {
1248                             add_unchecked(lastChar, lastChar);
1249                             _appendToPat(patBuf, lastChar, false);
1250                         }
1251                         add_unchecked(UnicodeMatcher.ETHER);
1252                         usePat = true;
1253                         patBuf.append(SymbolTable.SYMBOL_REF).append(']');
1254                         mode = 2;
1255                         continue;
1256                     }
1257                     syntaxError(chars, "Unquoted '$'");
1258                     break;
1259                 default:
1260                     break;
1261                 }
1262             }
1263 
1264             // -------- Parse literal characters.  This includes both
1265             // escaped chars ("\u4E01") and non-syntax characters
1266             // ("a").
1267 
1268             switch (lastItem) {
1269             case 0:
1270                 lastItem = 1;
1271                 lastChar = c;
1272                 break;
1273             case 1:
1274                 if (op == '-') {
1275                     if (lastChar >= c) {
1276                         // Don't allow redundant (a-a) or empty (b-a) ranges;
1277                         // these are most likely typos.
1278                         syntaxError(chars, "Invalid range");
1279                     }
1280                     add_unchecked(lastChar, c);
1281                     _appendToPat(patBuf, lastChar, false);
1282                     patBuf.append(op);
1283                     _appendToPat(patBuf, c, false);
1284                     lastItem = op = 0;
1285                 } else {
1286                     add_unchecked(lastChar, lastChar);
1287                     _appendToPat(patBuf, lastChar, false);
1288                     lastChar = c;
1289                 }
1290                 break;
1291             case 2:
1292                 if (op != 0) {
1293                     syntaxError(chars, "Set expected after operator");
1294                 }
1295                 lastChar = c;
1296                 lastItem = 1;
1297                 break;
1298             }
1299         }
1300 
1301         if (mode != 2) {
1302             syntaxError(chars, "Missing ']'");
1303         }
1304 
1305         chars.skipIgnored(opts);
1306 
1307         if (invert) {
1308             complement();
1309         }
1310 
1311         // Use the rebuilt pattern (pat) only if necessary.  Prefer the
1312         // generated pattern.
1313         if (usePat) {
1314             rebuiltPat.append(patBuf.toString());
1315         } else {
1316             _generatePattern(rebuiltPat, false, true);
1317         }
1318     }
1319 
1320     private static void syntaxError(RuleCharacterIterator chars, String msg) {
1321         throw new IllegalArgumentException("Error: " + msg + " at \"" +
1322                                            Utility.escape(chars.toString()) +
1323                                            '"');
1324     }
1325 
1326     //----------------------------------------------------------------
1327     // Implementation: Utility methods
1328     //----------------------------------------------------------------
1329 
1330     private void ensureCapacity(int newLen) {
1331         if (newLen <= list.length) return;
1332         int[] temp = new int[newLen + GROW_EXTRA];
1333         System.arraycopy(list, 0, temp, 0, len);
1334         list = temp;
1335     }
1336 
1337     private void ensureBufferCapacity(int newLen) {
1338         if (buffer != null && newLen <= buffer.length) return;
1339         buffer = new int[newLen + GROW_EXTRA];
1340     }
1341 
1342     /**
1343      * Assumes start <= end.
1344      */
1345     private int[] range(int start, int end) {
1346         if (rangeList == null) {
1347             rangeList = new int[] { start, end+1, HIGH };
1348         } else {
1349             rangeList[0] = start;
1350             rangeList[1] = end+1;
1351         }
1352         return rangeList;
1353     }
1354 
1355     //----------------------------------------------------------------
1356     // Implementation: Fundamental operations
1357     //----------------------------------------------------------------
1358 
1359     // polarity = 0, 3 is normal: x xor y
1360     // polarity = 1, 2: x xor ~y == x === y
1361 
1362     private UnicodeSet xor(int[] other, int otherLen, int polarity) {
1363         ensureBufferCapacity(len + otherLen);
1364         int i = 0, j = 0, k = 0;
1365         int a = list[i++];
1366         int b;
1367         if (polarity == 1 || polarity == 2) {
1368             b = LOW;
1369             if (other[j] == LOW) { // skip base if already LOW
1370                 ++j;
1371                 b = other[j];
1372             }
1373         } else {
1374             b = other[j++];
1375         }
1376         // simplest of all the routines
1377         // sort the values, discarding identicals!
1378         while (true) {
1379             if (a < b) {
1380                 buffer[k++] = a;
1381                 a = list[i++];
1382             } else if (b < a) {
1383                 buffer[k++] = b;
1384                 b = other[j++];
1385             } else if (a != HIGH) { // at this point, a == b
1386                 // discard both values!
1387                 a = list[i++];
1388                 b = other[j++];
1389             } else { // DONE!
1390                 buffer[k++] = HIGH;
1391                 len = k;
1392                 break;
1393             }
1394         }
1395         // swap list and buffer
1396         int[] temp = list;
1397         list = buffer;
1398         buffer = temp;
1399         pat = null;
1400         return this;
1401     }
1402 
1403     // polarity = 0 is normal: x union y
1404     // polarity = 2: x union ~y
1405     // polarity = 1: ~x union y
1406     // polarity = 3: ~x union ~y
1407 
1408     private UnicodeSet add(int[] other, int otherLen, int polarity) {
1409         ensureBufferCapacity(len + otherLen);
1410         int i = 0, j = 0, k = 0;
1411         int a = list[i++];
1412         int b = other[j++];
1413         // change from xor is that we have to check overlapping pairs
1414         // polarity bit 1 means a is second, bit 2 means b is.
1415         main:
1416         while (true) {
1417             switch (polarity) {
1418               case 0: // both first; take lower if unequal
1419                 if (a < b) { // take a
1420                     // Back up over overlapping ranges in buffer[]
1421                     if (k > 0 && a <= buffer[k-1]) {
1422                         // Pick latter end value in buffer[] vs. list[]
1423                         a = max(list[i], buffer[--k]);
1424                     } else {
1425                         // No overlap
1426                         buffer[k++] = a;
1427                         a = list[i];
1428                     }
1429                     i++; // Common if/else code factored out
1430                     polarity ^= 1;
1431                 } else if (b < a) { // take b
1432                     if (k > 0 && b <= buffer[k-1]) {
1433                         b = max(other[j], buffer[--k]);
1434                     } else {
1435                         buffer[k++] = b;
1436                         b = other[j];
1437                     }
1438                     j++;
1439                     polarity ^= 2;
1440                 } else { // a == b, take a, drop b
1441                     if (a == HIGH) break main;
1442                     // This is symmetrical; it doesn't matter if
1443                     // we backtrack with a or b. - liu
1444                     if (k > 0 && a <= buffer[k-1]) {
1445                         a = max(list[i], buffer[--k]);
1446                     } else {
1447                         // No overlap
1448                         buffer[k++] = a;
1449                         a = list[i];
1450                     }
1451                     i++;
1452                     polarity ^= 1;
1453                     b = other[j++]; polarity ^= 2;
1454                 }
1455                 break;
1456               case 3: // both second; take higher if unequal, and drop other
1457                 if (b <= a) { // take a
1458                     if (a == HIGH) break main;
1459                     buffer[k++] = a;
1460                 } else { // take b
1461                     if (b == HIGH) break main;
1462                     buffer[k++] = b;
1463                 }
1464                 a = list[i++]; polarity ^= 1;   // factored common code
1465                 b = other[j++]; polarity ^= 2;
1466                 break;
1467               case 1: // a second, b first; if b < a, overlap
1468                 if (a < b) { // no overlap, take a
1469                     buffer[k++] = a; a = list[i++]; polarity ^= 1;
1470                 } else if (b < a) { // OVERLAP, drop b
1471                     b = other[j++]; polarity ^= 2;
1472                 } else { // a == b, drop both!
1473                     if (a == HIGH) break main;
1474                     a = list[i++]; polarity ^= 1;
1475                     b = other[j++]; polarity ^= 2;
1476                 }
1477                 break;
1478               case 2: // a first, b second; if a < b, overlap
1479                 if (b < a) { // no overlap, take b
1480                     buffer[k++] = b; b = other[j++]; polarity ^= 2;
1481                 } else  if (a < b) { // OVERLAP, drop a
1482                     a = list[i++]; polarity ^= 1;
1483                 } else { // a == b, drop both!
1484                     if (a == HIGH) break main;
1485                     a = list[i++]; polarity ^= 1;
1486                     b = other[j++]; polarity ^= 2;
1487                 }
1488                 break;
1489             }
1490         }
1491         buffer[k++] = HIGH;    // terminate
1492         len = k;
1493         // swap list and buffer
1494         int[] temp = list;
1495         list = buffer;
1496         buffer = temp;
1497         pat = null;
1498         return this;
1499     }
1500 
1501     // polarity = 0 is normal: x intersect y
1502     // polarity = 2: x intersect ~y == set-minus
1503     // polarity = 1: ~x intersect y
1504     // polarity = 3: ~x intersect ~y
1505 
1506     private UnicodeSet retain(int[] other, int otherLen, int polarity) {
1507         ensureBufferCapacity(len + otherLen);
1508         int i = 0, j = 0, k = 0;
1509         int a = list[i++];
1510         int b = other[j++];
1511         // change from xor is that we have to check overlapping pairs
1512         // polarity bit 1 means a is second, bit 2 means b is.
1513         main:
1514         while (true) {
1515             switch (polarity) {
1516               case 0: // both first; drop the smaller
1517                 if (a < b) { // drop a
1518                     a = list[i++]; polarity ^= 1;
1519                 } else if (b < a) { // drop b
1520                     b = other[j++]; polarity ^= 2;
1521                 } else { // a == b, take one, drop other
1522                     if (a == HIGH) break main;
1523                     buffer[k++] = a; a = list[i++]; polarity ^= 1;
1524                     b = other[j++]; polarity ^= 2;
1525                 }
1526                 break;
1527               case 3: // both second; take lower if unequal
1528                 if (a < b) { // take a
1529                     buffer[k++] = a; a = list[i++]; polarity ^= 1;
1530                 } else if (b < a) { // take b
1531                     buffer[k++] = b; b = other[j++]; polarity ^= 2;
1532                 } else { // a == b, take one, drop other
1533                     if (a == HIGH) break main;
1534                     buffer[k++] = a; a = list[i++]; polarity ^= 1;
1535                     b = other[j++]; polarity ^= 2;
1536                 }
1537                 break;
1538               case 1: // a second, b first;
1539                 if (a < b) { // NO OVERLAP, drop a
1540                     a = list[i++]; polarity ^= 1;
1541                 } else if (b < a) { // OVERLAP, take b
1542                     buffer[k++] = b; b = other[j++]; polarity ^= 2;
1543                 } else { // a == b, drop both!
1544                     if (a == HIGH) break main;
1545                     a = list[i++]; polarity ^= 1;
1546                     b = other[j++]; polarity ^= 2;
1547                 }
1548                 break;
1549               case 2: // a first, b second; if a < b, overlap
1550                 if (b < a) { // no overlap, drop b
1551                     b = other[j++]; polarity ^= 2;
1552                 } else  if (a < b) { // OVERLAP, take a
1553                     buffer[k++] = a; a = list[i++]; polarity ^= 1;
1554                 } else { // a == b, drop both!
1555                     if (a == HIGH) break main;
1556                     a = list[i++]; polarity ^= 1;
1557                     b = other[j++]; polarity ^= 2;
1558                 }
1559                 break;
1560             }
1561         }
1562         buffer[k++] = HIGH;    // terminate
1563         len = k;
1564         // swap list and buffer
1565         int[] temp = list;
1566         list = buffer;
1567         buffer = temp;
1568         pat = null;
1569         return this;
1570     }
1571 
1572     private static final int max(int a, int b) {
1573         return (a > b) ? a : b;
1574     }
1575 
1576     //----------------------------------------------------------------
1577     // Generic filter-based scanning code
1578     //----------------------------------------------------------------
1579 
1580     private static interface Filter {
1581         boolean contains(int codePoint);
1582     }
1583 
1584     // VersionInfo for unassigned characters
1585     static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
1586 
1587     private static class VersionFilter implements Filter {
1588         VersionInfo version;
1589 
1590         VersionFilter(VersionInfo version) { this.version = version; }
1591 
1592         public boolean contains(int ch) {
1593             VersionInfo v = UCharacter.getAge(ch);
1594             // Reference comparison ok; VersionInfo caches and reuses
1595             // unique objects.
1596             return v != NO_VERSION &&
1597                    v.compareTo(version) <= 0;
1598         }
1599     }
1600 
1601     private static synchronized UnicodeSet getInclusions(int src) {
1602         if (INCLUSIONS == null) {
1603             INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
1604         }
1605         if(INCLUSIONS[src] == null) {
1606             UnicodeSet incl = new UnicodeSet();
1607             switch(src) {
1608             case UCharacterProperty.SRC_PROPSVEC:
1609                 UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
1610                 break;
1611             default:
1612                 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
1613             }
1614             INCLUSIONS[src] = incl;
1615         }
1616         return INCLUSIONS[src];
1617     }
1618 
1619     /**
1620      * Generic filter-based scanning code for UCD property UnicodeSets.
1621      */
1622     private UnicodeSet applyFilter(Filter filter, int src) {
1623         // Walk through all Unicode characters, noting the start
1624         // and end of each range for which filter.contain(c) is
1625         // true.  Add each range to a set.
1626         //
1627         // To improve performance, use the INCLUSIONS set, which
1628         // encodes information about character ranges that are known
1629         // to have identical properties, such as the CJK Ideographs
1630         // from U+4E00 to U+9FA5.  INCLUSIONS contains all characters
1631         // except the first characters of such ranges.
1632         //
1633         // TODO Where possible, instead of scanning over code points,
1634         // use internal property data to initialize UnicodeSets for
1635         // those properties.  Scanning code points is slow.
1636 
1637         clear();
1638 
1639         int startHasProperty = -1;
1640         UnicodeSet inclusions = getInclusions(src);
1641         int limitRange = inclusions.getRangeCount();
1642 
1643         for (int j=0; j<limitRange; ++j) {
1644             // get current range
1645             int start = inclusions.getRangeStart(j);
1646             int end = inclusions.getRangeEnd(j);
1647 
1648             // for all the code points in the range, process
1649             for (int ch = start; ch <= end; ++ch) {
1650                 // only add to the unicodeset on inflection points --
1651                 // where the hasProperty value changes to false
1652                 if (filter.contains(ch)) {
1653                     if (startHasProperty < 0) {
1654                         startHasProperty = ch;
1655                     }
1656                 } else if (startHasProperty >= 0) {
1657                     add_unchecked(startHasProperty, ch-1);
1658                     startHasProperty = -1;
1659                 }
1660             }
1661         }
1662         if (startHasProperty >= 0) {
1663             add_unchecked(startHasProperty, 0x10FFFF);
1664         }
1665 
1666         return this;
1667     }
1668 
1669     /**
1670      * Remove leading and trailing rule white space and compress
1671      * internal rule white space to a single space character.
1672      *
1673      * @see UCharacterProperty#isRuleWhiteSpace
1674      */
1675     private static String mungeCharName(String source) {
1676         StringBuffer buf = new StringBuffer();
1677         for (int i=0; i<source.length(); ) {
1678             int ch = UTF16.charAt(source, i);
1679             i += UTF16.getCharCount(ch);
1680             if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1681                 if (buf.length() == 0 ||
1682                     buf.charAt(buf.length() - 1) == ' ') {
1683                     continue;
1684                 }
1685                 ch = ' '; // convert to ' '
1686             }
1687             UTF16.append(buf, ch);
1688         }
1689         if (buf.length() != 0 &&
1690             buf.charAt(buf.length() - 1) == ' ') {
1691             buf.setLength(buf.length() - 1);
1692         }
1693         return buf.toString();
1694     }
1695 
1696     /**
1697      * Modifies this set to contain those code points which have the
1698      * given value for the given property.  Prior contents of this
1699      * set are lost.
1700      * @param propertyAlias
1701      * @param valueAlias
1702      * @param symbols if not null, then symbols are first called to see if a property
1703      * is available. If true, then everything else is skipped.
1704      * @return this set
1705      * @stable ICU 3.2
1706      */
1707     public UnicodeSet applyPropertyAlias(String propertyAlias,
1708                                          String valueAlias, SymbolTable symbols) {
1709         if (valueAlias.length() > 0) {
1710             if (propertyAlias.equals("Age")) {
1711                 // Must munge name, since
1712                 // VersionInfo.getInstance() does not do
1713                 // 'loose' matching.
1714                 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1715                 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1716                 return this;
1717             }
1718         }
1719         throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1720     }
1721 
1722     /**
1723      * Return true if the given iterator appears to point at a
1724      * property pattern.  Regardless of the result, return with the
1725      * iterator unchanged.
1726      * @param chars iterator over the pattern characters.  Upon return
1727      * it will be unchanged.
1728      * @param iterOpts RuleCharacterIterator options
1729      */
1730     private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
1731                                                     int iterOpts) {
1732         boolean result = false;
1733         iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
1734         Object pos = chars.getPos(null);
1735         int c = chars.next(iterOpts);
1736         if (c == '[' || c == '\\') {
1737             int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
1738             result = (c == '[') ? (d == ':') :
1739                      (d == 'N' || d == 'p' || d == 'P');
1740         }
1741         chars.setPos(pos);
1742         return result;
1743     }
1744 
1745     /**
1746      * Parse the given property pattern at the given parse position.
1747      * @param symbols TODO
1748      */
1749     private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
1750         int pos = ppos.getIndex();
1751 
1752         // On entry, ppos should point to one of the following locations:
1753 
1754         // Minimum length is 5 characters, e.g. \p{L}
1755         if ((pos+5) > pattern.length()) {
1756             return null;
1757         }
1758 
1759         boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1760         boolean isName = false; // true for \N{pat}, o/w false
1761         boolean invert = false;
1762 
1763         // Look for an opening [:, [:^, \p, or \P
1764         if (pattern.regionMatches(pos, "[:", 0, 2)) {
1765             posix = true;
1766             pos = Utility.skipWhitespace(pattern, pos+2);
1767             if (pos < pattern.length() && pattern.charAt(pos) == '^') {
1768                 ++pos;
1769                 invert = true;
1770             }
1771         } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
1772                    pattern.regionMatches(pos, "\\N", 0, 2)) {
1773             char c = pattern.charAt(pos+1);
1774             invert = (c == 'P');
1775             isName = (c == 'N');
1776             pos = Utility.skipWhitespace(pattern, pos+2);
1777             if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
1778                 // Syntax error; "\p" or "\P" not followed by "{"
1779                 return null;
1780             }
1781         } else {
1782             // Open delimiter not seen
1783             return null;
1784         }
1785 
1786         // Look for the matching close delimiter, either :] or }
1787         int close = pattern.indexOf(posix ? ":]" : "}", pos);
1788         if (close < 0) {
1789             // Syntax error; close delimiter missing
1790             return null;
1791         }
1792 
1793         // Look for an '=' sign.  If this is present, we will parse a
1794         // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1795         // pattern.
1796         int equals = pattern.indexOf('=', pos);
1797         String propName, valueName;
1798         if (equals >= 0 && equals < close && !isName) {
1799             // Equals seen; parse medium/long pattern
1800             propName = pattern.substring(pos, equals);
1801             valueName = pattern.substring(equals+1, close);
1802         }
1803 
1804         else {
1805             // Handle case where no '=' is seen, and \N{}
1806             propName = pattern.substring(pos, close);
1807             valueName = "";
1808 
1809             // Handle \N{name}
1810             if (isName) {
1811                 // This is a little inefficient since it means we have to
1812                 // parse "na" back to UProperty.NAME even though we already
1813                 // know it's UProperty.NAME.  If we refactor the API to
1814                 // support args of (int, String) then we can remove
1815                 // "na" and make this a little more efficient.
1816                 valueName = propName;
1817                 propName = "na";
1818             }
1819         }
1820 
1821         applyPropertyAlias(propName, valueName, symbols);
1822 
1823         if (invert) {
1824             complement();
1825         }
1826 
1827         // Move to the limit position after the close delimiter
1828         ppos.setIndex(close + (posix ? 2 : 1));
1829 
1830         return this;
1831     }
1832 
1833     /**
1834      * Parse a property pattern.
1835      * @param chars iterator over the pattern characters.  Upon return
1836      * it will be advanced to the first character after the parsed
1837      * pattern, or the end of the iteration if all characters are
1838      * parsed.
1839      * @param rebuiltPat the pattern that was parsed, rebuilt or
1840      * copied from the input pattern, as appropriate.
1841      * @param symbols TODO
1842      */
1843     private void applyPropertyPattern(RuleCharacterIterator chars,
1844                                       StringBuffer rebuiltPat, SymbolTable symbols) {
1845         String patStr = chars.lookahead();
1846         ParsePosition pos = new ParsePosition(0);
1847         applyPropertyPattern(patStr, pos, symbols);
1848         if (pos.getIndex() == 0) {
1849             syntaxError(chars, "Invalid property pattern");
1850         }
1851         chars.jumpahead(pos.getIndex());
1852         rebuiltPat.append(patStr.substring(0, pos.getIndex()));
1853     }
1854 
1855     //----------------------------------------------------------------
1856     // Case folding API
1857     //----------------------------------------------------------------
1858 
1859     /**
1860      * Bitmask for constructor and applyPattern() indicating that
1861      * white space should be ignored.  If set, ignore characters for
1862      * which UCharacterProperty.isRuleWhiteSpace() returns true,
1863      * unless they are quoted or escaped.  This may be ORed together
1864      * with other selectors.
1865      * @stable ICU 3.8
1866      */
1867     public static final int IGNORE_SPACE = 1;
1868 
1869 }
1870