1 /* 2 * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.text.ParsePosition; 40 import java.util.Iterator; 41 import java.util.TreeSet; 42 43 /** 44 * A mutable set of Unicode characters and multicharacter strings. Objects of this class 45 * represent <em>character classes</em> used in regular expressions. 46 * A character specifies a subset of Unicode code points. Legal 47 * code points are U+0000 to U+10FFFF, inclusive. 48 * 49 * <p>The UnicodeSet class is not designed to be subclassed. 50 * 51 * <p><code>UnicodeSet</code> supports two APIs. The first is the 52 * <em>operand</em> API that allows the caller to modify the value of 53 * a <code>UnicodeSet</code> object. It conforms to Java 2's 54 * <code>java.util.Set</code> interface, although 55 * <code>UnicodeSet</code> does not actually implement that 56 * interface. All methods of <code>Set</code> are supported, with the 57 * modification that they take a character range or single character 58 * instead of an <code>Object</code>, and they take a 59 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 60 * operand API may be thought of in terms of boolean logic: a boolean 61 * OR is implemented by <code>add</code>, a boolean AND is implemented 62 * by <code>retain</code>, a boolean XOR is implemented by 63 * <code>complement</code> taking an argument, and a boolean NOT is 64 * implemented by <code>complement</code> with no argument. In terms 65 * of traditional set theory function names, <code>add</code> is a 66 * union, <code>retain</code> is an intersection, <code>remove</code> 67 * is an asymmetric difference, and <code>complement</code> with no 68 * argument is a set complement with respect to the superset range 69 * <code>MIN_VALUE-MAX_VALUE</code> 70 * 71 * <p>The second API is the 72 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 73 * <code>java.text.Format</code>-derived classes. Unlike the 74 * methods that add characters, add categories, and control the logic 75 * of the set, the method <code>applyPattern()</code> sets all 76 * attributes of a <code>UnicodeSet</code> at once, based on a 77 * string pattern. 78 * 79 * <p><b>Pattern syntax</b></p> 80 * 81 * Patterns are accepted by the constructors and the 82 * <code>applyPattern()</code> methods and returned by the 83 * <code>toPattern()</code> method. These patterns follow a syntax 84 * similar to that employed by version 8 regular expression character 85 * classes. Here are some simple examples: 86 * 87 * <blockquote> 88 * <table> 89 * <tr align="top"> 90 * <td nowrap valign="top" align="left"><code>[]</code></td> 91 * <td valign="top">No characters</td> 92 * </tr><tr align="top"> 93 * <td nowrap valign="top" align="left"><code>[a]</code></td> 94 * <td valign="top">The character 'a'</td> 95 * </tr><tr align="top"> 96 * <td nowrap valign="top" align="left"><code>[ae]</code></td> 97 * <td valign="top">The characters 'a' and 'e'</td> 98 * </tr> 99 * <tr> 100 * <td nowrap valign="top" align="left"><code>[a-e]</code></td> 101 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code 102 * point order</td> 103 * </tr> 104 * <tr> 105 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td> 106 * <td valign="top">The character U+4E01</td> 107 * </tr> 108 * <tr> 109 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td> 110 * <td valign="top">The character 'a' and the multicharacter strings "ab" and 111 * "ac"</td> 112 * </tr> 113 * <tr> 114 * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td> 115 * <td valign="top">All characters in the general category Uppercase Letter</td> 116 * </tr> 117 * </table> 118 * </blockquote> 119 * 120 * Any character may be preceded by a backslash in order to remove any special 121 * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are 122 * ignored, unless they are escaped. 123 * 124 * <p>Property patterns specify a set of characters having a certain 125 * property as defined by the Unicode standard. Both the POSIX-like 126 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 127 * complete list of supported property patterns, see the User's Guide 128 * for UnicodeSet at 129 * <a href="http://www.icu-project.org/userguide/unicodeSet.html"> 130 * http://www.icu-project.org/userguide/unicodeSet.html</a>. 131 * Actual determination of property data is defined by the underlying 132 * Unicode database as implemented by UCharacter. 133 * 134 * <p>Patterns specify individual characters, ranges of characters, and 135 * Unicode property sets. When elements are concatenated, they 136 * specify their union. To complement a set, place a '^' immediately 137 * after the opening '['. Property patterns are inverted by modifying 138 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 139 * '^' has no special meaning. 140 * 141 * <p>Ranges are indicated by placing two a '-' between two 142 * characters, as in "a-z". This specifies the range of all 143 * characters from the left to the right, in Unicode order. If the 144 * left character is greater than or equal to the 145 * right character it is a syntax error. If a '-' occurs as the first 146 * character after the opening '[' or '[^', or if it occurs as the 147 * last character before the closing ']', then it is taken as a 148 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 149 * set of three characters, 'a', 'b', and '-'. 150 * 151 * <p>Sets may be intersected using the '&' operator or the asymmetric 152 * set difference may be taken using the '-' operator, for example, 153 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 154 * with values less than 4096. Operators ('&' and '|') have equal 155 * precedence and bind left-to-right. Thus 156 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 157 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 158 * difference; intersection is commutative. 159 * 160 * <table> 161 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a' 162 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a' 163 * through 'z' and all letters in between, in Unicode order 164 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing 165 * all characters but 'a' through 'z', 166 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 167 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 168 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 169 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 170 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 171 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 172 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 173 * <em>pat2</em> 174 * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code> 175 * <td>The set of characters having the specified 176 * Unicode property; in 177 * this case, Unicode uppercase letters 178 * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code> 179 * <td>The set of characters <em>not</em> having the given 180 * Unicode property 181 * </table> 182 * 183 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p> 184 * 185 * <p><b>Formal syntax</b></p> 186 * 187 * <blockquote> 188 * <table> 189 * <tr align="top"> 190 * <td nowrap valign="top" align="right"><code>pattern := </code></td> 191 * <td valign="top"><code>('[' '^'? item* ']') | 192 * property</code></td> 193 * </tr> 194 * <tr align="top"> 195 * <td nowrap valign="top" align="right"><code>item := </code></td> 196 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br> 197 * </code></td> 198 * </tr> 199 * <tr align="top"> 200 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td> 201 * <td valign="top"><code>pattern | pattern-expr pattern | 202 * pattern-expr op pattern<br> 203 * </code></td> 204 * </tr> 205 * <tr align="top"> 206 * <td nowrap valign="top" align="right"><code>op := </code></td> 207 * <td valign="top"><code>'&' | '-'<br> 208 * </code></td> 209 * </tr> 210 * <tr align="top"> 211 * <td nowrap valign="top" align="right"><code>special := </code></td> 212 * <td valign="top"><code>'[' | ']' | '-'<br> 213 * </code></td> 214 * </tr> 215 * <tr align="top"> 216 * <td nowrap valign="top" align="right"><code>char := </code></td> 217 * <td valign="top"><em>any character that is not</em><code> special<br> 218 * | ('\\' </code><em>any character</em><code>)<br> 219 * | ('\u' hex hex hex hex)<br> 220 * </code></td> 221 * </tr> 222 * <tr align="top"> 223 * <td nowrap valign="top" align="right"><code>hex := </code></td> 224 * <td valign="top"><em>any character for which 225 * </em><code>Character.digit(c, 16)</code><em> 226 * returns a non-negative result</em></td> 227 * </tr> 228 * <tr> 229 * <td nowrap valign="top" align="right"><code>property := </code></td> 230 * <td valign="top"><em>a Unicode property set pattern</td> 231 * </tr> 232 * </table> 233 * <br> 234 * <table border="1"> 235 * <tr> 236 * <td>Legend: <table> 237 * <tr> 238 * <td nowrap valign="top"><code>a := b</code></td> 239 * <td width="20" valign="top"> </td> 240 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td> 241 * </tr> 242 * <tr> 243 * <td nowrap valign="top"><code>a?</code></td> 244 * <td valign="top"></td> 245 * <td valign="top">zero or one instance of <code>a</code><br> 246 * </td> 247 * </tr> 248 * <tr> 249 * <td nowrap valign="top"><code>a*</code></td> 250 * <td valign="top"></td> 251 * <td valign="top">one or more instances of <code>a</code><br> 252 * </td> 253 * </tr> 254 * <tr> 255 * <td nowrap valign="top"><code>a | b</code></td> 256 * <td valign="top"></td> 257 * <td valign="top">either <code>a</code> or <code>b</code><br> 258 * </td> 259 * </tr> 260 * <tr> 261 * <td nowrap valign="top"><code>'a'</code></td> 262 * <td valign="top"></td> 263 * <td valign="top">the literal string between the quotes </td> 264 * </tr> 265 * </table> 266 * </td> 267 * </tr> 268 * </table> 269 * </blockquote> 270 * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class. 271 * 272 * @author Alan Liu 273 * @stable ICU 2.0 274 * @see UnicodeSetIterator 275 */ 276 public class UnicodeSet implements UnicodeMatcher { 277 278 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 279 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 280 // 110000 for codepoints 281 282 /** 283 * Minimum value that can be stored in a UnicodeSet. 284 * @stable ICU 2.0 285 */ 286 public static final int MIN_VALUE = LOW; 287 288 /** 289 * Maximum value that can be stored in a UnicodeSet. 290 * @stable ICU 2.0 291 */ 292 public static final int MAX_VALUE = HIGH - 1; 293 294 private int len; // length used; list may be longer to minimize reallocs 295 private int[] list; // MUST be terminated with HIGH 296 private int[] rangeList; // internal buffer 297 private int[] buffer; // internal buffer 298 299 // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! 300 // is not private so that UnicodeSetIterator can get access 301 TreeSet<String> strings = new TreeSet<>(); 302 303 /** 304 * The pattern representation of this set. This may not be the 305 * most economical pattern. It is the pattern supplied to 306 * applyPattern(), with variables substituted and whitespace 307 * removed. For sets constructed without applyPattern(), or 308 * modified using the non-pattern API, this string will be null, 309 * indicating that toPattern() must generate a pattern 310 * representation from the inversion list. 311 */ 312 private String pat = null; 313 314 private static final int START_EXTRA = 16; // initial storage. Must be >= 0 315 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 316 317 /** 318 * A set of all characters _except_ the second through last characters of 319 * certain ranges. These ranges are ranges of characters whose 320 * properties are all exactly alike, e.g. CJK Ideographs from 321 * U+4E00 to U+9FA5. 322 */ 323 private static UnicodeSet INCLUSIONS[] = null; 324 325 //---------------------------------------------------------------- 326 // Public API 327 //---------------------------------------------------------------- 328 329 /** 330 * Constructs an empty set. 331 * @stable ICU 2.0 332 */ 333 public UnicodeSet() { 334 list = new int[1 + START_EXTRA]; 335 list[len++] = HIGH; 336 } 337 338 /** 339 * Constructs a set containing the given range. If <code>end > 340 * start</code> then an empty set is created. 341 * 342 * @param start first character, inclusive, of range 343 * @param end last character, inclusive, of range 344 * @stable ICU 2.0 345 */ 346 public UnicodeSet(int start, int end) { 347 this(); 348 complement(start, end); 349 } 350 351 /** 352 * Constructs a set from the given pattern. See the class description 353 * for the syntax of the pattern language. Whitespace is ignored. 354 * @param pattern a string specifying what characters are in the set 355 * @exception java.lang.IllegalArgumentException if the pattern contains 356 * a syntax error. 357 * @stable ICU 2.0 358 */ 359 public UnicodeSet(String pattern) { 360 this(); 361 applyPattern(pattern, null, null, IGNORE_SPACE); 362 } 363 364 /** 365 * Make this object represent the same set as <code>other</code>. 366 * @param other a <code>UnicodeSet</code> whose value will be 367 * copied to this object 368 * @stable ICU 2.0 369 */ 370 @SuppressWarnings("unchecked") // Casting result of clone of a collection 371 public UnicodeSet set(UnicodeSet other) { 372 list = other.list.clone(); 373 len = other.len; 374 pat = other.pat; 375 strings = (TreeSet)other.strings.clone(); 376 return this; 377 } 378 379 /** 380 * Modifies this set to represent the set specified by the given pattern. 381 * See the class description for the syntax of the pattern language. 382 * Whitespace is ignored. 383 * @param pattern a string specifying what characters are in the set 384 * @exception java.lang.IllegalArgumentException if the pattern 385 * contains a syntax error. 386 * @stable ICU 2.0 387 */ 388 public final UnicodeSet applyPattern(String pattern) { 389 return applyPattern(pattern, null, null, IGNORE_SPACE); 390 } 391 392 /** 393 * Append the <code>toPattern()</code> representation of a 394 * string to the given <code>StringBuffer</code>. 395 */ 396 private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) { 397 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) { 398 _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable); 399 } 400 } 401 402 /** 403 * Append the <code>toPattern()</code> representation of a 404 * character to the given <code>StringBuffer</code>. 405 */ 406 private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) { 407 if (escapeUnprintable && Utility.isUnprintable(c)) { 408 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 409 // unprintable 410 if (Utility.escapeUnprintable(buf, c)) { 411 return; 412 } 413 } 414 // Okay to let ':' pass through 415 switch (c) { 416 case '[': // SET_OPEN: 417 case ']': // SET_CLOSE: 418 case '-': // HYPHEN: 419 case '^': // COMPLEMENT: 420 case '&': // INTERSECTION: 421 case '\\': //BACKSLASH: 422 case '{': 423 case '}': 424 case '$': 425 case ':': 426 buf.append('\\'); 427 break; 428 default: 429 // Escape whitespace 430 if (UCharacterProperty.isRuleWhiteSpace(c)) { 431 buf.append('\\'); 432 } 433 break; 434 } 435 UTF16.append(buf, c); 436 } 437 438 /** 439 * Append a string representation of this set to result. This will be 440 * a cleaned version of the string passed to applyPattern(), if there 441 * is one. Otherwise it will be generated. 442 */ 443 private StringBuffer _toPattern(StringBuffer result, 444 boolean escapeUnprintable) { 445 if (pat != null) { 446 int i; 447 int backslashCount = 0; 448 for (i=0; i<pat.length(); ) { 449 int c = UTF16.charAt(pat, i); 450 i += UTF16.getCharCount(c); 451 if (escapeUnprintable && Utility.isUnprintable(c)) { 452 // If the unprintable character is preceded by an odd 453 // number of backslashes, then it has been escaped. 454 // Before unescaping it, we delete the final 455 // backslash. 456 if ((backslashCount % 2) == 1) { 457 result.setLength(result.length() - 1); 458 } 459 Utility.escapeUnprintable(result, c); 460 backslashCount = 0; 461 } else { 462 UTF16.append(result, c); 463 if (c == '\\') { 464 ++backslashCount; 465 } else { 466 backslashCount = 0; 467 } 468 } 469 } 470 return result; 471 } 472 473 return _generatePattern(result, escapeUnprintable, true); 474 } 475 476 /** 477 * Generate and append a string representation of this set to result. 478 * This does not use this.pat, the cleaned up copy of the string 479 * passed to applyPattern(). 480 * @param includeStrings if false, doesn't include the strings. 481 * @stable ICU 3.8 482 */ 483 public StringBuffer _generatePattern(StringBuffer result, 484 boolean escapeUnprintable, boolean includeStrings) { 485 result.append('['); 486 487 int count = getRangeCount(); 488 489 // If the set contains at least 2 intervals and includes both 490 // MIN_VALUE and MAX_VALUE, then the inverse representation will 491 // be more economical. 492 if (count > 1 && 493 getRangeStart(0) == MIN_VALUE && 494 getRangeEnd(count-1) == MAX_VALUE) { 495 496 // Emit the inverse 497 result.append('^'); 498 499 for (int i = 1; i < count; ++i) { 500 int start = getRangeEnd(i-1)+1; 501 int end = getRangeStart(i)-1; 502 _appendToPat(result, start, escapeUnprintable); 503 if (start != end) { 504 if ((start+1) != end) { 505 result.append('-'); 506 } 507 _appendToPat(result, end, escapeUnprintable); 508 } 509 } 510 } 511 512 // Default; emit the ranges as pairs 513 else { 514 for (int i = 0; i < count; ++i) { 515 int start = getRangeStart(i); 516 int end = getRangeEnd(i); 517 _appendToPat(result, start, escapeUnprintable); 518 if (start != end) { 519 if ((start+1) != end) { 520 result.append('-'); 521 } 522 _appendToPat(result, end, escapeUnprintable); 523 } 524 } 525 } 526 527 if (includeStrings && strings.size() > 0) { 528 Iterator<String> it = strings.iterator(); 529 while (it.hasNext()) { 530 result.append('{'); 531 _appendToPat(result, it.next(), escapeUnprintable); 532 result.append('}'); 533 } 534 } 535 return result.append(']'); 536 } 537 538 // for internal use, after checkFrozen has been called 539 private UnicodeSet add_unchecked(int start, int end) { 540 if (start < MIN_VALUE || start > MAX_VALUE) { 541 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 542 } 543 if (end < MIN_VALUE || end > MAX_VALUE) { 544 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 545 } 546 if (start < end) { 547 add(range(start, end), 2, 0); 548 } else if (start == end) { 549 add(start); 550 } 551 return this; 552 } 553 554 /** 555 * Adds the specified character to this set if it is not already 556 * present. If this set already contains the specified character, 557 * the call leaves this set unchanged. 558 * @stable ICU 2.0 559 */ 560 public final UnicodeSet add(int c) { 561 return add_unchecked(c); 562 } 563 564 // for internal use only, after checkFrozen has been called 565 private final UnicodeSet add_unchecked(int c) { 566 if (c < MIN_VALUE || c > MAX_VALUE) { 567 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 568 } 569 570 // find smallest i such that c < list[i] 571 // if odd, then it is IN the set 572 // if even, then it is OUT of the set 573 int i = findCodePoint(c); 574 575 // already in set? 576 if ((i & 1) != 0) return this; 577 578 // HIGH is 0x110000 579 // assert(list[len-1] == HIGH); 580 581 // empty = [HIGH] 582 // [start_0, limit_0, start_1, limit_1, HIGH] 583 584 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 585 // ^ 586 // list[i] 587 588 // i == 0 means c is before the first range 589 590 if (c == list[i]-1) { 591 // c is before start of next range 592 list[i] = c; 593 // if we touched the HIGH mark, then add a new one 594 if (c == MAX_VALUE) { 595 ensureCapacity(len+1); 596 list[len++] = HIGH; 597 } 598 if (i > 0 && c == list[i-1]) { 599 // collapse adjacent ranges 600 601 // [..., start_k-1, c, c, limit_k, ..., HIGH] 602 // ^ 603 // list[i] 604 System.arraycopy(list, i+1, list, i-1, len-i-1); 605 len -= 2; 606 } 607 } 608 609 else if (i > 0 && c == list[i-1]) { 610 // c is after end of prior range 611 list[i-1]++; 612 // no need to chcek for collapse here 613 } 614 615 else { 616 // At this point we know the new char is not adjacent to 617 // any existing ranges, and it is not 10FFFF. 618 619 620 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 621 // ^ 622 // list[i] 623 624 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 625 // ^ 626 // list[i] 627 628 // Don't use ensureCapacity() to save on copying. 629 // NOTE: This has no measurable impact on performance, 630 // but it might help in some usage patterns. 631 if (len+2 > list.length) { 632 int[] temp = new int[len + 2 + GROW_EXTRA]; 633 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 634 System.arraycopy(list, i, temp, i+2, len-i); 635 list = temp; 636 } else { 637 System.arraycopy(list, i, list, i+2, len-i); 638 } 639 640 list[i] = c; 641 list[i+1] = c+1; 642 len += 2; 643 } 644 645 pat = null; 646 return this; 647 } 648 649 /** 650 * Adds the specified multicharacter to this set if it is not already 651 * present. If this set already contains the multicharacter, 652 * the call leaves this set unchanged. 653 * Thus "ch" => {"ch"} 654 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 655 * @param s the source string 656 * @return this object, for chaining 657 * @stable ICU 2.0 658 */ 659 public final UnicodeSet add(String s) { 660 int cp = getSingleCP(s); 661 if (cp < 0) { 662 strings.add(s); 663 pat = null; 664 } else { 665 add_unchecked(cp, cp); 666 } 667 return this; 668 } 669 670 /** 671 * @return a code point IF the string consists of a single one. 672 * otherwise returns -1. 673 * @param string to test 674 */ 675 private static int getSingleCP(String s) { 676 if (s.length() < 1) { 677 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 678 } 679 if (s.length() > 2) return -1; 680 if (s.length() == 1) return s.charAt(0); 681 682 // at this point, len = 2 683 int cp = UTF16.charAt(s, 0); 684 if (cp > 0xFFFF) { // is surrogate pair 685 return cp; 686 } 687 return -1; 688 } 689 690 /** 691 * Complements the specified range in this set. Any character in 692 * the range will be removed if it is in this set, or will be 693 * added if it is not in this set. If <code>end > start</code> 694 * then an empty range is complemented, leaving the set unchanged. 695 * 696 * @param start first character, inclusive, of range to be removed 697 * from this set. 698 * @param end last character, inclusive, of range to be removed 699 * from this set. 700 * @stable ICU 2.0 701 */ 702 public UnicodeSet complement(int start, int end) { 703 if (start < MIN_VALUE || start > MAX_VALUE) { 704 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 705 } 706 if (end < MIN_VALUE || end > MAX_VALUE) { 707 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 708 } 709 if (start <= end) { 710 xor(range(start, end), 2, 0); 711 } 712 pat = null; 713 return this; 714 } 715 716 /** 717 * This is equivalent to 718 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 719 * @stable ICU 2.0 720 */ 721 public UnicodeSet complement() { 722 if (list[0] == LOW) { 723 System.arraycopy(list, 1, list, 0, len-1); 724 --len; 725 } else { 726 ensureCapacity(len+1); 727 System.arraycopy(list, 0, list, 1, len); 728 list[0] = LOW; 729 ++len; 730 } 731 pat = null; 732 return this; 733 } 734 735 /** 736 * Returns true if this set contains the given character. 737 * @param c character to be checked for containment 738 * @return true if the test condition is met 739 * @stable ICU 2.0 740 */ 741 public boolean contains(int c) { 742 if (c < MIN_VALUE || c > MAX_VALUE) { 743 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 744 } 745 746 /* 747 // Set i to the index of the start item greater than ch 748 // We know we will terminate without length test! 749 int i = -1; 750 while (true) { 751 if (c < list[++i]) break; 752 } 753 */ 754 755 int i = findCodePoint(c); 756 757 return ((i & 1) != 0); // return true if odd 758 } 759 760 /** 761 * Returns the smallest value i such that c < list[i]. Caller 762 * must ensure that c is a legal value or this method will enter 763 * an infinite loop. This method performs a binary search. 764 * @param c a character in the range MIN_VALUE..MAX_VALUE 765 * inclusive 766 * @return the smallest integer i in the range 0..len-1, 767 * inclusive, such that c < list[i] 768 */ 769 private final int findCodePoint(int c) { 770 /* Examples: 771 findCodePoint(c) 772 set list[] c=0 1 3 4 7 8 773 === ============== =========== 774 [] [110000] 0 0 0 0 0 0 775 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 776 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 777 [:all:] [0, 110000] 1 1 1 1 1 1 778 */ 779 780 // Return the smallest i such that c < list[i]. Assume 781 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 782 if (c < list[0]) return 0; 783 // High runner test. c is often after the last range, so an 784 // initial check for this condition pays off. 785 if (len >= 2 && c >= list[len-2]) return len-1; 786 int lo = 0; 787 int hi = len - 1; 788 // invariant: c >= list[lo] 789 // invariant: c < list[hi] 790 for (;;) { 791 int i = (lo + hi) >>> 1; 792 if (i == lo) return hi; 793 if (c < list[i]) { 794 hi = i; 795 } else { 796 lo = i; 797 } 798 } 799 } 800 801 /** 802 * Adds all of the elements in the specified set to this set if 803 * they're not already present. This operation effectively 804 * modifies this set so that its value is the <i>union</i> of the two 805 * sets. The behavior of this operation is unspecified if the specified 806 * collection is modified while the operation is in progress. 807 * 808 * @param c set whose elements are to be added to this set. 809 * @stable ICU 2.0 810 */ 811 public UnicodeSet addAll(UnicodeSet c) { 812 add(c.list, c.len, 0); 813 strings.addAll(c.strings); 814 return this; 815 } 816 817 /** 818 * Retains only the elements in this set that are contained in the 819 * specified set. In other words, removes from this set all of 820 * its elements that are not contained in the specified set. This 821 * operation effectively modifies this set so that its value is 822 * the <i>intersection</i> of the two sets. 823 * 824 * @param c set that defines which elements this set will retain. 825 * @stable ICU 2.0 826 */ 827 public UnicodeSet retainAll(UnicodeSet c) { 828 retain(c.list, c.len, 0); 829 strings.retainAll(c.strings); 830 return this; 831 } 832 833 /** 834 * Removes from this set all of its elements that are contained in the 835 * specified set. This operation effectively modifies this 836 * set so that its value is the <i>asymmetric set difference</i> of 837 * the two sets. 838 * 839 * @param c set that defines which elements will be removed from 840 * this set. 841 * @stable ICU 2.0 842 */ 843 public UnicodeSet removeAll(UnicodeSet c) { 844 retain(c.list, c.len, 2); 845 strings.removeAll(c.strings); 846 return this; 847 } 848 849 /** 850 * Removes all of the elements from this set. This set will be 851 * empty after this call returns. 852 * @stable ICU 2.0 853 */ 854 public UnicodeSet clear() { 855 list[0] = HIGH; 856 len = 1; 857 pat = null; 858 strings.clear(); 859 return this; 860 } 861 862 /** 863 * Iteration method that returns the number of ranges contained in 864 * this set. 865 * @see #getRangeStart 866 * @see #getRangeEnd 867 * @stable ICU 2.0 868 */ 869 public int getRangeCount() { 870 return len/2; 871 } 872 873 /** 874 * Iteration method that returns the first character in the 875 * specified range of this set. 876 * @exception ArrayIndexOutOfBoundsException if index is outside 877 * the range <code>0..getRangeCount()-1</code> 878 * @see #getRangeCount 879 * @see #getRangeEnd 880 * @stable ICU 2.0 881 */ 882 public int getRangeStart(int index) { 883 return list[index*2]; 884 } 885 886 /** 887 * Iteration method that returns the last character in the 888 * specified range of this set. 889 * @exception ArrayIndexOutOfBoundsException if index is outside 890 * the range <code>0..getRangeCount()-1</code> 891 * @see #getRangeStart 892 * @see #getRangeEnd 893 * @stable ICU 2.0 894 */ 895 public int getRangeEnd(int index) { 896 return (list[index*2 + 1] - 1); 897 } 898 899 //---------------------------------------------------------------- 900 // Implementation: Pattern parsing 901 //---------------------------------------------------------------- 902 903 /** 904 * Parses the given pattern, starting at the given position. The character 905 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 906 * Parsing continues until the corresponding closing ']'. If a syntax error 907 * is encountered between the opening and closing brace, the parse fails. 908 * Upon return from a successful parse, the ParsePosition is updated to 909 * point to the character following the closing ']', and an inversion 910 * list for the parsed pattern is returned. This method 911 * calls itself recursively to parse embedded subpatterns. 912 * 913 * @param pattern the string containing the pattern to be parsed. The 914 * portion of the string from pos.getIndex(), which must be a '[', to the 915 * corresponding closing ']', is parsed. 916 * @param pos upon entry, the position at which to being parsing. The 917 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 918 * from a successful parse, pos.getIndex() is either the character after the 919 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 920 * is the last character of the pattern string. 921 * @return an inversion list for the parsed substring 922 * of <code>pattern</code> 923 * @exception java.lang.IllegalArgumentException if the parse fails. 924 */ 925 UnicodeSet applyPattern(String pattern, 926 ParsePosition pos, 927 SymbolTable symbols, 928 int options) { 929 930 // Need to build the pattern in a temporary string because 931 // _applyPattern calls add() etc., which set pat to empty. 932 boolean parsePositionWasNull = pos == null; 933 if (parsePositionWasNull) { 934 pos = new ParsePosition(0); 935 } 936 937 StringBuffer rebuiltPat = new StringBuffer(); 938 RuleCharacterIterator chars = 939 new RuleCharacterIterator(pattern, symbols, pos); 940 applyPattern(chars, symbols, rebuiltPat, options); 941 if (chars.inVariable()) { 942 syntaxError(chars, "Extra chars in variable value"); 943 } 944 pat = rebuiltPat.toString(); 945 if (parsePositionWasNull) { 946 int i = pos.getIndex(); 947 948 // Skip over trailing whitespace 949 if ((options & IGNORE_SPACE) != 0) { 950 i = Utility.skipWhitespace(pattern, i); 951 } 952 953 if (i != pattern.length()) { 954 throw new IllegalArgumentException("Parse of \"" + pattern + 955 "\" failed at " + i); 956 } 957 } 958 return this; 959 } 960 961 /** 962 * Parse the pattern from the given RuleCharacterIterator. The 963 * iterator is advanced over the parsed pattern. 964 * @param chars iterator over the pattern characters. Upon return 965 * it will be advanced to the first character after the parsed 966 * pattern, or the end of the iteration if all characters are 967 * parsed. 968 * @param symbols symbol table to use to parse and dereference 969 * variables, or null if none. 970 * @param rebuiltPat the pattern that was parsed, rebuilt or 971 * copied from the input pattern, as appropriate. 972 * @param options a bit mask of zero or more of the following: 973 * IGNORE_SPACE, CASE. 974 */ 975 void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 976 StringBuffer rebuiltPat, int options) { 977 // Syntax characters: [ ] ^ - & { } 978 979 // Recognized special forms for chars, sets: c-c s-s s&s 980 981 int opts = RuleCharacterIterator.PARSE_VARIABLES | 982 RuleCharacterIterator.PARSE_ESCAPES; 983 if ((options & IGNORE_SPACE) != 0) { 984 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 985 } 986 987 StringBuffer patBuf = new StringBuffer(), buf = null; 988 boolean usePat = false; 989 UnicodeSet scratch = null; 990 Object backup = null; 991 992 // mode: 0=before [, 1=between [...], 2=after ] 993 // lastItem: 0=none, 1=char, 2=set 994 int lastItem = 0, lastChar = 0, mode = 0; 995 char op = 0; 996 997 boolean invert = false; 998 999 clear(); 1000 1001 while (mode != 2 && !chars.atEnd()) { 1002 if (false) { 1003 // Debugging assertion 1004 if (!((lastItem == 0 && op == 0) || 1005 (lastItem == 1 && (op == 0 || op == '-')) || 1006 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 1007 throw new IllegalArgumentException(); 1008 } 1009 } 1010 1011 int c = 0; 1012 boolean literal = false; 1013 UnicodeSet nested = null; 1014 1015 // -------- Check for property pattern 1016 1017 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 1018 int setMode = 0; 1019 if (resemblesPropertyPattern(chars, opts)) { 1020 setMode = 2; 1021 } 1022 1023 // -------- Parse '[' of opening delimiter OR nested set. 1024 // If there is a nested set, use `setMode' to define how 1025 // the set should be parsed. If the '[' is part of the 1026 // opening delimiter for this pattern, parse special 1027 // strings "[", "[^", "[-", and "[^-". Check for stand-in 1028 // characters representing a nested set in the symbol 1029 // table. 1030 1031 else { 1032 // Prepare to backup if necessary 1033 backup = chars.getPos(backup); 1034 c = chars.next(opts); 1035 literal = chars.isEscaped(); 1036 1037 if (c == '[' && !literal) { 1038 if (mode == 1) { 1039 chars.setPos(backup); // backup 1040 setMode = 1; 1041 } else { 1042 // Handle opening '[' delimiter 1043 mode = 1; 1044 patBuf.append('['); 1045 backup = chars.getPos(backup); // prepare to backup 1046 c = chars.next(opts); 1047 literal = chars.isEscaped(); 1048 if (c == '^' && !literal) { 1049 invert = true; 1050 patBuf.append('^'); 1051 backup = chars.getPos(backup); // prepare to backup 1052 c = chars.next(opts); 1053 literal = chars.isEscaped(); 1054 } 1055 // Fall through to handle special leading '-'; 1056 // otherwise restart loop for nested [], \p{}, etc. 1057 if (c == '-') { 1058 literal = true; 1059 // Fall through to handle literal '-' below 1060 } else { 1061 chars.setPos(backup); // backup 1062 continue; 1063 } 1064 } 1065 } else if (symbols != null) { 1066 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 1067 if (m != null) { 1068 try { 1069 nested = (UnicodeSet) m; 1070 setMode = 3; 1071 } catch (ClassCastException e) { 1072 syntaxError(chars, "Syntax error"); 1073 } 1074 } 1075 } 1076 } 1077 1078 // -------- Handle a nested set. This either is inline in 1079 // the pattern or represented by a stand-in that has 1080 // previously been parsed and was looked up in the symbol 1081 // table. 1082 1083 if (setMode != 0) { 1084 if (lastItem == 1) { 1085 if (op != 0) { 1086 syntaxError(chars, "Char expected after operator"); 1087 } 1088 add_unchecked(lastChar, lastChar); 1089 _appendToPat(patBuf, lastChar, false); 1090 lastItem = op = 0; 1091 } 1092 1093 if (op == '-' || op == '&') { 1094 patBuf.append(op); 1095 } 1096 1097 if (nested == null) { 1098 if (scratch == null) scratch = new UnicodeSet(); 1099 nested = scratch; 1100 } 1101 switch (setMode) { 1102 case 1: 1103 nested.applyPattern(chars, symbols, patBuf, options); 1104 break; 1105 case 2: 1106 chars.skipIgnored(opts); 1107 nested.applyPropertyPattern(chars, patBuf, symbols); 1108 break; 1109 case 3: // `nested' already parsed 1110 nested._toPattern(patBuf, false); 1111 break; 1112 } 1113 1114 usePat = true; 1115 1116 if (mode == 0) { 1117 // Entire pattern is a category; leave parse loop 1118 set(nested); 1119 mode = 2; 1120 break; 1121 } 1122 1123 switch (op) { 1124 case '-': 1125 removeAll(nested); 1126 break; 1127 case '&': 1128 retainAll(nested); 1129 break; 1130 case 0: 1131 addAll(nested); 1132 break; 1133 } 1134 1135 op = 0; 1136 lastItem = 2; 1137 1138 continue; 1139 } 1140 1141 if (mode == 0) { 1142 syntaxError(chars, "Missing '['"); 1143 } 1144 1145 // -------- Parse special (syntax) characters. If the 1146 // current character is not special, or if it is escaped, 1147 // then fall through and handle it below. 1148 1149 if (!literal) { 1150 switch (c) { 1151 case ']': 1152 if (lastItem == 1) { 1153 add_unchecked(lastChar, lastChar); 1154 _appendToPat(patBuf, lastChar, false); 1155 } 1156 // Treat final trailing '-' as a literal 1157 if (op == '-') { 1158 add_unchecked(op, op); 1159 patBuf.append(op); 1160 } else if (op == '&') { 1161 syntaxError(chars, "Trailing '&'"); 1162 } 1163 patBuf.append(']'); 1164 mode = 2; 1165 continue; 1166 case '-': 1167 if (op == 0) { 1168 if (lastItem != 0) { 1169 op = (char) c; 1170 continue; 1171 } else { 1172 // Treat final trailing '-' as a literal 1173 add_unchecked(c, c); 1174 c = chars.next(opts); 1175 literal = chars.isEscaped(); 1176 if (c == ']' && !literal) { 1177 patBuf.append("-]"); 1178 mode = 2; 1179 continue; 1180 } 1181 } 1182 } 1183 syntaxError(chars, "'-' not after char or set"); 1184 break; 1185 case '&': 1186 if (lastItem == 2 && op == 0) { 1187 op = (char) c; 1188 continue; 1189 } 1190 syntaxError(chars, "'&' not after set"); 1191 break; 1192 case '^': 1193 syntaxError(chars, "'^' not after '['"); 1194 break; 1195 case '{': 1196 if (op != 0) { 1197 syntaxError(chars, "Missing operand after operator"); 1198 } 1199 if (lastItem == 1) { 1200 add_unchecked(lastChar, lastChar); 1201 _appendToPat(patBuf, lastChar, false); 1202 } 1203 lastItem = 0; 1204 if (buf == null) { 1205 buf = new StringBuffer(); 1206 } else { 1207 buf.setLength(0); 1208 } 1209 boolean ok = false; 1210 while (!chars.atEnd()) { 1211 c = chars.next(opts); 1212 literal = chars.isEscaped(); 1213 if (c == '}' && !literal) { 1214 ok = true; 1215 break; 1216 } 1217 UTF16.append(buf, c); 1218 } 1219 if (buf.length() < 1 || !ok) { 1220 syntaxError(chars, "Invalid multicharacter string"); 1221 } 1222 // We have new string. Add it to set and continue; 1223 // we don't need to drop through to the further 1224 // processing 1225 add(buf.toString()); 1226 patBuf.append('{'); 1227 _appendToPat(patBuf, buf.toString(), false); 1228 patBuf.append('}'); 1229 continue; 1230 case SymbolTable.SYMBOL_REF: 1231 // symbols nosymbols 1232 // [a-$] error error (ambiguous) 1233 // [a$] anchor anchor 1234 // [a-$x] var "x"* literal '$' 1235 // [a-$.] error literal '$' 1236 // *We won't get here in the case of var "x" 1237 backup = chars.getPos(backup); 1238 c = chars.next(opts); 1239 literal = chars.isEscaped(); 1240 boolean anchor = (c == ']' && !literal); 1241 if (symbols == null && !anchor) { 1242 c = SymbolTable.SYMBOL_REF; 1243 chars.setPos(backup); 1244 break; // literal '$' 1245 } 1246 if (anchor && op == 0) { 1247 if (lastItem == 1) { 1248 add_unchecked(lastChar, lastChar); 1249 _appendToPat(patBuf, lastChar, false); 1250 } 1251 add_unchecked(UnicodeMatcher.ETHER); 1252 usePat = true; 1253 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 1254 mode = 2; 1255 continue; 1256 } 1257 syntaxError(chars, "Unquoted '$'"); 1258 break; 1259 default: 1260 break; 1261 } 1262 } 1263 1264 // -------- Parse literal characters. This includes both 1265 // escaped chars ("\u4E01") and non-syntax characters 1266 // ("a"). 1267 1268 switch (lastItem) { 1269 case 0: 1270 lastItem = 1; 1271 lastChar = c; 1272 break; 1273 case 1: 1274 if (op == '-') { 1275 if (lastChar >= c) { 1276 // Don't allow redundant (a-a) or empty (b-a) ranges; 1277 // these are most likely typos. 1278 syntaxError(chars, "Invalid range"); 1279 } 1280 add_unchecked(lastChar, c); 1281 _appendToPat(patBuf, lastChar, false); 1282 patBuf.append(op); 1283 _appendToPat(patBuf, c, false); 1284 lastItem = op = 0; 1285 } else { 1286 add_unchecked(lastChar, lastChar); 1287 _appendToPat(patBuf, lastChar, false); 1288 lastChar = c; 1289 } 1290 break; 1291 case 2: 1292 if (op != 0) { 1293 syntaxError(chars, "Set expected after operator"); 1294 } 1295 lastChar = c; 1296 lastItem = 1; 1297 break; 1298 } 1299 } 1300 1301 if (mode != 2) { 1302 syntaxError(chars, "Missing ']'"); 1303 } 1304 1305 chars.skipIgnored(opts); 1306 1307 if (invert) { 1308 complement(); 1309 } 1310 1311 // Use the rebuilt pattern (pat) only if necessary. Prefer the 1312 // generated pattern. 1313 if (usePat) { 1314 rebuiltPat.append(patBuf.toString()); 1315 } else { 1316 _generatePattern(rebuiltPat, false, true); 1317 } 1318 } 1319 1320 private static void syntaxError(RuleCharacterIterator chars, String msg) { 1321 throw new IllegalArgumentException("Error: " + msg + " at \"" + 1322 Utility.escape(chars.toString()) + 1323 '"'); 1324 } 1325 1326 //---------------------------------------------------------------- 1327 // Implementation: Utility methods 1328 //---------------------------------------------------------------- 1329 1330 private void ensureCapacity(int newLen) { 1331 if (newLen <= list.length) return; 1332 int[] temp = new int[newLen + GROW_EXTRA]; 1333 System.arraycopy(list, 0, temp, 0, len); 1334 list = temp; 1335 } 1336 1337 private void ensureBufferCapacity(int newLen) { 1338 if (buffer != null && newLen <= buffer.length) return; 1339 buffer = new int[newLen + GROW_EXTRA]; 1340 } 1341 1342 /** 1343 * Assumes start <= end. 1344 */ 1345 private int[] range(int start, int end) { 1346 if (rangeList == null) { 1347 rangeList = new int[] { start, end+1, HIGH }; 1348 } else { 1349 rangeList[0] = start; 1350 rangeList[1] = end+1; 1351 } 1352 return rangeList; 1353 } 1354 1355 //---------------------------------------------------------------- 1356 // Implementation: Fundamental operations 1357 //---------------------------------------------------------------- 1358 1359 // polarity = 0, 3 is normal: x xor y 1360 // polarity = 1, 2: x xor ~y == x === y 1361 1362 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 1363 ensureBufferCapacity(len + otherLen); 1364 int i = 0, j = 0, k = 0; 1365 int a = list[i++]; 1366 int b; 1367 if (polarity == 1 || polarity == 2) { 1368 b = LOW; 1369 if (other[j] == LOW) { // skip base if already LOW 1370 ++j; 1371 b = other[j]; 1372 } 1373 } else { 1374 b = other[j++]; 1375 } 1376 // simplest of all the routines 1377 // sort the values, discarding identicals! 1378 while (true) { 1379 if (a < b) { 1380 buffer[k++] = a; 1381 a = list[i++]; 1382 } else if (b < a) { 1383 buffer[k++] = b; 1384 b = other[j++]; 1385 } else if (a != HIGH) { // at this point, a == b 1386 // discard both values! 1387 a = list[i++]; 1388 b = other[j++]; 1389 } else { // DONE! 1390 buffer[k++] = HIGH; 1391 len = k; 1392 break; 1393 } 1394 } 1395 // swap list and buffer 1396 int[] temp = list; 1397 list = buffer; 1398 buffer = temp; 1399 pat = null; 1400 return this; 1401 } 1402 1403 // polarity = 0 is normal: x union y 1404 // polarity = 2: x union ~y 1405 // polarity = 1: ~x union y 1406 // polarity = 3: ~x union ~y 1407 1408 private UnicodeSet add(int[] other, int otherLen, int polarity) { 1409 ensureBufferCapacity(len + otherLen); 1410 int i = 0, j = 0, k = 0; 1411 int a = list[i++]; 1412 int b = other[j++]; 1413 // change from xor is that we have to check overlapping pairs 1414 // polarity bit 1 means a is second, bit 2 means b is. 1415 main: 1416 while (true) { 1417 switch (polarity) { 1418 case 0: // both first; take lower if unequal 1419 if (a < b) { // take a 1420 // Back up over overlapping ranges in buffer[] 1421 if (k > 0 && a <= buffer[k-1]) { 1422 // Pick latter end value in buffer[] vs. list[] 1423 a = max(list[i], buffer[--k]); 1424 } else { 1425 // No overlap 1426 buffer[k++] = a; 1427 a = list[i]; 1428 } 1429 i++; // Common if/else code factored out 1430 polarity ^= 1; 1431 } else if (b < a) { // take b 1432 if (k > 0 && b <= buffer[k-1]) { 1433 b = max(other[j], buffer[--k]); 1434 } else { 1435 buffer[k++] = b; 1436 b = other[j]; 1437 } 1438 j++; 1439 polarity ^= 2; 1440 } else { // a == b, take a, drop b 1441 if (a == HIGH) break main; 1442 // This is symmetrical; it doesn't matter if 1443 // we backtrack with a or b. - liu 1444 if (k > 0 && a <= buffer[k-1]) { 1445 a = max(list[i], buffer[--k]); 1446 } else { 1447 // No overlap 1448 buffer[k++] = a; 1449 a = list[i]; 1450 } 1451 i++; 1452 polarity ^= 1; 1453 b = other[j++]; polarity ^= 2; 1454 } 1455 break; 1456 case 3: // both second; take higher if unequal, and drop other 1457 if (b <= a) { // take a 1458 if (a == HIGH) break main; 1459 buffer[k++] = a; 1460 } else { // take b 1461 if (b == HIGH) break main; 1462 buffer[k++] = b; 1463 } 1464 a = list[i++]; polarity ^= 1; // factored common code 1465 b = other[j++]; polarity ^= 2; 1466 break; 1467 case 1: // a second, b first; if b < a, overlap 1468 if (a < b) { // no overlap, take a 1469 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1470 } else if (b < a) { // OVERLAP, drop b 1471 b = other[j++]; polarity ^= 2; 1472 } else { // a == b, drop both! 1473 if (a == HIGH) break main; 1474 a = list[i++]; polarity ^= 1; 1475 b = other[j++]; polarity ^= 2; 1476 } 1477 break; 1478 case 2: // a first, b second; if a < b, overlap 1479 if (b < a) { // no overlap, take b 1480 buffer[k++] = b; b = other[j++]; polarity ^= 2; 1481 } else if (a < b) { // OVERLAP, drop a 1482 a = list[i++]; polarity ^= 1; 1483 } else { // a == b, drop both! 1484 if (a == HIGH) break main; 1485 a = list[i++]; polarity ^= 1; 1486 b = other[j++]; polarity ^= 2; 1487 } 1488 break; 1489 } 1490 } 1491 buffer[k++] = HIGH; // terminate 1492 len = k; 1493 // swap list and buffer 1494 int[] temp = list; 1495 list = buffer; 1496 buffer = temp; 1497 pat = null; 1498 return this; 1499 } 1500 1501 // polarity = 0 is normal: x intersect y 1502 // polarity = 2: x intersect ~y == set-minus 1503 // polarity = 1: ~x intersect y 1504 // polarity = 3: ~x intersect ~y 1505 1506 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 1507 ensureBufferCapacity(len + otherLen); 1508 int i = 0, j = 0, k = 0; 1509 int a = list[i++]; 1510 int b = other[j++]; 1511 // change from xor is that we have to check overlapping pairs 1512 // polarity bit 1 means a is second, bit 2 means b is. 1513 main: 1514 while (true) { 1515 switch (polarity) { 1516 case 0: // both first; drop the smaller 1517 if (a < b) { // drop a 1518 a = list[i++]; polarity ^= 1; 1519 } else if (b < a) { // drop b 1520 b = other[j++]; polarity ^= 2; 1521 } else { // a == b, take one, drop other 1522 if (a == HIGH) break main; 1523 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1524 b = other[j++]; polarity ^= 2; 1525 } 1526 break; 1527 case 3: // both second; take lower if unequal 1528 if (a < b) { // take a 1529 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1530 } else if (b < a) { // take b 1531 buffer[k++] = b; b = other[j++]; polarity ^= 2; 1532 } else { // a == b, take one, drop other 1533 if (a == HIGH) break main; 1534 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1535 b = other[j++]; polarity ^= 2; 1536 } 1537 break; 1538 case 1: // a second, b first; 1539 if (a < b) { // NO OVERLAP, drop a 1540 a = list[i++]; polarity ^= 1; 1541 } else if (b < a) { // OVERLAP, take b 1542 buffer[k++] = b; b = other[j++]; polarity ^= 2; 1543 } else { // a == b, drop both! 1544 if (a == HIGH) break main; 1545 a = list[i++]; polarity ^= 1; 1546 b = other[j++]; polarity ^= 2; 1547 } 1548 break; 1549 case 2: // a first, b second; if a < b, overlap 1550 if (b < a) { // no overlap, drop b 1551 b = other[j++]; polarity ^= 2; 1552 } else if (a < b) { // OVERLAP, take a 1553 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1554 } else { // a == b, drop both! 1555 if (a == HIGH) break main; 1556 a = list[i++]; polarity ^= 1; 1557 b = other[j++]; polarity ^= 2; 1558 } 1559 break; 1560 } 1561 } 1562 buffer[k++] = HIGH; // terminate 1563 len = k; 1564 // swap list and buffer 1565 int[] temp = list; 1566 list = buffer; 1567 buffer = temp; 1568 pat = null; 1569 return this; 1570 } 1571 1572 private static final int max(int a, int b) { 1573 return (a > b) ? a : b; 1574 } 1575 1576 //---------------------------------------------------------------- 1577 // Generic filter-based scanning code 1578 //---------------------------------------------------------------- 1579 1580 private static interface Filter { 1581 boolean contains(int codePoint); 1582 } 1583 1584 // VersionInfo for unassigned characters 1585 static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 1586 1587 private static class VersionFilter implements Filter { 1588 VersionInfo version; 1589 1590 VersionFilter(VersionInfo version) { this.version = version; } 1591 1592 public boolean contains(int ch) { 1593 VersionInfo v = UCharacter.getAge(ch); 1594 // Reference comparison ok; VersionInfo caches and reuses 1595 // unique objects. 1596 return v != NO_VERSION && 1597 v.compareTo(version) <= 0; 1598 } 1599 } 1600 1601 private static synchronized UnicodeSet getInclusions(int src) { 1602 if (INCLUSIONS == null) { 1603 INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT]; 1604 } 1605 if(INCLUSIONS[src] == null) { 1606 UnicodeSet incl = new UnicodeSet(); 1607 switch(src) { 1608 case UCharacterProperty.SRC_PROPSVEC: 1609 UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl); 1610 break; 1611 default: 1612 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); 1613 } 1614 INCLUSIONS[src] = incl; 1615 } 1616 return INCLUSIONS[src]; 1617 } 1618 1619 /** 1620 * Generic filter-based scanning code for UCD property UnicodeSets. 1621 */ 1622 private UnicodeSet applyFilter(Filter filter, int src) { 1623 // Walk through all Unicode characters, noting the start 1624 // and end of each range for which filter.contain(c) is 1625 // true. Add each range to a set. 1626 // 1627 // To improve performance, use the INCLUSIONS set, which 1628 // encodes information about character ranges that are known 1629 // to have identical properties, such as the CJK Ideographs 1630 // from U+4E00 to U+9FA5. INCLUSIONS contains all characters 1631 // except the first characters of such ranges. 1632 // 1633 // TODO Where possible, instead of scanning over code points, 1634 // use internal property data to initialize UnicodeSets for 1635 // those properties. Scanning code points is slow. 1636 1637 clear(); 1638 1639 int startHasProperty = -1; 1640 UnicodeSet inclusions = getInclusions(src); 1641 int limitRange = inclusions.getRangeCount(); 1642 1643 for (int j=0; j<limitRange; ++j) { 1644 // get current range 1645 int start = inclusions.getRangeStart(j); 1646 int end = inclusions.getRangeEnd(j); 1647 1648 // for all the code points in the range, process 1649 for (int ch = start; ch <= end; ++ch) { 1650 // only add to the unicodeset on inflection points -- 1651 // where the hasProperty value changes to false 1652 if (filter.contains(ch)) { 1653 if (startHasProperty < 0) { 1654 startHasProperty = ch; 1655 } 1656 } else if (startHasProperty >= 0) { 1657 add_unchecked(startHasProperty, ch-1); 1658 startHasProperty = -1; 1659 } 1660 } 1661 } 1662 if (startHasProperty >= 0) { 1663 add_unchecked(startHasProperty, 0x10FFFF); 1664 } 1665 1666 return this; 1667 } 1668 1669 /** 1670 * Remove leading and trailing rule white space and compress 1671 * internal rule white space to a single space character. 1672 * 1673 * @see UCharacterProperty#isRuleWhiteSpace 1674 */ 1675 private static String mungeCharName(String source) { 1676 StringBuffer buf = new StringBuffer(); 1677 for (int i=0; i<source.length(); ) { 1678 int ch = UTF16.charAt(source, i); 1679 i += UTF16.getCharCount(ch); 1680 if (UCharacterProperty.isRuleWhiteSpace(ch)) { 1681 if (buf.length() == 0 || 1682 buf.charAt(buf.length() - 1) == ' ') { 1683 continue; 1684 } 1685 ch = ' '; // convert to ' ' 1686 } 1687 UTF16.append(buf, ch); 1688 } 1689 if (buf.length() != 0 && 1690 buf.charAt(buf.length() - 1) == ' ') { 1691 buf.setLength(buf.length() - 1); 1692 } 1693 return buf.toString(); 1694 } 1695 1696 /** 1697 * Modifies this set to contain those code points which have the 1698 * given value for the given property. Prior contents of this 1699 * set are lost. 1700 * @param propertyAlias 1701 * @param valueAlias 1702 * @param symbols if not null, then symbols are first called to see if a property 1703 * is available. If true, then everything else is skipped. 1704 * @return this set 1705 * @stable ICU 3.2 1706 */ 1707 public UnicodeSet applyPropertyAlias(String propertyAlias, 1708 String valueAlias, SymbolTable symbols) { 1709 if (valueAlias.length() > 0) { 1710 if (propertyAlias.equals("Age")) { 1711 // Must munge name, since 1712 // VersionInfo.getInstance() does not do 1713 // 'loose' matching. 1714 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 1715 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); 1716 return this; 1717 } 1718 } 1719 throw new IllegalArgumentException("Unsupported property: " + propertyAlias); 1720 } 1721 1722 /** 1723 * Return true if the given iterator appears to point at a 1724 * property pattern. Regardless of the result, return with the 1725 * iterator unchanged. 1726 * @param chars iterator over the pattern characters. Upon return 1727 * it will be unchanged. 1728 * @param iterOpts RuleCharacterIterator options 1729 */ 1730 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 1731 int iterOpts) { 1732 boolean result = false; 1733 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 1734 Object pos = chars.getPos(null); 1735 int c = chars.next(iterOpts); 1736 if (c == '[' || c == '\\') { 1737 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 1738 result = (c == '[') ? (d == ':') : 1739 (d == 'N' || d == 'p' || d == 'P'); 1740 } 1741 chars.setPos(pos); 1742 return result; 1743 } 1744 1745 /** 1746 * Parse the given property pattern at the given parse position. 1747 * @param symbols TODO 1748 */ 1749 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 1750 int pos = ppos.getIndex(); 1751 1752 // On entry, ppos should point to one of the following locations: 1753 1754 // Minimum length is 5 characters, e.g. \p{L} 1755 if ((pos+5) > pattern.length()) { 1756 return null; 1757 } 1758 1759 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1760 boolean isName = false; // true for \N{pat}, o/w false 1761 boolean invert = false; 1762 1763 // Look for an opening [:, [:^, \p, or \P 1764 if (pattern.regionMatches(pos, "[:", 0, 2)) { 1765 posix = true; 1766 pos = Utility.skipWhitespace(pattern, pos+2); 1767 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 1768 ++pos; 1769 invert = true; 1770 } 1771 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 1772 pattern.regionMatches(pos, "\\N", 0, 2)) { 1773 char c = pattern.charAt(pos+1); 1774 invert = (c == 'P'); 1775 isName = (c == 'N'); 1776 pos = Utility.skipWhitespace(pattern, pos+2); 1777 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 1778 // Syntax error; "\p" or "\P" not followed by "{" 1779 return null; 1780 } 1781 } else { 1782 // Open delimiter not seen 1783 return null; 1784 } 1785 1786 // Look for the matching close delimiter, either :] or } 1787 int close = pattern.indexOf(posix ? ":]" : "}", pos); 1788 if (close < 0) { 1789 // Syntax error; close delimiter missing 1790 return null; 1791 } 1792 1793 // Look for an '=' sign. If this is present, we will parse a 1794 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1795 // pattern. 1796 int equals = pattern.indexOf('=', pos); 1797 String propName, valueName; 1798 if (equals >= 0 && equals < close && !isName) { 1799 // Equals seen; parse medium/long pattern 1800 propName = pattern.substring(pos, equals); 1801 valueName = pattern.substring(equals+1, close); 1802 } 1803 1804 else { 1805 // Handle case where no '=' is seen, and \N{} 1806 propName = pattern.substring(pos, close); 1807 valueName = ""; 1808 1809 // Handle \N{name} 1810 if (isName) { 1811 // This is a little inefficient since it means we have to 1812 // parse "na" back to UProperty.NAME even though we already 1813 // know it's UProperty.NAME. If we refactor the API to 1814 // support args of (int, String) then we can remove 1815 // "na" and make this a little more efficient. 1816 valueName = propName; 1817 propName = "na"; 1818 } 1819 } 1820 1821 applyPropertyAlias(propName, valueName, symbols); 1822 1823 if (invert) { 1824 complement(); 1825 } 1826 1827 // Move to the limit position after the close delimiter 1828 ppos.setIndex(close + (posix ? 2 : 1)); 1829 1830 return this; 1831 } 1832 1833 /** 1834 * Parse a property pattern. 1835 * @param chars iterator over the pattern characters. Upon return 1836 * it will be advanced to the first character after the parsed 1837 * pattern, or the end of the iteration if all characters are 1838 * parsed. 1839 * @param rebuiltPat the pattern that was parsed, rebuilt or 1840 * copied from the input pattern, as appropriate. 1841 * @param symbols TODO 1842 */ 1843 private void applyPropertyPattern(RuleCharacterIterator chars, 1844 StringBuffer rebuiltPat, SymbolTable symbols) { 1845 String patStr = chars.lookahead(); 1846 ParsePosition pos = new ParsePosition(0); 1847 applyPropertyPattern(patStr, pos, symbols); 1848 if (pos.getIndex() == 0) { 1849 syntaxError(chars, "Invalid property pattern"); 1850 } 1851 chars.jumpahead(pos.getIndex()); 1852 rebuiltPat.append(patStr.substring(0, pos.getIndex())); 1853 } 1854 1855 //---------------------------------------------------------------- 1856 // Case folding API 1857 //---------------------------------------------------------------- 1858 1859 /** 1860 * Bitmask for constructor and applyPattern() indicating that 1861 * white space should be ignored. If set, ignore characters for 1862 * which UCharacterProperty.isRuleWhiteSpace() returns true, 1863 * unless they are quoted or escaped. This may be ORed together 1864 * with other selectors. 1865 * @stable ICU 3.8 1866 */ 1867 public static final int IGNORE_SPACE = 1; 1868 1869 } 1870