1 /* 2 * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.text.ParsePosition; 40 import java.util.Iterator; 41 import java.util.TreeSet; 42 43 /** 44 * A mutable set of Unicode characters and multicharacter strings. Objects of this class 45 * represent <em>character classes</em> used in regular expressions. 46 * A character specifies a subset of Unicode code points. Legal 47 * code points are U+0000 to U+10FFFF, inclusive. 48 * 49 * <p>The UnicodeSet class is not designed to be subclassed. 50 * 51 * <p><code>UnicodeSet</code> supports two APIs. The first is the 52 * <em>operand</em> API that allows the caller to modify the value of 53 * a <code>UnicodeSet</code> object. It conforms to Java 2's 54 * <code>java.util.Set</code> interface, although 55 * <code>UnicodeSet</code> does not actually implement that 56 * interface. All methods of <code>Set</code> are supported, with the 57 * modification that they take a character range or single character 58 * instead of an <code>Object</code>, and they take a 59 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 60 * operand API may be thought of in terms of boolean logic: a boolean 61 * OR is implemented by <code>add</code>, a boolean AND is implemented 62 * by <code>retain</code>, a boolean XOR is implemented by 63 * <code>complement</code> taking an argument, and a boolean NOT is 64 * implemented by <code>complement</code> with no argument. In terms 65 * of traditional set theory function names, <code>add</code> is a 66 * union, <code>retain</code> is an intersection, <code>remove</code> 67 * is an asymmetric difference, and <code>complement</code> with no 68 * argument is a set complement with respect to the superset range 69 * <code>MIN_VALUE-MAX_VALUE</code> 70 * 71 * <p>The second API is the 72 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 73 * <code>java.text.Format</code>-derived classes. Unlike the 74 * methods that add characters, add categories, and control the logic 75 * of the set, the method <code>applyPattern()</code> sets all 76 * attributes of a <code>UnicodeSet</code> at once, based on a 77 * string pattern. 78 * 79 * <p><b>Pattern syntax</b></p> 80 * 81 * Patterns are accepted by the constructors and the 82 * <code>applyPattern()</code> methods and returned by the 83 * <code>toPattern()</code> method. These patterns follow a syntax 84 * similar to that employed by version 8 regular expression character 85 * classes. Here are some simple examples: 86 * 87 * <blockquote> 88 * <table> 89 * <tr align="top"> 90 * <td nowrap valign="top" align="left"><code>[]</code></td> 91 * <td valign="top">No characters</td> 92 * </tr><tr align="top"> 93 * <td nowrap valign="top" align="left"><code>[a]</code></td> 94 * <td valign="top">The character 'a'</td> 95 * </tr><tr align="top"> 96 * <td nowrap valign="top" align="left"><code>[ae]</code></td> 97 * <td valign="top">The characters 'a' and 'e'</td> 98 * </tr> 99 * <tr> 100 * <td nowrap valign="top" align="left"><code>[a-e]</code></td> 101 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code 102 * point order</td> 103 * </tr> 104 * <tr> 105 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td> 106 * <td valign="top">The character U+4E01</td> 107 * </tr> 108 * <tr> 109 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td> 110 * <td valign="top">The character 'a' and the multicharacter strings "ab" and 111 * "ac"</td> 112 * </tr> 113 * <tr> 114 * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td> 115 * <td valign="top">All characters in the general category Uppercase Letter</td> 116 * </tr> 117 * </table> 118 * </blockquote> 119 * 120 * Any character may be preceded by a backslash in order to remove any special 121 * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are 122 * ignored, unless they are escaped. 123 * 124 * <p>Property patterns specify a set of characters having a certain 125 * property as defined by the Unicode standard. Both the POSIX-like 126 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 127 * complete list of supported property patterns, see the User's Guide 128 * for UnicodeSet at 129 * <a href="http://www.icu-project.org/userguide/unicodeSet.html"> 130 * http://www.icu-project.org/userguide/unicodeSet.html</a>. 131 * Actual determination of property data is defined by the underlying 132 * Unicode database as implemented by UCharacter. 133 * 134 * <p>Patterns specify individual characters, ranges of characters, and 135 * Unicode property sets. When elements are concatenated, they 136 * specify their union. To complement a set, place a '^' immediately 137 * after the opening '['. Property patterns are inverted by modifying 138 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 139 * '^' has no special meaning. 140 * 141 * <p>Ranges are indicated by placing two a '-' between two 142 * characters, as in "a-z". This specifies the range of all 143 * characters from the left to the right, in Unicode order. If the 144 * left character is greater than or equal to the 145 * right character it is a syntax error. If a '-' occurs as the first 146 * character after the opening '[' or '[^', or if it occurs as the 147 * last character before the closing ']', then it is taken as a 148 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 149 * set of three characters, 'a', 'b', and '-'. 150 * 151 * <p>Sets may be intersected using the {@literal '&'} operator or the asymmetric 152 * set difference may be taken using the '-' operator, for example, 153 * "{@code [[:L:]&[\\u0000-\\u0FFF]]}" indicates the set of all Unicode letters 154 * with values less than 4096. Operators ({@literal '&'} and '|') have equal 155 * precedence and bind left-to-right. Thus 156 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 157 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 158 * difference; intersection is commutative. 159 * 160 * <table> 161 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a' 162 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a' 163 * through 'z' and all letters in between, in Unicode order 164 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing 165 * all characters but 'a' through 'z', 166 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 167 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 168 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 169 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 170 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 171 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 172 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 173 * <em>pat2</em> 174 * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code> 175 * <td>The set of characters having the specified 176 * Unicode property; in 177 * this case, Unicode uppercase letters 178 * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code> 179 * <td>The set of characters <em>not</em> having the given 180 * Unicode property 181 * </table> 182 * 183 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p> 184 * 185 * <p><b>Formal syntax</b></p> 186 * 187 * <blockquote> 188 * <table> 189 * <tr align="top"> 190 * <td nowrap valign="top" align="right"><code>pattern := </code></td> 191 * <td valign="top"><code>('[' '^'? item* ']') | 192 * property</code></td> 193 * </tr> 194 * <tr align="top"> 195 * <td nowrap valign="top" align="right"><code>item := </code></td> 196 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br> 197 * </code></td> 198 * </tr> 199 * <tr align="top"> 200 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td> 201 * <td valign="top"><code>pattern | pattern-expr pattern | 202 * pattern-expr op pattern<br> 203 * </code></td> 204 * </tr> 205 * <tr align="top"> 206 * <td nowrap valign="top" align="right"><code>op := </code></td> 207 * <td valign="top"><code>'&' | '-'<br> 208 * </code></td> 209 * </tr> 210 * <tr align="top"> 211 * <td nowrap valign="top" align="right"><code>special := </code></td> 212 * <td valign="top"><code>'[' | ']' | '-'<br> 213 * </code></td> 214 * </tr> 215 * <tr align="top"> 216 * <td nowrap valign="top" align="right"><code>char := </code></td> 217 * <td valign="top"><em>any character that is not</em><code> special<br> 218 * | ('\\' </code><em>any character</em><code>)<br> 219 * | ('\u' hex hex hex hex)<br> 220 * </code></td> 221 * </tr> 222 * <tr align="top"> 223 * <td nowrap valign="top" align="right"><code>hex := </code></td> 224 * <td valign="top"><em>any character for which 225 * </em><code>Character.digit(c, 16)</code><em> 226 * returns a non-negative result</em></td> 227 * </tr> 228 * <tr> 229 * <td nowrap valign="top" align="right"><code>property := </code></td> 230 * <td valign="top"><em>a Unicode property set pattern</em></td> 231 * </tr> 232 * </table> 233 * <br> 234 * <table border="1"> 235 * <tr> 236 * <td>Legend: <table> 237 * <tr> 238 * <td nowrap valign="top"><code>a := b</code></td> 239 * <td width="20" valign="top"> </td> 240 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td> 241 * </tr> 242 * <tr> 243 * <td nowrap valign="top"><code>a?</code></td> 244 * <td valign="top"></td> 245 * <td valign="top">zero or one instance of <code>a</code><br> 246 * </td> 247 * </tr> 248 * <tr> 249 * <td nowrap valign="top"><code>a*</code></td> 250 * <td valign="top"></td> 251 * <td valign="top">one or more instances of <code>a</code><br> 252 * </td> 253 * </tr> 254 * <tr> 255 * <td nowrap valign="top"><code>a | b</code></td> 256 * <td valign="top"></td> 257 * <td valign="top">either <code>a</code> or <code>b</code><br> 258 * </td> 259 * </tr> 260 * <tr> 261 * <td nowrap valign="top"><code>'a'</code></td> 262 * <td valign="top"></td> 263 * <td valign="top">the literal string between the quotes </td> 264 * </tr> 265 * </table> 266 * </td> 267 * </tr> 268 * </table> 269 * </blockquote> 270 * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class. 271 * 272 * @author Alan Liu 273 * @stable ICU 2.0 274 * @see UnicodeSetIterator 275 */ 276 @SuppressWarnings("deprecation") 277 public class UnicodeSet implements UnicodeMatcher { 278 279 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 280 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 281 // 110000 for codepoints 282 283 /** 284 * Minimum value that can be stored in a UnicodeSet. 285 * @stable ICU 2.0 286 */ 287 public static final int MIN_VALUE = LOW; 288 289 /** 290 * Maximum value that can be stored in a UnicodeSet. 291 * @stable ICU 2.0 292 */ 293 public static final int MAX_VALUE = HIGH - 1; 294 295 private int len; // length used; list may be longer to minimize reallocs 296 private int[] list; // MUST be terminated with HIGH 297 private int[] rangeList; // internal buffer 298 private int[] buffer; // internal buffer 299 300 // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! 301 // is not private so that UnicodeSetIterator can get access 302 TreeSet<String> strings = new TreeSet<>(); 303 304 /** 305 * The pattern representation of this set. This may not be the 306 * most economical pattern. It is the pattern supplied to 307 * applyPattern(), with variables substituted and whitespace 308 * removed. For sets constructed without applyPattern(), or 309 * modified using the non-pattern API, this string will be null, 310 * indicating that toPattern() must generate a pattern 311 * representation from the inversion list. 312 */ 313 private String pat = null; 314 315 private static final int START_EXTRA = 16; // initial storage. Must be >= 0 316 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 317 318 /** 319 * A set of all characters _except_ the second through last characters of 320 * certain ranges. These ranges are ranges of characters whose 321 * properties are all exactly alike, e.g. CJK Ideographs from 322 * U+4E00 to U+9FA5. 323 */ 324 private static UnicodeSet INCLUSIONS[] = null; 325 326 //---------------------------------------------------------------- 327 // Public API 328 //---------------------------------------------------------------- 329 330 /** 331 * Constructs an empty set. 332 * @stable ICU 2.0 333 */ 334 public UnicodeSet() { 335 list = new int[1 + START_EXTRA]; 336 list[len++] = HIGH; 337 } 338 339 /** 340 * Constructs a set containing the given range. 341 * If {@code end > start} then an empty set is created. 342 * 343 * @param start first character, inclusive, of range 344 * @param end last character, inclusive, of range 345 * @stable ICU 2.0 346 */ 347 public UnicodeSet(int start, int end) { 348 this(); 349 complement(start, end); 350 } 351 352 /** 353 * Constructs a set from the given pattern. See the class description 354 * for the syntax of the pattern language. Whitespace is ignored. 355 * @param pattern a string specifying what characters are in the set 356 * @exception java.lang.IllegalArgumentException if the pattern contains 357 * a syntax error. 358 * @stable ICU 2.0 359 */ 360 public UnicodeSet(String pattern) { 361 this(); 362 applyPattern(pattern, null, null, IGNORE_SPACE); 363 } 364 365 /** 366 * Make this object represent the same set as <code>other</code>. 367 * @param other a <code>UnicodeSet</code> whose value will be 368 * copied to this object 369 * @stable ICU 2.0 370 */ 371 @SuppressWarnings("unchecked") // Casting result of clone of a collection 372 public UnicodeSet set(UnicodeSet other) { 373 list = other.list.clone(); 374 len = other.len; 375 pat = other.pat; 376 strings = (TreeSet)other.strings.clone(); 377 return this; 378 } 379 380 /** 381 * Modifies this set to represent the set specified by the given pattern. 382 * See the class description for the syntax of the pattern language. 383 * Whitespace is ignored. 384 * @param pattern a string specifying what characters are in the set 385 * @exception java.lang.IllegalArgumentException if the pattern 386 * contains a syntax error. 387 * @stable ICU 2.0 388 */ 389 public final UnicodeSet applyPattern(String pattern) { 390 return applyPattern(pattern, null, null, IGNORE_SPACE); 391 } 392 393 /** 394 * Append the <code>toPattern()</code> representation of a 395 * string to the given <code>StringBuffer</code>. 396 */ 397 private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) { 398 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) { 399 _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable); 400 } 401 } 402 403 /** 404 * Append the <code>toPattern()</code> representation of a 405 * character to the given <code>StringBuffer</code>. 406 */ 407 private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) { 408 if (escapeUnprintable && Utility.isUnprintable(c)) { 409 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 410 // unprintable 411 if (Utility.escapeUnprintable(buf, c)) { 412 return; 413 } 414 } 415 // Okay to let ':' pass through 416 switch (c) { 417 case '[': // SET_OPEN: 418 case ']': // SET_CLOSE: 419 case '-': // HYPHEN: 420 case '^': // COMPLEMENT: 421 case '&': // INTERSECTION: 422 case '\\': //BACKSLASH: 423 case '{': 424 case '}': 425 case '$': 426 case ':': 427 buf.append('\\'); 428 break; 429 default: 430 // Escape whitespace 431 if (UCharacterProperty.isRuleWhiteSpace(c)) { 432 buf.append('\\'); 433 } 434 break; 435 } 436 UTF16.append(buf, c); 437 } 438 439 /** 440 * Append a string representation of this set to result. This will be 441 * a cleaned version of the string passed to applyPattern(), if there 442 * is one. Otherwise it will be generated. 443 */ 444 private StringBuffer _toPattern(StringBuffer result, 445 boolean escapeUnprintable) { 446 if (pat != null) { 447 int i; 448 int backslashCount = 0; 449 for (i=0; i<pat.length(); ) { 450 int c = UTF16.charAt(pat, i); 451 i += UTF16.getCharCount(c); 452 if (escapeUnprintable && Utility.isUnprintable(c)) { 453 // If the unprintable character is preceded by an odd 454 // number of backslashes, then it has been escaped. 455 // Before unescaping it, we delete the final 456 // backslash. 457 if ((backslashCount % 2) == 1) { 458 result.setLength(result.length() - 1); 459 } 460 Utility.escapeUnprintable(result, c); 461 backslashCount = 0; 462 } else { 463 UTF16.append(result, c); 464 if (c == '\\') { 465 ++backslashCount; 466 } else { 467 backslashCount = 0; 468 } 469 } 470 } 471 return result; 472 } 473 474 return _generatePattern(result, escapeUnprintable, true); 475 } 476 477 /** 478 * Generate and append a string representation of this set to result. 479 * This does not use this.pat, the cleaned up copy of the string 480 * passed to applyPattern(). 481 * @param includeStrings if false, doesn't include the strings. 482 * @stable ICU 3.8 483 */ 484 public StringBuffer _generatePattern(StringBuffer result, 485 boolean escapeUnprintable, boolean includeStrings) { 486 result.append('['); 487 488 int count = getRangeCount(); 489 490 // If the set contains at least 2 intervals and includes both 491 // MIN_VALUE and MAX_VALUE, then the inverse representation will 492 // be more economical. 493 if (count > 1 && 494 getRangeStart(0) == MIN_VALUE && 495 getRangeEnd(count-1) == MAX_VALUE) { 496 497 // Emit the inverse 498 result.append('^'); 499 500 for (int i = 1; i < count; ++i) { 501 int start = getRangeEnd(i-1)+1; 502 int end = getRangeStart(i)-1; 503 _appendToPat(result, start, escapeUnprintable); 504 if (start != end) { 505 if ((start+1) != end) { 506 result.append('-'); 507 } 508 _appendToPat(result, end, escapeUnprintable); 509 } 510 } 511 } 512 513 // Default; emit the ranges as pairs 514 else { 515 for (int i = 0; i < count; ++i) { 516 int start = getRangeStart(i); 517 int end = getRangeEnd(i); 518 _appendToPat(result, start, escapeUnprintable); 519 if (start != end) { 520 if ((start+1) != end) { 521 result.append('-'); 522 } 523 _appendToPat(result, end, escapeUnprintable); 524 } 525 } 526 } 527 528 if (includeStrings && strings.size() > 0) { 529 Iterator<String> it = strings.iterator(); 530 while (it.hasNext()) { 531 result.append('{'); 532 _appendToPat(result, it.next(), escapeUnprintable); 533 result.append('}'); 534 } 535 } 536 return result.append(']'); 537 } 538 539 // for internal use, after checkFrozen has been called 540 private UnicodeSet add_unchecked(int start, int end) { 541 if (start < MIN_VALUE || start > MAX_VALUE) { 542 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 543 } 544 if (end < MIN_VALUE || end > MAX_VALUE) { 545 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 546 } 547 if (start < end) { 548 add(range(start, end), 2, 0); 549 } else if (start == end) { 550 add(start); 551 } 552 return this; 553 } 554 555 /** 556 * Adds the specified character to this set if it is not already 557 * present. If this set already contains the specified character, 558 * the call leaves this set unchanged. 559 * @stable ICU 2.0 560 */ 561 public final UnicodeSet add(int c) { 562 return add_unchecked(c); 563 } 564 565 // for internal use only, after checkFrozen has been called 566 private final UnicodeSet add_unchecked(int c) { 567 if (c < MIN_VALUE || c > MAX_VALUE) { 568 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 569 } 570 571 // find smallest i such that c < list[i] 572 // if odd, then it is IN the set 573 // if even, then it is OUT of the set 574 int i = findCodePoint(c); 575 576 // already in set? 577 if ((i & 1) != 0) return this; 578 579 // HIGH is 0x110000 580 // assert(list[len-1] == HIGH); 581 582 // empty = [HIGH] 583 // [start_0, limit_0, start_1, limit_1, HIGH] 584 585 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 586 // ^ 587 // list[i] 588 589 // i == 0 means c is before the first range 590 591 if (c == list[i]-1) { 592 // c is before start of next range 593 list[i] = c; 594 // if we touched the HIGH mark, then add a new one 595 if (c == MAX_VALUE) { 596 ensureCapacity(len+1); 597 list[len++] = HIGH; 598 } 599 if (i > 0 && c == list[i-1]) { 600 // collapse adjacent ranges 601 602 // [..., start_k-1, c, c, limit_k, ..., HIGH] 603 // ^ 604 // list[i] 605 System.arraycopy(list, i+1, list, i-1, len-i-1); 606 len -= 2; 607 } 608 } 609 610 else if (i > 0 && c == list[i-1]) { 611 // c is after end of prior range 612 list[i-1]++; 613 // no need to chcek for collapse here 614 } 615 616 else { 617 // At this point we know the new char is not adjacent to 618 // any existing ranges, and it is not 10FFFF. 619 620 621 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 622 // ^ 623 // list[i] 624 625 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 626 // ^ 627 // list[i] 628 629 // Don't use ensureCapacity() to save on copying. 630 // NOTE: This has no measurable impact on performance, 631 // but it might help in some usage patterns. 632 if (len+2 > list.length) { 633 int[] temp = new int[len + 2 + GROW_EXTRA]; 634 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 635 System.arraycopy(list, i, temp, i+2, len-i); 636 list = temp; 637 } else { 638 System.arraycopy(list, i, list, i+2, len-i); 639 } 640 641 list[i] = c; 642 list[i+1] = c+1; 643 len += 2; 644 } 645 646 pat = null; 647 return this; 648 } 649 650 /** 651 * Adds the specified multicharacter to this set if it is not already 652 * present. If this set already contains the multicharacter, 653 * the call leaves this set unchanged. 654 * Thus {@code "ch" => {"ch"}} 655 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 656 * @param s the source string 657 * @return this object, for chaining 658 * @stable ICU 2.0 659 */ 660 public final UnicodeSet add(String s) { 661 int cp = getSingleCP(s); 662 if (cp < 0) { 663 strings.add(s); 664 pat = null; 665 } else { 666 add_unchecked(cp, cp); 667 } 668 return this; 669 } 670 671 /** 672 * @return a code point IF the string consists of a single one. 673 * otherwise returns -1. 674 * @param string to test 675 */ 676 private static int getSingleCP(String s) { 677 if (s.length() < 1) { 678 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 679 } 680 if (s.length() > 2) return -1; 681 if (s.length() == 1) return s.charAt(0); 682 683 // at this point, len = 2 684 int cp = UTF16.charAt(s, 0); 685 if (cp > 0xFFFF) { // is surrogate pair 686 return cp; 687 } 688 return -1; 689 } 690 691 /** 692 * Complements the specified range in this set. Any character in 693 * the range will be removed if it is in this set, or will be 694 * added if it is not in this set. If {@code end > start} 695 * then an empty range is complemented, leaving the set unchanged. 696 * 697 * @param start first character, inclusive, of range to be removed 698 * from this set. 699 * @param end last character, inclusive, of range to be removed 700 * from this set. 701 * @stable ICU 2.0 702 */ 703 public UnicodeSet complement(int start, int end) { 704 if (start < MIN_VALUE || start > MAX_VALUE) { 705 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 706 } 707 if (end < MIN_VALUE || end > MAX_VALUE) { 708 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 709 } 710 if (start <= end) { 711 xor(range(start, end), 2, 0); 712 } 713 pat = null; 714 return this; 715 } 716 717 /** 718 * This is equivalent to 719 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 720 * @stable ICU 2.0 721 */ 722 public UnicodeSet complement() { 723 if (list[0] == LOW) { 724 System.arraycopy(list, 1, list, 0, len-1); 725 --len; 726 } else { 727 ensureCapacity(len+1); 728 System.arraycopy(list, 0, list, 1, len); 729 list[0] = LOW; 730 ++len; 731 } 732 pat = null; 733 return this; 734 } 735 736 /** 737 * Returns true if this set contains the given character. 738 * @param c character to be checked for containment 739 * @return true if the test condition is met 740 * @stable ICU 2.0 741 */ 742 public boolean contains(int c) { 743 if (c < MIN_VALUE || c > MAX_VALUE) { 744 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 745 } 746 747 /* 748 // Set i to the index of the start item greater than ch 749 // We know we will terminate without length test! 750 int i = -1; 751 while (true) { 752 if (c < list[++i]) break; 753 } 754 */ 755 756 int i = findCodePoint(c); 757 758 return ((i & 1) != 0); // return true if odd 759 } 760 761 /** 762 * Returns the smallest value i such that c < list[i]. Caller 763 * must ensure that c is a legal value or this method will enter 764 * an infinite loop. This method performs a binary search. 765 * @param c a character in the range MIN_VALUE..MAX_VALUE 766 * inclusive 767 * @return the smallest integer i in the range 0..len-1, 768 * inclusive, such that c < list[i] 769 */ 770 private final int findCodePoint(int c) { 771 /* Examples: 772 findCodePoint(c) 773 set list[] c=0 1 3 4 7 8 774 === ============== =========== 775 [] [110000] 0 0 0 0 0 0 776 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 777 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 778 [:all:] [0, 110000] 1 1 1 1 1 1 779 */ 780 781 // Return the smallest i such that c < list[i]. Assume 782 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 783 if (c < list[0]) return 0; 784 // High runner test. c is often after the last range, so an 785 // initial check for this condition pays off. 786 if (len >= 2 && c >= list[len-2]) return len-1; 787 int lo = 0; 788 int hi = len - 1; 789 // invariant: c >= list[lo] 790 // invariant: c < list[hi] 791 for (;;) { 792 int i = (lo + hi) >>> 1; 793 if (i == lo) return hi; 794 if (c < list[i]) { 795 hi = i; 796 } else { 797 lo = i; 798 } 799 } 800 } 801 802 /** 803 * Adds all of the elements in the specified set to this set if 804 * they're not already present. This operation effectively 805 * modifies this set so that its value is the <i>union</i> of the two 806 * sets. The behavior of this operation is unspecified if the specified 807 * collection is modified while the operation is in progress. 808 * 809 * @param c set whose elements are to be added to this set. 810 * @stable ICU 2.0 811 */ 812 public UnicodeSet addAll(UnicodeSet c) { 813 add(c.list, c.len, 0); 814 strings.addAll(c.strings); 815 return this; 816 } 817 818 /** 819 * Retains only the elements in this set that are contained in the 820 * specified set. In other words, removes from this set all of 821 * its elements that are not contained in the specified set. This 822 * operation effectively modifies this set so that its value is 823 * the <i>intersection</i> of the two sets. 824 * 825 * @param c set that defines which elements this set will retain. 826 * @stable ICU 2.0 827 */ 828 public UnicodeSet retainAll(UnicodeSet c) { 829 retain(c.list, c.len, 0); 830 strings.retainAll(c.strings); 831 return this; 832 } 833 834 /** 835 * Removes from this set all of its elements that are contained in the 836 * specified set. This operation effectively modifies this 837 * set so that its value is the <i>asymmetric set difference</i> of 838 * the two sets. 839 * 840 * @param c set that defines which elements will be removed from 841 * this set. 842 * @stable ICU 2.0 843 */ 844 public UnicodeSet removeAll(UnicodeSet c) { 845 retain(c.list, c.len, 2); 846 strings.removeAll(c.strings); 847 return this; 848 } 849 850 /** 851 * Removes all of the elements from this set. This set will be 852 * empty after this call returns. 853 * @stable ICU 2.0 854 */ 855 public UnicodeSet clear() { 856 list[0] = HIGH; 857 len = 1; 858 pat = null; 859 strings.clear(); 860 return this; 861 } 862 863 /** 864 * Iteration method that returns the number of ranges contained in 865 * this set. 866 * @see #getRangeStart 867 * @see #getRangeEnd 868 * @stable ICU 2.0 869 */ 870 public int getRangeCount() { 871 return len/2; 872 } 873 874 /** 875 * Iteration method that returns the first character in the 876 * specified range of this set. 877 * @exception ArrayIndexOutOfBoundsException if index is outside 878 * the range <code>0..getRangeCount()-1</code> 879 * @see #getRangeCount 880 * @see #getRangeEnd 881 * @stable ICU 2.0 882 */ 883 public int getRangeStart(int index) { 884 return list[index*2]; 885 } 886 887 /** 888 * Iteration method that returns the last character in the 889 * specified range of this set. 890 * @exception ArrayIndexOutOfBoundsException if index is outside 891 * the range <code>0..getRangeCount()-1</code> 892 * @see #getRangeStart 893 * @see #getRangeEnd 894 * @stable ICU 2.0 895 */ 896 public int getRangeEnd(int index) { 897 return (list[index*2 + 1] - 1); 898 } 899 900 //---------------------------------------------------------------- 901 // Implementation: Pattern parsing 902 //---------------------------------------------------------------- 903 904 /** 905 * Parses the given pattern, starting at the given position. The character 906 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 907 * Parsing continues until the corresponding closing ']'. If a syntax error 908 * is encountered between the opening and closing brace, the parse fails. 909 * Upon return from a successful parse, the ParsePosition is updated to 910 * point to the character following the closing ']', and an inversion 911 * list for the parsed pattern is returned. This method 912 * calls itself recursively to parse embedded subpatterns. 913 * 914 * @param pattern the string containing the pattern to be parsed. The 915 * portion of the string from pos.getIndex(), which must be a '[', to the 916 * corresponding closing ']', is parsed. 917 * @param pos upon entry, the position at which to being parsing. The 918 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 919 * from a successful parse, pos.getIndex() is either the character after the 920 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 921 * is the last character of the pattern string. 922 * @return an inversion list for the parsed substring 923 * of <code>pattern</code> 924 * @exception java.lang.IllegalArgumentException if the parse fails. 925 */ 926 UnicodeSet applyPattern(String pattern, 927 ParsePosition pos, 928 SymbolTable symbols, 929 int options) { 930 931 // Need to build the pattern in a temporary string because 932 // _applyPattern calls add() etc., which set pat to empty. 933 boolean parsePositionWasNull = pos == null; 934 if (parsePositionWasNull) { 935 pos = new ParsePosition(0); 936 } 937 938 StringBuffer rebuiltPat = new StringBuffer(); 939 RuleCharacterIterator chars = 940 new RuleCharacterIterator(pattern, symbols, pos); 941 applyPattern(chars, symbols, rebuiltPat, options); 942 if (chars.inVariable()) { 943 syntaxError(chars, "Extra chars in variable value"); 944 } 945 pat = rebuiltPat.toString(); 946 if (parsePositionWasNull) { 947 int i = pos.getIndex(); 948 949 // Skip over trailing whitespace 950 if ((options & IGNORE_SPACE) != 0) { 951 i = Utility.skipWhitespace(pattern, i); 952 } 953 954 if (i != pattern.length()) { 955 throw new IllegalArgumentException("Parse of \"" + pattern + 956 "\" failed at " + i); 957 } 958 } 959 return this; 960 } 961 962 /** 963 * Parse the pattern from the given RuleCharacterIterator. The 964 * iterator is advanced over the parsed pattern. 965 * @param chars iterator over the pattern characters. Upon return 966 * it will be advanced to the first character after the parsed 967 * pattern, or the end of the iteration if all characters are 968 * parsed. 969 * @param symbols symbol table to use to parse and dereference 970 * variables, or null if none. 971 * @param rebuiltPat the pattern that was parsed, rebuilt or 972 * copied from the input pattern, as appropriate. 973 * @param options a bit mask of zero or more of the following: 974 * IGNORE_SPACE, CASE. 975 */ 976 void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 977 StringBuffer rebuiltPat, int options) { 978 // Syntax characters: [ ] ^ - & { } 979 980 // Recognized special forms for chars, sets: c-c s-s s&s 981 982 int opts = RuleCharacterIterator.PARSE_VARIABLES | 983 RuleCharacterIterator.PARSE_ESCAPES; 984 if ((options & IGNORE_SPACE) != 0) { 985 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 986 } 987 988 StringBuffer patBuf = new StringBuffer(), buf = null; 989 boolean usePat = false; 990 UnicodeSet scratch = null; 991 Object backup = null; 992 993 // mode: 0=before [, 1=between [...], 2=after ] 994 // lastItem: 0=none, 1=char, 2=set 995 int lastItem = 0, lastChar = 0, mode = 0; 996 char op = 0; 997 998 boolean invert = false; 999 1000 clear(); 1001 1002 while (mode != 2 && !chars.atEnd()) { 1003 if (false) { 1004 // Debugging assertion 1005 if (!((lastItem == 0 && op == 0) || 1006 (lastItem == 1 && (op == 0 || op == '-')) || 1007 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 1008 throw new IllegalArgumentException(); 1009 } 1010 } 1011 1012 int c = 0; 1013 boolean literal = false; 1014 UnicodeSet nested = null; 1015 1016 // -------- Check for property pattern 1017 1018 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 1019 int setMode = 0; 1020 if (resemblesPropertyPattern(chars, opts)) { 1021 setMode = 2; 1022 } 1023 1024 // -------- Parse '[' of opening delimiter OR nested set. 1025 // If there is a nested set, use `setMode' to define how 1026 // the set should be parsed. If the '[' is part of the 1027 // opening delimiter for this pattern, parse special 1028 // strings "[", "[^", "[-", and "[^-". Check for stand-in 1029 // characters representing a nested set in the symbol 1030 // table. 1031 1032 else { 1033 // Prepare to backup if necessary 1034 backup = chars.getPos(backup); 1035 c = chars.next(opts); 1036 literal = chars.isEscaped(); 1037 1038 if (c == '[' && !literal) { 1039 if (mode == 1) { 1040 chars.setPos(backup); // backup 1041 setMode = 1; 1042 } else { 1043 // Handle opening '[' delimiter 1044 mode = 1; 1045 patBuf.append('['); 1046 backup = chars.getPos(backup); // prepare to backup 1047 c = chars.next(opts); 1048 literal = chars.isEscaped(); 1049 if (c == '^' && !literal) { 1050 invert = true; 1051 patBuf.append('^'); 1052 backup = chars.getPos(backup); // prepare to backup 1053 c = chars.next(opts); 1054 literal = chars.isEscaped(); 1055 } 1056 // Fall through to handle special leading '-'; 1057 // otherwise restart loop for nested [], \p{}, etc. 1058 if (c == '-') { 1059 literal = true; 1060 // Fall through to handle literal '-' below 1061 } else { 1062 chars.setPos(backup); // backup 1063 continue; 1064 } 1065 } 1066 } else if (symbols != null) { 1067 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 1068 if (m != null) { 1069 try { 1070 nested = (UnicodeSet) m; 1071 setMode = 3; 1072 } catch (ClassCastException e) { 1073 syntaxError(chars, "Syntax error"); 1074 } 1075 } 1076 } 1077 } 1078 1079 // -------- Handle a nested set. This either is inline in 1080 // the pattern or represented by a stand-in that has 1081 // previously been parsed and was looked up in the symbol 1082 // table. 1083 1084 if (setMode != 0) { 1085 if (lastItem == 1) { 1086 if (op != 0) { 1087 syntaxError(chars, "Char expected after operator"); 1088 } 1089 add_unchecked(lastChar, lastChar); 1090 _appendToPat(patBuf, lastChar, false); 1091 lastItem = op = 0; 1092 } 1093 1094 if (op == '-' || op == '&') { 1095 patBuf.append(op); 1096 } 1097 1098 if (nested == null) { 1099 if (scratch == null) scratch = new UnicodeSet(); 1100 nested = scratch; 1101 } 1102 switch (setMode) { 1103 case 1: 1104 nested.applyPattern(chars, symbols, patBuf, options); 1105 break; 1106 case 2: 1107 chars.skipIgnored(opts); 1108 nested.applyPropertyPattern(chars, patBuf, symbols); 1109 break; 1110 case 3: // `nested' already parsed 1111 nested._toPattern(patBuf, false); 1112 break; 1113 } 1114 1115 usePat = true; 1116 1117 if (mode == 0) { 1118 // Entire pattern is a category; leave parse loop 1119 set(nested); 1120 mode = 2; 1121 break; 1122 } 1123 1124 switch (op) { 1125 case '-': 1126 removeAll(nested); 1127 break; 1128 case '&': 1129 retainAll(nested); 1130 break; 1131 case 0: 1132 addAll(nested); 1133 break; 1134 } 1135 1136 op = 0; 1137 lastItem = 2; 1138 1139 continue; 1140 } 1141 1142 if (mode == 0) { 1143 syntaxError(chars, "Missing '['"); 1144 } 1145 1146 // -------- Parse special (syntax) characters. If the 1147 // current character is not special, or if it is escaped, 1148 // then fall through and handle it below. 1149 1150 if (!literal) { 1151 switch (c) { 1152 case ']': 1153 if (lastItem == 1) { 1154 add_unchecked(lastChar, lastChar); 1155 _appendToPat(patBuf, lastChar, false); 1156 } 1157 // Treat final trailing '-' as a literal 1158 if (op == '-') { 1159 add_unchecked(op, op); 1160 patBuf.append(op); 1161 } else if (op == '&') { 1162 syntaxError(chars, "Trailing '&'"); 1163 } 1164 patBuf.append(']'); 1165 mode = 2; 1166 continue; 1167 case '-': 1168 if (op == 0) { 1169 if (lastItem != 0) { 1170 op = (char) c; 1171 continue; 1172 } else { 1173 // Treat final trailing '-' as a literal 1174 add_unchecked(c, c); 1175 c = chars.next(opts); 1176 literal = chars.isEscaped(); 1177 if (c == ']' && !literal) { 1178 patBuf.append("-]"); 1179 mode = 2; 1180 continue; 1181 } 1182 } 1183 } 1184 syntaxError(chars, "'-' not after char or set"); 1185 break; 1186 case '&': 1187 if (lastItem == 2 && op == 0) { 1188 op = (char) c; 1189 continue; 1190 } 1191 syntaxError(chars, "'&' not after set"); 1192 break; 1193 case '^': 1194 syntaxError(chars, "'^' not after '['"); 1195 break; 1196 case '{': 1197 if (op != 0) { 1198 syntaxError(chars, "Missing operand after operator"); 1199 } 1200 if (lastItem == 1) { 1201 add_unchecked(lastChar, lastChar); 1202 _appendToPat(patBuf, lastChar, false); 1203 } 1204 lastItem = 0; 1205 if (buf == null) { 1206 buf = new StringBuffer(); 1207 } else { 1208 buf.setLength(0); 1209 } 1210 boolean ok = false; 1211 while (!chars.atEnd()) { 1212 c = chars.next(opts); 1213 literal = chars.isEscaped(); 1214 if (c == '}' && !literal) { 1215 ok = true; 1216 break; 1217 } 1218 UTF16.append(buf, c); 1219 } 1220 if (buf.length() < 1 || !ok) { 1221 syntaxError(chars, "Invalid multicharacter string"); 1222 } 1223 // We have new string. Add it to set and continue; 1224 // we don't need to drop through to the further 1225 // processing 1226 add(buf.toString()); 1227 patBuf.append('{'); 1228 _appendToPat(patBuf, buf.toString(), false); 1229 patBuf.append('}'); 1230 continue; 1231 case SymbolTable.SYMBOL_REF: 1232 // symbols nosymbols 1233 // [a-$] error error (ambiguous) 1234 // [a$] anchor anchor 1235 // [a-$x] var "x"* literal '$' 1236 // [a-$.] error literal '$' 1237 // *We won't get here in the case of var "x" 1238 backup = chars.getPos(backup); 1239 c = chars.next(opts); 1240 literal = chars.isEscaped(); 1241 boolean anchor = (c == ']' && !literal); 1242 if (symbols == null && !anchor) { 1243 c = SymbolTable.SYMBOL_REF; 1244 chars.setPos(backup); 1245 break; // literal '$' 1246 } 1247 if (anchor && op == 0) { 1248 if (lastItem == 1) { 1249 add_unchecked(lastChar, lastChar); 1250 _appendToPat(patBuf, lastChar, false); 1251 } 1252 add_unchecked(UnicodeMatcher.ETHER); 1253 usePat = true; 1254 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 1255 mode = 2; 1256 continue; 1257 } 1258 syntaxError(chars, "Unquoted '$'"); 1259 break; 1260 default: 1261 break; 1262 } 1263 } 1264 1265 // -------- Parse literal characters. This includes both 1266 // escaped chars ("\u4E01") and non-syntax characters 1267 // ("a"). 1268 1269 switch (lastItem) { 1270 case 0: 1271 lastItem = 1; 1272 lastChar = c; 1273 break; 1274 case 1: 1275 if (op == '-') { 1276 if (lastChar >= c) { 1277 // Don't allow redundant (a-a) or empty (b-a) ranges; 1278 // these are most likely typos. 1279 syntaxError(chars, "Invalid range"); 1280 } 1281 add_unchecked(lastChar, c); 1282 _appendToPat(patBuf, lastChar, false); 1283 patBuf.append(op); 1284 _appendToPat(patBuf, c, false); 1285 lastItem = op = 0; 1286 } else { 1287 add_unchecked(lastChar, lastChar); 1288 _appendToPat(patBuf, lastChar, false); 1289 lastChar = c; 1290 } 1291 break; 1292 case 2: 1293 if (op != 0) { 1294 syntaxError(chars, "Set expected after operator"); 1295 } 1296 lastChar = c; 1297 lastItem = 1; 1298 break; 1299 } 1300 } 1301 1302 if (mode != 2) { 1303 syntaxError(chars, "Missing ']'"); 1304 } 1305 1306 chars.skipIgnored(opts); 1307 1308 if (invert) { 1309 complement(); 1310 } 1311 1312 // Use the rebuilt pattern (pat) only if necessary. Prefer the 1313 // generated pattern. 1314 if (usePat) { 1315 rebuiltPat.append(patBuf.toString()); 1316 } else { 1317 _generatePattern(rebuiltPat, false, true); 1318 } 1319 } 1320 1321 private static void syntaxError(RuleCharacterIterator chars, String msg) { 1322 throw new IllegalArgumentException("Error: " + msg + " at \"" + 1323 Utility.escape(chars.toString()) + 1324 '"'); 1325 } 1326 1327 //---------------------------------------------------------------- 1328 // Implementation: Utility methods 1329 //---------------------------------------------------------------- 1330 1331 private void ensureCapacity(int newLen) { 1332 if (newLen <= list.length) return; 1333 int[] temp = new int[newLen + GROW_EXTRA]; 1334 System.arraycopy(list, 0, temp, 0, len); 1335 list = temp; 1336 } 1337 1338 private void ensureBufferCapacity(int newLen) { 1339 if (buffer != null && newLen <= buffer.length) return; 1340 buffer = new int[newLen + GROW_EXTRA]; 1341 } 1342 1343 /** 1344 * Assumes start <= end. 1345 */ 1346 private int[] range(int start, int end) { 1347 if (rangeList == null) { 1348 rangeList = new int[] { start, end+1, HIGH }; 1349 } else { 1350 rangeList[0] = start; 1351 rangeList[1] = end+1; 1352 } 1353 return rangeList; 1354 } 1355 1356 //---------------------------------------------------------------- 1357 // Implementation: Fundamental operations 1358 //---------------------------------------------------------------- 1359 1360 // polarity = 0, 3 is normal: x xor y 1361 // polarity = 1, 2: x xor ~y == x === y 1362 1363 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 1364 ensureBufferCapacity(len + otherLen); 1365 int i = 0, j = 0, k = 0; 1366 int a = list[i++]; 1367 int b; 1368 if (polarity == 1 || polarity == 2) { 1369 b = LOW; 1370 if (other[j] == LOW) { // skip base if already LOW 1371 ++j; 1372 b = other[j]; 1373 } 1374 } else { 1375 b = other[j++]; 1376 } 1377 // simplest of all the routines 1378 // sort the values, discarding identicals! 1379 while (true) { 1380 if (a < b) { 1381 buffer[k++] = a; 1382 a = list[i++]; 1383 } else if (b < a) { 1384 buffer[k++] = b; 1385 b = other[j++]; 1386 } else if (a != HIGH) { // at this point, a == b 1387 // discard both values! 1388 a = list[i++]; 1389 b = other[j++]; 1390 } else { // DONE! 1391 buffer[k++] = HIGH; 1392 len = k; 1393 break; 1394 } 1395 } 1396 // swap list and buffer 1397 int[] temp = list; 1398 list = buffer; 1399 buffer = temp; 1400 pat = null; 1401 return this; 1402 } 1403 1404 // polarity = 0 is normal: x union y 1405 // polarity = 2: x union ~y 1406 // polarity = 1: ~x union y 1407 // polarity = 3: ~x union ~y 1408 1409 private UnicodeSet add(int[] other, int otherLen, int polarity) { 1410 ensureBufferCapacity(len + otherLen); 1411 int i = 0, j = 0, k = 0; 1412 int a = list[i++]; 1413 int b = other[j++]; 1414 // change from xor is that we have to check overlapping pairs 1415 // polarity bit 1 means a is second, bit 2 means b is. 1416 main: 1417 while (true) { 1418 switch (polarity) { 1419 case 0: // both first; take lower if unequal 1420 if (a < b) { // take a 1421 // Back up over overlapping ranges in buffer[] 1422 if (k > 0 && a <= buffer[k-1]) { 1423 // Pick latter end value in buffer[] vs. list[] 1424 a = max(list[i], buffer[--k]); 1425 } else { 1426 // No overlap 1427 buffer[k++] = a; 1428 a = list[i]; 1429 } 1430 i++; // Common if/else code factored out 1431 polarity ^= 1; 1432 } else if (b < a) { // take b 1433 if (k > 0 && b <= buffer[k-1]) { 1434 b = max(other[j], buffer[--k]); 1435 } else { 1436 buffer[k++] = b; 1437 b = other[j]; 1438 } 1439 j++; 1440 polarity ^= 2; 1441 } else { // a == b, take a, drop b 1442 if (a == HIGH) break main; 1443 // This is symmetrical; it doesn't matter if 1444 // we backtrack with a or b. - liu 1445 if (k > 0 && a <= buffer[k-1]) { 1446 a = max(list[i], buffer[--k]); 1447 } else { 1448 // No overlap 1449 buffer[k++] = a; 1450 a = list[i]; 1451 } 1452 i++; 1453 polarity ^= 1; 1454 b = other[j++]; polarity ^= 2; 1455 } 1456 break; 1457 case 3: // both second; take higher if unequal, and drop other 1458 if (b <= a) { // take a 1459 if (a == HIGH) break main; 1460 buffer[k++] = a; 1461 } else { // take b 1462 if (b == HIGH) break main; 1463 buffer[k++] = b; 1464 } 1465 a = list[i++]; polarity ^= 1; // factored common code 1466 b = other[j++]; polarity ^= 2; 1467 break; 1468 case 1: // a second, b first; if b < a, overlap 1469 if (a < b) { // no overlap, take a 1470 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1471 } else if (b < a) { // OVERLAP, drop b 1472 b = other[j++]; polarity ^= 2; 1473 } else { // a == b, drop both! 1474 if (a == HIGH) break main; 1475 a = list[i++]; polarity ^= 1; 1476 b = other[j++]; polarity ^= 2; 1477 } 1478 break; 1479 case 2: // a first, b second; if a < b, overlap 1480 if (b < a) { // no overlap, take b 1481 buffer[k++] = b; b = other[j++]; polarity ^= 2; 1482 } else if (a < b) { // OVERLAP, drop a 1483 a = list[i++]; polarity ^= 1; 1484 } else { // a == b, drop both! 1485 if (a == HIGH) break main; 1486 a = list[i++]; polarity ^= 1; 1487 b = other[j++]; polarity ^= 2; 1488 } 1489 break; 1490 } 1491 } 1492 buffer[k++] = HIGH; // terminate 1493 len = k; 1494 // swap list and buffer 1495 int[] temp = list; 1496 list = buffer; 1497 buffer = temp; 1498 pat = null; 1499 return this; 1500 } 1501 1502 // polarity = 0 is normal: x intersect y 1503 // polarity = 2: x intersect ~y == set-minus 1504 // polarity = 1: ~x intersect y 1505 // polarity = 3: ~x intersect ~y 1506 1507 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 1508 ensureBufferCapacity(len + otherLen); 1509 int i = 0, j = 0, k = 0; 1510 int a = list[i++]; 1511 int b = other[j++]; 1512 // change from xor is that we have to check overlapping pairs 1513 // polarity bit 1 means a is second, bit 2 means b is. 1514 main: 1515 while (true) { 1516 switch (polarity) { 1517 case 0: // both first; drop the smaller 1518 if (a < b) { // drop a 1519 a = list[i++]; polarity ^= 1; 1520 } else if (b < a) { // drop b 1521 b = other[j++]; polarity ^= 2; 1522 } else { // a == b, take one, drop other 1523 if (a == HIGH) break main; 1524 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1525 b = other[j++]; polarity ^= 2; 1526 } 1527 break; 1528 case 3: // both second; take lower if unequal 1529 if (a < b) { // take a 1530 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1531 } else if (b < a) { // take b 1532 buffer[k++] = b; b = other[j++]; polarity ^= 2; 1533 } else { // a == b, take one, drop other 1534 if (a == HIGH) break main; 1535 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1536 b = other[j++]; polarity ^= 2; 1537 } 1538 break; 1539 case 1: // a second, b first; 1540 if (a < b) { // NO OVERLAP, drop a 1541 a = list[i++]; polarity ^= 1; 1542 } else if (b < a) { // OVERLAP, take b 1543 buffer[k++] = b; b = other[j++]; polarity ^= 2; 1544 } else { // a == b, drop both! 1545 if (a == HIGH) break main; 1546 a = list[i++]; polarity ^= 1; 1547 b = other[j++]; polarity ^= 2; 1548 } 1549 break; 1550 case 2: // a first, b second; if a < b, overlap 1551 if (b < a) { // no overlap, drop b 1552 b = other[j++]; polarity ^= 2; 1553 } else if (a < b) { // OVERLAP, take a 1554 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1555 } else { // a == b, drop both! 1556 if (a == HIGH) break main; 1557 a = list[i++]; polarity ^= 1; 1558 b = other[j++]; polarity ^= 2; 1559 } 1560 break; 1561 } 1562 } 1563 buffer[k++] = HIGH; // terminate 1564 len = k; 1565 // swap list and buffer 1566 int[] temp = list; 1567 list = buffer; 1568 buffer = temp; 1569 pat = null; 1570 return this; 1571 } 1572 1573 private static final int max(int a, int b) { 1574 return (a > b) ? a : b; 1575 } 1576 1577 //---------------------------------------------------------------- 1578 // Generic filter-based scanning code 1579 //---------------------------------------------------------------- 1580 1581 private static interface Filter { 1582 boolean contains(int codePoint); 1583 } 1584 1585 // VersionInfo for unassigned characters 1586 static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 1587 1588 private static class VersionFilter implements Filter { 1589 VersionInfo version; 1590 1591 VersionFilter(VersionInfo version) { this.version = version; } 1592 1593 public boolean contains(int ch) { 1594 VersionInfo v = UCharacter.getAge(ch); 1595 // Reference comparison ok; VersionInfo caches and reuses 1596 // unique objects. 1597 return v != NO_VERSION && 1598 v.compareTo(version) <= 0; 1599 } 1600 } 1601 1602 private static synchronized UnicodeSet getInclusions(int src) { 1603 if (INCLUSIONS == null) { 1604 INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT]; 1605 } 1606 if(INCLUSIONS[src] == null) { 1607 UnicodeSet incl = new UnicodeSet(); 1608 switch(src) { 1609 case UCharacterProperty.SRC_PROPSVEC: 1610 UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl); 1611 break; 1612 default: 1613 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); 1614 } 1615 INCLUSIONS[src] = incl; 1616 } 1617 return INCLUSIONS[src]; 1618 } 1619 1620 /** 1621 * Generic filter-based scanning code for UCD property UnicodeSets. 1622 */ 1623 private UnicodeSet applyFilter(Filter filter, int src) { 1624 // Walk through all Unicode characters, noting the start 1625 // and end of each range for which filter.contain(c) is 1626 // true. Add each range to a set. 1627 // 1628 // To improve performance, use the INCLUSIONS set, which 1629 // encodes information about character ranges that are known 1630 // to have identical properties, such as the CJK Ideographs 1631 // from U+4E00 to U+9FA5. INCLUSIONS contains all characters 1632 // except the first characters of such ranges. 1633 // 1634 // TODO Where possible, instead of scanning over code points, 1635 // use internal property data to initialize UnicodeSets for 1636 // those properties. Scanning code points is slow. 1637 1638 clear(); 1639 1640 int startHasProperty = -1; 1641 UnicodeSet inclusions = getInclusions(src); 1642 int limitRange = inclusions.getRangeCount(); 1643 1644 for (int j=0; j<limitRange; ++j) { 1645 // get current range 1646 int start = inclusions.getRangeStart(j); 1647 int end = inclusions.getRangeEnd(j); 1648 1649 // for all the code points in the range, process 1650 for (int ch = start; ch <= end; ++ch) { 1651 // only add to the unicodeset on inflection points -- 1652 // where the hasProperty value changes to false 1653 if (filter.contains(ch)) { 1654 if (startHasProperty < 0) { 1655 startHasProperty = ch; 1656 } 1657 } else if (startHasProperty >= 0) { 1658 add_unchecked(startHasProperty, ch-1); 1659 startHasProperty = -1; 1660 } 1661 } 1662 } 1663 if (startHasProperty >= 0) { 1664 add_unchecked(startHasProperty, 0x10FFFF); 1665 } 1666 1667 return this; 1668 } 1669 1670 /** 1671 * Remove leading and trailing rule white space and compress 1672 * internal rule white space to a single space character. 1673 * 1674 * @see UCharacterProperty#isRuleWhiteSpace 1675 */ 1676 private static String mungeCharName(String source) { 1677 StringBuffer buf = new StringBuffer(); 1678 for (int i=0; i<source.length(); ) { 1679 int ch = UTF16.charAt(source, i); 1680 i += UTF16.getCharCount(ch); 1681 if (UCharacterProperty.isRuleWhiteSpace(ch)) { 1682 if (buf.length() == 0 || 1683 buf.charAt(buf.length() - 1) == ' ') { 1684 continue; 1685 } 1686 ch = ' '; // convert to ' ' 1687 } 1688 UTF16.append(buf, ch); 1689 } 1690 if (buf.length() != 0 && 1691 buf.charAt(buf.length() - 1) == ' ') { 1692 buf.setLength(buf.length() - 1); 1693 } 1694 return buf.toString(); 1695 } 1696 1697 /** 1698 * Modifies this set to contain those code points which have the 1699 * given value for the given property. Prior contents of this 1700 * set are lost. 1701 * @param propertyAlias the property alias 1702 * @param valueAlias the value alias 1703 * @param symbols if not null, then symbols are first called to see if a property 1704 * is available. If true, then everything else is skipped. 1705 * @return this set 1706 * @stable ICU 3.2 1707 */ 1708 public UnicodeSet applyPropertyAlias(String propertyAlias, 1709 String valueAlias, SymbolTable symbols) { 1710 if (valueAlias.length() > 0) { 1711 if (propertyAlias.equals("Age")) { 1712 // Must munge name, since 1713 // VersionInfo.getInstance() does not do 1714 // 'loose' matching. 1715 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 1716 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); 1717 return this; 1718 } 1719 } 1720 throw new IllegalArgumentException("Unsupported property: " + propertyAlias); 1721 } 1722 1723 /** 1724 * Return true if the given iterator appears to point at a 1725 * property pattern. Regardless of the result, return with the 1726 * iterator unchanged. 1727 * @param chars iterator over the pattern characters. Upon return 1728 * it will be unchanged. 1729 * @param iterOpts RuleCharacterIterator options 1730 */ 1731 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 1732 int iterOpts) { 1733 boolean result = false; 1734 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 1735 Object pos = chars.getPos(null); 1736 int c = chars.next(iterOpts); 1737 if (c == '[' || c == '\\') { 1738 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 1739 result = (c == '[') ? (d == ':') : 1740 (d == 'N' || d == 'p' || d == 'P'); 1741 } 1742 chars.setPos(pos); 1743 return result; 1744 } 1745 1746 /** 1747 * Parse the given property pattern at the given parse position. 1748 * @param symbols TODO 1749 */ 1750 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 1751 int pos = ppos.getIndex(); 1752 1753 // On entry, ppos should point to one of the following locations: 1754 1755 // Minimum length is 5 characters, e.g. \p{L} 1756 if ((pos+5) > pattern.length()) { 1757 return null; 1758 } 1759 1760 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1761 boolean isName = false; // true for \N{pat}, o/w false 1762 boolean invert = false; 1763 1764 // Look for an opening [:, [:^, \p, or \P 1765 if (pattern.regionMatches(pos, "[:", 0, 2)) { 1766 posix = true; 1767 pos = Utility.skipWhitespace(pattern, pos+2); 1768 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 1769 ++pos; 1770 invert = true; 1771 } 1772 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 1773 pattern.regionMatches(pos, "\\N", 0, 2)) { 1774 char c = pattern.charAt(pos+1); 1775 invert = (c == 'P'); 1776 isName = (c == 'N'); 1777 pos = Utility.skipWhitespace(pattern, pos+2); 1778 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 1779 // Syntax error; "\p" or "\P" not followed by "{" 1780 return null; 1781 } 1782 } else { 1783 // Open delimiter not seen 1784 return null; 1785 } 1786 1787 // Look for the matching close delimiter, either :] or } 1788 int close = pattern.indexOf(posix ? ":]" : "}", pos); 1789 if (close < 0) { 1790 // Syntax error; close delimiter missing 1791 return null; 1792 } 1793 1794 // Look for an '=' sign. If this is present, we will parse a 1795 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1796 // pattern. 1797 int equals = pattern.indexOf('=', pos); 1798 String propName, valueName; 1799 if (equals >= 0 && equals < close && !isName) { 1800 // Equals seen; parse medium/long pattern 1801 propName = pattern.substring(pos, equals); 1802 valueName = pattern.substring(equals+1, close); 1803 } 1804 1805 else { 1806 // Handle case where no '=' is seen, and \N{} 1807 propName = pattern.substring(pos, close); 1808 valueName = ""; 1809 1810 // Handle \N{name} 1811 if (isName) { 1812 // This is a little inefficient since it means we have to 1813 // parse "na" back to UProperty.NAME even though we already 1814 // know it's UProperty.NAME. If we refactor the API to 1815 // support args of (int, String) then we can remove 1816 // "na" and make this a little more efficient. 1817 valueName = propName; 1818 propName = "na"; 1819 } 1820 } 1821 1822 applyPropertyAlias(propName, valueName, symbols); 1823 1824 if (invert) { 1825 complement(); 1826 } 1827 1828 // Move to the limit position after the close delimiter 1829 ppos.setIndex(close + (posix ? 2 : 1)); 1830 1831 return this; 1832 } 1833 1834 /** 1835 * Parse a property pattern. 1836 * @param chars iterator over the pattern characters. Upon return 1837 * it will be advanced to the first character after the parsed 1838 * pattern, or the end of the iteration if all characters are 1839 * parsed. 1840 * @param rebuiltPat the pattern that was parsed, rebuilt or 1841 * copied from the input pattern, as appropriate. 1842 * @param symbols TODO 1843 */ 1844 private void applyPropertyPattern(RuleCharacterIterator chars, 1845 StringBuffer rebuiltPat, SymbolTable symbols) { 1846 String patStr = chars.lookahead(); 1847 ParsePosition pos = new ParsePosition(0); 1848 applyPropertyPattern(patStr, pos, symbols); 1849 if (pos.getIndex() == 0) { 1850 syntaxError(chars, "Invalid property pattern"); 1851 } 1852 chars.jumpahead(pos.getIndex()); 1853 rebuiltPat.append(patStr.substring(0, pos.getIndex())); 1854 } 1855 1856 //---------------------------------------------------------------- 1857 // Case folding API 1858 //---------------------------------------------------------------- 1859 1860 /** 1861 * Bitmask for constructor and applyPattern() indicating that 1862 * white space should be ignored. If set, ignore characters for 1863 * which UCharacterProperty.isRuleWhiteSpace() returns true, 1864 * unless they are quoted or escaped. This may be ORed together 1865 * with other selectors. 1866 * @stable ICU 3.8 1867 */ 1868 public static final int IGNORE_SPACE = 1; 1869 1870 } 1871