1 /*
2 * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 /*
26 *******************************************************************************
27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28 * *
29 * The original version of this source code and documentation is copyrighted *
30 * and owned by IBM, These materials are provided under terms of a License *
31 * Agreement between IBM and Sun. This technology is protected by multiple *
32 * US and International patents. This notice and attribution to IBM may not *
33 * to removed. *
34 *******************************************************************************
35 */
36
37 package sun.text.normalizer;
38
39 import java.text.ParsePosition;
40 import java.util.Iterator;
41 import java.util.TreeSet;
42
43 /**
44 * A mutable set of Unicode characters and multicharacter strings. Objects of this class
45 * represent <em>character classes</em> used in regular expressions.
46 * A character specifies a subset of Unicode code points. Legal
47 * code points are U+0000 to U+10FFFF, inclusive.
48 *
49 * <p>The UnicodeSet class is not designed to be subclassed.
50 *
51 * <p><code>UnicodeSet</code> supports two APIs. The first is the
52 * <em>operand</em> API that allows the caller to modify the value of
53 * a <code>UnicodeSet</code> object. It conforms to Java 2's
54 * <code>java.util.Set</code> interface, although
55 * <code>UnicodeSet</code> does not actually implement that
56 * interface. All methods of <code>Set</code> are supported, with the
57 * modification that they take a character range or single character
58 * instead of an <code>Object</code>, and they take a
59 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
60 * operand API may be thought of in terms of boolean logic: a boolean
61 * OR is implemented by <code>add</code>, a boolean AND is implemented
62 * by <code>retain</code>, a boolean XOR is implemented by
63 * <code>complement</code> taking an argument, and a boolean NOT is
64 * implemented by <code>complement</code> with no argument. In terms
65 * of traditional set theory function names, <code>add</code> is a
66 * union, <code>retain</code> is an intersection, <code>remove</code>
67 * is an asymmetric difference, and <code>complement</code> with no
101 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
102 * point order</td>
103 * </tr>
104 * <tr>
105 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
106 * <td valign="top">The character U+4E01</td>
107 * </tr>
108 * <tr>
109 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
110 * <td valign="top">The character 'a' and the multicharacter strings "ab" and
111 * "ac"</td>
112 * </tr>
113 * <tr>
114 * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
115 * <td valign="top">All characters in the general category Uppercase Letter</td>
116 * </tr>
117 * </table>
118 * </blockquote>
119 *
120 * Any character may be preceded by a backslash in order to remove any special
121 * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
122 * ignored, unless they are escaped.
123 *
124 * <p>Property patterns specify a set of characters having a certain
125 * property as defined by the Unicode standard. Both the POSIX-like
126 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
127 * complete list of supported property patterns, see the User's Guide
128 * for UnicodeSet at
129 * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
130 * http://www.icu-project.org/userguide/unicodeSet.html</a>.
131 * Actual determination of property data is defined by the underlying
132 * Unicode database as implemented by UCharacter.
133 *
134 * <p>Patterns specify individual characters, ranges of characters, and
135 * Unicode property sets. When elements are concatenated, they
136 * specify their union. To complement a set, place a '^' immediately
137 * after the opening '['. Property patterns are inverted by modifying
138 * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
139 * '^' has no special meaning.
140 *
141 * <p>Ranges are indicated by placing two a '-' between two
250 * <td valign="top"></td>
251 * <td valign="top">one or more instances of <code>a</code><br>
252 * </td>
253 * </tr>
254 * <tr>
255 * <td nowrap valign="top"><code>a | b</code></td>
256 * <td valign="top"></td>
257 * <td valign="top">either <code>a</code> or <code>b</code><br>
258 * </td>
259 * </tr>
260 * <tr>
261 * <td nowrap valign="top"><code>'a'</code></td>
262 * <td valign="top"></td>
263 * <td valign="top">the literal string between the quotes </td>
264 * </tr>
265 * </table>
266 * </td>
267 * </tr>
268 * </table>
269 * </blockquote>
270 * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
271 *
272 * @author Alan Liu
273 * @stable ICU 2.0
274 * @see UnicodeSetIterator
275 */
276 @SuppressWarnings("deprecation")
277 public class UnicodeSet implements UnicodeMatcher {
278
279 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
280 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
281 // 110000 for codepoints
282
283 /**
284 * Minimum value that can be stored in a UnicodeSet.
285 * @stable ICU 2.0
286 */
287 public static final int MIN_VALUE = LOW;
288
289 /**
290 * Maximum value that can be stored in a UnicodeSet.
291 * @stable ICU 2.0
292 */
293 public static final int MAX_VALUE = HIGH - 1;
294
295 private int len; // length used; list may be longer to minimize reallocs
296 private int[] list; // MUST be terminated with HIGH
297 private int[] rangeList; // internal buffer
298 private int[] buffer; // internal buffer
299
300 // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
301 // is not private so that UnicodeSetIterator can get access
302 TreeSet<String> strings = new TreeSet<>();
303
304 /**
305 * The pattern representation of this set. This may not be the
306 * most economical pattern. It is the pattern supplied to
307 * applyPattern(), with variables substituted and whitespace
308 * removed. For sets constructed without applyPattern(), or
309 * modified using the non-pattern API, this string will be null,
310 * indicating that toPattern() must generate a pattern
311 * representation from the inversion list.
312 */
313 private String pat = null;
314
315 private static final int START_EXTRA = 16; // initial storage. Must be >= 0
316 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
317
318 /**
319 * A set of all characters _except_ the second through last characters of
320 * certain ranges. These ranges are ranges of characters whose
321 * properties are all exactly alike, e.g. CJK Ideographs from
322 * U+4E00 to U+9FA5.
323 */
324 private static UnicodeSet INCLUSIONS[] = null;
325
326 //----------------------------------------------------------------
327 // Public API
328 //----------------------------------------------------------------
329
330 /**
331 * Constructs an empty set.
332 * @stable ICU 2.0
333 */
334 public UnicodeSet() {
335 list = new int[1 + START_EXTRA];
336 list[len++] = HIGH;
337 }
338
339 /**
340 * Constructs a set containing the given range.
341 * If {@code end > start} then an empty set is created.
342 *
343 * @param start first character, inclusive, of range
344 * @param end last character, inclusive, of range
345 * @stable ICU 2.0
346 */
347 public UnicodeSet(int start, int end) {
348 this();
349 complement(start, end);
350 }
351
352 /**
353 * Constructs a set from the given pattern. See the class description
354 * for the syntax of the pattern language. Whitespace is ignored.
355 * @param pattern a string specifying what characters are in the set
356 * @exception java.lang.IllegalArgumentException if the pattern contains
357 * a syntax error.
358 * @stable ICU 2.0
359 */
360 public UnicodeSet(String pattern) {
361 this();
362 applyPattern(pattern, null, null, IGNORE_SPACE);
363 }
364
365 /**
366 * Make this object represent the same set as <code>other</code>.
367 * @param other a <code>UnicodeSet</code> whose value will be
368 * copied to this object
369 * @stable ICU 2.0
370 */
371 @SuppressWarnings("unchecked") // Casting result of clone of a collection
372 public UnicodeSet set(UnicodeSet other) {
373 list = other.list.clone();
374 len = other.len;
375 pat = other.pat;
376 strings = (TreeSet)other.strings.clone();
377 return this;
378 }
379
380 /**
381 * Modifies this set to represent the set specified by the given pattern.
382 * See the class description for the syntax of the pattern language.
383 * Whitespace is ignored.
384 * @param pattern a string specifying what characters are in the set
385 * @exception java.lang.IllegalArgumentException if the pattern
386 * contains a syntax error.
387 * @stable ICU 2.0
388 */
389 public final UnicodeSet applyPattern(String pattern) {
390 return applyPattern(pattern, null, null, IGNORE_SPACE);
391 }
392
393 /**
394 * Append the <code>toPattern()</code> representation of a
395 * string to the given <code>StringBuffer</code>.
396 */
397 private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
398 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
399 _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
400 }
401 }
402
403 /**
404 * Append the <code>toPattern()</code> representation of a
405 * character to the given <code>StringBuffer</code>.
406 */
407 private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
408 if (escapeUnprintable && Utility.isUnprintable(c)) {
409 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
410 // unprintable
411 if (Utility.escapeUnprintable(buf, c)) {
412 return;
413 }
414 }
415 // Okay to let ':' pass through
416 switch (c) {
417 case '[': // SET_OPEN:
418 case ']': // SET_CLOSE:
419 case '-': // HYPHEN:
420 case '^': // COMPLEMENT:
421 case '&': // INTERSECTION:
422 case '\\': //BACKSLASH:
423 case '{':
424 case '}':
425 case '$':
426 case ':':
427 buf.append('\\');
428 break;
429 default:
430 // Escape whitespace
431 if (UCharacterProperty.isRuleWhiteSpace(c)) {
432 buf.append('\\');
433 }
434 break;
435 }
436 UTF16.append(buf, c);
437 }
438
439 /**
440 * Append a string representation of this set to result. This will be
441 * a cleaned version of the string passed to applyPattern(), if there
442 * is one. Otherwise it will be generated.
443 */
444 private StringBuffer _toPattern(StringBuffer result,
445 boolean escapeUnprintable) {
446 if (pat != null) {
447 int i;
448 int backslashCount = 0;
449 for (i=0; i<pat.length(); ) {
450 int c = UTF16.charAt(pat, i);
451 i += UTF16.getCharCount(c);
452 if (escapeUnprintable && Utility.isUnprintable(c)) {
453 // If the unprintable character is preceded by an odd
454 // number of backslashes, then it has been escaped.
455 // Before unescaping it, we delete the final
456 // backslash.
457 if ((backslashCount % 2) == 1) {
458 result.setLength(result.length() - 1);
459 }
460 Utility.escapeUnprintable(result, c);
461 backslashCount = 0;
462 } else {
463 UTF16.append(result, c);
464 if (c == '\\') {
465 ++backslashCount;
466 } else {
467 backslashCount = 0;
468 }
469 }
470 }
471 return result;
472 }
473
474 return _generatePattern(result, escapeUnprintable, true);
475 }
476
477 /**
478 * Generate and append a string representation of this set to result.
479 * This does not use this.pat, the cleaned up copy of the string
480 * passed to applyPattern().
481 * @param includeStrings if false, doesn't include the strings.
482 * @stable ICU 3.8
483 */
484 public StringBuffer _generatePattern(StringBuffer result,
485 boolean escapeUnprintable, boolean includeStrings) {
486 result.append('[');
487
488 int count = getRangeCount();
489
490 // If the set contains at least 2 intervals and includes both
491 // MIN_VALUE and MAX_VALUE, then the inverse representation will
492 // be more economical.
493 if (count > 1 &&
494 getRangeStart(0) == MIN_VALUE &&
495 getRangeEnd(count-1) == MAX_VALUE) {
496
497 // Emit the inverse
498 result.append('^');
499
500 for (int i = 1; i < count; ++i) {
501 int start = getRangeEnd(i-1)+1;
502 int end = getRangeStart(i)-1;
503 _appendToPat(result, start, escapeUnprintable);
504 if (start != end) {
505 if ((start+1) != end) {
506 result.append('-');
507 }
508 _appendToPat(result, end, escapeUnprintable);
509 }
510 }
511 }
512
513 // Default; emit the ranges as pairs
514 else {
515 for (int i = 0; i < count; ++i) {
516 int start = getRangeStart(i);
517 int end = getRangeEnd(i);
518 _appendToPat(result, start, escapeUnprintable);
519 if (start != end) {
520 if ((start+1) != end) {
521 result.append('-');
522 }
523 _appendToPat(result, end, escapeUnprintable);
524 }
525 }
526 }
527
528 if (includeStrings && strings.size() > 0) {
529 Iterator<String> it = strings.iterator();
530 while (it.hasNext()) {
531 result.append('{');
532 _appendToPat(result, it.next(), escapeUnprintable);
533 result.append('}');
534 }
535 }
536 return result.append(']');
537 }
538
539 // for internal use, after checkFrozen has been called
540 private UnicodeSet add_unchecked(int start, int end) {
541 if (start < MIN_VALUE || start > MAX_VALUE) {
542 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
543 }
544 if (end < MIN_VALUE || end > MAX_VALUE) {
545 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
546 }
547 if (start < end) {
548 add(range(start, end), 2, 0);
549 } else if (start == end) {
550 add(start);
551 }
552 return this;
553 }
554
555 /**
556 * Adds the specified character to this set if it is not already
557 * present. If this set already contains the specified character,
558 * the call leaves this set unchanged.
559 * @stable ICU 2.0
560 */
561 public final UnicodeSet add(int c) {
562 return add_unchecked(c);
563 }
564
565 // for internal use only, after checkFrozen has been called
566 private final UnicodeSet add_unchecked(int c) {
567 if (c < MIN_VALUE || c > MAX_VALUE) {
568 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
569 }
570
571 // find smallest i such that c < list[i]
572 // if odd, then it is IN the set
573 // if even, then it is OUT of the set
574 int i = findCodePoint(c);
575
576 // already in set?
577 if ((i & 1) != 0) return this;
578
579 // HIGH is 0x110000
580 // assert(list[len-1] == HIGH);
581
626 // ^
627 // list[i]
628
629 // Don't use ensureCapacity() to save on copying.
630 // NOTE: This has no measurable impact on performance,
631 // but it might help in some usage patterns.
632 if (len+2 > list.length) {
633 int[] temp = new int[len + 2 + GROW_EXTRA];
634 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
635 System.arraycopy(list, i, temp, i+2, len-i);
636 list = temp;
637 } else {
638 System.arraycopy(list, i, list, i+2, len-i);
639 }
640
641 list[i] = c;
642 list[i+1] = c+1;
643 len += 2;
644 }
645
646 pat = null;
647 return this;
648 }
649
650 /**
651 * Adds the specified multicharacter to this set if it is not already
652 * present. If this set already contains the multicharacter,
653 * the call leaves this set unchanged.
654 * Thus {@code "ch" => {"ch"}}
655 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
656 * @param s the source string
657 * @return this object, for chaining
658 * @stable ICU 2.0
659 */
660 public final UnicodeSet add(String s) {
661 int cp = getSingleCP(s);
662 if (cp < 0) {
663 strings.add(s);
664 pat = null;
665 } else {
666 add_unchecked(cp, cp);
667 }
668 return this;
669 }
670
671 /**
672 * @return a code point IF the string consists of a single one.
673 * otherwise returns -1.
674 * @param string to test
675 */
676 private static int getSingleCP(String s) {
677 if (s.length() < 1) {
678 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
679 }
680 if (s.length() > 2) return -1;
681 if (s.length() == 1) return s.charAt(0);
682
683 // at this point, len = 2
684 int cp = UTF16.charAt(s, 0);
685 if (cp > 0xFFFF) { // is surrogate pair
686 return cp;
687 }
688 return -1;
689 }
690
691 /**
692 * Complements the specified range in this set. Any character in
693 * the range will be removed if it is in this set, or will be
694 * added if it is not in this set. If {@code end > start}
695 * then an empty range is complemented, leaving the set unchanged.
696 *
697 * @param start first character, inclusive, of range to be removed
698 * from this set.
699 * @param end last character, inclusive, of range to be removed
700 * from this set.
701 * @stable ICU 2.0
702 */
703 public UnicodeSet complement(int start, int end) {
704 if (start < MIN_VALUE || start > MAX_VALUE) {
705 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
706 }
707 if (end < MIN_VALUE || end > MAX_VALUE) {
708 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
709 }
710 if (start <= end) {
711 xor(range(start, end), 2, 0);
712 }
713 pat = null;
714 return this;
715 }
716
717 /**
718 * This is equivalent to
719 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
720 * @stable ICU 2.0
721 */
722 public UnicodeSet complement() {
723 if (list[0] == LOW) {
724 System.arraycopy(list, 1, list, 0, len-1);
725 --len;
726 } else {
727 ensureCapacity(len+1);
728 System.arraycopy(list, 0, list, 1, len);
729 list[0] = LOW;
730 ++len;
731 }
732 pat = null;
733 return this;
734 }
735
736 /**
737 * Returns true if this set contains the given character.
738 * @param c character to be checked for containment
739 * @return true if the test condition is met
740 * @stable ICU 2.0
741 */
742 public boolean contains(int c) {
743 if (c < MIN_VALUE || c > MAX_VALUE) {
744 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
745 }
746
747 /*
748 // Set i to the index of the start item greater than ch
749 // We know we will terminate without length test!
750 int i = -1;
751 while (true) {
752 if (c < list[++i]) break;
753 }
754 */
755
756 int i = findCodePoint(c);
757
758 return ((i & 1) != 0); // return true if odd
759 }
760
761 /**
762 * Returns the smallest value i such that c < list[i]. Caller
763 * must ensure that c is a legal value or this method will enter
764 * an infinite loop. This method performs a binary search.
765 * @param c a character in the range MIN_VALUE..MAX_VALUE
783 if (c < list[0]) return 0;
784 // High runner test. c is often after the last range, so an
785 // initial check for this condition pays off.
786 if (len >= 2 && c >= list[len-2]) return len-1;
787 int lo = 0;
788 int hi = len - 1;
789 // invariant: c >= list[lo]
790 // invariant: c < list[hi]
791 for (;;) {
792 int i = (lo + hi) >>> 1;
793 if (i == lo) return hi;
794 if (c < list[i]) {
795 hi = i;
796 } else {
797 lo = i;
798 }
799 }
800 }
801
802 /**
803 * Adds all of the elements in the specified set to this set if
804 * they're not already present. This operation effectively
805 * modifies this set so that its value is the <i>union</i> of the two
806 * sets. The behavior of this operation is unspecified if the specified
807 * collection is modified while the operation is in progress.
808 *
809 * @param c set whose elements are to be added to this set.
810 * @stable ICU 2.0
811 */
812 public UnicodeSet addAll(UnicodeSet c) {
813 add(c.list, c.len, 0);
814 strings.addAll(c.strings);
815 return this;
816 }
817
818 /**
819 * Retains only the elements in this set that are contained in the
820 * specified set. In other words, removes from this set all of
821 * its elements that are not contained in the specified set. This
822 * operation effectively modifies this set so that its value is
823 * the <i>intersection</i> of the two sets.
824 *
825 * @param c set that defines which elements this set will retain.
826 * @stable ICU 2.0
827 */
828 public UnicodeSet retainAll(UnicodeSet c) {
829 retain(c.list, c.len, 0);
830 strings.retainAll(c.strings);
831 return this;
832 }
833
834 /**
835 * Removes from this set all of its elements that are contained in the
836 * specified set. This operation effectively modifies this
837 * set so that its value is the <i>asymmetric set difference</i> of
838 * the two sets.
839 *
840 * @param c set that defines which elements will be removed from
841 * this set.
842 * @stable ICU 2.0
843 */
844 public UnicodeSet removeAll(UnicodeSet c) {
845 retain(c.list, c.len, 2);
846 strings.removeAll(c.strings);
847 return this;
848 }
849
850 /**
851 * Removes all of the elements from this set. This set will be
852 * empty after this call returns.
853 * @stable ICU 2.0
854 */
855 public UnicodeSet clear() {
856 list[0] = HIGH;
857 len = 1;
858 pat = null;
859 strings.clear();
860 return this;
861 }
862
863 /**
864 * Iteration method that returns the number of ranges contained in
865 * this set.
866 * @see #getRangeStart
867 * @see #getRangeEnd
868 * @stable ICU 2.0
869 */
870 public int getRangeCount() {
871 return len/2;
872 }
873
874 /**
875 * Iteration method that returns the first character in the
876 * specified range of this set.
877 * @exception ArrayIndexOutOfBoundsException if index is outside
878 * the range <code>0..getRangeCount()-1</code>
906 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
907 * Parsing continues until the corresponding closing ']'. If a syntax error
908 * is encountered between the opening and closing brace, the parse fails.
909 * Upon return from a successful parse, the ParsePosition is updated to
910 * point to the character following the closing ']', and an inversion
911 * list for the parsed pattern is returned. This method
912 * calls itself recursively to parse embedded subpatterns.
913 *
914 * @param pattern the string containing the pattern to be parsed. The
915 * portion of the string from pos.getIndex(), which must be a '[', to the
916 * corresponding closing ']', is parsed.
917 * @param pos upon entry, the position at which to being parsing. The
918 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
919 * from a successful parse, pos.getIndex() is either the character after the
920 * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
921 * is the last character of the pattern string.
922 * @return an inversion list for the parsed substring
923 * of <code>pattern</code>
924 * @exception java.lang.IllegalArgumentException if the parse fails.
925 */
926 UnicodeSet applyPattern(String pattern,
927 ParsePosition pos,
928 SymbolTable symbols,
929 int options) {
930
931 // Need to build the pattern in a temporary string because
932 // _applyPattern calls add() etc., which set pat to empty.
933 boolean parsePositionWasNull = pos == null;
934 if (parsePositionWasNull) {
935 pos = new ParsePosition(0);
936 }
937
938 StringBuffer rebuiltPat = new StringBuffer();
939 RuleCharacterIterator chars =
940 new RuleCharacterIterator(pattern, symbols, pos);
941 applyPattern(chars, symbols, rebuiltPat, options);
942 if (chars.inVariable()) {
943 syntaxError(chars, "Extra chars in variable value");
944 }
945 pat = rebuiltPat.toString();
946 if (parsePositionWasNull) {
947 int i = pos.getIndex();
948
949 // Skip over trailing whitespace
950 if ((options & IGNORE_SPACE) != 0) {
951 i = Utility.skipWhitespace(pattern, i);
952 }
953
954 if (i != pattern.length()) {
955 throw new IllegalArgumentException("Parse of \"" + pattern +
956 "\" failed at " + i);
957 }
958 }
959 return this;
960 }
961
962 /**
963 * Parse the pattern from the given RuleCharacterIterator. The
964 * iterator is advanced over the parsed pattern.
965 * @param chars iterator over the pattern characters. Upon return
966 * it will be advanced to the first character after the parsed
967 * pattern, or the end of the iteration if all characters are
968 * parsed.
969 * @param symbols symbol table to use to parse and dereference
970 * variables, or null if none.
971 * @param rebuiltPat the pattern that was parsed, rebuilt or
972 * copied from the input pattern, as appropriate.
973 * @param options a bit mask of zero or more of the following:
974 * IGNORE_SPACE, CASE.
975 */
976 void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
977 StringBuffer rebuiltPat, int options) {
978 // Syntax characters: [ ] ^ - & { }
979
980 // Recognized special forms for chars, sets: c-c s-s s&s
981
982 int opts = RuleCharacterIterator.PARSE_VARIABLES |
983 RuleCharacterIterator.PARSE_ESCAPES;
984 if ((options & IGNORE_SPACE) != 0) {
985 opts |= RuleCharacterIterator.SKIP_WHITESPACE;
986 }
987
988 StringBuffer patBuf = new StringBuffer(), buf = null;
989 boolean usePat = false;
990 UnicodeSet scratch = null;
991 Object backup = null;
992
993 // mode: 0=before [, 1=between [...], 2=after ]
994 // lastItem: 0=none, 1=char, 2=set
995 int lastItem = 0, lastChar = 0, mode = 0;
996 char op = 0;
997
998 boolean invert = false;
999
1000 clear();
1001
1002 while (mode != 2 && !chars.atEnd()) {
1003 if (false) {
1004 // Debugging assertion
1005 if (!((lastItem == 0 && op == 0) ||
1006 (lastItem == 1 && (op == 0 || op == '-')) ||
1007 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
1008 throw new IllegalArgumentException();
1009 }
1010 }
1011
1012 int c = 0;
1013 boolean literal = false;
1014 UnicodeSet nested = null;
1015
1016 // -------- Check for property pattern
1017
1018 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
1019 int setMode = 0;
1020 if (resemblesPropertyPattern(chars, opts)) {
1021 setMode = 2;
1022 }
1023
1024 // -------- Parse '[' of opening delimiter OR nested set.
1025 // If there is a nested set, use `setMode' to define how
1026 // the set should be parsed. If the '[' is part of the
1027 // opening delimiter for this pattern, parse special
1028 // strings "[", "[^", "[-", and "[^-". Check for stand-in
1029 // characters representing a nested set in the symbol
1030 // table.
1031
1032 else {
1033 // Prepare to backup if necessary
1034 backup = chars.getPos(backup);
1035 c = chars.next(opts);
1036 literal = chars.isEscaped();
1037
1038 if (c == '[' && !literal) {
1039 if (mode == 1) {
1040 chars.setPos(backup); // backup
1041 setMode = 1;
1042 } else {
1043 // Handle opening '[' delimiter
1044 mode = 1;
1045 patBuf.append('[');
1046 backup = chars.getPos(backup); // prepare to backup
1047 c = chars.next(opts);
1048 literal = chars.isEscaped();
1049 if (c == '^' && !literal) {
1050 invert = true;
1051 patBuf.append('^');
1052 backup = chars.getPos(backup); // prepare to backup
1053 c = chars.next(opts);
1054 literal = chars.isEscaped();
1055 }
1056 // Fall through to handle special leading '-';
1057 // otherwise restart loop for nested [], \p{}, etc.
1058 if (c == '-') {
1059 literal = true;
1060 // Fall through to handle literal '-' below
1061 } else {
1062 chars.setPos(backup); // backup
1063 continue;
1064 }
1065 }
1066 } else if (symbols != null) {
1067 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
1068 if (m != null) {
1069 try {
1070 nested = (UnicodeSet) m;
1071 setMode = 3;
1072 } catch (ClassCastException e) {
1073 syntaxError(chars, "Syntax error");
1074 }
1075 }
1076 }
1077 }
1078
1079 // -------- Handle a nested set. This either is inline in
1080 // the pattern or represented by a stand-in that has
1081 // previously been parsed and was looked up in the symbol
1082 // table.
1083
1084 if (setMode != 0) {
1085 if (lastItem == 1) {
1086 if (op != 0) {
1087 syntaxError(chars, "Char expected after operator");
1088 }
1089 add_unchecked(lastChar, lastChar);
1090 _appendToPat(patBuf, lastChar, false);
1091 lastItem = op = 0;
1092 }
1093
1094 if (op == '-' || op == '&') {
1095 patBuf.append(op);
1096 }
1097
1098 if (nested == null) {
1099 if (scratch == null) scratch = new UnicodeSet();
1100 nested = scratch;
1101 }
1102 switch (setMode) {
1103 case 1:
1104 nested.applyPattern(chars, symbols, patBuf, options);
1105 break;
1106 case 2:
1107 chars.skipIgnored(opts);
1108 nested.applyPropertyPattern(chars, patBuf, symbols);
1109 break;
1110 case 3: // `nested' already parsed
1111 nested._toPattern(patBuf, false);
1112 break;
1113 }
1114
1115 usePat = true;
1116
1117 if (mode == 0) {
1118 // Entire pattern is a category; leave parse loop
1119 set(nested);
1120 mode = 2;
1121 break;
1122 }
1123
1124 switch (op) {
1125 case '-':
1126 removeAll(nested);
1127 break;
1128 case '&':
1129 retainAll(nested);
1130 break;
1131 case 0:
1132 addAll(nested);
1133 break;
1134 }
1135
1136 op = 0;
1137 lastItem = 2;
1138
1139 continue;
1140 }
1141
1142 if (mode == 0) {
1143 syntaxError(chars, "Missing '['");
1144 }
1145
1146 // -------- Parse special (syntax) characters. If the
1147 // current character is not special, or if it is escaped,
1148 // then fall through and handle it below.
1149
1150 if (!literal) {
1151 switch (c) {
1152 case ']':
1153 if (lastItem == 1) {
1154 add_unchecked(lastChar, lastChar);
1155 _appendToPat(patBuf, lastChar, false);
1156 }
1157 // Treat final trailing '-' as a literal
1158 if (op == '-') {
1159 add_unchecked(op, op);
1160 patBuf.append(op);
1161 } else if (op == '&') {
1162 syntaxError(chars, "Trailing '&'");
1163 }
1164 patBuf.append(']');
1165 mode = 2;
1166 continue;
1167 case '-':
1168 if (op == 0) {
1169 if (lastItem != 0) {
1170 op = (char) c;
1171 continue;
1172 } else {
1173 // Treat final trailing '-' as a literal
1174 add_unchecked(c, c);
1175 c = chars.next(opts);
1176 literal = chars.isEscaped();
1177 if (c == ']' && !literal) {
1178 patBuf.append("-]");
1179 mode = 2;
1180 continue;
1181 }
1182 }
1183 }
1184 syntaxError(chars, "'-' not after char or set");
1185 break;
1186 case '&':
1187 if (lastItem == 2 && op == 0) {
1188 op = (char) c;
1189 continue;
1190 }
1191 syntaxError(chars, "'&' not after set");
1192 break;
1193 case '^':
1194 syntaxError(chars, "'^' not after '['");
1195 break;
1196 case '{':
1197 if (op != 0) {
1198 syntaxError(chars, "Missing operand after operator");
1199 }
1200 if (lastItem == 1) {
1201 add_unchecked(lastChar, lastChar);
1202 _appendToPat(patBuf, lastChar, false);
1203 }
1204 lastItem = 0;
1205 if (buf == null) {
1206 buf = new StringBuffer();
1207 } else {
1208 buf.setLength(0);
1209 }
1210 boolean ok = false;
1211 while (!chars.atEnd()) {
1212 c = chars.next(opts);
1213 literal = chars.isEscaped();
1214 if (c == '}' && !literal) {
1215 ok = true;
1216 break;
1217 }
1218 UTF16.append(buf, c);
1219 }
1220 if (buf.length() < 1 || !ok) {
1221 syntaxError(chars, "Invalid multicharacter string");
1222 }
1223 // We have new string. Add it to set and continue;
1224 // we don't need to drop through to the further
1225 // processing
1226 add(buf.toString());
1227 patBuf.append('{');
1228 _appendToPat(patBuf, buf.toString(), false);
1229 patBuf.append('}');
1230 continue;
1231 case SymbolTable.SYMBOL_REF:
1232 // symbols nosymbols
1233 // [a-$] error error (ambiguous)
1234 // [a$] anchor anchor
1235 // [a-$x] var "x"* literal '$'
1236 // [a-$.] error literal '$'
1237 // *We won't get here in the case of var "x"
1238 backup = chars.getPos(backup);
1239 c = chars.next(opts);
1240 literal = chars.isEscaped();
1241 boolean anchor = (c == ']' && !literal);
1242 if (symbols == null && !anchor) {
1243 c = SymbolTable.SYMBOL_REF;
1244 chars.setPos(backup);
1245 break; // literal '$'
1246 }
1247 if (anchor && op == 0) {
1248 if (lastItem == 1) {
1249 add_unchecked(lastChar, lastChar);
1250 _appendToPat(patBuf, lastChar, false);
1251 }
1252 add_unchecked(UnicodeMatcher.ETHER);
1253 usePat = true;
1254 patBuf.append(SymbolTable.SYMBOL_REF).append(']');
1255 mode = 2;
1256 continue;
1257 }
1258 syntaxError(chars, "Unquoted '$'");
1259 break;
1260 default:
1261 break;
1262 }
1263 }
1264
1265 // -------- Parse literal characters. This includes both
1266 // escaped chars ("\u4E01") and non-syntax characters
1267 // ("a").
1268
1269 switch (lastItem) {
1270 case 0:
1271 lastItem = 1;
1272 lastChar = c;
1273 break;
1274 case 1:
1275 if (op == '-') {
1276 if (lastChar >= c) {
1277 // Don't allow redundant (a-a) or empty (b-a) ranges;
1278 // these are most likely typos.
1279 syntaxError(chars, "Invalid range");
1280 }
1281 add_unchecked(lastChar, c);
1282 _appendToPat(patBuf, lastChar, false);
1283 patBuf.append(op);
1284 _appendToPat(patBuf, c, false);
1285 lastItem = op = 0;
1286 } else {
1287 add_unchecked(lastChar, lastChar);
1288 _appendToPat(patBuf, lastChar, false);
1289 lastChar = c;
1290 }
1291 break;
1292 case 2:
1293 if (op != 0) {
1294 syntaxError(chars, "Set expected after operator");
1295 }
1296 lastChar = c;
1297 lastItem = 1;
1298 break;
1299 }
1300 }
1301
1302 if (mode != 2) {
1303 syntaxError(chars, "Missing ']'");
1304 }
1305
1306 chars.skipIgnored(opts);
1307
1308 if (invert) {
1309 complement();
1310 }
1311
1312 // Use the rebuilt pattern (pat) only if necessary. Prefer the
1313 // generated pattern.
1314 if (usePat) {
1315 rebuiltPat.append(patBuf.toString());
1316 } else {
1317 _generatePattern(rebuiltPat, false, true);
1318 }
1319 }
1320
1321 private static void syntaxError(RuleCharacterIterator chars, String msg) {
1322 throw new IllegalArgumentException("Error: " + msg + " at \"" +
1323 Utility.escape(chars.toString()) +
1324 '"');
1325 }
1326
1327 //----------------------------------------------------------------
1328 // Implementation: Utility methods
1329 //----------------------------------------------------------------
1330
1331 private void ensureCapacity(int newLen) {
1332 if (newLen <= list.length) return;
1333 int[] temp = new int[newLen + GROW_EXTRA];
1334 System.arraycopy(list, 0, temp, 0, len);
1335 list = temp;
1336 }
1337
1338 private void ensureBufferCapacity(int newLen) {
1339 if (buffer != null && newLen <= buffer.length) return;
1340 buffer = new int[newLen + GROW_EXTRA];
1341 }
1342
1343 /**
1344 * Assumes start <= end.
1380 if (a < b) {
1381 buffer[k++] = a;
1382 a = list[i++];
1383 } else if (b < a) {
1384 buffer[k++] = b;
1385 b = other[j++];
1386 } else if (a != HIGH) { // at this point, a == b
1387 // discard both values!
1388 a = list[i++];
1389 b = other[j++];
1390 } else { // DONE!
1391 buffer[k++] = HIGH;
1392 len = k;
1393 break;
1394 }
1395 }
1396 // swap list and buffer
1397 int[] temp = list;
1398 list = buffer;
1399 buffer = temp;
1400 pat = null;
1401 return this;
1402 }
1403
1404 // polarity = 0 is normal: x union y
1405 // polarity = 2: x union ~y
1406 // polarity = 1: ~x union y
1407 // polarity = 3: ~x union ~y
1408
1409 private UnicodeSet add(int[] other, int otherLen, int polarity) {
1410 ensureBufferCapacity(len + otherLen);
1411 int i = 0, j = 0, k = 0;
1412 int a = list[i++];
1413 int b = other[j++];
1414 // change from xor is that we have to check overlapping pairs
1415 // polarity bit 1 means a is second, bit 2 means b is.
1416 main:
1417 while (true) {
1418 switch (polarity) {
1419 case 0: // both first; take lower if unequal
1420 if (a < b) { // take a
1478 break;
1479 case 2: // a first, b second; if a < b, overlap
1480 if (b < a) { // no overlap, take b
1481 buffer[k++] = b; b = other[j++]; polarity ^= 2;
1482 } else if (a < b) { // OVERLAP, drop a
1483 a = list[i++]; polarity ^= 1;
1484 } else { // a == b, drop both!
1485 if (a == HIGH) break main;
1486 a = list[i++]; polarity ^= 1;
1487 b = other[j++]; polarity ^= 2;
1488 }
1489 break;
1490 }
1491 }
1492 buffer[k++] = HIGH; // terminate
1493 len = k;
1494 // swap list and buffer
1495 int[] temp = list;
1496 list = buffer;
1497 buffer = temp;
1498 pat = null;
1499 return this;
1500 }
1501
1502 // polarity = 0 is normal: x intersect y
1503 // polarity = 2: x intersect ~y == set-minus
1504 // polarity = 1: ~x intersect y
1505 // polarity = 3: ~x intersect ~y
1506
1507 private UnicodeSet retain(int[] other, int otherLen, int polarity) {
1508 ensureBufferCapacity(len + otherLen);
1509 int i = 0, j = 0, k = 0;
1510 int a = list[i++];
1511 int b = other[j++];
1512 // change from xor is that we have to check overlapping pairs
1513 // polarity bit 1 means a is second, bit 2 means b is.
1514 main:
1515 while (true) {
1516 switch (polarity) {
1517 case 0: // both first; drop the smaller
1518 if (a < b) { // drop a
1549 break;
1550 case 2: // a first, b second; if a < b, overlap
1551 if (b < a) { // no overlap, drop b
1552 b = other[j++]; polarity ^= 2;
1553 } else if (a < b) { // OVERLAP, take a
1554 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1555 } else { // a == b, drop both!
1556 if (a == HIGH) break main;
1557 a = list[i++]; polarity ^= 1;
1558 b = other[j++]; polarity ^= 2;
1559 }
1560 break;
1561 }
1562 }
1563 buffer[k++] = HIGH; // terminate
1564 len = k;
1565 // swap list and buffer
1566 int[] temp = list;
1567 list = buffer;
1568 buffer = temp;
1569 pat = null;
1570 return this;
1571 }
1572
1573 private static final int max(int a, int b) {
1574 return (a > b) ? a : b;
1575 }
1576
1577 //----------------------------------------------------------------
1578 // Generic filter-based scanning code
1579 //----------------------------------------------------------------
1580
1581 private static interface Filter {
1582 boolean contains(int codePoint);
1583 }
1584
1585 // VersionInfo for unassigned characters
1586 static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
1587
1588 private static class VersionFilter implements Filter {
1589 VersionInfo version;
1590
1591 VersionFilter(VersionInfo version) { this.version = version; }
1592
1593 public boolean contains(int ch) {
1594 VersionInfo v = UCharacter.getAge(ch);
1595 // Reference comparison ok; VersionInfo caches and reuses
1596 // unique objects.
1597 return v != NO_VERSION &&
1598 v.compareTo(version) <= 0;
1599 }
1600 }
1601
1602 private static synchronized UnicodeSet getInclusions(int src) {
1603 if (INCLUSIONS == null) {
1604 INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
1605 }
1606 if(INCLUSIONS[src] == null) {
1607 UnicodeSet incl = new UnicodeSet();
1608 switch(src) {
1609 case UCharacterProperty.SRC_PROPSVEC:
1610 UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
1611 break;
1612 default:
1613 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
1614 }
1615 INCLUSIONS[src] = incl;
1616 }
1617 return INCLUSIONS[src];
1618 }
1619
1620 /**
1621 * Generic filter-based scanning code for UCD property UnicodeSets.
1622 */
1623 private UnicodeSet applyFilter(Filter filter, int src) {
1624 // Walk through all Unicode characters, noting the start
1625 // and end of each range for which filter.contain(c) is
1626 // true. Add each range to a set.
1627 //
1628 // To improve performance, use the INCLUSIONS set, which
1629 // encodes information about character ranges that are known
1630 // to have identical properties, such as the CJK Ideographs
1631 // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
1632 // except the first characters of such ranges.
1633 //
1634 // TODO Where possible, instead of scanning over code points,
1635 // use internal property data to initialize UnicodeSets for
1636 // those properties. Scanning code points is slow.
1637
1638 clear();
1639
1640 int startHasProperty = -1;
1641 UnicodeSet inclusions = getInclusions(src);
1642 int limitRange = inclusions.getRangeCount();
1643
1644 for (int j=0; j<limitRange; ++j) {
1645 // get current range
1646 int start = inclusions.getRangeStart(j);
1647 int end = inclusions.getRangeEnd(j);
1648
1649 // for all the code points in the range, process
1650 for (int ch = start; ch <= end; ++ch) {
1651 // only add to the unicodeset on inflection points --
1652 // where the hasProperty value changes to false
1653 if (filter.contains(ch)) {
1654 if (startHasProperty < 0) {
1655 startHasProperty = ch;
1656 }
1657 } else if (startHasProperty >= 0) {
1658 add_unchecked(startHasProperty, ch-1);
1659 startHasProperty = -1;
1660 }
1661 }
1662 }
1663 if (startHasProperty >= 0) {
1664 add_unchecked(startHasProperty, 0x10FFFF);
1665 }
1666
1667 return this;
1668 }
1669
1670 /**
1671 * Remove leading and trailing rule white space and compress
1672 * internal rule white space to a single space character.
1673 *
1674 * @see UCharacterProperty#isRuleWhiteSpace
1675 */
1676 private static String mungeCharName(String source) {
1677 StringBuffer buf = new StringBuffer();
1678 for (int i=0; i<source.length(); ) {
1679 int ch = UTF16.charAt(source, i);
1680 i += UTF16.getCharCount(ch);
1681 if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1682 if (buf.length() == 0 ||
1683 buf.charAt(buf.length() - 1) == ' ') {
1684 continue;
1685 }
1686 ch = ' '; // convert to ' '
1687 }
1688 UTF16.append(buf, ch);
1689 }
1690 if (buf.length() != 0 &&
1691 buf.charAt(buf.length() - 1) == ' ') {
1692 buf.setLength(buf.length() - 1);
1693 }
1694 return buf.toString();
1695 }
1696
1697 /**
1698 * Modifies this set to contain those code points which have the
1699 * given value for the given property. Prior contents of this
1700 * set are lost.
1701 * @param propertyAlias the property alias
1702 * @param valueAlias the value alias
1703 * @param symbols if not null, then symbols are first called to see if a property
1704 * is available. If true, then everything else is skipped.
1705 * @return this set
1706 * @stable ICU 3.2
1707 */
1708 public UnicodeSet applyPropertyAlias(String propertyAlias,
1709 String valueAlias, SymbolTable symbols) {
1710 if (valueAlias.length() > 0) {
1711 if (propertyAlias.equals("Age")) {
1712 // Must munge name, since
1713 // VersionInfo.getInstance() does not do
1714 // 'loose' matching.
1715 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1716 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1717 return this;
1718 }
1719 }
1720 throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1721 }
1722
1723 /**
1724 * Return true if the given iterator appears to point at a
1725 * property pattern. Regardless of the result, return with the
1726 * iterator unchanged.
1727 * @param chars iterator over the pattern characters. Upon return
1728 * it will be unchanged.
1729 * @param iterOpts RuleCharacterIterator options
1730 */
1731 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
1732 int iterOpts) {
1733 boolean result = false;
1734 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
1735 Object pos = chars.getPos(null);
1736 int c = chars.next(iterOpts);
1737 if (c == '[' || c == '\\') {
1738 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
1739 result = (c == '[') ? (d == ':') :
1740 (d == 'N' || d == 'p' || d == 'P');
1741 }
1742 chars.setPos(pos);
1743 return result;
1744 }
1745
1746 /**
1747 * Parse the given property pattern at the given parse position.
1748 * @param symbols TODO
1749 */
1750 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
1751 int pos = ppos.getIndex();
1752
1753 // On entry, ppos should point to one of the following locations:
1754
1755 // Minimum length is 5 characters, e.g. \p{L}
1756 if ((pos+5) > pattern.length()) {
1757 return null;
1758 }
1759
1760 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1761 boolean isName = false; // true for \N{pat}, o/w false
1762 boolean invert = false;
1763
1764 // Look for an opening [:, [:^, \p, or \P
1765 if (pattern.regionMatches(pos, "[:", 0, 2)) {
1766 posix = true;
1767 pos = Utility.skipWhitespace(pattern, pos+2);
1768 if (pos < pattern.length() && pattern.charAt(pos) == '^') {
1769 ++pos;
1770 invert = true;
1771 }
1772 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
1773 pattern.regionMatches(pos, "\\N", 0, 2)) {
1774 char c = pattern.charAt(pos+1);
1775 invert = (c == 'P');
1776 isName = (c == 'N');
1777 pos = Utility.skipWhitespace(pattern, pos+2);
1778 if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
1779 // Syntax error; "\p" or "\P" not followed by "{"
1780 return null;
1781 }
1782 } else {
1783 // Open delimiter not seen
1784 return null;
1785 }
1786
1787 // Look for the matching close delimiter, either :] or }
1788 int close = pattern.indexOf(posix ? ":]" : "}", pos);
1789 if (close < 0) {
1790 // Syntax error; close delimiter missing
1791 return null;
1792 }
1793
1794 // Look for an '=' sign. If this is present, we will parse a
1795 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1796 // pattern.
1797 int equals = pattern.indexOf('=', pos);
1798 String propName, valueName;
1799 if (equals >= 0 && equals < close && !isName) {
1800 // Equals seen; parse medium/long pattern
1801 propName = pattern.substring(pos, equals);
1802 valueName = pattern.substring(equals+1, close);
1803 }
1804
1805 else {
1806 // Handle case where no '=' is seen, and \N{}
1807 propName = pattern.substring(pos, close);
1808 valueName = "";
1809
1810 // Handle \N{name}
1811 if (isName) {
1812 // This is a little inefficient since it means we have to
1813 // parse "na" back to UProperty.NAME even though we already
1814 // know it's UProperty.NAME. If we refactor the API to
1815 // support args of (int, String) then we can remove
1816 // "na" and make this a little more efficient.
1817 valueName = propName;
1818 propName = "na";
1819 }
1820 }
1821
1822 applyPropertyAlias(propName, valueName, symbols);
1823
1824 if (invert) {
1825 complement();
1826 }
1827
1828 // Move to the limit position after the close delimiter
1829 ppos.setIndex(close + (posix ? 2 : 1));
1830
1831 return this;
1832 }
1833
1834 /**
1835 * Parse a property pattern.
1836 * @param chars iterator over the pattern characters. Upon return
1837 * it will be advanced to the first character after the parsed
1838 * pattern, or the end of the iteration if all characters are
1839 * parsed.
1840 * @param rebuiltPat the pattern that was parsed, rebuilt or
1841 * copied from the input pattern, as appropriate.
1842 * @param symbols TODO
1843 */
1844 private void applyPropertyPattern(RuleCharacterIterator chars,
1845 StringBuffer rebuiltPat, SymbolTable symbols) {
1846 String patStr = chars.lookahead();
1847 ParsePosition pos = new ParsePosition(0);
1848 applyPropertyPattern(patStr, pos, symbols);
1849 if (pos.getIndex() == 0) {
1850 syntaxError(chars, "Invalid property pattern");
1851 }
1852 chars.jumpahead(pos.getIndex());
1853 rebuiltPat.append(patStr, 0, pos.getIndex());
1854 }
1855
1856 //----------------------------------------------------------------
1857 // Case folding API
1858 //----------------------------------------------------------------
1859
1860 /**
1861 * Bitmask for constructor and applyPattern() indicating that
1862 * white space should be ignored. If set, ignore characters for
1863 * which UCharacterProperty.isRuleWhiteSpace() returns true,
1864 * unless they are quoted or escaped. This may be ORed together
1865 * with other selectors.
1866 * @stable ICU 3.8
1867 */
1868 public static final int IGNORE_SPACE = 1;
1869
1870 }
1871
|
1 /*
2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 *******************************************************************************
28 * Copyright (C) 1996-2015, International Business Machines Corporation and
29 * others. All Rights Reserved.
30 *******************************************************************************
31 */
32 package sun.text.normalizer;
33
34 import java.io.IOException;
35 import java.text.ParsePosition;
36 import java.util.ArrayList;
37 import java.util.TreeSet;
38
39 /**
40 * A mutable set of Unicode characters and multicharacter strings.
41 * Objects of this class represent <em>character classes</em> used
42 * in regular expressions. A character specifies a subset of Unicode
43 * code points. Legal code points are U+0000 to U+10FFFF, inclusive.
44 *
45 * Note: method freeze() will not only make the set immutable, but
46 * also makes important methods much higher performance:
47 * contains(c), containsNone(...), span(...), spanBack(...) etc.
48 * After the object is frozen, any subsequent call that wants to change
49 * the object will throw UnsupportedOperationException.
50 *
51 * <p>The UnicodeSet class is not designed to be subclassed.
52 *
53 * <p><code>UnicodeSet</code> supports two APIs. The first is the
54 * <em>operand</em> API that allows the caller to modify the value of
55 * a <code>UnicodeSet</code> object. It conforms to Java 2's
56 * <code>java.util.Set</code> interface, although
57 * <code>UnicodeSet</code> does not actually implement that
58 * interface. All methods of <code>Set</code> are supported, with the
59 * modification that they take a character range or single character
60 * instead of an <code>Object</code>, and they take a
61 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
62 * operand API may be thought of in terms of boolean logic: a boolean
63 * OR is implemented by <code>add</code>, a boolean AND is implemented
64 * by <code>retain</code>, a boolean XOR is implemented by
65 * <code>complement</code> taking an argument, and a boolean NOT is
66 * implemented by <code>complement</code> with no argument. In terms
67 * of traditional set theory function names, <code>add</code> is a
68 * union, <code>retain</code> is an intersection, <code>remove</code>
69 * is an asymmetric difference, and <code>complement</code> with no
103 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
104 * point order</td>
105 * </tr>
106 * <tr>
107 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
108 * <td valign="top">The character U+4E01</td>
109 * </tr>
110 * <tr>
111 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
112 * <td valign="top">The character 'a' and the multicharacter strings "ab" and
113 * "ac"</td>
114 * </tr>
115 * <tr>
116 * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
117 * <td valign="top">All characters in the general category Uppercase Letter</td>
118 * </tr>
119 * </table>
120 * </blockquote>
121 *
122 * Any character may be preceded by a backslash in order to remove any special
123 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are
124 * ignored, unless they are escaped.
125 *
126 * <p>Property patterns specify a set of characters having a certain
127 * property as defined by the Unicode standard. Both the POSIX-like
128 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
129 * complete list of supported property patterns, see the User's Guide
130 * for UnicodeSet at
131 * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
132 * http://www.icu-project.org/userguide/unicodeSet.html</a>.
133 * Actual determination of property data is defined by the underlying
134 * Unicode database as implemented by UCharacter.
135 *
136 * <p>Patterns specify individual characters, ranges of characters, and
137 * Unicode property sets. When elements are concatenated, they
138 * specify their union. To complement a set, place a '^' immediately
139 * after the opening '['. Property patterns are inverted by modifying
140 * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
141 * '^' has no special meaning.
142 *
143 * <p>Ranges are indicated by placing two a '-' between two
252 * <td valign="top"></td>
253 * <td valign="top">one or more instances of <code>a</code><br>
254 * </td>
255 * </tr>
256 * <tr>
257 * <td nowrap valign="top"><code>a | b</code></td>
258 * <td valign="top"></td>
259 * <td valign="top">either <code>a</code> or <code>b</code><br>
260 * </td>
261 * </tr>
262 * <tr>
263 * <td nowrap valign="top"><code>'a'</code></td>
264 * <td valign="top"></td>
265 * <td valign="top">the literal string between the quotes </td>
266 * </tr>
267 * </table>
268 * </td>
269 * </tr>
270 * </table>
271 * </blockquote>
272 * <p>To iterate over contents of UnicodeSet, the following are available:
273 * <ul><li>{@link #ranges()} to iterate through the ranges</li>
274 * <li>{@link #strings()} to iterate through the strings</li>
275 * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
276 * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
277 * </ul>
278 * All of the above can be used in <b>for</b> loops.
279 * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
280 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
281 *
282 * @author Alan Liu
283 * @stable ICU 2.0
284 */
285 class UnicodeSet {
286
287 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
288 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
289 // 110000 for codepoints
290
291 /**
292 * Minimum value that can be stored in a UnicodeSet.
293 * @stable ICU 2.0
294 */
295 public static final int MIN_VALUE = LOW;
296
297 /**
298 * Maximum value that can be stored in a UnicodeSet.
299 * @stable ICU 2.0
300 */
301 public static final int MAX_VALUE = HIGH - 1;
302
303 private int len; // length used; list may be longer to minimize reallocs
304 private int[] list; // MUST be terminated with HIGH
305 private int[] rangeList; // internal buffer
306 private int[] buffer; // internal buffer
307
308 // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
309 // is not private so that UnicodeSetIterator can get access
310 TreeSet<String> strings = new TreeSet<String>();
311
312 /**
313 * The pattern representation of this set. This may not be the
314 * most economical pattern. It is the pattern supplied to
315 * applyPattern(), with variables substituted and whitespace
316 * removed. For sets constructed without applyPattern(), or
317 * modified using the non-pattern API, this string will be null,
318 * indicating that toPattern() must generate a pattern
319 * representation from the inversion list.
320 */
321
322 private static final int START_EXTRA = 16; // initial storage. Must be >= 0
323 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
324
325 private static UnicodeSet INCLUSION = null;
326
327 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
328 private volatile UnicodeSetStringSpan stringSpan;
329
330 //----------------------------------------------------------------
331 // Public API
332 //----------------------------------------------------------------
333
334 /**
335 * Constructs an empty set.
336 * @stable ICU 2.0
337 */
338 private UnicodeSet() {
339 list = new int[1 + START_EXTRA];
340 list[len++] = HIGH;
341 }
342
343 /**
344 * Constructs a copy of an existing set.
345 * @stable ICU 2.0
346 */
347 private UnicodeSet(UnicodeSet other) {
348 set(other);
349 }
350
351 /**
352 * Constructs a set containing the given range. If <code>end >
353 * start</code> then an empty set is created.
354 *
355 * @param start first character, inclusive, of range
356 * @param end last character, inclusive, of range
357 * @stable ICU 2.0
358 */
359 public UnicodeSet(int start, int end) {
360 this();
361 complement(start, end);
362 }
363
364 /**
365 * Constructs a set from the given pattern. See the class description
366 * for the syntax of the pattern language. Whitespace is ignored.
367 * @param pattern a string specifying what characters are in the set
368 * @exception java.lang.IllegalArgumentException if the pattern contains
369 * a syntax error.
370 * @stable ICU 2.0
371 */
372 public UnicodeSet(String pattern) {
373 this();
374 applyPattern(pattern, null);
375 }
376
377 /**
378 * Make this object represent the same set as <code>other</code>.
379 * @param other a <code>UnicodeSet</code> whose value will be
380 * copied to this object
381 * @stable ICU 2.0
382 */
383 public UnicodeSet set(UnicodeSet other) {
384 checkFrozen();
385 list = other.list.clone();
386 len = other.len;
387 strings = new TreeSet<String>(other.strings);
388 return this;
389 }
390
391 /**
392 * Returns the number of elements in this set (its cardinality)
393 * Note than the elements of a set may include both individual
394 * codepoints and strings.
395 *
396 * @return the number of elements in this set (its cardinality).
397 * @stable ICU 2.0
398 */
399 public int size() {
400 int n = 0;
401 int count = getRangeCount();
402 for (int i = 0; i < count; ++i) {
403 n += getRangeEnd(i) - getRangeStart(i) + 1;
404 }
405 return n + strings.size();
406 }
407
408 // for internal use, after checkFrozen has been called
409 private UnicodeSet add_unchecked(int start, int end) {
410 if (start < MIN_VALUE || start > MAX_VALUE) {
411 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
412 }
413 if (end < MIN_VALUE || end > MAX_VALUE) {
414 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
415 }
416 if (start < end) {
417 add(range(start, end), 2, 0);
418 } else if (start == end) {
419 add(start);
420 }
421 return this;
422 }
423
424 /**
425 * Adds the specified character to this set if it is not already
426 * present. If this set already contains the specified character,
427 * the call leaves this set unchanged.
428 * @stable ICU 2.0
429 */
430 public final UnicodeSet add(int c) {
431 checkFrozen();
432 return add_unchecked(c);
433 }
434
435 // for internal use only, after checkFrozen has been called
436 private final UnicodeSet add_unchecked(int c) {
437 if (c < MIN_VALUE || c > MAX_VALUE) {
438 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
439 }
440
441 // find smallest i such that c < list[i]
442 // if odd, then it is IN the set
443 // if even, then it is OUT of the set
444 int i = findCodePoint(c);
445
446 // already in set?
447 if ((i & 1) != 0) return this;
448
449 // HIGH is 0x110000
450 // assert(list[len-1] == HIGH);
451
496 // ^
497 // list[i]
498
499 // Don't use ensureCapacity() to save on copying.
500 // NOTE: This has no measurable impact on performance,
501 // but it might help in some usage patterns.
502 if (len+2 > list.length) {
503 int[] temp = new int[len + 2 + GROW_EXTRA];
504 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
505 System.arraycopy(list, i, temp, i+2, len-i);
506 list = temp;
507 } else {
508 System.arraycopy(list, i, list, i+2, len-i);
509 }
510
511 list[i] = c;
512 list[i+1] = c+1;
513 len += 2;
514 }
515
516 return this;
517 }
518
519 /**
520 * Adds the specified multicharacter to this set if it is not already
521 * present. If this set already contains the multicharacter,
522 * the call leaves this set unchanged.
523 * Thus {@code "ch" => {"ch"}}
524 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
525 * @param s the source string
526 * @return this object, for chaining
527 * @stable ICU 2.0
528 */
529 public final UnicodeSet add(CharSequence s) {
530 checkFrozen();
531 int cp = getSingleCP(s);
532 if (cp < 0) {
533 strings.add(s.toString());
534 } else {
535 add_unchecked(cp, cp);
536 }
537 return this;
538 }
539
540 /**
541 * Utility for getting code point from single code point CharSequence.
542 * See the public UTF16.getSingleCodePoint()
543 * @return a code point IF the string consists of a single one.
544 * otherwise returns -1.
545 * @param s to test
546 */
547 private static int getSingleCP(CharSequence s) {
548 if (s.length() < 1) {
549 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
550 }
551 if (s.length() > 2) return -1;
552 if (s.length() == 1) return s.charAt(0);
553
554 // at this point, len = 2
555 int cp = UTF16.charAt(s, 0);
556 if (cp > 0xFFFF) { // is surrogate pair
557 return cp;
558 }
559 return -1;
560 }
561
562 /**
563 * Complements the specified range in this set. Any character in
564 * the range will be removed if it is in this set, or will be
565 * added if it is not in this set. If {@code end > start}
566 * then an empty range is complemented, leaving the set unchanged.
567 *
568 * @param start first character, inclusive, of range to be removed
569 * from this set.
570 * @param end last character, inclusive, of range to be removed
571 * from this set.
572 * @stable ICU 2.0
573 */
574 public UnicodeSet complement(int start, int end) {
575 checkFrozen();
576 if (start < MIN_VALUE || start > MAX_VALUE) {
577 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
578 }
579 if (end < MIN_VALUE || end > MAX_VALUE) {
580 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
581 }
582 if (start <= end) {
583 xor(range(start, end), 2, 0);
584 }
585 return this;
586 }
587
588 /**
589 * Returns true if this set contains the given character.
590 * @param c character to be checked for containment
591 * @return true if the test condition is met
592 * @stable ICU 2.0
593 */
594 public boolean contains(int c) {
595 if (c < MIN_VALUE || c > MAX_VALUE) {
596 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
597 }
598 if (bmpSet != null) {
599 return bmpSet.contains(c);
600 }
601 if (stringSpan != null) {
602 return stringSpan.contains(c);
603 }
604
605 /*
606 // Set i to the index of the start item greater than ch
607 // We know we will terminate without length test!
608 int i = -1;
609 while (true) {
610 if (c < list[++i]) break;
611 }
612 */
613
614 int i = findCodePoint(c);
615
616 return ((i & 1) != 0); // return true if odd
617 }
618
619 /**
620 * Returns the smallest value i such that c < list[i]. Caller
621 * must ensure that c is a legal value or this method will enter
622 * an infinite loop. This method performs a binary search.
623 * @param c a character in the range MIN_VALUE..MAX_VALUE
641 if (c < list[0]) return 0;
642 // High runner test. c is often after the last range, so an
643 // initial check for this condition pays off.
644 if (len >= 2 && c >= list[len-2]) return len-1;
645 int lo = 0;
646 int hi = len - 1;
647 // invariant: c >= list[lo]
648 // invariant: c < list[hi]
649 for (;;) {
650 int i = (lo + hi) >>> 1;
651 if (i == lo) return hi;
652 if (c < list[i]) {
653 hi = i;
654 } else {
655 lo = i;
656 }
657 }
658 }
659
660 /**
661 * Retains only the elements in this set that are contained in the
662 * specified set. In other words, removes from this set all of
663 * its elements that are not contained in the specified set. This
664 * operation effectively modifies this set so that its value is
665 * the <i>intersection</i> of the two sets.
666 *
667 * @param c set that defines which elements this set will retain.
668 * @stable ICU 2.0
669 */
670 public UnicodeSet retainAll(UnicodeSet c) {
671 checkFrozen();
672 retain(c.list, c.len, 0);
673 strings.retainAll(c.strings);
674 return this;
675 }
676
677 /**
678 * Removes all of the elements from this set. This set will be
679 * empty after this call returns.
680 * @stable ICU 2.0
681 */
682 public UnicodeSet clear() {
683 checkFrozen();
684 list[0] = HIGH;
685 len = 1;
686 strings.clear();
687 return this;
688 }
689
690 /**
691 * Iteration method that returns the number of ranges contained in
692 * this set.
693 * @see #getRangeStart
694 * @see #getRangeEnd
695 * @stable ICU 2.0
696 */
697 public int getRangeCount() {
698 return len/2;
699 }
700
701 /**
702 * Iteration method that returns the first character in the
703 * specified range of this set.
704 * @exception ArrayIndexOutOfBoundsException if index is outside
705 * the range <code>0..getRangeCount()-1</code>
733 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
734 * Parsing continues until the corresponding closing ']'. If a syntax error
735 * is encountered between the opening and closing brace, the parse fails.
736 * Upon return from a successful parse, the ParsePosition is updated to
737 * point to the character following the closing ']', and an inversion
738 * list for the parsed pattern is returned. This method
739 * calls itself recursively to parse embedded subpatterns.
740 *
741 * @param pattern the string containing the pattern to be parsed. The
742 * portion of the string from pos.getIndex(), which must be a '[', to the
743 * corresponding closing ']', is parsed.
744 * @param pos upon entry, the position at which to being parsing. The
745 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
746 * from a successful parse, pos.getIndex() is either the character after the
747 * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
748 * is the last character of the pattern string.
749 * @return an inversion list for the parsed substring
750 * of <code>pattern</code>
751 * @exception java.lang.IllegalArgumentException if the parse fails.
752 */
753 private UnicodeSet applyPattern(String pattern,
754 ParsePosition pos) {
755 if ("[:age=3.2:]".equals(pattern)) {
756 checkFrozen();
757 VersionInfo version = VersionInfo.getInstance("3.2");
758 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
759 } else {
760 throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
761 + pattern + ")");
762 }
763
764 return this;
765 }
766
767 //----------------------------------------------------------------
768 // Implementation: Utility methods
769 //----------------------------------------------------------------
770
771 private void ensureCapacity(int newLen) {
772 if (newLen <= list.length) return;
773 int[] temp = new int[newLen + GROW_EXTRA];
774 System.arraycopy(list, 0, temp, 0, len);
775 list = temp;
776 }
777
778 private void ensureBufferCapacity(int newLen) {
779 if (buffer != null && newLen <= buffer.length) return;
780 buffer = new int[newLen + GROW_EXTRA];
781 }
782
783 /**
784 * Assumes start <= end.
820 if (a < b) {
821 buffer[k++] = a;
822 a = list[i++];
823 } else if (b < a) {
824 buffer[k++] = b;
825 b = other[j++];
826 } else if (a != HIGH) { // at this point, a == b
827 // discard both values!
828 a = list[i++];
829 b = other[j++];
830 } else { // DONE!
831 buffer[k++] = HIGH;
832 len = k;
833 break;
834 }
835 }
836 // swap list and buffer
837 int[] temp = list;
838 list = buffer;
839 buffer = temp;
840 return this;
841 }
842
843 // polarity = 0 is normal: x union y
844 // polarity = 2: x union ~y
845 // polarity = 1: ~x union y
846 // polarity = 3: ~x union ~y
847
848 private UnicodeSet add(int[] other, int otherLen, int polarity) {
849 ensureBufferCapacity(len + otherLen);
850 int i = 0, j = 0, k = 0;
851 int a = list[i++];
852 int b = other[j++];
853 // change from xor is that we have to check overlapping pairs
854 // polarity bit 1 means a is second, bit 2 means b is.
855 main:
856 while (true) {
857 switch (polarity) {
858 case 0: // both first; take lower if unequal
859 if (a < b) { // take a
917 break;
918 case 2: // a first, b second; if a < b, overlap
919 if (b < a) { // no overlap, take b
920 buffer[k++] = b; b = other[j++]; polarity ^= 2;
921 } else if (a < b) { // OVERLAP, drop a
922 a = list[i++]; polarity ^= 1;
923 } else { // a == b, drop both!
924 if (a == HIGH) break main;
925 a = list[i++]; polarity ^= 1;
926 b = other[j++]; polarity ^= 2;
927 }
928 break;
929 }
930 }
931 buffer[k++] = HIGH; // terminate
932 len = k;
933 // swap list and buffer
934 int[] temp = list;
935 list = buffer;
936 buffer = temp;
937 return this;
938 }
939
940 // polarity = 0 is normal: x intersect y
941 // polarity = 2: x intersect ~y == set-minus
942 // polarity = 1: ~x intersect y
943 // polarity = 3: ~x intersect ~y
944
945 private UnicodeSet retain(int[] other, int otherLen, int polarity) {
946 ensureBufferCapacity(len + otherLen);
947 int i = 0, j = 0, k = 0;
948 int a = list[i++];
949 int b = other[j++];
950 // change from xor is that we have to check overlapping pairs
951 // polarity bit 1 means a is second, bit 2 means b is.
952 main:
953 while (true) {
954 switch (polarity) {
955 case 0: // both first; drop the smaller
956 if (a < b) { // drop a
987 break;
988 case 2: // a first, b second; if a < b, overlap
989 if (b < a) { // no overlap, drop b
990 b = other[j++]; polarity ^= 2;
991 } else if (a < b) { // OVERLAP, take a
992 buffer[k++] = a; a = list[i++]; polarity ^= 1;
993 } else { // a == b, drop both!
994 if (a == HIGH) break main;
995 a = list[i++]; polarity ^= 1;
996 b = other[j++]; polarity ^= 2;
997 }
998 break;
999 }
1000 }
1001 buffer[k++] = HIGH; // terminate
1002 len = k;
1003 // swap list and buffer
1004 int[] temp = list;
1005 list = buffer;
1006 buffer = temp;
1007 return this;
1008 }
1009
1010 private static final int max(int a, int b) {
1011 return (a > b) ? a : b;
1012 }
1013
1014 //----------------------------------------------------------------
1015 // Generic filter-based scanning code
1016 //----------------------------------------------------------------
1017
1018 private static interface Filter {
1019 boolean contains(int codePoint);
1020 }
1021
1022 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
1023
1024 private static class VersionFilter implements Filter {
1025 VersionInfo version;
1026 VersionFilter(VersionInfo version) { this.version = version; }
1027 public boolean contains(int ch) {
1028 VersionInfo v = UCharacter.getAge(ch);
1029 // Reference comparison ok; VersionInfo caches and reuses
1030 // unique objects.
1031 return v != NO_VERSION &&
1032 v.compareTo(version) <= 0;
1033 }
1034 }
1035
1036 private static synchronized UnicodeSet getInclusions(int src) {
1037 if (src != UCharacterProperty.SRC_PROPSVEC) {
1038 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
1039 }
1040
1041 if (INCLUSION == null) {
1042 UnicodeSet incl = new UnicodeSet();
1043 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
1044 INCLUSION = incl;
1045 }
1046 return INCLUSION;
1047 }
1048
1049 /**
1050 * Generic filter-based scanning code for UCD property UnicodeSets.
1051 */
1052 private UnicodeSet applyFilter(Filter filter, int src) {
1053 // Logically, walk through all Unicode characters, noting the start
1054 // and end of each range for which filter.contain(c) is
1055 // true. Add each range to a set.
1056 //
1057 // To improve performance, use an inclusions set which
1058 // encodes information about character ranges that are known
1059 // to have identical properties.
1060 // getInclusions(src) contains exactly the first characters of
1061 // same-value ranges for the given properties "source".
1062
1063 clear();
1064
1065 int startHasProperty = -1;
1066 UnicodeSet inclusions = getInclusions(src);
1067 int limitRange = inclusions.getRangeCount();
1068
1069 for (int j=0; j<limitRange; ++j) {
1070 // get current range
1071 int start = inclusions.getRangeStart(j);
1072 int end = inclusions.getRangeEnd(j);
1073
1074 // for all the code points in the range, process
1075 for (int ch = start; ch <= end; ++ch) {
1076 // only add to the unicodeset on inflection points --
1077 // where the hasProperty value changes to false
1078 if (filter.contains(ch)) {
1079 if (startHasProperty < 0) {
1080 startHasProperty = ch;
1081 }
1082 } else if (startHasProperty >= 0) {
1083 add_unchecked(startHasProperty, ch-1);
1084 startHasProperty = -1;
1085 }
1086 }
1087 }
1088 if (startHasProperty >= 0) {
1089 add_unchecked(startHasProperty, 0x10FFFF);
1090 }
1091
1092 return this;
1093 }
1094
1095 /**
1096 * Is this frozen, according to the Freezable interface?
1097 *
1098 * @return value
1099 * @stable ICU 3.8
1100 */
1101 public boolean isFrozen() {
1102 return (bmpSet != null || stringSpan != null);
1103 }
1104
1105 /**
1106 * Freeze this class, according to the Freezable interface.
1107 *
1108 * @return this
1109 * @stable ICU 4.4
1110 */
1111 public UnicodeSet freeze() {
1112 if (!isFrozen()) {
1113 // Do most of what compact() does before freezing because
1114 // compact() will not work when the set is frozen.
1115 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
1116
1117 // Delete buffer first to defragment memory less.
1118 buffer = null;
1119 if (list.length > (len + GROW_EXTRA)) {
1120 // Make the capacity equal to len or 1.
1121 // We don't want to realloc of 0 size.
1122 int capacity = (len == 0) ? 1 : len;
1123 int[] oldList = list;
1124 list = new int[capacity];
1125 for (int i = capacity; i-- > 0;) {
1126 list[i] = oldList[i];
1127 }
1128 }
1129
1130 // Optimize contains() and span() and similar functions.
1131 if (!strings.isEmpty()) {
1132 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
1133 }
1134 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
1135 // Optimize for code point spans.
1136 // There are no strings, or
1137 // all strings are irrelevant for span() etc. because
1138 // all of each string's code points are contained in this set.
1139 // However, fully contained strings are relevant for spanAndCount(),
1140 // so we create both objects.
1141 bmpSet = new BMPSet(list, len);
1142 }
1143 }
1144 return this;
1145 }
1146
1147 /**
1148 * Span a string using this UnicodeSet.
1149 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1150 * @param s The string to be spanned
1151 * @param spanCondition The span condition
1152 * @return the length of the span
1153 * @stable ICU 4.4
1154 */
1155 public int span(CharSequence s, SpanCondition spanCondition) {
1156 return span(s, 0, spanCondition);
1157 }
1158
1159 /**
1160 * Span a string using this UnicodeSet.
1161 * If the start index is less than 0, span will start from 0.
1162 * If the start index is greater than the string length, span returns the string length.
1163 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1164 * @param s The string to be spanned
1165 * @param start The start index that the span begins
1166 * @param spanCondition The span condition
1167 * @return the string index which ends the span (i.e. exclusive)
1168 * @stable ICU 4.4
1169 */
1170 public int span(CharSequence s, int start, SpanCondition spanCondition) {
1171 int end = s.length();
1172 if (start < 0) {
1173 start = 0;
1174 } else if (start >= end) {
1175 return end;
1176 }
1177 if (bmpSet != null) {
1178 // Frozen set without strings, or no string is relevant for span().
1179 return bmpSet.span(s, start, spanCondition, null);
1180 }
1181 if (stringSpan != null) {
1182 return stringSpan.span(s, start, spanCondition);
1183 } else if (!strings.isEmpty()) {
1184 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
1185 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
1186 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
1187 if (strSpan.needsStringSpanUTF16()) {
1188 return strSpan.span(s, start, spanCondition);
1189 }
1190 }
1191
1192 return spanCodePointsAndCount(s, start, spanCondition, null);
1193 }
1194
1195 /**
1196 * Same as span() but also counts the smallest number of set elements on any path across the span.
1197 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1198 * @param outCount An output-only object (must not be null) for returning the count.
1199 * @return the limit (exclusive end) of the span
1200 */
1201 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
1202 if (outCount == null) {
1203 throw new IllegalArgumentException("outCount must not be null");
1204 }
1205 int end = s.length();
1206 if (start < 0) {
1207 start = 0;
1208 } else if (start >= end) {
1209 return end;
1210 }
1211 if (stringSpan != null) {
1212 // We might also have bmpSet != null,
1213 // but fully-contained strings are relevant for counting elements.
1214 return stringSpan.spanAndCount(s, start, spanCondition, outCount);
1215 } else if (bmpSet != null) {
1216 return bmpSet.span(s, start, spanCondition, outCount);
1217 } else if (!strings.isEmpty()) {
1218 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
1219 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
1220 which |= UnicodeSetStringSpan.WITH_COUNT;
1221 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
1222 return strSpan.spanAndCount(s, start, spanCondition, outCount);
1223 }
1224
1225 return spanCodePointsAndCount(s, start, spanCondition, outCount);
1226 }
1227
1228 private int spanCodePointsAndCount(CharSequence s, int start,
1229 SpanCondition spanCondition, OutputInt outCount) {
1230 // Pin to 0/1 values.
1231 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
1232
1233 int c;
1234 int next = start;
1235 int length = s.length();
1236 int count = 0;
1237 do {
1238 c = Character.codePointAt(s, next);
1239 if (spanContained != contains(c)) {
1240 break;
1241 }
1242 ++count;
1243 next += Character.charCount(c);
1244 } while (next < length);
1245 if (outCount != null) { outCount.value = count; }
1246 return next;
1247 }
1248
1249 /**
1250 * Span a string backwards (from the fromIndex) using this UnicodeSet.
1251 * If the fromIndex is less than 0, spanBack will return 0.
1252 * If fromIndex is greater than the string length, spanBack will start from the string length.
1253 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
1254 * @param s The string to be spanned
1255 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
1256 * @param spanCondition The span condition
1257 * @return The string index which starts the span (i.e. inclusive).
1258 * @stable ICU 4.4
1259 */
1260 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
1261 if (fromIndex <= 0) {
1262 return 0;
1263 }
1264 if (fromIndex > s.length()) {
1265 fromIndex = s.length();
1266 }
1267 if (bmpSet != null) {
1268 // Frozen set without strings, or no string is relevant for spanBack().
1269 return bmpSet.spanBack(s, fromIndex, spanCondition);
1270 }
1271 if (stringSpan != null) {
1272 return stringSpan.spanBack(s, fromIndex, spanCondition);
1273 } else if (!strings.isEmpty()) {
1274 int which = (spanCondition == SpanCondition.NOT_CONTAINED)
1275 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
1276 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
1277 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
1278 if (strSpan.needsStringSpanUTF16()) {
1279 return strSpan.spanBack(s, fromIndex, spanCondition);
1280 }
1281 }
1282
1283 // Pin to 0/1 values.
1284 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
1285
1286 int c;
1287 int prev = fromIndex;
1288 do {
1289 c = Character.codePointBefore(s, prev);
1290 if (spanContained != contains(c)) {
1291 break;
1292 }
1293 prev -= Character.charCount(c);
1294 } while (prev > 0);
1295 return prev;
1296 }
1297
1298 /**
1299 * Clone a thawed version of this class, according to the Freezable interface.
1300 * @return the clone, not frozen
1301 * @stable ICU 4.4
1302 */
1303 public UnicodeSet cloneAsThawed() {
1304 UnicodeSet result = new UnicodeSet(this);
1305 assert !result.isFrozen();
1306 return result;
1307 }
1308
1309 // internal function
1310 private void checkFrozen() {
1311 if (isFrozen()) {
1312 throw new UnsupportedOperationException("Attempt to modify frozen object");
1313 }
1314 }
1315
1316 /**
1317 * Argument values for whether span() and similar functions continue while the current character is contained vs.
1318 * not contained in the set.
1319 * <p>
1320 * The functionality is straightforward for sets with only single code points, without strings (which is the common
1321 * case):
1322 * <ul>
1323 * <li>CONTAINED and SIMPLE work the same.
1324 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
1325 * <li>span() and spanBack() partition any string the
1326 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
1327 * <li>Using a
1328 * complemented (inverted) set and the opposite span conditions yields the same results.
1329 * </ul>
1330 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
1331 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
1332 * strings:
1333 * <ul>
1334 * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
1335 * Therefore, complementing both the set and the span conditions may yield different results.
1336 * <li>When starting spans
1337 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
1338 * because a set string may start before the later position.
1339 * <li>span(SIMPLE) may be shorter than
1340 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
1341 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
1342 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
1343 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
1344 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
1345 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
1346 * </ul>
1347 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
1348 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
1349 * be used.
1350 * <p>
1351 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
1352 * boundaries, never in the middle of a surrogate pair.
1353 *
1354 * @stable ICU 4.4
1355 */
1356 public enum SpanCondition {
1357 /**
1358 * Continues a span() while there is no set element at the current position.
1359 * Increments by one code point at a time.
1360 * Stops before the first set element (character or string).
1361 * (For code points only, this is like while contains(current)==false).
1362 * <p>
1363 * When span() returns, the substring between where it started and the position it returned consists only of
1364 * characters that are not in the set, and none of its strings overlap with the span.
1365 *
1366 * @stable ICU 4.4
1367 */
1368 NOT_CONTAINED,
1369
1370 /**
1371 * Spans the longest substring that is a concatenation of set elements (characters or strings).
1372 * (For characters only, this is like while contains(current)==true).
1373 * <p>
1374 * When span() returns, the substring between where it started and the position it returned consists only of set
1375 * elements (characters or strings) that are in the set.
1376 * <p>
1377 * If a set contains strings, then the span will be the longest substring for which there
1378 * exists at least one non-overlapping concatenation of set elements (characters or strings).
1379 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
1380 * (Java/ICU/Perl regex stops at the first match of an OR.)
1381 *
1382 * @stable ICU 4.4
1383 */
1384 CONTAINED,
1385
1386 /**
1387 * Continues a span() while there is a set element at the current position.
1388 * Increments by the longest matching element at each position.
1389 * (For characters only, this is like while contains(current)==true).
1390 * <p>
1391 * When span() returns, the substring between where it started and the position it returned consists only of set
1392 * elements (characters or strings) that are in the set.
1393 * <p>
1394 * If a set only contains single characters, then this is the same as CONTAINED.
1395 * <p>
1396 * If a set contains strings, then the span will be the longest substring with a match at each position with the
1397 * longest single set element (character or string).
1398 * <p>
1399 * Use this span condition together with other longest-match algorithms, such as ICU converters
1400 * (ucnv_getUnicodeSet()).
1401 *
1402 * @stable ICU 4.4
1403 */
1404 SIMPLE,
1405 }
1406
1407 }
|