90 * <td nowrap valign="top" align="left"><code>[]</code></td>
91 * <td valign="top">No characters</td>
92 * </tr><tr align="top">
93 * <td nowrap valign="top" align="left"><code>[a]</code></td>
94 * <td valign="top">The character 'a'</td>
95 * </tr><tr align="top">
96 * <td nowrap valign="top" align="left"><code>[ae]</code></td>
97 * <td valign="top">The characters 'a' and 'e'</td>
98 * </tr>
99 * <tr>
100 * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
101 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
102 * point order</td>
103 * </tr>
104 * <tr>
105 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
106 * <td valign="top">The character U+4E01</td>
107 * </tr>
108 * <tr>
109 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
110 * <td valign="top">The character 'a' and the multicharacter strings "ab" and
111 * "ac"</td>
112 * </tr>
113 * <tr>
114 * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
115 * <td valign="top">All characters in the general category Uppercase Letter</td>
116 * </tr>
117 * </table>
118 * </blockquote>
119 *
120 * Any character may be preceded by a backslash in order to remove any special
121 * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
122 * ignored, unless they are escaped.
123 *
124 * <p>Property patterns specify a set of characters having a certain
125 * property as defined by the Unicode standard. Both the POSIX-like
126 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
127 * complete list of supported property patterns, see the User's Guide
128 * for UnicodeSet at
129 * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
130 * http://www.icu-project.org/userguide/unicodeSet.html</a>.
131 * Actual determination of property data is defined by the underlying
132 * Unicode database as implemented by UCharacter.
133 *
134 * <p>Patterns specify individual characters, ranges of characters, and
135 * Unicode property sets. When elements are concatenated, they
136 * specify their union. To complement a set, place a '^' immediately
137 * after the opening '['. Property patterns are inverted by modifying
138 * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
139 * '^' has no special meaning.
140 *
141 * <p>Ranges are indicated by placing two a '-' between two
142 * characters, as in "a-z". This specifies the range of all
143 * characters from the left to the right, in Unicode order. If the
144 * left character is greater than or equal to the
145 * right character it is a syntax error. If a '-' occurs as the first
146 * character after the opening '[' or '[^', or if it occurs as the
147 * last character before the closing ']', then it is taken as a
148 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
149 * set of three characters, 'a', 'b', and '-'.
150 *
151 * <p>Sets may be intersected using the '&' operator or the asymmetric
152 * set difference may be taken using the '-' operator, for example,
153 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
154 * with values less than 4096. Operators ('&' and '|') have equal
155 * precedence and bind left-to-right. Thus
156 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
157 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
158 * difference; intersection is commutative.
159 *
160 * <table>
161 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
162 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
163 * through 'z' and all letters in between, in Unicode order
164 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
165 * all characters but 'a' through 'z',
166 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
167 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
168 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
169 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
170 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
171 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
172 * <td>The asymmetric difference of sets specified by <em>pat1</em> and
173 * <em>pat2</em>
174 * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
175 * <td>The set of characters having the specified
176 * Unicode property; in
177 * this case, Unicode uppercase letters
178 * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
179 * <td>The set of characters <em>not</em> having the given
180 * Unicode property
181 * </table>
182 *
183 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
184 *
185 * <p><b>Formal syntax</b></p>
186 *
187 * <blockquote>
188 * <table>
189 * <tr align="top">
210 * <tr align="top">
211 * <td nowrap valign="top" align="right"><code>special := </code></td>
212 * <td valign="top"><code>'[' | ']' | '-'<br>
213 * </code></td>
214 * </tr>
215 * <tr align="top">
216 * <td nowrap valign="top" align="right"><code>char := </code></td>
217 * <td valign="top"><em>any character that is not</em><code> special<br>
218 * | ('\\' </code><em>any character</em><code>)<br>
219 * | ('\u' hex hex hex hex)<br>
220 * </code></td>
221 * </tr>
222 * <tr align="top">
223 * <td nowrap valign="top" align="right"><code>hex := </code></td>
224 * <td valign="top"><em>any character for which
225 * </em><code>Character.digit(c, 16)</code><em>
226 * returns a non-negative result</em></td>
227 * </tr>
228 * <tr>
229 * <td nowrap valign="top" align="right"><code>property := </code></td>
230 * <td valign="top"><em>a Unicode property set pattern</td>
231 * </tr>
232 * </table>
233 * <br>
234 * <table border="1">
235 * <tr>
236 * <td>Legend: <table>
237 * <tr>
238 * <td nowrap valign="top"><code>a := b</code></td>
239 * <td width="20" valign="top"> </td>
240 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
241 * </tr>
242 * <tr>
243 * <td nowrap valign="top"><code>a?</code></td>
244 * <td valign="top"></td>
245 * <td valign="top">zero or one instance of <code>a</code><br>
246 * </td>
247 * </tr>
248 * <tr>
249 * <td nowrap valign="top"><code>a*</code></td>
250 * <td valign="top"></td>
320 * certain ranges. These ranges are ranges of characters whose
321 * properties are all exactly alike, e.g. CJK Ideographs from
322 * U+4E00 to U+9FA5.
323 */
324 private static UnicodeSet INCLUSIONS[] = null;
325
326 //----------------------------------------------------------------
327 // Public API
328 //----------------------------------------------------------------
329
330 /**
331 * Constructs an empty set.
332 * @stable ICU 2.0
333 */
334 public UnicodeSet() {
335 list = new int[1 + START_EXTRA];
336 list[len++] = HIGH;
337 }
338
339 /**
340 * Constructs a set containing the given range. If <code>end >
341 * start</code> then an empty set is created.
342 *
343 * @param start first character, inclusive, of range
344 * @param end last character, inclusive, of range
345 * @stable ICU 2.0
346 */
347 public UnicodeSet(int start, int end) {
348 this();
349 complement(start, end);
350 }
351
352 /**
353 * Constructs a set from the given pattern. See the class description
354 * for the syntax of the pattern language. Whitespace is ignored.
355 * @param pattern a string specifying what characters are in the set
356 * @exception java.lang.IllegalArgumentException if the pattern contains
357 * a syntax error.
358 * @stable ICU 2.0
359 */
360 public UnicodeSet(String pattern) {
361 this();
634 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
635 System.arraycopy(list, i, temp, i+2, len-i);
636 list = temp;
637 } else {
638 System.arraycopy(list, i, list, i+2, len-i);
639 }
640
641 list[i] = c;
642 list[i+1] = c+1;
643 len += 2;
644 }
645
646 pat = null;
647 return this;
648 }
649
650 /**
651 * Adds the specified multicharacter to this set if it is not already
652 * present. If this set already contains the multicharacter,
653 * the call leaves this set unchanged.
654 * Thus "ch" => {"ch"}
655 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
656 * @param s the source string
657 * @return this object, for chaining
658 * @stable ICU 2.0
659 */
660 public final UnicodeSet add(String s) {
661 int cp = getSingleCP(s);
662 if (cp < 0) {
663 strings.add(s);
664 pat = null;
665 } else {
666 add_unchecked(cp, cp);
667 }
668 return this;
669 }
670
671 /**
672 * @return a code point IF the string consists of a single one.
673 * otherwise returns -1.
674 * @param string to test
675 */
676 private static int getSingleCP(String s) {
677 if (s.length() < 1) {
678 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
679 }
680 if (s.length() > 2) return -1;
681 if (s.length() == 1) return s.charAt(0);
682
683 // at this point, len = 2
684 int cp = UTF16.charAt(s, 0);
685 if (cp > 0xFFFF) { // is surrogate pair
686 return cp;
687 }
688 return -1;
689 }
690
691 /**
692 * Complements the specified range in this set. Any character in
693 * the range will be removed if it is in this set, or will be
694 * added if it is not in this set. If <code>end > start</code>
695 * then an empty range is complemented, leaving the set unchanged.
696 *
697 * @param start first character, inclusive, of range to be removed
698 * from this set.
699 * @param end last character, inclusive, of range to be removed
700 * from this set.
701 * @stable ICU 2.0
702 */
703 public UnicodeSet complement(int start, int end) {
704 if (start < MIN_VALUE || start > MAX_VALUE) {
705 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
706 }
707 if (end < MIN_VALUE || end > MAX_VALUE) {
708 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
709 }
710 if (start <= end) {
711 xor(range(start, end), 2, 0);
712 }
713 pat = null;
714 return this;
1681 if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1682 if (buf.length() == 0 ||
1683 buf.charAt(buf.length() - 1) == ' ') {
1684 continue;
1685 }
1686 ch = ' '; // convert to ' '
1687 }
1688 UTF16.append(buf, ch);
1689 }
1690 if (buf.length() != 0 &&
1691 buf.charAt(buf.length() - 1) == ' ') {
1692 buf.setLength(buf.length() - 1);
1693 }
1694 return buf.toString();
1695 }
1696
1697 /**
1698 * Modifies this set to contain those code points which have the
1699 * given value for the given property. Prior contents of this
1700 * set are lost.
1701 * @param propertyAlias
1702 * @param valueAlias
1703 * @param symbols if not null, then symbols are first called to see if a property
1704 * is available. If true, then everything else is skipped.
1705 * @return this set
1706 * @stable ICU 3.2
1707 */
1708 public UnicodeSet applyPropertyAlias(String propertyAlias,
1709 String valueAlias, SymbolTable symbols) {
1710 if (valueAlias.length() > 0) {
1711 if (propertyAlias.equals("Age")) {
1712 // Must munge name, since
1713 // VersionInfo.getInstance() does not do
1714 // 'loose' matching.
1715 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1716 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1717 return this;
1718 }
1719 }
1720 throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1721 }
1722
|
90 * <td nowrap valign="top" align="left"><code>[]</code></td>
91 * <td valign="top">No characters</td>
92 * </tr><tr align="top">
93 * <td nowrap valign="top" align="left"><code>[a]</code></td>
94 * <td valign="top">The character 'a'</td>
95 * </tr><tr align="top">
96 * <td nowrap valign="top" align="left"><code>[ae]</code></td>
97 * <td valign="top">The characters 'a' and 'e'</td>
98 * </tr>
99 * <tr>
100 * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
101 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
102 * point order</td>
103 * </tr>
104 * <tr>
105 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
106 * <td valign="top">The character U+4E01</td>
107 * </tr>
108 * <tr>
109 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
110 * <td valign="top">The character 'a' and the multicharacter strings "ab" and
111 * "ac"</td>
112 * </tr>
113 * <tr>
114 * <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
115 * <td valign="top">All characters in the general category Uppercase Letter</td>
116 * </tr>
117 * </table>
118 * </blockquote>
119 *
120 * Any character may be preceded by a backslash in order to remove any special
121 * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
122 * ignored, unless they are escaped.
123 *
124 * <p>Property patterns specify a set of characters having a certain
125 * property as defined by the Unicode standard. Both the POSIX-like
126 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
127 * complete list of supported property patterns, see the User's Guide
128 * for UnicodeSet at
129 * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
130 * http://www.icu-project.org/userguide/unicodeSet.html</a>.
131 * Actual determination of property data is defined by the underlying
132 * Unicode database as implemented by UCharacter.
133 *
134 * <p>Patterns specify individual characters, ranges of characters, and
135 * Unicode property sets. When elements are concatenated, they
136 * specify their union. To complement a set, place a '^' immediately
137 * after the opening '['. Property patterns are inverted by modifying
138 * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
139 * '^' has no special meaning.
140 *
141 * <p>Ranges are indicated by placing two a '-' between two
142 * characters, as in "a-z". This specifies the range of all
143 * characters from the left to the right, in Unicode order. If the
144 * left character is greater than or equal to the
145 * right character it is a syntax error. If a '-' occurs as the first
146 * character after the opening '[' or '[^', or if it occurs as the
147 * last character before the closing ']', then it is taken as a
148 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
149 * set of three characters, 'a', 'b', and '-'.
150 *
151 * <p>Sets may be intersected using the {@literal '&'} operator or the asymmetric
152 * set difference may be taken using the '-' operator, for example,
153 * "{@code [[:L:]&[\\u0000-\\u0FFF]]}" indicates the set of all Unicode letters
154 * with values less than 4096. Operators ({@literal '&'} and '|') have equal
155 * precedence and bind left-to-right. Thus
156 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
157 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
158 * difference; intersection is commutative.
159 *
160 * <table>
161 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
162 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
163 * through 'z' and all letters in between, in Unicode order
164 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
165 * all characters but 'a' through 'z',
166 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
167 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
168 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
169 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
170 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
171 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
172 * <td>The asymmetric difference of sets specified by <em>pat1</em> and
173 * <em>pat2</em>
174 * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
175 * <td>The set of characters having the specified
176 * Unicode property; in
177 * this case, Unicode uppercase letters
178 * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
179 * <td>The set of characters <em>not</em> having the given
180 * Unicode property
181 * </table>
182 *
183 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
184 *
185 * <p><b>Formal syntax</b></p>
186 *
187 * <blockquote>
188 * <table>
189 * <tr align="top">
210 * <tr align="top">
211 * <td nowrap valign="top" align="right"><code>special := </code></td>
212 * <td valign="top"><code>'[' | ']' | '-'<br>
213 * </code></td>
214 * </tr>
215 * <tr align="top">
216 * <td nowrap valign="top" align="right"><code>char := </code></td>
217 * <td valign="top"><em>any character that is not</em><code> special<br>
218 * | ('\\' </code><em>any character</em><code>)<br>
219 * | ('\u' hex hex hex hex)<br>
220 * </code></td>
221 * </tr>
222 * <tr align="top">
223 * <td nowrap valign="top" align="right"><code>hex := </code></td>
224 * <td valign="top"><em>any character for which
225 * </em><code>Character.digit(c, 16)</code><em>
226 * returns a non-negative result</em></td>
227 * </tr>
228 * <tr>
229 * <td nowrap valign="top" align="right"><code>property := </code></td>
230 * <td valign="top"><em>a Unicode property set pattern</em></td>
231 * </tr>
232 * </table>
233 * <br>
234 * <table border="1">
235 * <tr>
236 * <td>Legend: <table>
237 * <tr>
238 * <td nowrap valign="top"><code>a := b</code></td>
239 * <td width="20" valign="top"> </td>
240 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
241 * </tr>
242 * <tr>
243 * <td nowrap valign="top"><code>a?</code></td>
244 * <td valign="top"></td>
245 * <td valign="top">zero or one instance of <code>a</code><br>
246 * </td>
247 * </tr>
248 * <tr>
249 * <td nowrap valign="top"><code>a*</code></td>
250 * <td valign="top"></td>
320 * certain ranges. These ranges are ranges of characters whose
321 * properties are all exactly alike, e.g. CJK Ideographs from
322 * U+4E00 to U+9FA5.
323 */
324 private static UnicodeSet INCLUSIONS[] = null;
325
326 //----------------------------------------------------------------
327 // Public API
328 //----------------------------------------------------------------
329
330 /**
331 * Constructs an empty set.
332 * @stable ICU 2.0
333 */
334 public UnicodeSet() {
335 list = new int[1 + START_EXTRA];
336 list[len++] = HIGH;
337 }
338
339 /**
340 * Constructs a set containing the given range.
341 * If {@code end > start} then an empty set is created.
342 *
343 * @param start first character, inclusive, of range
344 * @param end last character, inclusive, of range
345 * @stable ICU 2.0
346 */
347 public UnicodeSet(int start, int end) {
348 this();
349 complement(start, end);
350 }
351
352 /**
353 * Constructs a set from the given pattern. See the class description
354 * for the syntax of the pattern language. Whitespace is ignored.
355 * @param pattern a string specifying what characters are in the set
356 * @exception java.lang.IllegalArgumentException if the pattern contains
357 * a syntax error.
358 * @stable ICU 2.0
359 */
360 public UnicodeSet(String pattern) {
361 this();
634 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
635 System.arraycopy(list, i, temp, i+2, len-i);
636 list = temp;
637 } else {
638 System.arraycopy(list, i, list, i+2, len-i);
639 }
640
641 list[i] = c;
642 list[i+1] = c+1;
643 len += 2;
644 }
645
646 pat = null;
647 return this;
648 }
649
650 /**
651 * Adds the specified multicharacter to this set if it is not already
652 * present. If this set already contains the multicharacter,
653 * the call leaves this set unchanged.
654 * Thus {@code "ch" => {"ch"}}
655 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
656 * @param s the source string
657 * @return this object, for chaining
658 * @stable ICU 2.0
659 */
660 public final UnicodeSet add(String s) {
661 int cp = getSingleCP(s);
662 if (cp < 0) {
663 strings.add(s);
664 pat = null;
665 } else {
666 add_unchecked(cp, cp);
667 }
668 return this;
669 }
670
671 /**
672 * @return a code point IF the string consists of a single one.
673 * otherwise returns -1.
674 * @param string to test
675 */
676 private static int getSingleCP(String s) {
677 if (s.length() < 1) {
678 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
679 }
680 if (s.length() > 2) return -1;
681 if (s.length() == 1) return s.charAt(0);
682
683 // at this point, len = 2
684 int cp = UTF16.charAt(s, 0);
685 if (cp > 0xFFFF) { // is surrogate pair
686 return cp;
687 }
688 return -1;
689 }
690
691 /**
692 * Complements the specified range in this set. Any character in
693 * the range will be removed if it is in this set, or will be
694 * added if it is not in this set. If {@code end > start}
695 * then an empty range is complemented, leaving the set unchanged.
696 *
697 * @param start first character, inclusive, of range to be removed
698 * from this set.
699 * @param end last character, inclusive, of range to be removed
700 * from this set.
701 * @stable ICU 2.0
702 */
703 public UnicodeSet complement(int start, int end) {
704 if (start < MIN_VALUE || start > MAX_VALUE) {
705 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
706 }
707 if (end < MIN_VALUE || end > MAX_VALUE) {
708 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
709 }
710 if (start <= end) {
711 xor(range(start, end), 2, 0);
712 }
713 pat = null;
714 return this;
1681 if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1682 if (buf.length() == 0 ||
1683 buf.charAt(buf.length() - 1) == ' ') {
1684 continue;
1685 }
1686 ch = ' '; // convert to ' '
1687 }
1688 UTF16.append(buf, ch);
1689 }
1690 if (buf.length() != 0 &&
1691 buf.charAt(buf.length() - 1) == ' ') {
1692 buf.setLength(buf.length() - 1);
1693 }
1694 return buf.toString();
1695 }
1696
1697 /**
1698 * Modifies this set to contain those code points which have the
1699 * given value for the given property. Prior contents of this
1700 * set are lost.
1701 * @param propertyAlias the property alias
1702 * @param valueAlias the value alias
1703 * @param symbols if not null, then symbols are first called to see if a property
1704 * is available. If true, then everything else is skipped.
1705 * @return this set
1706 * @stable ICU 3.2
1707 */
1708 public UnicodeSet applyPropertyAlias(String propertyAlias,
1709 String valueAlias, SymbolTable symbols) {
1710 if (valueAlias.length() > 0) {
1711 if (propertyAlias.equals("Age")) {
1712 // Must munge name, since
1713 // VersionInfo.getInstance() does not do
1714 // 'loose' matching.
1715 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1716 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1717 return this;
1718 }
1719 }
1720 throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1721 }
1722
|