< prev index next >

src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java

Print this page




  90  *       <td nowrap valign="top" align="left"><code>[]</code></td>
  91  *       <td valign="top">No characters</td>
  92  *     </tr><tr align="top">
  93  *       <td nowrap valign="top" align="left"><code>[a]</code></td>
  94  *       <td valign="top">The character 'a'</td>
  95  *     </tr><tr align="top">
  96  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
  97  *       <td valign="top">The characters 'a' and 'e'</td>
  98  *     </tr>
  99  *     <tr>
 100  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
 101  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
 102  *       point order</td>
 103  *     </tr>
 104  *     <tr>
 105  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
 106  *       <td valign="top">The character U+4E01</td>
 107  *     </tr>
 108  *     <tr>
 109  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
 110  *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
 111  *       &quot;ac&quot;</td>
 112  *     </tr>
 113  *     <tr>
 114  *       <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
 115  *       <td valign="top">All characters in the general category Uppercase Letter</td>
 116  *     </tr>
 117  *   </table>
 118  * </blockquote>
 119  *
 120  * Any character may be preceded by a backslash in order to remove any special
 121  * meaning.  White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
 122  * ignored, unless they are escaped.
 123  *
 124  * <p>Property patterns specify a set of characters having a certain
 125  * property as defined by the Unicode standard.  Both the POSIX-like
 126  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
 127  * complete list of supported property patterns, see the User's Guide
 128  * for UnicodeSet at
 129  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
 130  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
 131  * Actual determination of property data is defined by the underlying
 132  * Unicode database as implemented by UCharacter.
 133  *
 134  * <p>Patterns specify individual characters, ranges of characters, and
 135  * Unicode property sets.  When elements are concatenated, they
 136  * specify their union.  To complement a set, place a '^' immediately
 137  * after the opening '['.  Property patterns are inverted by modifying
 138  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
 139  * '^' has no special meaning.
 140  *
 141  * <p>Ranges are indicated by placing two a '-' between two
 142  * characters, as in "a-z".  This specifies the range of all
 143  * characters from the left to the right, in Unicode order.  If the
 144  * left character is greater than or equal to the
 145  * right character it is a syntax error.  If a '-' occurs as the first
 146  * character after the opening '[' or '[^', or if it occurs as the
 147  * last character before the closing ']', then it is taken as a
 148  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
 149  * set of three characters, 'a', 'b', and '-'.
 150  *
 151  * <p>Sets may be intersected using the '&' operator or the asymmetric
 152  * set difference may be taken using the '-' operator, for example,
 153  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
 154  * with values less than 4096.  Operators ('&' and '|') have equal
 155  * precedence and bind left-to-right.  Thus
 156  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
 157  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
 158  * difference; intersection is commutative.
 159  *
 160  * <table>
 161  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
 162  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
 163  * through 'z' and all letters in between, in Unicode order
 164  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 165  * all characters but 'a' through 'z',
 166  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 167  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 168  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 169  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
 170  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
 171  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
 172  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
 173  * <em>pat2</em>
 174  * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
 175  * <td>The set of characters having the specified
 176  * Unicode property; in
 177  * this case, Unicode uppercase letters
 178  * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
 179  * <td>The set of characters <em>not</em> having the given
 180  * Unicode property
 181  * </table>
 182  *
 183  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
 184  *
 185  * <p><b>Formal syntax</b></p>
 186  *
 187  * <blockquote>
 188  *   <table>
 189  *     <tr align="top">


 210  *     <tr align="top">
 211  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
 212  *       <td valign="top"><code>'[' | ']' | '-'<br>
 213  *       </code></td>
 214  *     </tr>
 215  *     <tr align="top">
 216  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
 217  *       <td valign="top"><em>any character that is not</em><code> special<br>
 218  *       | ('\\' </code><em>any character</em><code>)<br>
 219  *       | ('\u' hex hex hex hex)<br>
 220  *       </code></td>
 221  *     </tr>
 222  *     <tr align="top">
 223  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
 224  *       <td valign="top"><em>any character for which
 225  *       </em><code>Character.digit(c, 16)</code><em>
 226  *       returns a non-negative result</em></td>
 227  *     </tr>
 228  *     <tr>
 229  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
 230  *       <td valign="top"><em>a Unicode property set pattern</td>
 231  *     </tr>
 232  *   </table>
 233  *   <br>
 234  *   <table border="1">
 235  *     <tr>
 236  *       <td>Legend: <table>
 237  *         <tr>
 238  *           <td nowrap valign="top"><code>a := b</code></td>
 239  *           <td width="20" valign="top">&nbsp; </td>
 240  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
 241  *         </tr>
 242  *         <tr>
 243  *           <td nowrap valign="top"><code>a?</code></td>
 244  *           <td valign="top"></td>
 245  *           <td valign="top">zero or one instance of <code>a</code><br>
 246  *           </td>
 247  *         </tr>
 248  *         <tr>
 249  *           <td nowrap valign="top"><code>a*</code></td>
 250  *           <td valign="top"></td>


 320      * certain ranges.  These ranges are ranges of characters whose
 321      * properties are all exactly alike, e.g. CJK Ideographs from
 322      * U+4E00 to U+9FA5.
 323      */
 324     private static UnicodeSet INCLUSIONS[] = null;
 325 
 326     //----------------------------------------------------------------
 327     // Public API
 328     //----------------------------------------------------------------
 329 
 330     /**
 331      * Constructs an empty set.
 332      * @stable ICU 2.0
 333      */
 334     public UnicodeSet() {
 335         list = new int[1 + START_EXTRA];
 336         list[len++] = HIGH;
 337     }
 338 
 339     /**
 340      * Constructs a set containing the given range. If <code>end >
 341      * start</code> then an empty set is created.
 342      *
 343      * @param start first character, inclusive, of range
 344      * @param end last character, inclusive, of range
 345      * @stable ICU 2.0
 346      */
 347     public UnicodeSet(int start, int end) {
 348         this();
 349         complement(start, end);
 350     }
 351 
 352     /**
 353      * Constructs a set from the given pattern.  See the class description
 354      * for the syntax of the pattern language.  Whitespace is ignored.
 355      * @param pattern a string specifying what characters are in the set
 356      * @exception java.lang.IllegalArgumentException if the pattern contains
 357      * a syntax error.
 358      * @stable ICU 2.0
 359      */
 360     public UnicodeSet(String pattern) {
 361         this();


 634                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
 635                 System.arraycopy(list, i, temp, i+2, len-i);
 636                 list = temp;
 637             } else {
 638                 System.arraycopy(list, i, list, i+2, len-i);
 639             }
 640 
 641             list[i] = c;
 642             list[i+1] = c+1;
 643             len += 2;
 644         }
 645 
 646         pat = null;
 647         return this;
 648     }
 649 
 650     /**
 651      * Adds the specified multicharacter to this set if it is not already
 652      * present.  If this set already contains the multicharacter,
 653      * the call leaves this set unchanged.
 654      * Thus "ch" => {"ch"}
 655      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 656      * @param s the source string
 657      * @return this object, for chaining
 658      * @stable ICU 2.0
 659      */
 660     public final UnicodeSet add(String s) {
 661         int cp = getSingleCP(s);
 662         if (cp < 0) {
 663             strings.add(s);
 664             pat = null;
 665         } else {
 666             add_unchecked(cp, cp);
 667         }
 668         return this;
 669     }
 670 
 671     /**
 672      * @return a code point IF the string consists of a single one.
 673      * otherwise returns -1.
 674      * @param string to test
 675      */
 676     private static int getSingleCP(String s) {
 677         if (s.length() < 1) {
 678             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
 679         }
 680         if (s.length() > 2) return -1;
 681         if (s.length() == 1) return s.charAt(0);
 682 
 683         // at this point, len = 2
 684         int cp = UTF16.charAt(s, 0);
 685         if (cp > 0xFFFF) { // is surrogate pair
 686             return cp;
 687         }
 688         return -1;
 689     }
 690 
 691     /**
 692      * Complements the specified range in this set.  Any character in
 693      * the range will be removed if it is in this set, or will be
 694      * added if it is not in this set.  If <code>end > start</code>
 695      * then an empty range is complemented, leaving the set unchanged.
 696      *
 697      * @param start first character, inclusive, of range to be removed
 698      * from this set.
 699      * @param end last character, inclusive, of range to be removed
 700      * from this set.
 701      * @stable ICU 2.0
 702      */
 703     public UnicodeSet complement(int start, int end) {
 704         if (start < MIN_VALUE || start > MAX_VALUE) {
 705             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
 706         }
 707         if (end < MIN_VALUE || end > MAX_VALUE) {
 708             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
 709         }
 710         if (start <= end) {
 711             xor(range(start, end), 2, 0);
 712         }
 713         pat = null;
 714         return this;


1681             if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1682                 if (buf.length() == 0 ||
1683                     buf.charAt(buf.length() - 1) == ' ') {
1684                     continue;
1685                 }
1686                 ch = ' '; // convert to ' '
1687             }
1688             UTF16.append(buf, ch);
1689         }
1690         if (buf.length() != 0 &&
1691             buf.charAt(buf.length() - 1) == ' ') {
1692             buf.setLength(buf.length() - 1);
1693         }
1694         return buf.toString();
1695     }
1696 
1697     /**
1698      * Modifies this set to contain those code points which have the
1699      * given value for the given property.  Prior contents of this
1700      * set are lost.
1701      * @param propertyAlias
1702      * @param valueAlias
1703      * @param symbols if not null, then symbols are first called to see if a property
1704      * is available. If true, then everything else is skipped.
1705      * @return this set
1706      * @stable ICU 3.2
1707      */
1708     public UnicodeSet applyPropertyAlias(String propertyAlias,
1709                                          String valueAlias, SymbolTable symbols) {
1710         if (valueAlias.length() > 0) {
1711             if (propertyAlias.equals("Age")) {
1712                 // Must munge name, since
1713                 // VersionInfo.getInstance() does not do
1714                 // 'loose' matching.
1715                 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1716                 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1717                 return this;
1718             }
1719         }
1720         throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1721     }
1722 




  90  *       <td nowrap valign="top" align="left"><code>[]</code></td>
  91  *       <td valign="top">No characters</td>
  92  *     </tr><tr align="top">
  93  *       <td nowrap valign="top" align="left"><code>[a]</code></td>
  94  *       <td valign="top">The character 'a'</td>
  95  *     </tr><tr align="top">
  96  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
  97  *       <td valign="top">The characters 'a' and 'e'</td>
  98  *     </tr>
  99  *     <tr>
 100  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
 101  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
 102  *       point order</td>
 103  *     </tr>
 104  *     <tr>
 105  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
 106  *       <td valign="top">The character U+4E01</td>
 107  *     </tr>
 108  *     <tr>
 109  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
 110  *       <td valign="top">The character 'a' and the multicharacter strings "ab" and
 111  *       "ac"</td>
 112  *     </tr>
 113  *     <tr>
 114  *       <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
 115  *       <td valign="top">All characters in the general category Uppercase Letter</td>
 116  *     </tr>
 117  *   </table>
 118  * </blockquote>
 119  *
 120  * Any character may be preceded by a backslash in order to remove any special
 121  * meaning.  White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
 122  * ignored, unless they are escaped.
 123  *
 124  * <p>Property patterns specify a set of characters having a certain
 125  * property as defined by the Unicode standard.  Both the POSIX-like
 126  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
 127  * complete list of supported property patterns, see the User's Guide
 128  * for UnicodeSet at
 129  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
 130  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
 131  * Actual determination of property data is defined by the underlying
 132  * Unicode database as implemented by UCharacter.
 133  *
 134  * <p>Patterns specify individual characters, ranges of characters, and
 135  * Unicode property sets.  When elements are concatenated, they
 136  * specify their union.  To complement a set, place a '^' immediately
 137  * after the opening '['.  Property patterns are inverted by modifying
 138  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
 139  * '^' has no special meaning.
 140  *
 141  * <p>Ranges are indicated by placing two a '-' between two
 142  * characters, as in "a-z".  This specifies the range of all
 143  * characters from the left to the right, in Unicode order.  If the
 144  * left character is greater than or equal to the
 145  * right character it is a syntax error.  If a '-' occurs as the first
 146  * character after the opening '[' or '[^', or if it occurs as the
 147  * last character before the closing ']', then it is taken as a
 148  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
 149  * set of three characters, 'a', 'b', and '-'.
 150  *
 151  * <p>Sets may be intersected using the {@literal '&'} operator or the asymmetric
 152  * set difference may be taken using the '-' operator, for example,
 153  * "{@code [[:L:]&[\\u0000-\\u0FFF]]}" indicates the set of all Unicode letters
 154  * with values less than 4096.  Operators ({@literal '&'} and '|') have equal
 155  * precedence and bind left-to-right.  Thus
 156  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
 157  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
 158  * difference; intersection is commutative.
 159  *
 160  * <table>
 161  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
 162  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
 163  * through 'z' and all letters in between, in Unicode order
 164  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 165  * all characters but 'a' through 'z',
 166  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 167  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 168  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 169  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&amp;[<em>pat2</em>]]</code>
 170  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
 171  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
 172  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
 173  * <em>pat2</em>
 174  * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
 175  * <td>The set of characters having the specified
 176  * Unicode property; in
 177  * this case, Unicode uppercase letters
 178  * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
 179  * <td>The set of characters <em>not</em> having the given
 180  * Unicode property
 181  * </table>
 182  *
 183  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
 184  *
 185  * <p><b>Formal syntax</b></p>
 186  *
 187  * <blockquote>
 188  *   <table>
 189  *     <tr align="top">


 210  *     <tr align="top">
 211  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
 212  *       <td valign="top"><code>'[' | ']' | '-'<br>
 213  *       </code></td>
 214  *     </tr>
 215  *     <tr align="top">
 216  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
 217  *       <td valign="top"><em>any character that is not</em><code> special<br>
 218  *       | ('\\' </code><em>any character</em><code>)<br>
 219  *       | ('\u' hex hex hex hex)<br>
 220  *       </code></td>
 221  *     </tr>
 222  *     <tr align="top">
 223  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
 224  *       <td valign="top"><em>any character for which
 225  *       </em><code>Character.digit(c, 16)</code><em>
 226  *       returns a non-negative result</em></td>
 227  *     </tr>
 228  *     <tr>
 229  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
 230  *       <td valign="top"><em>a Unicode property set pattern</em></td>
 231  *     </tr>
 232  *   </table>
 233  *   <br>
 234  *   <table border="1">
 235  *     <tr>
 236  *       <td>Legend: <table>
 237  *         <tr>
 238  *           <td nowrap valign="top"><code>a := b</code></td>
 239  *           <td width="20" valign="top">&nbsp; </td>
 240  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
 241  *         </tr>
 242  *         <tr>
 243  *           <td nowrap valign="top"><code>a?</code></td>
 244  *           <td valign="top"></td>
 245  *           <td valign="top">zero or one instance of <code>a</code><br>
 246  *           </td>
 247  *         </tr>
 248  *         <tr>
 249  *           <td nowrap valign="top"><code>a*</code></td>
 250  *           <td valign="top"></td>


 320      * certain ranges.  These ranges are ranges of characters whose
 321      * properties are all exactly alike, e.g. CJK Ideographs from
 322      * U+4E00 to U+9FA5.
 323      */
 324     private static UnicodeSet INCLUSIONS[] = null;
 325 
 326     //----------------------------------------------------------------
 327     // Public API
 328     //----------------------------------------------------------------
 329 
 330     /**
 331      * Constructs an empty set.
 332      * @stable ICU 2.0
 333      */
 334     public UnicodeSet() {
 335         list = new int[1 + START_EXTRA];
 336         list[len++] = HIGH;
 337     }
 338 
 339     /**
 340      * Constructs a set containing the given range.
 341      * If {@code end > start} then an empty set is created.
 342      *
 343      * @param start first character, inclusive, of range
 344      * @param end last character, inclusive, of range
 345      * @stable ICU 2.0
 346      */
 347     public UnicodeSet(int start, int end) {
 348         this();
 349         complement(start, end);
 350     }
 351 
 352     /**
 353      * Constructs a set from the given pattern.  See the class description
 354      * for the syntax of the pattern language.  Whitespace is ignored.
 355      * @param pattern a string specifying what characters are in the set
 356      * @exception java.lang.IllegalArgumentException if the pattern contains
 357      * a syntax error.
 358      * @stable ICU 2.0
 359      */
 360     public UnicodeSet(String pattern) {
 361         this();


 634                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
 635                 System.arraycopy(list, i, temp, i+2, len-i);
 636                 list = temp;
 637             } else {
 638                 System.arraycopy(list, i, list, i+2, len-i);
 639             }
 640 
 641             list[i] = c;
 642             list[i+1] = c+1;
 643             len += 2;
 644         }
 645 
 646         pat = null;
 647         return this;
 648     }
 649 
 650     /**
 651      * Adds the specified multicharacter to this set if it is not already
 652      * present.  If this set already contains the multicharacter,
 653      * the call leaves this set unchanged.
 654      * Thus {@code "ch" => {"ch"}}
 655      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 656      * @param s the source string
 657      * @return this object, for chaining
 658      * @stable ICU 2.0
 659      */
 660     public final UnicodeSet add(String s) {
 661         int cp = getSingleCP(s);
 662         if (cp < 0) {
 663             strings.add(s);
 664             pat = null;
 665         } else {
 666             add_unchecked(cp, cp);
 667         }
 668         return this;
 669     }
 670 
 671     /**
 672      * @return a code point IF the string consists of a single one.
 673      * otherwise returns -1.
 674      * @param string to test
 675      */
 676     private static int getSingleCP(String s) {
 677         if (s.length() < 1) {
 678             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
 679         }
 680         if (s.length() > 2) return -1;
 681         if (s.length() == 1) return s.charAt(0);
 682 
 683         // at this point, len = 2
 684         int cp = UTF16.charAt(s, 0);
 685         if (cp > 0xFFFF) { // is surrogate pair
 686             return cp;
 687         }
 688         return -1;
 689     }
 690 
 691     /**
 692      * Complements the specified range in this set.  Any character in
 693      * the range will be removed if it is in this set, or will be
 694      * added if it is not in this set.  If {@code end > start}
 695      * then an empty range is complemented, leaving the set unchanged.
 696      *
 697      * @param start first character, inclusive, of range to be removed
 698      * from this set.
 699      * @param end last character, inclusive, of range to be removed
 700      * from this set.
 701      * @stable ICU 2.0
 702      */
 703     public UnicodeSet complement(int start, int end) {
 704         if (start < MIN_VALUE || start > MAX_VALUE) {
 705             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
 706         }
 707         if (end < MIN_VALUE || end > MAX_VALUE) {
 708             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
 709         }
 710         if (start <= end) {
 711             xor(range(start, end), 2, 0);
 712         }
 713         pat = null;
 714         return this;


1681             if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1682                 if (buf.length() == 0 ||
1683                     buf.charAt(buf.length() - 1) == ' ') {
1684                     continue;
1685                 }
1686                 ch = ' '; // convert to ' '
1687             }
1688             UTF16.append(buf, ch);
1689         }
1690         if (buf.length() != 0 &&
1691             buf.charAt(buf.length() - 1) == ' ') {
1692             buf.setLength(buf.length() - 1);
1693         }
1694         return buf.toString();
1695     }
1696 
1697     /**
1698      * Modifies this set to contain those code points which have the
1699      * given value for the given property.  Prior contents of this
1700      * set are lost.
1701      * @param propertyAlias the property alias
1702      * @param valueAlias the value alias
1703      * @param symbols if not null, then symbols are first called to see if a property
1704      * is available. If true, then everything else is skipped.
1705      * @return this set
1706      * @stable ICU 3.2
1707      */
1708     public UnicodeSet applyPropertyAlias(String propertyAlias,
1709                                          String valueAlias, SymbolTable symbols) {
1710         if (valueAlias.length() > 0) {
1711             if (propertyAlias.equals("Age")) {
1712                 // Must munge name, since
1713                 // VersionInfo.getInstance() does not do
1714                 // 'loose' matching.
1715                 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1716                 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1717                 return this;
1718             }
1719         }
1720         throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1721     }
1722 


< prev index next >