1 /*
   2  * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.util.HashMap;
  29 import java.util.Locale;
  30 import java.util.regex.Pattern.CharPredicate;
  31 import java.util.regex.Pattern.BmpCharPredicate;
  32 
  33 class CharPredicates {
  34 
  35     static final CharPredicate ALPHABETIC  = Character::isAlphabetic;
  36 
  37     // \p{gc=Decimal_Number}
  38     static final CharPredicate DIGIT       = Character::isDigit;
  39 
  40     static final CharPredicate LETTER      = Character::isLetter;
  41 
  42     static final CharPredicate IDEOGRAPHIC = Character::isIdeographic;
  43 
  44     static final CharPredicate LOWERCASE   = Character::isLowerCase;
  45 
  46     static final CharPredicate UPPERCASE   = Character::isUpperCase;
  47 
  48     static final CharPredicate TITLECASE   = Character::isTitleCase;
  49 
  50     // \p{Whitespace}
  51     static final CharPredicate WHITE_SPACE = ch ->
  52         ((((1 << Character.SPACE_SEPARATOR) |
  53            (1 << Character.LINE_SEPARATOR) |
  54            (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
  55         != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
  56 
  57     // \p{gc=Control}
  58     static final CharPredicate CONTROL     = ch ->
  59         Character.getType(ch) == Character.CONTROL;
  60 
  61     // \p{gc=Punctuation}
  62     static final CharPredicate PUNCTUATION = ch ->
  63         ((((1 << Character.CONNECTOR_PUNCTUATION) |
  64            (1 << Character.DASH_PUNCTUATION) |
  65            (1 << Character.START_PUNCTUATION) |
  66            (1 << Character.END_PUNCTUATION) |
  67            (1 << Character.OTHER_PUNCTUATION) |
  68            (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
  69            (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
  70         != 0;
  71 
  72     // \p{gc=Decimal_Number}
  73     // \p{Hex_Digit}    -> PropList.txt: Hex_Digit
  74     static final CharPredicate HEX_DIGIT = DIGIT.union(
  75         ch -> (ch >= 0x0030 && ch <= 0x0039) ||
  76               (ch >= 0x0041 && ch <= 0x0046) ||
  77               (ch >= 0x0061 && ch <= 0x0066) ||
  78               (ch >= 0xFF10 && ch <= 0xFF19) ||
  79               (ch >= 0xFF21 && ch <= 0xFF26) ||
  80               (ch >= 0xFF41 && ch <= 0xFF46));
  81 
  82     static final CharPredicate ASSIGNED = ch ->
  83         Character.getType(ch) != Character.UNASSIGNED;
  84 
  85     // PropList.txt:Noncharacter_Code_Point
  86     static final CharPredicate NONCHARACTER_CODE_POINT = ch ->
  87         (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
  88 
  89     // \p{alpha}
  90     // \p{digit}
  91     static final CharPredicate ALNUM = ALPHABETIC.union(DIGIT);
  92 
  93     // \p{Whitespace} --
  94     // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}  -> 0xa, 0xb, 0xc, 0xd, 0x85
  95     //  \p{gc=Line_Separator}
  96     //  \p{gc=Paragraph_Separator}]
  97     static final CharPredicate BLANK = ch ->
  98         Character.getType(ch) == Character.SPACE_SEPARATOR ||
  99         ch == 0x9; // \N{HT}
 100 
 101     // [^
 102     //  \p{space}
 103     //  \p{gc=Control}
 104     //  \p{gc=Surrogate}
 105     //  \p{gc=Unassigned}]
 106     static final CharPredicate GRAPH = ch ->
 107         ((((1 << Character.SPACE_SEPARATOR) |
 108            (1 << Character.LINE_SEPARATOR) |
 109            (1 << Character.PARAGRAPH_SEPARATOR) |
 110            (1 << Character.CONTROL) |
 111            (1 << Character.SURROGATE) |
 112            (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
 113         == 0;
 114 
 115     // \p{graph}
 116     // \p{blank}
 117     // -- \p{cntrl}
 118     static final CharPredicate PRINT = GRAPH.union(BLANK).and(CONTROL.negate());
 119 
 120     //  200C..200D    PropList.txt:Join_Control
 121     static final CharPredicate JOIN_CONTROL = ch -> ch == 0x200C || ch == 0x200D;
 122 
 123     //  \p{alpha}
 124     //  \p{gc=Mark}
 125     //  \p{digit}
 126     //  \p{gc=Connector_Punctuation}
 127     //  \p{Join_Control}    200C..200D
 128     static final CharPredicate WORD =
 129         ALPHABETIC.union(ch -> ((((1 << Character.NON_SPACING_MARK) |
 130                                   (1 << Character.ENCLOSING_MARK) |
 131                                   (1 << Character.COMBINING_SPACING_MARK) |
 132                                   (1 << Character.DECIMAL_DIGIT_NUMBER) |
 133                                   (1 << Character.CONNECTOR_PUNCTUATION))
 134                                  >> Character.getType(ch)) & 1) != 0,
 135                          JOIN_CONTROL);
 136 
 137     /////////////////////////////////////////////////////////////////////////////
 138 
 139     private static final HashMap<String, CharPredicate> posix = new HashMap<>(12);
 140     private static final HashMap<String, CharPredicate> uprops = new HashMap<>(18);
 141 
 142     private static void defPosix(String name, CharPredicate p) {
 143         posix.put(name, p);
 144     }
 145     private static void defUProp(String name, CharPredicate p) {
 146         uprops.put(name, p);
 147     }
 148 
 149     static {
 150         defPosix("ALPHA", ALPHABETIC);
 151         defPosix("LOWER", LOWERCASE);
 152         defPosix("UPPER", UPPERCASE);
 153         defPosix("SPACE", WHITE_SPACE);
 154         defPosix("PUNCT", PUNCTUATION);
 155         defPosix("XDIGIT",HEX_DIGIT);
 156         defPosix("ALNUM", ALNUM);
 157         defPosix("CNTRL", CONTROL);
 158         defPosix("DIGIT", DIGIT);
 159         defPosix("BLANK", BLANK);
 160         defPosix("GRAPH", GRAPH);
 161         defPosix("PRINT", PRINT);
 162 
 163         defUProp("ALPHABETIC", ALPHABETIC);
 164         defUProp("ASSIGNED", ASSIGNED);
 165         defUProp("CONTROL", CONTROL);
 166         defUProp("HEXDIGIT", HEX_DIGIT);
 167         defUProp("IDEOGRAPHIC", IDEOGRAPHIC);
 168         defUProp("JOINCONTROL", JOIN_CONTROL);
 169         defUProp("LETTER", LETTER);
 170         defUProp("LOWERCASE", LOWERCASE);
 171         defUProp("NONCHARACTERCODEPOINT", NONCHARACTER_CODE_POINT);
 172         defUProp("TITLECASE", TITLECASE);
 173         defUProp("PUNCTUATION", PUNCTUATION);
 174         defUProp("UPPERCASE", UPPERCASE);
 175         defUProp("WHITESPACE", WHITE_SPACE);
 176         defUProp("WORD", WORD);
 177         defUProp("WHITE_SPACE", WHITE_SPACE);
 178         defUProp("HEX_DIGIT", HEX_DIGIT);
 179         defUProp("NONCHARACTER_CODE_POINT", NONCHARACTER_CODE_POINT);
 180         defUProp("JOIN_CONTROL", JOIN_CONTROL);
 181     }
 182 
 183     public static CharPredicate forUnicodeProperty(String propName) {
 184         propName = propName.toUpperCase(Locale.ROOT);
 185         CharPredicate p = uprops.get(propName);
 186         if (p != null)
 187             return p;
 188         return posix.get(propName);
 189     }
 190 
 191     public static CharPredicate forPOSIXName(String propName) {
 192         return posix.get(propName.toUpperCase(Locale.ENGLISH));
 193     }
 194 
 195     /////////////////////////////////////////////////////////////////////////////
 196 
 197     /**
 198      * Returns a predicate matching all characters belong to a named
 199      * UnicodeScript.
 200      */
 201     static CharPredicate forUnicodeScript(String name) {
 202         final Character.UnicodeScript script;
 203         try {
 204             script = Character.UnicodeScript.forName(name);
 205             return ch -> script == Character.UnicodeScript.of(ch);
 206         } catch (IllegalArgumentException iae) {}
 207         return null;
 208     }
 209 
 210     /**
 211      * Returns a predicate matching all characters in a UnicodeBlock.
 212      */
 213     static CharPredicate forUnicodeBlock(String name) {
 214         final Character.UnicodeBlock block;
 215         try {
 216             block = Character.UnicodeBlock.forName(name);
 217             return ch -> block == Character.UnicodeBlock.of(ch);
 218         } catch (IllegalArgumentException iae) {}
 219          return null;
 220     }
 221 
 222     /////////////////////////////////////////////////////////////////////////////
 223 
 224     // unicode categories, aliases, properties, java methods ...
 225 
 226     private static final HashMap<String, CharPredicate> props = new HashMap<>(128);
 227 
 228     /**
 229      * Returns a predicate matching all characters in a named property.
 230      */
 231     static CharPredicate forProperty(String name) {
 232         return props.get(name);
 233     }
 234 
 235     private static void defProp(String name, CharPredicate p) {
 236         props.put(name, p);
 237     }
 238 
 239     private static void defCategory(String name, final int typeMask) {
 240         CharPredicate p = ch -> (typeMask & (1 << Character.getType(ch))) != 0;
 241         props.put(name, p);
 242     }
 243 
 244     private static void defRange(String name, final int lower, final int upper) {
 245         BmpCharPredicate p = ch -> lower <= ch && ch <= upper;
 246         props.put(name, p);
 247     }
 248 
 249     private static void defCtype(String name, final int ctype) {
 250         BmpCharPredicate p = ch -> ch < 128 && ASCII.isType(ch, ctype);
 251         // PrintPattern.pmap.put(p, name);
 252         props.put(name, p);
 253     }
 254 
 255     static {
 256         // Unicode character property aliases, defined in
 257         // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
 258         defCategory("Cn", 1<<Character.UNASSIGNED);
 259         defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
 260         defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
 261         defCategory("Lt", 1<<Character.TITLECASE_LETTER);
 262         defCategory("Lm", 1<<Character.MODIFIER_LETTER);
 263         defCategory("Lo", 1<<Character.OTHER_LETTER);
 264         defCategory("Mn", 1<<Character.NON_SPACING_MARK);
 265         defCategory("Me", 1<<Character.ENCLOSING_MARK);
 266         defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
 267         defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
 268         defCategory("Nl", 1<<Character.LETTER_NUMBER);
 269         defCategory("No", 1<<Character.OTHER_NUMBER);
 270         defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
 271         defCategory("Zl", 1<<Character.LINE_SEPARATOR);
 272         defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
 273         defCategory("Cc", 1<<Character.CONTROL);
 274         defCategory("Cf", 1<<Character.FORMAT);
 275         defCategory("Co", 1<<Character.PRIVATE_USE);
 276         defCategory("Cs", 1<<Character.SURROGATE);
 277         defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
 278         defCategory("Ps", 1<<Character.START_PUNCTUATION);
 279         defCategory("Pe", 1<<Character.END_PUNCTUATION);
 280         defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
 281         defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
 282         defCategory("Sm", 1<<Character.MATH_SYMBOL);
 283         defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
 284         defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
 285         defCategory("So", 1<<Character.OTHER_SYMBOL);
 286         defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
 287         defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
 288         defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
 289                           (1<<Character.LOWERCASE_LETTER) |
 290                           (1<<Character.TITLECASE_LETTER) |
 291                           (1<<Character.MODIFIER_LETTER)  |
 292                           (1<<Character.OTHER_LETTER)));
 293         defCategory("M", ((1<<Character.NON_SPACING_MARK) |
 294                           (1<<Character.ENCLOSING_MARK)   |
 295                           (1<<Character.COMBINING_SPACING_MARK)));
 296         defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
 297                           (1<<Character.LETTER_NUMBER)        |
 298                           (1<<Character.OTHER_NUMBER)));
 299         defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
 300                           (1<<Character.LINE_SEPARATOR)  |
 301                           (1<<Character.PARAGRAPH_SEPARATOR)));
 302         defCategory("C", ((1<<Character.CONTROL)     |
 303                           (1<<Character.FORMAT)      |
 304                           (1<<Character.PRIVATE_USE) |
 305                           (1<<Character.SURROGATE))); // Other
 306         defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
 307                           (1<<Character.START_PUNCTUATION)     |
 308                           (1<<Character.END_PUNCTUATION)       |
 309                           (1<<Character.CONNECTOR_PUNCTUATION) |
 310                           (1<<Character.OTHER_PUNCTUATION)     |
 311                           (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
 312                           (1<<Character.FINAL_QUOTE_PUNCTUATION)));
 313         defCategory("S", ((1<<Character.MATH_SYMBOL)     |
 314                           (1<<Character.CURRENCY_SYMBOL) |
 315                           (1<<Character.MODIFIER_SYMBOL) |
 316                           (1<<Character.OTHER_SYMBOL)));
 317         defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
 318                            (1<<Character.LOWERCASE_LETTER) |
 319                            (1<<Character.TITLECASE_LETTER)));
 320         defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
 321                            (1<<Character.LOWERCASE_LETTER) |
 322                            (1<<Character.TITLECASE_LETTER) |
 323                            (1<<Character.MODIFIER_LETTER)  |
 324                            (1<<Character.OTHER_LETTER)     |
 325                            (1<<Character.DECIMAL_DIGIT_NUMBER)));
 326         defRange("L1", 0x00, 0xFF); // Latin-1
 327         props.put("all", ch -> true);
 328 
 329         // Posix regular expression character classes, defined in
 330         // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
 331         defRange("ASCII", 0x00, 0x7F);   // ASCII
 332         defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
 333         defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
 334         defCtype("Blank", ASCII.BLANK);  // Space and tab characters
 335         defCtype("Cntrl", ASCII.CNTRL);  // Control characters
 336         defRange("Digit", '0', '9');     // Numeric characters
 337         defCtype("Graph", ASCII.GRAPH);  // printable and visible
 338         defRange("Lower", 'a', 'z');     // Lower-case alphabetic
 339         defRange("Print", 0x20, 0x7E);   // Printable characters
 340         defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
 341         defCtype("Space", ASCII.SPACE);  // Space characters
 342         defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
 343         defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
 344 
 345         // Java character properties, defined by methods in Character.java
 346         defProp("javaLowerCase", java.lang.Character::isLowerCase);
 347         defProp("javaUpperCase",  Character::isUpperCase);
 348         defProp("javaAlphabetic", java.lang.Character::isAlphabetic);
 349         defProp("javaIdeographic", java.lang.Character::isIdeographic);
 350         defProp("javaTitleCase", java.lang.Character::isTitleCase);
 351         defProp("javaDigit", java.lang.Character::isDigit);
 352         defProp("javaDefined", java.lang.Character::isDefined);
 353         defProp("javaLetter", java.lang.Character::isLetter);
 354         defProp("javaLetterOrDigit", java.lang.Character::isLetterOrDigit);
 355         defProp("javaJavaIdentifierStart", java.lang.Character::isJavaIdentifierStart);
 356         defProp("javaJavaIdentifierPart", java.lang.Character::isJavaIdentifierPart);
 357         defProp("javaUnicodeIdentifierStart", java.lang.Character::isUnicodeIdentifierStart);
 358         defProp("javaUnicodeIdentifierPart", java.lang.Character::isUnicodeIdentifierPart);
 359         defProp("javaIdentifierIgnorable", java.lang.Character::isIdentifierIgnorable);
 360         defProp("javaSpaceChar", java.lang.Character::isSpaceChar);
 361         defProp("javaWhitespace", java.lang.Character::isWhitespace);
 362         defProp("javaISOControl", java.lang.Character::isISOControl);
 363         defProp("javaMirrored", java.lang.Character::isMirrored);
 364     }
 365 
 366     /////////////////////////////////////////////////////////////////////////////
 367 
 368     /**
 369      * Posix ASCII variants, not in the lookup map
 370      */
 371     static final BmpCharPredicate ASCII_DIGIT = ch -> ch < 128 && ASCII.isDigit(ch);
 372     static final BmpCharPredicate ASCII_WORD  = ch -> ch < 128 && ASCII.isWord(ch);
 373     static final BmpCharPredicate ASCII_SPACE = ch -> ch < 128 && ASCII.isSpace(ch);
 374 
 375 }