1 /* 2 * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.HashMap; 29 import java.util.Locale; 30 import java.util.regex.Pattern.CharPredicate; 31 import java.util.regex.Pattern.BmpCharPredicate; 32 33 class CharPredicates { 34 35 static final CharPredicate ALPHABETIC = Character::isAlphabetic; 36 37 // \p{gc=Decimal_Number} 38 static final CharPredicate DIGIT = Character::isDigit; 39 40 static final CharPredicate LETTER = Character::isLetter; 41 42 static final CharPredicate IDEOGRAPHIC = Character::isIdeographic; 43 44 static final CharPredicate LOWERCASE = Character::isLowerCase; 45 46 static final CharPredicate UPPERCASE = Character::isUpperCase; 47 48 static final CharPredicate TITLECASE = Character::isTitleCase; 49 50 // \p{Whitespace} 51 static final CharPredicate WHITE_SPACE = ch -> 52 ((((1 << Character.SPACE_SEPARATOR) | 53 (1 << Character.LINE_SEPARATOR) | 54 (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) 55 != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); 56 57 // \p{gc=Control} 58 static final CharPredicate CONTROL = ch -> 59 Character.getType(ch) == Character.CONTROL; 60 61 // \p{gc=Punctuation} 62 static final CharPredicate PUNCTUATION = ch -> 63 ((((1 << Character.CONNECTOR_PUNCTUATION) | 64 (1 << Character.DASH_PUNCTUATION) | 65 (1 << Character.START_PUNCTUATION) | 66 (1 << Character.END_PUNCTUATION) | 67 (1 << Character.OTHER_PUNCTUATION) | 68 (1 << Character.INITIAL_QUOTE_PUNCTUATION) | 69 (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) 70 != 0; 71 72 // \p{gc=Decimal_Number} 73 // \p{Hex_Digit} -> PropList.txt: Hex_Digit 74 static final CharPredicate HEX_DIGIT = DIGIT.union( 75 ch -> (ch >= 0x0030 && ch <= 0x0039) || 76 (ch >= 0x0041 && ch <= 0x0046) || 77 (ch >= 0x0061 && ch <= 0x0066) || 78 (ch >= 0xFF10 && ch <= 0xFF19) || 79 (ch >= 0xFF21 && ch <= 0xFF26) || 80 (ch >= 0xFF41 && ch <= 0xFF46)); 81 82 static final CharPredicate ASSIGNED = ch -> 83 Character.getType(ch) != Character.UNASSIGNED; 84 85 // PropList.txt:Noncharacter_Code_Point 86 static final CharPredicate NONCHARACTER_CODE_POINT = ch -> 87 (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); 88 89 // \p{alpha} 90 // \p{digit} 91 static final CharPredicate ALNUM = ALPHABETIC.union(DIGIT); 92 93 // \p{Whitespace} -- 94 // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 95 // \p{gc=Line_Separator} 96 // \p{gc=Paragraph_Separator}] 97 static final CharPredicate BLANK = ch -> 98 Character.getType(ch) == Character.SPACE_SEPARATOR || 99 ch == 0x9; // \N{HT} 100 101 // [^ 102 // \p{space} 103 // \p{gc=Control} 104 // \p{gc=Surrogate} 105 // \p{gc=Unassigned}] 106 static final CharPredicate GRAPH = ch -> 107 ((((1 << Character.SPACE_SEPARATOR) | 108 (1 << Character.LINE_SEPARATOR) | 109 (1 << Character.PARAGRAPH_SEPARATOR) | 110 (1 << Character.CONTROL) | 111 (1 << Character.SURROGATE) | 112 (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) 113 == 0; 114 115 // \p{graph} 116 // \p{blank} 117 // -- \p{cntrl} 118 static final CharPredicate PRINT = GRAPH.union(BLANK).and(CONTROL.negate()); 119 120 // 200C..200D PropList.txt:Join_Control 121 static final CharPredicate JOIN_CONTROL = ch -> ch == 0x200C || ch == 0x200D; 122 123 // \p{alpha} 124 // \p{gc=Mark} 125 // \p{digit} 126 // \p{gc=Connector_Punctuation} 127 // \p{Join_Control} 200C..200D 128 static final CharPredicate WORD = 129 ALPHABETIC.union(ch -> ((((1 << Character.NON_SPACING_MARK) | 130 (1 << Character.ENCLOSING_MARK) | 131 (1 << Character.COMBINING_SPACING_MARK) | 132 (1 << Character.DECIMAL_DIGIT_NUMBER) | 133 (1 << Character.CONNECTOR_PUNCTUATION)) 134 >> Character.getType(ch)) & 1) != 0, 135 JOIN_CONTROL); 136 137 ///////////////////////////////////////////////////////////////////////////// 138 139 private static final HashMap<String, CharPredicate> posix = new HashMap<>(12); 140 private static final HashMap<String, CharPredicate> uprops = new HashMap<>(18); 141 142 private static void defPosix(String name, CharPredicate p) { 143 posix.put(name, p); 144 } 145 private static void defUProp(String name, CharPredicate p) { 146 uprops.put(name, p); 147 } 148 149 static { 150 defPosix("ALPHA", ALPHABETIC); 151 defPosix("LOWER", LOWERCASE); 152 defPosix("UPPER", UPPERCASE); 153 defPosix("SPACE", WHITE_SPACE); 154 defPosix("PUNCT", PUNCTUATION); 155 defPosix("XDIGIT",HEX_DIGIT); 156 defPosix("ALNUM", ALNUM); 157 defPosix("CNTRL", CONTROL); 158 defPosix("DIGIT", DIGIT); 159 defPosix("BLANK", BLANK); 160 defPosix("GRAPH", GRAPH); 161 defPosix("PRINT", PRINT); 162 163 defUProp("ALPHABETIC", ALPHABETIC); 164 defUProp("ASSIGNED", ASSIGNED); 165 defUProp("CONTROL", CONTROL); 166 defUProp("HEXDIGIT", HEX_DIGIT); 167 defUProp("IDEOGRAPHIC", IDEOGRAPHIC); 168 defUProp("JOINCONTROL", JOIN_CONTROL); 169 defUProp("LETTER", LETTER); 170 defUProp("LOWERCASE", LOWERCASE); 171 defUProp("NONCHARACTERCODEPOINT", NONCHARACTER_CODE_POINT); 172 defUProp("TITLECASE", TITLECASE); 173 defUProp("PUNCTUATION", PUNCTUATION); 174 defUProp("UPPERCASE", UPPERCASE); 175 defUProp("WHITESPACE", WHITE_SPACE); 176 defUProp("WORD", WORD); 177 defUProp("WHITE_SPACE", WHITE_SPACE); 178 defUProp("HEX_DIGIT", HEX_DIGIT); 179 defUProp("NONCHARACTER_CODE_POINT", NONCHARACTER_CODE_POINT); 180 defUProp("JOIN_CONTROL", JOIN_CONTROL); 181 } 182 183 public static CharPredicate forUnicodeProperty(String propName) { 184 propName = propName.toUpperCase(Locale.ROOT); 185 CharPredicate p = uprops.get(propName); 186 if (p != null) 187 return p; 188 return posix.get(propName); 189 } 190 191 public static CharPredicate forPOSIXName(String propName) { 192 return posix.get(propName.toUpperCase(Locale.ENGLISH)); 193 } 194 195 ///////////////////////////////////////////////////////////////////////////// 196 197 /** 198 * Returns a predicate matching all characters belong to a named 199 * UnicodeScript. 200 */ 201 static CharPredicate forUnicodeScript(String name) { 202 final Character.UnicodeScript script; 203 try { 204 script = Character.UnicodeScript.forName(name); 205 return ch -> script == Character.UnicodeScript.of(ch); 206 } catch (IllegalArgumentException iae) {} 207 return null; 208 } 209 210 /** 211 * Returns a predicate matching all characters in a UnicodeBlock. 212 */ 213 static CharPredicate forUnicodeBlock(String name) { 214 final Character.UnicodeBlock block; 215 try { 216 block = Character.UnicodeBlock.forName(name); 217 return ch -> block == Character.UnicodeBlock.of(ch); 218 } catch (IllegalArgumentException iae) {} 219 return null; 220 } 221 222 ///////////////////////////////////////////////////////////////////////////// 223 224 // unicode categories, aliases, properties, java methods ... 225 226 private static final HashMap<String, CharPredicate> props = new HashMap<>(128); 227 228 /** 229 * Returns a predicate matching all characters in a named property. 230 */ 231 static CharPredicate forProperty(String name) { 232 return props.get(name); 233 } 234 235 private static void defProp(String name, CharPredicate p) { 236 props.put(name, p); 237 } 238 239 private static void defCategory(String name, final int typeMask) { 240 CharPredicate p = ch -> (typeMask & (1 << Character.getType(ch))) != 0; 241 props.put(name, p); 242 } 243 244 private static void defRange(String name, final int lower, final int upper) { 245 BmpCharPredicate p = ch -> lower <= ch && ch <= upper; 246 props.put(name, p); 247 } 248 249 private static void defCtype(String name, final int ctype) { 250 BmpCharPredicate p = ch -> ch < 128 && ASCII.isType(ch, ctype); 251 // PrintPattern.pmap.put(p, name); 252 props.put(name, p); 253 } 254 255 static { 256 // Unicode character property aliases, defined in 257 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 258 defCategory("Cn", 1<<Character.UNASSIGNED); 259 defCategory("Lu", 1<<Character.UPPERCASE_LETTER); 260 defCategory("Ll", 1<<Character.LOWERCASE_LETTER); 261 defCategory("Lt", 1<<Character.TITLECASE_LETTER); 262 defCategory("Lm", 1<<Character.MODIFIER_LETTER); 263 defCategory("Lo", 1<<Character.OTHER_LETTER); 264 defCategory("Mn", 1<<Character.NON_SPACING_MARK); 265 defCategory("Me", 1<<Character.ENCLOSING_MARK); 266 defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK); 267 defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER); 268 defCategory("Nl", 1<<Character.LETTER_NUMBER); 269 defCategory("No", 1<<Character.OTHER_NUMBER); 270 defCategory("Zs", 1<<Character.SPACE_SEPARATOR); 271 defCategory("Zl", 1<<Character.LINE_SEPARATOR); 272 defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR); 273 defCategory("Cc", 1<<Character.CONTROL); 274 defCategory("Cf", 1<<Character.FORMAT); 275 defCategory("Co", 1<<Character.PRIVATE_USE); 276 defCategory("Cs", 1<<Character.SURROGATE); 277 defCategory("Pd", 1<<Character.DASH_PUNCTUATION); 278 defCategory("Ps", 1<<Character.START_PUNCTUATION); 279 defCategory("Pe", 1<<Character.END_PUNCTUATION); 280 defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION); 281 defCategory("Po", 1<<Character.OTHER_PUNCTUATION); 282 defCategory("Sm", 1<<Character.MATH_SYMBOL); 283 defCategory("Sc", 1<<Character.CURRENCY_SYMBOL); 284 defCategory("Sk", 1<<Character.MODIFIER_SYMBOL); 285 defCategory("So", 1<<Character.OTHER_SYMBOL); 286 defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION); 287 defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION); 288 defCategory("L", ((1<<Character.UPPERCASE_LETTER) | 289 (1<<Character.LOWERCASE_LETTER) | 290 (1<<Character.TITLECASE_LETTER) | 291 (1<<Character.MODIFIER_LETTER) | 292 (1<<Character.OTHER_LETTER))); 293 defCategory("M", ((1<<Character.NON_SPACING_MARK) | 294 (1<<Character.ENCLOSING_MARK) | 295 (1<<Character.COMBINING_SPACING_MARK))); 296 defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) | 297 (1<<Character.LETTER_NUMBER) | 298 (1<<Character.OTHER_NUMBER))); 299 defCategory("Z", ((1<<Character.SPACE_SEPARATOR) | 300 (1<<Character.LINE_SEPARATOR) | 301 (1<<Character.PARAGRAPH_SEPARATOR))); 302 defCategory("C", ((1<<Character.CONTROL) | 303 (1<<Character.FORMAT) | 304 (1<<Character.PRIVATE_USE) | 305 (1<<Character.SURROGATE))); // Other 306 defCategory("P", ((1<<Character.DASH_PUNCTUATION) | 307 (1<<Character.START_PUNCTUATION) | 308 (1<<Character.END_PUNCTUATION) | 309 (1<<Character.CONNECTOR_PUNCTUATION) | 310 (1<<Character.OTHER_PUNCTUATION) | 311 (1<<Character.INITIAL_QUOTE_PUNCTUATION) | 312 (1<<Character.FINAL_QUOTE_PUNCTUATION))); 313 defCategory("S", ((1<<Character.MATH_SYMBOL) | 314 (1<<Character.CURRENCY_SYMBOL) | 315 (1<<Character.MODIFIER_SYMBOL) | 316 (1<<Character.OTHER_SYMBOL))); 317 defCategory("LC", ((1<<Character.UPPERCASE_LETTER) | 318 (1<<Character.LOWERCASE_LETTER) | 319 (1<<Character.TITLECASE_LETTER))); 320 defCategory("LD", ((1<<Character.UPPERCASE_LETTER) | 321 (1<<Character.LOWERCASE_LETTER) | 322 (1<<Character.TITLECASE_LETTER) | 323 (1<<Character.MODIFIER_LETTER) | 324 (1<<Character.OTHER_LETTER) | 325 (1<<Character.DECIMAL_DIGIT_NUMBER))); 326 defRange("L1", 0x00, 0xFF); // Latin-1 327 props.put("all", ch -> true); 328 329 // Posix regular expression character classes, defined in 330 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html 331 defRange("ASCII", 0x00, 0x7F); // ASCII 332 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters 333 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters 334 defCtype("Blank", ASCII.BLANK); // Space and tab characters 335 defCtype("Cntrl", ASCII.CNTRL); // Control characters 336 defRange("Digit", '0', '9'); // Numeric characters 337 defCtype("Graph", ASCII.GRAPH); // printable and visible 338 defRange("Lower", 'a', 'z'); // Lower-case alphabetic 339 defRange("Print", 0x20, 0x7E); // Printable characters 340 defCtype("Punct", ASCII.PUNCT); // Punctuation characters 341 defCtype("Space", ASCII.SPACE); // Space characters 342 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic 343 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits 344 345 // Java character properties, defined by methods in Character.java 346 defProp("javaLowerCase", java.lang.Character::isLowerCase); 347 defProp("javaUpperCase", Character::isUpperCase); 348 defProp("javaAlphabetic", java.lang.Character::isAlphabetic); 349 defProp("javaIdeographic", java.lang.Character::isIdeographic); 350 defProp("javaTitleCase", java.lang.Character::isTitleCase); 351 defProp("javaDigit", java.lang.Character::isDigit); 352 defProp("javaDefined", java.lang.Character::isDefined); 353 defProp("javaLetter", java.lang.Character::isLetter); 354 defProp("javaLetterOrDigit", java.lang.Character::isLetterOrDigit); 355 defProp("javaJavaIdentifierStart", java.lang.Character::isJavaIdentifierStart); 356 defProp("javaJavaIdentifierPart", java.lang.Character::isJavaIdentifierPart); 357 defProp("javaUnicodeIdentifierStart", java.lang.Character::isUnicodeIdentifierStart); 358 defProp("javaUnicodeIdentifierPart", java.lang.Character::isUnicodeIdentifierPart); 359 defProp("javaIdentifierIgnorable", java.lang.Character::isIdentifierIgnorable); 360 defProp("javaSpaceChar", java.lang.Character::isSpaceChar); 361 defProp("javaWhitespace", java.lang.Character::isWhitespace); 362 defProp("javaISOControl", java.lang.Character::isISOControl); 363 defProp("javaMirrored", java.lang.Character::isMirrored); 364 } 365 366 ///////////////////////////////////////////////////////////////////////////// 367 368 /** 369 * Posix ASCII variants, not in the lookup map 370 */ 371 static final BmpCharPredicate ASCII_DIGIT = ch -> ch < 128 && ASCII.isDigit(ch); 372 static final BmpCharPredicate ASCII_WORD = ch -> ch < 128 && ASCII.isWord(ch); 373 static final BmpCharPredicate ASCII_SPACE = ch -> ch < 128 && ASCII.isSpace(ch); 374 375 }