1 /* 2 * Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.HashMap; 29 import java.util.Locale; 30 import java.util.regex.Pattern.CharPredicate; 31 import java.util.regex.Pattern.BmpCharPredicate; 32 33 class CharPredicates { 34 35 static final CharPredicate ALPHABETIC() { 36 return Character::isAlphabetic; 37 } 38 39 // \p{gc=Decimal_Number} 40 static final CharPredicate DIGIT() { 41 return Character::isDigit; 42 } 43 44 static final CharPredicate LETTER() { 45 return Character::isLetter; 46 } 47 48 static final CharPredicate IDEOGRAPHIC() { 49 return Character::isIdeographic; 50 } 51 52 static final CharPredicate LOWERCASE() { 53 return Character::isLowerCase; 54 } 55 56 static final CharPredicate UPPERCASE() { 57 return Character::isUpperCase; 58 } 59 60 static final CharPredicate TITLECASE() { 61 return Character::isTitleCase; 62 } 63 64 // \p{Whitespace} 65 static final CharPredicate WHITE_SPACE() { 66 return ch -> 67 ((((1 << Character.SPACE_SEPARATOR) | 68 (1 << Character.LINE_SEPARATOR) | 69 (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) 70 != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); 71 } 72 73 // \p{gc=Control} 74 static final CharPredicate CONTROL() { 75 return ch -> Character.getType(ch) == Character.CONTROL; 76 } 77 78 // \p{gc=Punctuation} 79 static final CharPredicate PUNCTUATION() { 80 return ch -> 81 ((((1 << Character.CONNECTOR_PUNCTUATION) | 82 (1 << Character.DASH_PUNCTUATION) | 83 (1 << Character.START_PUNCTUATION) | 84 (1 << Character.END_PUNCTUATION) | 85 (1 << Character.OTHER_PUNCTUATION) | 86 (1 << Character.INITIAL_QUOTE_PUNCTUATION) | 87 (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) 88 != 0; 89 } 90 91 // \p{gc=Decimal_Number} 92 // \p{Hex_Digit} -> PropList.txt: Hex_Digit 93 static final CharPredicate HEX_DIGIT() { 94 return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) || 95 (ch >= 0x0041 && ch <= 0x0046) || 96 (ch >= 0x0061 && ch <= 0x0066) || 97 (ch >= 0xFF10 && ch <= 0xFF19) || 98 (ch >= 0xFF21 && ch <= 0xFF26) || 99 (ch >= 0xFF41 && ch <= 0xFF46)); 100 } 101 102 static final CharPredicate ASSIGNED() { 103 return ch -> Character.getType(ch) != Character.UNASSIGNED; 104 } 105 106 // PropList.txt:Noncharacter_Code_Point 107 static final CharPredicate NONCHARACTER_CODE_POINT() { 108 return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); 109 } 110 111 // \p{alpha} 112 // \p{digit} 113 static final CharPredicate ALNUM() { 114 return ALPHABETIC().union(DIGIT()); 115 } 116 117 // \p{Whitespace} -- 118 // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 119 // \p{gc=Line_Separator} 120 // \p{gc=Paragraph_Separator}] 121 static final CharPredicate BLANK() { 122 return ch -> 123 Character.getType(ch) == Character.SPACE_SEPARATOR || 124 ch == 0x9; // \N{HT} 125 } 126 127 // [^ 128 // \p{space} 129 // \p{gc=Control} 130 // \p{gc=Surrogate} 131 // \p{gc=Unassigned}] 132 static final CharPredicate GRAPH() { 133 return ch -> 134 ((((1 << Character.SPACE_SEPARATOR) | 135 (1 << Character.LINE_SEPARATOR) | 136 (1 << Character.PARAGRAPH_SEPARATOR) | 137 (1 << Character.CONTROL) | 138 (1 << Character.SURROGATE) | 139 (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) 140 == 0; 141 } 142 143 // \p{graph} 144 // \p{blank} 145 // -- \p{cntrl} 146 static final CharPredicate PRINT() { 147 return GRAPH().union(BLANK()).and(CONTROL().negate()); 148 } 149 150 // 200C..200D PropList.txt:Join_Control 151 static final CharPredicate JOIN_CONTROL() { 152 return ch -> ch == 0x200C || ch == 0x200D; 153 } 154 155 // \p{alpha} 156 // \p{gc=Mark} 157 // \p{digit} 158 // \p{gc=Connector_Punctuation} 159 // \p{Join_Control} 200C..200D 160 static final CharPredicate WORD() { 161 return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) | 162 (1 << Character.ENCLOSING_MARK) | 163 (1 << Character.COMBINING_SPACING_MARK) | 164 (1 << Character.DECIMAL_DIGIT_NUMBER) | 165 (1 << Character.CONNECTOR_PUNCTUATION)) 166 >> Character.getType(ch)) & 1) != 0, 167 JOIN_CONTROL()); 168 } 169 170 ///////////////////////////////////////////////////////////////////////////// 171 172 private static CharPredicate getPosixPredicate(String name, boolean caseIns) { 173 switch (name) { 174 case "ALPHA": return ALPHABETIC(); 175 case "LOWER": return caseIns 176 ? LOWERCASE().union(UPPERCASE(), TITLECASE()) 177 : LOWERCASE(); 178 case "UPPER": return caseIns 179 ? UPPERCASE().union(LOWERCASE(), TITLECASE()) 180 : UPPERCASE(); 181 case "SPACE": return WHITE_SPACE(); 182 case "PUNCT": return PUNCTUATION(); 183 case "XDIGIT": return HEX_DIGIT(); 184 case "ALNUM": return ALNUM(); 185 case "CNTRL": return CONTROL(); 186 case "DIGIT": return DIGIT(); 187 case "BLANK": return BLANK(); 188 case "GRAPH": return GRAPH(); 189 case "PRINT": return PRINT(); 190 default: return null; 191 } 192 } 193 194 private static CharPredicate getUnicodePredicate(String name, boolean caseIns) { 195 switch (name) { 196 case "ALPHABETIC": return ALPHABETIC(); 197 case "ASSIGNED": return ASSIGNED(); 198 case "CONTROL": return CONTROL(); 199 case "HEXDIGIT": 200 case "HEX_DIGIT": return HEX_DIGIT(); 201 case "IDEOGRAPHIC": return IDEOGRAPHIC(); 202 case "JOINCONTROL": 203 case "JOIN_CONTROL": return JOIN_CONTROL(); 204 case "LETTER": return LETTER(); 205 case "LOWERCASE": return caseIns 206 ? LOWERCASE().union(UPPERCASE(), TITLECASE()) 207 : LOWERCASE(); 208 case "NONCHARACTERCODEPOINT": 209 case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT(); 210 case "TITLECASE": return caseIns 211 ? TITLECASE().union(LOWERCASE(), UPPERCASE()) 212 : TITLECASE(); 213 case "PUNCTUATION": return PUNCTUATION(); 214 case "UPPERCASE": return caseIns 215 ? UPPERCASE().union(LOWERCASE(), TITLECASE()) 216 : UPPERCASE(); 217 case "WHITESPACE": 218 case "WHITE_SPACE": return WHITE_SPACE(); 219 case "WORD": return WORD(); 220 default: return null; 221 } 222 } 223 224 public static CharPredicate forUnicodeProperty(String propName, boolean caseIns) { 225 propName = propName.toUpperCase(Locale.ROOT); 226 CharPredicate p = getUnicodePredicate(propName, caseIns); 227 if (p != null) 228 return p; 229 return getPosixPredicate(propName, caseIns); 230 } 231 232 public static CharPredicate forPOSIXName(String propName, boolean caseIns) { 233 return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH), caseIns); 234 } 235 236 ///////////////////////////////////////////////////////////////////////////// 237 238 /** 239 * Returns a predicate matching all characters belong to a named 240 * UnicodeScript. 241 */ 242 static CharPredicate forUnicodeScript(String name) { 243 final Character.UnicodeScript script; 244 try { 245 script = Character.UnicodeScript.forName(name); 246 return ch -> script == Character.UnicodeScript.of(ch); 247 } catch (IllegalArgumentException iae) {} 248 return null; 249 } 250 251 /** 252 * Returns a predicate matching all characters in a UnicodeBlock. 253 */ 254 static CharPredicate forUnicodeBlock(String name) { 255 final Character.UnicodeBlock block; 256 try { 257 block = Character.UnicodeBlock.forName(name); 258 return ch -> block == Character.UnicodeBlock.of(ch); 259 } catch (IllegalArgumentException iae) {} 260 return null; 261 } 262 263 ///////////////////////////////////////////////////////////////////////////// 264 265 // unicode categories, aliases, properties, java methods ... 266 267 static CharPredicate forProperty(String name, boolean caseIns) { 268 // Unicode character property aliases, defined in 269 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 270 switch (name) { 271 case "Cn": return category(1<<Character.UNASSIGNED); 272 case "Lu": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) | 273 (1<<Character.UPPERCASE_LETTER) | 274 (1<<Character.TITLECASE_LETTER) 275 : (1<<Character.UPPERCASE_LETTER)); 276 case "Ll": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) | 277 (1<<Character.UPPERCASE_LETTER) | 278 (1<<Character.TITLECASE_LETTER) 279 : (1<<Character.LOWERCASE_LETTER)); 280 case "Lt": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) | 281 (1<<Character.UPPERCASE_LETTER) | 282 (1<<Character.TITLECASE_LETTER) 283 : (1<<Character.TITLECASE_LETTER)); 284 case "Lm": return category(1<<Character.MODIFIER_LETTER); 285 case "Lo": return category(1<<Character.OTHER_LETTER); 286 case "Mn": return category(1<<Character.NON_SPACING_MARK); 287 case "Me": return category(1<<Character.ENCLOSING_MARK); 288 case "Mc": return category(1<<Character.COMBINING_SPACING_MARK); 289 case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER); 290 case "Nl": return category(1<<Character.LETTER_NUMBER); 291 case "No": return category(1<<Character.OTHER_NUMBER); 292 case "Zs": return category(1<<Character.SPACE_SEPARATOR); 293 case "Zl": return category(1<<Character.LINE_SEPARATOR); 294 case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR); 295 case "Cc": return category(1<<Character.CONTROL); 296 case "Cf": return category(1<<Character.FORMAT); 297 case "Co": return category(1<<Character.PRIVATE_USE); 298 case "Cs": return category(1<<Character.SURROGATE); 299 case "Pd": return category(1<<Character.DASH_PUNCTUATION); 300 case "Ps": return category(1<<Character.START_PUNCTUATION); 301 case "Pe": return category(1<<Character.END_PUNCTUATION); 302 case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION); 303 case "Po": return category(1<<Character.OTHER_PUNCTUATION); 304 case "Sm": return category(1<<Character.MATH_SYMBOL); 305 case "Sc": return category(1<<Character.CURRENCY_SYMBOL); 306 case "Sk": return category(1<<Character.MODIFIER_SYMBOL); 307 case "So": return category(1<<Character.OTHER_SYMBOL); 308 case "Pi": return category(1<<Character.INITIAL_QUOTE_PUNCTUATION); 309 case "Pf": return category(1<<Character.FINAL_QUOTE_PUNCTUATION); 310 case "L": return category(((1<<Character.UPPERCASE_LETTER) | 311 (1<<Character.LOWERCASE_LETTER) | 312 (1<<Character.TITLECASE_LETTER) | 313 (1<<Character.MODIFIER_LETTER) | 314 (1<<Character.OTHER_LETTER))); 315 case "M": return category(((1<<Character.NON_SPACING_MARK) | 316 (1<<Character.ENCLOSING_MARK) | 317 (1<<Character.COMBINING_SPACING_MARK))); 318 case "N": return category(((1<<Character.DECIMAL_DIGIT_NUMBER) | 319 (1<<Character.LETTER_NUMBER) | 320 (1<<Character.OTHER_NUMBER))); 321 case "Z": return category(((1<<Character.SPACE_SEPARATOR) | 322 (1<<Character.LINE_SEPARATOR) | 323 (1<<Character.PARAGRAPH_SEPARATOR))); 324 case "C": return category(((1<<Character.CONTROL) | 325 (1<<Character.FORMAT) | 326 (1<<Character.PRIVATE_USE) | 327 (1<<Character.SURROGATE) | 328 (1<<Character.UNASSIGNED))); // Other 329 case "P": return category(((1<<Character.DASH_PUNCTUATION) | 330 (1<<Character.START_PUNCTUATION) | 331 (1<<Character.END_PUNCTUATION) | 332 (1<<Character.CONNECTOR_PUNCTUATION) | 333 (1<<Character.OTHER_PUNCTUATION) | 334 (1<<Character.INITIAL_QUOTE_PUNCTUATION) | 335 (1<<Character.FINAL_QUOTE_PUNCTUATION))); 336 case "S": return category(((1<<Character.MATH_SYMBOL) | 337 (1<<Character.CURRENCY_SYMBOL) | 338 (1<<Character.MODIFIER_SYMBOL) | 339 (1<<Character.OTHER_SYMBOL))); 340 case "LC": return category(((1<<Character.UPPERCASE_LETTER) | 341 (1<<Character.LOWERCASE_LETTER) | 342 (1<<Character.TITLECASE_LETTER))); 343 case "LD": return category(((1<<Character.UPPERCASE_LETTER) | 344 (1<<Character.LOWERCASE_LETTER) | 345 (1<<Character.TITLECASE_LETTER) | 346 (1<<Character.MODIFIER_LETTER) | 347 (1<<Character.OTHER_LETTER) | 348 (1<<Character.DECIMAL_DIGIT_NUMBER))); 349 case "L1": return range(0x00, 0xFF); // Latin-1 350 case "all": return Pattern.ALL(); 351 // Posix regular expression character classes, defined in 352 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html 353 case "ASCII": return range(0x00, 0x7F); // ASCII 354 case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters 355 case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters 356 case "Blank": return ctype(ASCII.BLANK); // Space and tab characters 357 case "Cntrl": return ctype(ASCII.CNTRL); // Control characters 358 case "Digit": return range('0', '9'); // Numeric characters 359 case "Graph": return ctype(ASCII.GRAPH); // printable and visible 360 case "Lower": return caseIns ? ctype(ASCII.ALPHA) 361 : range('a', 'z'); // Lower-case alphabetic 362 case "Print": return range(0x20, 0x7E); // Printable characters 363 case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters 364 case "Space": return ctype(ASCII.SPACE); // Space characters 365 case "Upper": return caseIns ? ctype(ASCII.ALPHA) 366 : range('A', 'Z'); // Upper-case alphabetic 367 case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits 368 369 // Java character properties, defined by methods in Character.java 370 case "javaLowerCase": return caseIns ? c -> Character.isLowerCase(c) || 371 Character.isUpperCase(c) || 372 Character.isTitleCase(c) 373 : Character::isLowerCase; 374 case "javaUpperCase": return caseIns ? c -> Character.isUpperCase(c) || 375 Character.isLowerCase(c) || 376 Character.isTitleCase(c) 377 : Character::isUpperCase; 378 case "javaAlphabetic": return Character::isAlphabetic; 379 case "javaIdeographic": return Character::isIdeographic; 380 case "javaTitleCase": return caseIns ? c -> Character.isTitleCase(c) || 381 Character.isLowerCase(c) || 382 Character.isUpperCase(c) 383 : Character::isTitleCase; 384 case "javaDigit": return Character::isDigit; 385 case "javaDefined": return Character::isDefined; 386 case "javaLetter": return Character::isLetter; 387 case "javaLetterOrDigit": return Character::isLetterOrDigit; 388 case "javaJavaIdentifierStart": return Character::isJavaIdentifierStart; 389 case "javaJavaIdentifierPart": return Character::isJavaIdentifierPart; 390 case "javaUnicodeIdentifierStart": return Character::isUnicodeIdentifierStart; 391 case "javaUnicodeIdentifierPart": return Character::isUnicodeIdentifierPart; 392 case "javaIdentifierIgnorable": return Character::isIdentifierIgnorable; 393 case "javaSpaceChar": return Character::isSpaceChar; 394 case "javaWhitespace": return Character::isWhitespace; 395 case "javaISOControl": return Character::isISOControl; 396 case "javaMirrored": return Character::isMirrored; 397 default: return null; 398 } 399 } 400 401 private static CharPredicate category(final int typeMask) { 402 return ch -> (typeMask & (1 << Character.getType(ch))) != 0; 403 } 404 405 private static CharPredicate range(final int lower, final int upper) { 406 return (BmpCharPredicate)ch -> lower <= ch && ch <= upper; 407 } 408 409 private static CharPredicate ctype(final int ctype) { 410 return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype); 411 } 412 413 ///////////////////////////////////////////////////////////////////////////// 414 415 /** 416 * Posix ASCII variants, not in the lookup map 417 */ 418 static final BmpCharPredicate ASCII_DIGIT() { 419 return ch -> ch < 128 && ASCII.isDigit(ch); 420 } 421 static final BmpCharPredicate ASCII_WORD() { 422 return ch -> ch < 128 && ASCII.isWord(ch); 423 } 424 static final BmpCharPredicate ASCII_SPACE() { 425 return ch -> ch < 128 && ASCII.isSpace(ch); 426 } 427 428 }