1 /*
2 * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
152 return ch -> ch == 0x200C || ch == 0x200D;
153 }
154
155 // \p{alpha}
156 // \p{gc=Mark}
157 // \p{digit}
158 // \p{gc=Connector_Punctuation}
159 // \p{Join_Control} 200C..200D
160 static final CharPredicate WORD() {
161 return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
162 (1 << Character.ENCLOSING_MARK) |
163 (1 << Character.COMBINING_SPACING_MARK) |
164 (1 << Character.DECIMAL_DIGIT_NUMBER) |
165 (1 << Character.CONNECTOR_PUNCTUATION))
166 >> Character.getType(ch)) & 1) != 0,
167 JOIN_CONTROL());
168 }
169
170 /////////////////////////////////////////////////////////////////////////////
171
172 private static CharPredicate getPosixPredicate(String name) {
173 switch (name) {
174 case "ALPHA": return ALPHABETIC();
175 case "LOWER": return LOWERCASE();
176 case "UPPER": return UPPERCASE();
177 case "SPACE": return WHITE_SPACE();
178 case "PUNCT": return PUNCTUATION();
179 case "XDIGIT": return HEX_DIGIT();
180 case "ALNUM": return ALNUM();
181 case "CNTRL": return CONTROL();
182 case "DIGIT": return DIGIT();
183 case "BLANK": return BLANK();
184 case "GRAPH": return GRAPH();
185 case "PRINT": return PRINT();
186 default: return null;
187 }
188 }
189
190 private static CharPredicate getUnicodePredicate(String name) {
191 switch (name) {
192 case "ALPHABETIC": return ALPHABETIC();
193 case "ASSIGNED": return ASSIGNED();
194 case "CONTROL": return CONTROL();
195 case "HEXDIGIT": return HEX_DIGIT();
196 case "IDEOGRAPHIC": return IDEOGRAPHIC();
197 case "JOINCONTROL": return JOIN_CONTROL();
198 case "LETTER": return LETTER();
199 case "LOWERCASE": return LOWERCASE();
200 case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT();
201 case "TITLECASE": return TITLECASE();
202 case "PUNCTUATION": return PUNCTUATION();
203 case "UPPERCASE": return UPPERCASE();
204 case "WHITESPACE": return WHITE_SPACE();
205 case "WORD": return WORD();
206 case "WHITE_SPACE": return WHITE_SPACE();
207 case "HEX_DIGIT": return HEX_DIGIT();
208 case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT();
209 case "JOIN_CONTROL": return JOIN_CONTROL();
210 default: return null;
211 }
212 }
213
214 public static CharPredicate forUnicodeProperty(String propName) {
215 propName = propName.toUpperCase(Locale.ROOT);
216 CharPredicate p = getUnicodePredicate(propName);
217 if (p != null)
218 return p;
219 return getPosixPredicate(propName);
220 }
221
222 public static CharPredicate forPOSIXName(String propName) {
223 return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH));
224 }
225
226 /////////////////////////////////////////////////////////////////////////////
227
228 /**
229 * Returns a predicate matching all characters belong to a named
230 * UnicodeScript.
231 */
232 static CharPredicate forUnicodeScript(String name) {
233 final Character.UnicodeScript script;
234 try {
235 script = Character.UnicodeScript.forName(name);
236 return ch -> script == Character.UnicodeScript.of(ch);
237 } catch (IllegalArgumentException iae) {}
238 return null;
239 }
240
241 /**
242 * Returns a predicate matching all characters in a UnicodeBlock.
243 */
244 static CharPredicate forUnicodeBlock(String name) {
245 final Character.UnicodeBlock block;
246 try {
247 block = Character.UnicodeBlock.forName(name);
248 return ch -> block == Character.UnicodeBlock.of(ch);
249 } catch (IllegalArgumentException iae) {}
250 return null;
251 }
252
253 /////////////////////////////////////////////////////////////////////////////
254
255 // unicode categories, aliases, properties, java methods ...
256
257 static CharPredicate forProperty(String name) {
258 // Unicode character property aliases, defined in
259 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
260 switch (name) {
261 case "Cn": return category(1<<Character.UNASSIGNED);
262 case "Lu": return category(1<<Character.UPPERCASE_LETTER);
263 case "Ll": return category(1<<Character.LOWERCASE_LETTER);
264 case "Lt": return category(1<<Character.TITLECASE_LETTER);
265 case "Lm": return category(1<<Character.MODIFIER_LETTER);
266 case "Lo": return category(1<<Character.OTHER_LETTER);
267 case "Mn": return category(1<<Character.NON_SPACING_MARK);
268 case "Me": return category(1<<Character.ENCLOSING_MARK);
269 case "Mc": return category(1<<Character.COMBINING_SPACING_MARK);
270 case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER);
271 case "Nl": return category(1<<Character.LETTER_NUMBER);
272 case "No": return category(1<<Character.OTHER_NUMBER);
273 case "Zs": return category(1<<Character.SPACE_SEPARATOR);
274 case "Zl": return category(1<<Character.LINE_SEPARATOR);
275 case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR);
276 case "Cc": return category(1<<Character.CONTROL);
277 case "Cf": return category(1<<Character.FORMAT);
278 case "Co": return category(1<<Character.PRIVATE_USE);
279 case "Cs": return category(1<<Character.SURROGATE);
280 case "Pd": return category(1<<Character.DASH_PUNCTUATION);
281 case "Ps": return category(1<<Character.START_PUNCTUATION);
282 case "Pe": return category(1<<Character.END_PUNCTUATION);
283 case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION);
284 case "Po": return category(1<<Character.OTHER_PUNCTUATION);
321 case "LC": return category(((1<<Character.UPPERCASE_LETTER) |
322 (1<<Character.LOWERCASE_LETTER) |
323 (1<<Character.TITLECASE_LETTER)));
324 case "LD": return category(((1<<Character.UPPERCASE_LETTER) |
325 (1<<Character.LOWERCASE_LETTER) |
326 (1<<Character.TITLECASE_LETTER) |
327 (1<<Character.MODIFIER_LETTER) |
328 (1<<Character.OTHER_LETTER) |
329 (1<<Character.DECIMAL_DIGIT_NUMBER)));
330 case "L1": return range(0x00, 0xFF); // Latin-1
331 case "all": return Pattern.ALL();
332 // Posix regular expression character classes, defined in
333 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
334 case "ASCII": return range(0x00, 0x7F); // ASCII
335 case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters
336 case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters
337 case "Blank": return ctype(ASCII.BLANK); // Space and tab characters
338 case "Cntrl": return ctype(ASCII.CNTRL); // Control characters
339 case "Digit": return range('0', '9'); // Numeric characters
340 case "Graph": return ctype(ASCII.GRAPH); // printable and visible
341 case "Lower": return range('a', 'z'); // Lower-case alphabetic
342 case "Print": return range(0x20, 0x7E); // Printable characters
343 case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters
344 case "Space": return ctype(ASCII.SPACE); // Space characters
345 case "Upper": return range('A', 'Z'); // Upper-case alphabetic
346 case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits
347
348 // Java character properties, defined by methods in Character.java
349 case "javaLowerCase": return java.lang.Character::isLowerCase;
350 case "javaUpperCase": return Character::isUpperCase;
351 case "javaAlphabetic": return java.lang.Character::isAlphabetic;
352 case "javaIdeographic": return java.lang.Character::isIdeographic;
353 case "javaTitleCase": return java.lang.Character::isTitleCase;
354 case "javaDigit": return java.lang.Character::isDigit;
355 case "javaDefined": return java.lang.Character::isDefined;
356 case "javaLetter": return java.lang.Character::isLetter;
357 case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit;
358 case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart;
359 case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart;
360 case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart;
361 case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart;
362 case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable;
363 case "javaSpaceChar": return java.lang.Character::isSpaceChar;
364 case "javaWhitespace": return java.lang.Character::isWhitespace;
365 case "javaISOControl": return java.lang.Character::isISOControl;
366 case "javaMirrored": return java.lang.Character::isMirrored;
367 default: return null;
368 }
369 }
370
371 private static CharPredicate category(final int typeMask) {
372 return ch -> (typeMask & (1 << Character.getType(ch))) != 0;
373 }
374
375 private static CharPredicate range(final int lower, final int upper) {
376 return (BmpCharPredicate)ch -> lower <= ch && ch <= upper;
377 }
378
379 private static CharPredicate ctype(final int ctype) {
380 return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype);
381 }
382
383 /////////////////////////////////////////////////////////////////////////////
384
385 /**
386 * Posix ASCII variants, not in the lookup map
|
1 /*
2 * Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
152 return ch -> ch == 0x200C || ch == 0x200D;
153 }
154
155 // \p{alpha}
156 // \p{gc=Mark}
157 // \p{digit}
158 // \p{gc=Connector_Punctuation}
159 // \p{Join_Control} 200C..200D
160 static final CharPredicate WORD() {
161 return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
162 (1 << Character.ENCLOSING_MARK) |
163 (1 << Character.COMBINING_SPACING_MARK) |
164 (1 << Character.DECIMAL_DIGIT_NUMBER) |
165 (1 << Character.CONNECTOR_PUNCTUATION))
166 >> Character.getType(ch)) & 1) != 0,
167 JOIN_CONTROL());
168 }
169
170 /////////////////////////////////////////////////////////////////////////////
171
172 private static CharPredicate getPosixPredicate(String name, boolean caseIns) {
173 switch (name) {
174 case "ALPHA": return ALPHABETIC();
175 case "LOWER": return caseIns
176 ? LOWERCASE().union(UPPERCASE(), TITLECASE())
177 : LOWERCASE();
178 case "UPPER": return caseIns
179 ? UPPERCASE().union(LOWERCASE(), TITLECASE())
180 : UPPERCASE();
181 case "SPACE": return WHITE_SPACE();
182 case "PUNCT": return PUNCTUATION();
183 case "XDIGIT": return HEX_DIGIT();
184 case "ALNUM": return ALNUM();
185 case "CNTRL": return CONTROL();
186 case "DIGIT": return DIGIT();
187 case "BLANK": return BLANK();
188 case "GRAPH": return GRAPH();
189 case "PRINT": return PRINT();
190 default: return null;
191 }
192 }
193
194 private static CharPredicate getUnicodePredicate(String name, boolean caseIns) {
195 switch (name) {
196 case "ALPHABETIC": return ALPHABETIC();
197 case "ASSIGNED": return ASSIGNED();
198 case "CONTROL": return CONTROL();
199 case "HEXDIGIT":
200 case "HEX_DIGIT": return HEX_DIGIT();
201 case "IDEOGRAPHIC": return IDEOGRAPHIC();
202 case "JOINCONTROL":
203 case "JOIN_CONTROL": return JOIN_CONTROL();
204 case "LETTER": return LETTER();
205 case "LOWERCASE": return caseIns
206 ? LOWERCASE().union(UPPERCASE(), TITLECASE())
207 : LOWERCASE();
208 case "NONCHARACTERCODEPOINT":
209 case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT();
210 case "TITLECASE": return caseIns
211 ? TITLECASE().union(LOWERCASE(), UPPERCASE())
212 : TITLECASE();
213 case "PUNCTUATION": return PUNCTUATION();
214 case "UPPERCASE": return caseIns
215 ? UPPERCASE().union(LOWERCASE(), TITLECASE())
216 : UPPERCASE();
217 case "WHITESPACE":
218 case "WHITE_SPACE": return WHITE_SPACE();
219 case "WORD": return WORD();
220 default: return null;
221 }
222 }
223
224 public static CharPredicate forUnicodeProperty(String propName, boolean caseIns) {
225 propName = propName.toUpperCase(Locale.ROOT);
226 CharPredicate p = getUnicodePredicate(propName, caseIns);
227 if (p != null)
228 return p;
229 return getPosixPredicate(propName, caseIns);
230 }
231
232 public static CharPredicate forPOSIXName(String propName, boolean caseIns) {
233 return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH), caseIns);
234 }
235
236 /////////////////////////////////////////////////////////////////////////////
237
238 /**
239 * Returns a predicate matching all characters belong to a named
240 * UnicodeScript.
241 */
242 static CharPredicate forUnicodeScript(String name) {
243 final Character.UnicodeScript script;
244 try {
245 script = Character.UnicodeScript.forName(name);
246 return ch -> script == Character.UnicodeScript.of(ch);
247 } catch (IllegalArgumentException iae) {}
248 return null;
249 }
250
251 /**
252 * Returns a predicate matching all characters in a UnicodeBlock.
253 */
254 static CharPredicate forUnicodeBlock(String name) {
255 final Character.UnicodeBlock block;
256 try {
257 block = Character.UnicodeBlock.forName(name);
258 return ch -> block == Character.UnicodeBlock.of(ch);
259 } catch (IllegalArgumentException iae) {}
260 return null;
261 }
262
263 /////////////////////////////////////////////////////////////////////////////
264
265 // unicode categories, aliases, properties, java methods ...
266
267 static CharPredicate forProperty(String name, boolean caseIns) {
268 // Unicode character property aliases, defined in
269 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
270 switch (name) {
271 case "Cn": return category(1<<Character.UNASSIGNED);
272 case "Lu": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) |
273 (1<<Character.UPPERCASE_LETTER) |
274 (1<<Character.TITLECASE_LETTER)
275 : (1<<Character.UPPERCASE_LETTER));
276 case "Ll": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) |
277 (1<<Character.UPPERCASE_LETTER) |
278 (1<<Character.TITLECASE_LETTER)
279 : (1<<Character.LOWERCASE_LETTER));
280 case "Lt": return category(caseIns ? (1<<Character.LOWERCASE_LETTER) |
281 (1<<Character.UPPERCASE_LETTER) |
282 (1<<Character.TITLECASE_LETTER)
283 : (1<<Character.TITLECASE_LETTER));
284 case "Lm": return category(1<<Character.MODIFIER_LETTER);
285 case "Lo": return category(1<<Character.OTHER_LETTER);
286 case "Mn": return category(1<<Character.NON_SPACING_MARK);
287 case "Me": return category(1<<Character.ENCLOSING_MARK);
288 case "Mc": return category(1<<Character.COMBINING_SPACING_MARK);
289 case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER);
290 case "Nl": return category(1<<Character.LETTER_NUMBER);
291 case "No": return category(1<<Character.OTHER_NUMBER);
292 case "Zs": return category(1<<Character.SPACE_SEPARATOR);
293 case "Zl": return category(1<<Character.LINE_SEPARATOR);
294 case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR);
295 case "Cc": return category(1<<Character.CONTROL);
296 case "Cf": return category(1<<Character.FORMAT);
297 case "Co": return category(1<<Character.PRIVATE_USE);
298 case "Cs": return category(1<<Character.SURROGATE);
299 case "Pd": return category(1<<Character.DASH_PUNCTUATION);
300 case "Ps": return category(1<<Character.START_PUNCTUATION);
301 case "Pe": return category(1<<Character.END_PUNCTUATION);
302 case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION);
303 case "Po": return category(1<<Character.OTHER_PUNCTUATION);
340 case "LC": return category(((1<<Character.UPPERCASE_LETTER) |
341 (1<<Character.LOWERCASE_LETTER) |
342 (1<<Character.TITLECASE_LETTER)));
343 case "LD": return category(((1<<Character.UPPERCASE_LETTER) |
344 (1<<Character.LOWERCASE_LETTER) |
345 (1<<Character.TITLECASE_LETTER) |
346 (1<<Character.MODIFIER_LETTER) |
347 (1<<Character.OTHER_LETTER) |
348 (1<<Character.DECIMAL_DIGIT_NUMBER)));
349 case "L1": return range(0x00, 0xFF); // Latin-1
350 case "all": return Pattern.ALL();
351 // Posix regular expression character classes, defined in
352 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
353 case "ASCII": return range(0x00, 0x7F); // ASCII
354 case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters
355 case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters
356 case "Blank": return ctype(ASCII.BLANK); // Space and tab characters
357 case "Cntrl": return ctype(ASCII.CNTRL); // Control characters
358 case "Digit": return range('0', '9'); // Numeric characters
359 case "Graph": return ctype(ASCII.GRAPH); // printable and visible
360 case "Lower": return caseIns ? ctype(ASCII.ALPHA)
361 : range('a', 'z'); // Lower-case alphabetic
362 case "Print": return range(0x20, 0x7E); // Printable characters
363 case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters
364 case "Space": return ctype(ASCII.SPACE); // Space characters
365 case "Upper": return caseIns ? ctype(ASCII.ALPHA)
366 : range('A', 'Z'); // Upper-case alphabetic
367 case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits
368
369 // Java character properties, defined by methods in Character.java
370 case "javaLowerCase": return caseIns ? c -> Character.isLowerCase(c) ||
371 Character.isUpperCase(c) ||
372 Character.isTitleCase(c)
373 : Character::isLowerCase;
374 case "javaUpperCase": return caseIns ? c -> Character.isUpperCase(c) ||
375 Character.isLowerCase(c) ||
376 Character.isTitleCase(c)
377 : Character::isUpperCase;
378 case "javaAlphabetic": return Character::isAlphabetic;
379 case "javaIdeographic": return Character::isIdeographic;
380 case "javaTitleCase": return caseIns ? c -> Character.isTitleCase(c) ||
381 Character.isLowerCase(c) ||
382 Character.isUpperCase(c)
383 : Character::isTitleCase;
384 case "javaDigit": return Character::isDigit;
385 case "javaDefined": return Character::isDefined;
386 case "javaLetter": return Character::isLetter;
387 case "javaLetterOrDigit": return Character::isLetterOrDigit;
388 case "javaJavaIdentifierStart": return Character::isJavaIdentifierStart;
389 case "javaJavaIdentifierPart": return Character::isJavaIdentifierPart;
390 case "javaUnicodeIdentifierStart": return Character::isUnicodeIdentifierStart;
391 case "javaUnicodeIdentifierPart": return Character::isUnicodeIdentifierPart;
392 case "javaIdentifierIgnorable": return Character::isIdentifierIgnorable;
393 case "javaSpaceChar": return Character::isSpaceChar;
394 case "javaWhitespace": return Character::isWhitespace;
395 case "javaISOControl": return Character::isISOControl;
396 case "javaMirrored": return Character::isMirrored;
397 default: return null;
398 }
399 }
400
401 private static CharPredicate category(final int typeMask) {
402 return ch -> (typeMask & (1 << Character.getType(ch))) != 0;
403 }
404
405 private static CharPredicate range(final int lower, final int upper) {
406 return (BmpCharPredicate)ch -> lower <= ch && ch <= upper;
407 }
408
409 private static CharPredicate ctype(final int ctype) {
410 return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype);
411 }
412
413 /////////////////////////////////////////////////////////////////////////////
414
415 /**
416 * Posix ASCII variants, not in the lookup map
|