src/java.base/share/classes/java/util/regex/CharPredicates.java
Print this page
*** 1,7 ****
/*
! * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
--- 1,7 ----
/*
! * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
*** 25,246 ****
package java.util.regex;
import java.util.HashMap;
import java.util.Locale;
! enum UnicodeProp {
! ALPHABETIC {
! public boolean is(int ch) {
! return Character.isAlphabetic(ch);
! }
! },
! LETTER {
! public boolean is(int ch) {
! return Character.isLetter(ch);
! }
! },
! IDEOGRAPHIC {
! public boolean is(int ch) {
! return Character.isIdeographic(ch);
! }
! },
! LOWERCASE {
! public boolean is(int ch) {
! return Character.isLowerCase(ch);
! }
! },
! UPPERCASE {
! public boolean is(int ch) {
! return Character.isUpperCase(ch);
! }
! },
! TITLECASE {
! public boolean is(int ch) {
! return Character.isTitleCase(ch);
! }
! },
- WHITE_SPACE {
// \p{Whitespace}
! public boolean is(int ch) {
! return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
- }
- },
- CONTROL {
// \p{gc=Control}
! public boolean is(int ch) {
! return Character.getType(ch) == Character.CONTROL;
! }
! },
- PUNCTUATION {
// \p{gc=Punctuation}
! public boolean is(int ch) {
! return ((((1 << Character.CONNECTOR_PUNCTUATION) |
(1 << Character.DASH_PUNCTUATION) |
(1 << Character.START_PUNCTUATION) |
(1 << Character.END_PUNCTUATION) |
(1 << Character.OTHER_PUNCTUATION) |
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
- }
- },
- HEX_DIGIT {
// \p{gc=Decimal_Number}
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
! public boolean is(int ch) {
! return DIGIT.is(ch) ||
! (ch >= 0x0030 && ch <= 0x0039) ||
(ch >= 0x0041 && ch <= 0x0046) ||
(ch >= 0x0061 && ch <= 0x0066) ||
(ch >= 0xFF10 && ch <= 0xFF19) ||
(ch >= 0xFF21 && ch <= 0xFF26) ||
! (ch >= 0xFF41 && ch <= 0xFF46);
! }
! },
! ASSIGNED {
! public boolean is(int ch) {
! return Character.getType(ch) != Character.UNASSIGNED;
! }
! },
- NONCHARACTER_CODE_POINT {
// PropList.txt:Noncharacter_Code_Point
! public boolean is(int ch) {
! return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
! }
! },
!
! DIGIT {
! // \p{gc=Decimal_Number}
! public boolean is(int ch) {
! return Character.isDigit(ch);
! }
! },
- ALNUM {
// \p{alpha}
// \p{digit}
! public boolean is(int ch) {
! return ALPHABETIC.is(ch) || DIGIT.is(ch);
! }
! },
- BLANK {
// \p{Whitespace} --
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
// \p{gc=Line_Separator}
// \p{gc=Paragraph_Separator}]
! public boolean is(int ch) {
! return Character.getType(ch) == Character.SPACE_SEPARATOR ||
ch == 0x9; // \N{HT}
- }
- },
- GRAPH {
// [^
// \p{space}
// \p{gc=Control}
// \p{gc=Surrogate}
// \p{gc=Unassigned}]
! public boolean is(int ch) {
! return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR) |
(1 << Character.CONTROL) |
(1 << Character.SURROGATE) |
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
== 0;
- }
- },
- PRINT {
// \p{graph}
// \p{blank}
// -- \p{cntrl}
! public boolean is(int ch) {
! return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
! }
! },
- WORD {
// \p{alpha}
// \p{gc=Mark}
// \p{digit}
// \p{gc=Connector_Punctuation}
// \p{Join_Control} 200C..200D
!
! public boolean is(int ch) {
! return ALPHABETIC.is(ch) ||
! ((((1 << Character.NON_SPACING_MARK) |
(1 << Character.ENCLOSING_MARK) |
(1 << Character.COMBINING_SPACING_MARK) |
(1 << Character.DECIMAL_DIGIT_NUMBER) |
! (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
! != 0 ||
! JOIN_CONTROL.is(ch);
! }
! },
! JOIN_CONTROL {
! // 200C..200D PropList.txt:Join_Control
! public boolean is(int ch) {
! return (ch == 0x200C || ch == 0x200D);
}
- };
- private static final HashMap<String, String> posix = new HashMap<>();
- private static final HashMap<String, String> aliases = new HashMap<>();
static {
! posix.put("ALPHA", "ALPHABETIC");
! posix.put("LOWER", "LOWERCASE");
! posix.put("UPPER", "UPPERCASE");
! posix.put("SPACE", "WHITE_SPACE");
! posix.put("PUNCT", "PUNCTUATION");
! posix.put("XDIGIT","HEX_DIGIT");
! posix.put("ALNUM", "ALNUM");
! posix.put("CNTRL", "CONTROL");
! posix.put("DIGIT", "DIGIT");
! posix.put("BLANK", "BLANK");
! posix.put("GRAPH", "GRAPH");
! posix.put("PRINT", "PRINT");
!
! aliases.put("WHITESPACE", "WHITE_SPACE");
! aliases.put("HEXDIGIT","HEX_DIGIT");
! aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
! aliases.put("JOINCONTROL", "JOIN_CONTROL");
! }
!
! public static UnicodeProp forName(String propName) {
! propName = propName.toUpperCase(Locale.ENGLISH);
! String alias = aliases.get(propName);
! if (alias != null)
! propName = alias;
try {
! return valueOf (propName);
! } catch (IllegalArgumentException x) {}
return null;
}
! public static UnicodeProp forPOSIXName(String propName) {
! propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
! if (propName == null)
return null;
- return valueOf (propName);
}
! public abstract boolean is(int ch);
}
--- 25,378 ----
package java.util.regex;
import java.util.HashMap;
import java.util.Locale;
+ import java.util.regex.Pattern.CharPredicate;
+ import java.util.regex.Pattern.BmpCharPredicate;
! class CharPredicates {
! static CharPredicate ALPHABETIC = Character::isAlphabetic;
! // \p{gc=Decimal_Number}
! static CharPredicate DIGIT = Character::isDigit;
! static CharPredicate LETTER = Character::isLetter;
! static CharPredicate IDEOGRAPHIC = Character::isIdeographic;
! static CharPredicate LOWERCASE = Character::isLowerCase;
! static CharPredicate UPPERCASE = Character::isUpperCase;
!
! static CharPredicate TITLECASE = Character::isTitleCase;
// \p{Whitespace}
! static CharPredicate WHITE_SPACE = ch ->
! ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
// \p{gc=Control}
! static CharPredicate CONTROL = ch ->
! Character.getType(ch) == Character.CONTROL;
// \p{gc=Punctuation}
! static CharPredicate PUNCTUATION = ch ->
! ((((1 << Character.CONNECTOR_PUNCTUATION) |
(1 << Character.DASH_PUNCTUATION) |
(1 << Character.START_PUNCTUATION) |
(1 << Character.END_PUNCTUATION) |
(1 << Character.OTHER_PUNCTUATION) |
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
// \p{gc=Decimal_Number}
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
! static CharPredicate HEX_DIGIT = DIGIT.union(
! ch -> (ch >= 0x0030 && ch <= 0x0039) ||
(ch >= 0x0041 && ch <= 0x0046) ||
(ch >= 0x0061 && ch <= 0x0066) ||
(ch >= 0xFF10 && ch <= 0xFF19) ||
(ch >= 0xFF21 && ch <= 0xFF26) ||
! (ch >= 0xFF41 && ch <= 0xFF46));
! static CharPredicate ASSIGNED = ch ->
! Character.getType(ch) != Character.UNASSIGNED;
// PropList.txt:Noncharacter_Code_Point
! static CharPredicate NONCHARACTER_CODE_POINT = ch ->
! (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
// \p{alpha}
// \p{digit}
! static CharPredicate ALNUM = ALPHABETIC.union(DIGIT);
// \p{Whitespace} --
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
// \p{gc=Line_Separator}
// \p{gc=Paragraph_Separator}]
! static CharPredicate BLANK = ch ->
! Character.getType(ch) == Character.SPACE_SEPARATOR ||
ch == 0x9; // \N{HT}
// [^
// \p{space}
// \p{gc=Control}
// \p{gc=Surrogate}
// \p{gc=Unassigned}]
! static CharPredicate GRAPH = ch ->
! ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR) |
(1 << Character.CONTROL) |
(1 << Character.SURROGATE) |
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
== 0;
// \p{graph}
// \p{blank}
// -- \p{cntrl}
! static CharPredicate PRINT = GRAPH.union(BLANK).and(CONTROL.negate());
!
! // 200C..200D PropList.txt:Join_Control
! static CharPredicate JOIN_CONTROL = ch -> ch == 0x200C || ch == 0x200D;
// \p{alpha}
// \p{gc=Mark}
// \p{digit}
// \p{gc=Connector_Punctuation}
// \p{Join_Control} 200C..200D
! static CharPredicate WORD =
! ALPHABETIC.union(ch -> ((((1 << Character.NON_SPACING_MARK) |
(1 << Character.ENCLOSING_MARK) |
(1 << Character.COMBINING_SPACING_MARK) |
(1 << Character.DECIMAL_DIGIT_NUMBER) |
! (1 << Character.CONNECTOR_PUNCTUATION))
! >> Character.getType(ch)) & 1) != 0,
! JOIN_CONTROL);
! /////////////////////////////////////////////////////////////////////////////
!
! private static final HashMap<String, CharPredicate> posix = new HashMap<>();
! private static final HashMap<String, CharPredicate> uprops = new HashMap<>();
!
! private static void defPosix(String name, CharPredicate p) {
! posix.put(name, p);
! }
! private static void defUProp(String name, CharPredicate p) {
! uprops.put(name, p);
}
static {
! defPosix("ALPHA", ALPHABETIC);
! defPosix("LOWER", LOWERCASE);
! defPosix("UPPER", UPPERCASE);
! defPosix("SPACE", WHITE_SPACE);
! defPosix("PUNCT", PUNCTUATION);
! defPosix("XDIGIT",HEX_DIGIT);
! defPosix("ALNUM", ALNUM);
! defPosix("CNTRL", CONTROL);
! defPosix("DIGIT", DIGIT);
! defPosix("BLANK", BLANK);
! defPosix("GRAPH", GRAPH);
! defPosix("PRINT", PRINT);
!
! defUProp("ALPHABETIC", ALPHABETIC);
! defUProp("ASSIGNED", ASSIGNED);
! defUProp("CONTROL", CONTROL);
! defUProp("HEXDIGIT", HEX_DIGIT);
! defUProp("IDEOGRAPHIC", IDEOGRAPHIC);
! defUProp("JOINCONTROL", JOIN_CONTROL);
! defUProp("LETTER", LETTER);
! defUProp("LOWERCASE", LOWERCASE);
! defUProp("NONCHARACTERCODEPOINT", NONCHARACTER_CODE_POINT);
! defUProp("TITLECASE", TITLECASE);
! defUProp("PUNCTUATION", PUNCTUATION);
! defUProp("UPPERCASE", UPPERCASE);
! defUProp("WHITESPACE", WHITE_SPACE);
! defUProp("WORD", WORD);
! defUProp("WHITE_SPACE", WHITE_SPACE);
! defUProp("HEX_DIGIT", HEX_DIGIT);
! defUProp("NONCHARACTER_CODE_POINT", NONCHARACTER_CODE_POINT);
! defUProp("JOIN_CONTROL", JOIN_CONTROL);
! }
!
! public static CharPredicate forUnicodeProperty(String propName) {
! propName = propName.toUpperCase(Locale.ROOT);
! CharPredicate p = uprops.get(propName);
! if (p != null)
! return p;
! return posix.get(propName);
! }
!
! public static CharPredicate forPOSIXName(String propName) {
! return posix.get(propName.toUpperCase(Locale.ENGLISH));
! }
!
! /////////////////////////////////////////////////////////////////////////////
!
! /**
! * Returns a predicate matching all characters belong to a named
! * UnicodeScript.
! */
! static CharPredicate forUnicodeScript(String name) {
! final Character.UnicodeScript script;
try {
! script = Character.UnicodeScript.forName(name);
! return ch -> script == Character.UnicodeScript.of(ch);
! } catch (IllegalArgumentException iae) {}
return null;
}
! /**
! * Returns a predicate matching all characters in a UnicodeBlock.
! */
! static CharPredicate forUnicodeBlock(String name) {
! final Character.UnicodeBlock block;
! try {
! block = Character.UnicodeBlock.forName(name);
! return ch -> block == Character.UnicodeBlock.of(ch);
! } catch (IllegalArgumentException iae) {}
return null;
}
! /////////////////////////////////////////////////////////////////////////////
!
! // unicode categoreis, aliases, properties, java methods ...
!
! private static final HashMap<String, CharPredicate> props = new HashMap<>();
!
! /**
! * Returns a predicate matching all characters in a named property.
! */
! static CharPredicate forProperty(String name) {
! return props.get(name);
! }
!
! private static void defProp(String name, CharPredicate p) {
! // PrintPattern.pmap.put(p, name);
! props.put(name, p);
! }
!
! private static void defCategory(String name, final int typeMask) {
! CharPredicate p = ch -> (typeMask & (1 << Character.getType(ch))) != 0;
! // PrintPattern.pmap.put(p, name);
! props.put(name, p);
! }
!
! private static void defRange(String name, final int lower, final int upper) {
! BmpCharPredicate p = ch -> lower <= ch && ch <= upper;
! // PrintPattern.pmap.put(p, name);
! props.put(name, p);
! }
!
! private static void defCtype(String name, final int ctype) {
! BmpCharPredicate p = ch -> ch < 128 && ASCII.isType(ch, ctype);
! // PrintPattern.pmap.put(p, name);
! props.put(name, p);
! }
!
! static {
! // Unicode character property aliases, defined in
! // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
! defCategory("Cn", 1<<Character.UNASSIGNED);
! defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
! defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
! defCategory("Lt", 1<<Character.TITLECASE_LETTER);
! defCategory("Lm", 1<<Character.MODIFIER_LETTER);
! defCategory("Lo", 1<<Character.OTHER_LETTER);
! defCategory("Mn", 1<<Character.NON_SPACING_MARK);
! defCategory("Me", 1<<Character.ENCLOSING_MARK);
! defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
! defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
! defCategory("Nl", 1<<Character.LETTER_NUMBER);
! defCategory("No", 1<<Character.OTHER_NUMBER);
! defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
! defCategory("Zl", 1<<Character.LINE_SEPARATOR);
! defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
! defCategory("Cc", 1<<Character.CONTROL);
! defCategory("Cf", 1<<Character.FORMAT);
! defCategory("Co", 1<<Character.PRIVATE_USE);
! defCategory("Cs", 1<<Character.SURROGATE);
! defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
! defCategory("Ps", 1<<Character.START_PUNCTUATION);
! defCategory("Pe", 1<<Character.END_PUNCTUATION);
! defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
! defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
! defCategory("Sm", 1<<Character.MATH_SYMBOL);
! defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
! defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
! defCategory("So", 1<<Character.OTHER_SYMBOL);
! defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
! defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
! defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
! (1<<Character.LOWERCASE_LETTER) |
! (1<<Character.TITLECASE_LETTER) |
! (1<<Character.MODIFIER_LETTER) |
! (1<<Character.OTHER_LETTER)));
! defCategory("M", ((1<<Character.NON_SPACING_MARK) |
! (1<<Character.ENCLOSING_MARK) |
! (1<<Character.COMBINING_SPACING_MARK)));
! defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
! (1<<Character.LETTER_NUMBER) |
! (1<<Character.OTHER_NUMBER)));
! defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
! (1<<Character.LINE_SEPARATOR) |
! (1<<Character.PARAGRAPH_SEPARATOR)));
! defCategory("C", ((1<<Character.CONTROL) |
! (1<<Character.FORMAT) |
! (1<<Character.PRIVATE_USE) |
! (1<<Character.SURROGATE))); // Other
! defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
! (1<<Character.START_PUNCTUATION) |
! (1<<Character.END_PUNCTUATION) |
! (1<<Character.CONNECTOR_PUNCTUATION) |
! (1<<Character.OTHER_PUNCTUATION) |
! (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
! (1<<Character.FINAL_QUOTE_PUNCTUATION)));
! defCategory("S", ((1<<Character.MATH_SYMBOL) |
! (1<<Character.CURRENCY_SYMBOL) |
! (1<<Character.MODIFIER_SYMBOL) |
! (1<<Character.OTHER_SYMBOL)));
! defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
! (1<<Character.LOWERCASE_LETTER) |
! (1<<Character.TITLECASE_LETTER)));
! defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
! (1<<Character.LOWERCASE_LETTER) |
! (1<<Character.TITLECASE_LETTER) |
! (1<<Character.MODIFIER_LETTER) |
! (1<<Character.OTHER_LETTER) |
! (1<<Character.DECIMAL_DIGIT_NUMBER)));
! defRange("L1", 0x00, 0xFF); // Latin-1
! props.put("all", ch -> true);
!
! // Posix regular expression character classes, defined in
! // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
! defRange("ASCII", 0x00, 0x7F); // ASCII
! defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
! defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
! defCtype("Blank", ASCII.BLANK); // Space and tab characters
! defCtype("Cntrl", ASCII.CNTRL); // Control characters
! defRange("Digit", '0', '9'); // Numeric characters
! defCtype("Graph", ASCII.GRAPH); // printable and visible
! defRange("Lower", 'a', 'z'); // Lower-case alphabetic
! defRange("Print", 0x20, 0x7E); // Printable characters
! defCtype("Punct", ASCII.PUNCT); // Punctuation characters
! defCtype("Space", ASCII.SPACE); // Space characters
! defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
! defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
!
! // Java character properties, defined by methods in Character.java
! defProp("javaLowerCase", java.lang.Character::isLowerCase);
! defProp("javaUpperCase", Character::isUpperCase);
! defProp("javaAlphabetic", java.lang.Character::isAlphabetic);
! defProp("javaIdeographic", java.lang.Character::isIdeographic);
! defProp("javaTitleCase", java.lang.Character::isTitleCase);
! defProp("javaDigit", java.lang.Character::isDigit);
! defProp("javaDefined", java.lang.Character::isDefined);
! defProp("javaLetter", java.lang.Character::isLetter);
! defProp("javaLetterOrDigit", java.lang.Character::isLetterOrDigit);
! defProp("javaJavaIdentifierStart", java.lang.Character::isJavaIdentifierStart);
! defProp("javaJavaIdentifierPart", java.lang.Character::isJavaIdentifierPart);
! defProp("javaUnicodeIdentifierStart", java.lang.Character::isUnicodeIdentifierStart);
! defProp("javaUnicodeIdentifierPart", java.lang.Character::isUnicodeIdentifierPart);
! defProp("javaIdentifierIgnorable", java.lang.Character::isIdentifierIgnorable);
! defProp("javaSpaceChar", java.lang.Character::isSpaceChar);
! defProp("javaWhitespace", java.lang.Character::isWhitespace);
! defProp("javaISOControl", java.lang.Character::isISOControl);
! defProp("javaMirrored", java.lang.Character::isMirrored);
! }
!
! /////////////////////////////////////////////////////////////////////////////
!
! /**
! * Posix ASCII variants, not in the lookup map
! */
! static BmpCharPredicate ASCII_DIGIT = ch -> ch < 128 && ASCII.isDigit(ch);
! static BmpCharPredicate ASCII_WORD = ch -> ch < 128 && ASCII.isWord(ch);
! static BmpCharPredicate ASCII_SPACE = ch -> ch < 128 && ASCII.isSpace(ch);
!
}