--- old/src/share/classes/java/util/regex/Pattern.java 2011-04-28 15:33:12.334988133 -0700 +++ new/src/share/classes/java/util/regex/Pattern.java 2011-04-28 15:33:11.999231406 -0700 @@ -206,13 +206,15 @@ * Equivalent to java.lang.Character.isMirrored() * *   - * Classes for Unicode scripts, blocks and categories + * Classes for Unicode scripts, blocks, categories and binary properties * * \p{IsLatin} - * A Latin script character (simple script) + * A Latin script character (script) * \p{InGreek} - * A character in the Greek block (simple block) + * A character in the Greek block (block) * \p{Lu} - * An uppercase letter (simple category) + * An uppercase letter (category) + * \p{isAlphabetic} + * An alphabetic character (binary property) * \p{Sc} * A currency symbol * \P{InGreek} @@ -328,10 +330,11 @@ * X, as a named-capturing group * (?:X) * X, as a non-capturing group - * (?idmsux-idmsux)  + * (?idmsuxU-idmsuxU)  * Nothing, but turns match flags i * d m s - * u x on - off + * u x U + * on - off * (?idmsux-idmsux:X)   * X, as a non-capturing group with the * given flags i d @@ -518,61 +521,140 @@ * *

This class is in conformance with Level 1 of Unicode Technical - * Standard #18: Unicode Regular Expression Guidelines, plus RL2.1 + * Standard #18: Unicode Regular Expression, plus RL2.1 * Canonical Equivalents. - * - *

Unicode escape sequences such as \u2014 in Java source code + *

+ * Unicode escape sequences such as \u2014 in Java source code * are processed as described in section 3.3 of * The Java™ Language Specification. - * Such escape sequences are also - * implemented directly by the regular-expression parser so that Unicode - * escapes can be used in expressions that are read from files or from the - * keyboard. Thus the strings "\u2014" and "\\u2014", - * while not equal, compile into the same pattern, which matches the character - * with hexadecimal value 0x2014. - * - *

A Unicode character can also be represented in a regular-expression by - * using its hexadecimal code point value directly as described in construct + * Such escape sequences are also implemented directly by the regular-expression + * parser so that Unicode escapes can be used in expressions that are read from + * files or from the keyboard. Thus the strings "\u2014" and + * "\\u2014", while not equal, compile into the same pattern, which + * matches the character with hexadecimal value 0x2014. + *

+ * A Unicode character can also be represented in a regular-expression by + * using its Hex notation(hexadecimal code point value) directly as described in construct * \x{...}, for example a supplementary character U+2011F * can be specified as \x{2011F}, instead of two consecutive * Unicode escape sequences of the surrogate pair * \uD840\uDD1F. - * - * - *

Unicode scripts, blocks and categories are written with the \p and - * \P constructs as in Perl. \p{prop} matches if + *

+ * Unicode scripts, blocks, categories and binary properties are written with + * the \p and \P constructs as in Perl. + * \p{prop} matches if * the input has the property prop, while \P{prop} * does not match if the input has that property. *

- * Scripts are specified either with the prefix {@code Is}, as in + * Scripts, blocks, categories and binary properties can be used both inside + * and outside of a character class. + * + *

+ * Scripts are specified either with the prefix {@code Is}, as in * {@code IsHiragana}, or by using the {@code script} keyword (or its short * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. *

- * Blocks are specified with the prefix {@code In}, as in + * The script names supported by Pattern are the valid script names + * accepted and defined by + * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. + * + *

+ * Blocks are specified with the prefix {@code In}, as in * {@code InMongolian}, or by using the keyword {@code block} (or its short * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. *

- * Categories may be specified with the optional prefix {@code Is}: + * The block names supported by Pattern are the valid block names + * accepted and defined by + * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. + *

+ * + * Categories may be specified with the optional prefix {@code Is}: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode * letters. Same as scripts and blocks, categories can also be specified * by using the keyword {@code general_category} (or its short form * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. *

- * Scripts, blocks and categories can be used both inside and outside of a - * character class. - *

The supported categories are those of + * The supported categories are those of * * The Unicode Standard in the version specified by the * {@link java.lang.Character Character} class. The category names are those * defined in the Standard, both normative and informative. - * The script names supported by Pattern are the valid script names - * accepted and defined by - * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. - * The block names supported by Pattern are the valid block names - * accepted and defined by - * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. *

- *

Categories that behave like the java.lang.Character + * + * Binary properties are specified with the prefix {@code Is}, as in + * {@code IsAlphabetic}. The supported binary properties by Pattern + * are + *

+ + + *

+ * Predefined Character classes and POSIX character classes are in + * conformance with the recommendation of Annex C: Compatibility Properties + * of Unicode Regular Expression + * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified. + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ClassesMatches
\p{Lower}A lowercase character:\p{IsLowercase}
\p{Upper}An uppercase character:\p{IsUppercase}
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:\p{IsAlphabetic}
\p{Digit}A decimal digit character:p{IsDigit}
\p{Alnum}An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}]
\p{Punct}A punctuation character:p{IsPunctuation}
\p{Graph}A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]
\p{Print}A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]
\p{Blank}A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]
\p{Cntrl}A control character: \p{gc=Cc}
\p{XDigit}A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}]
\p{Space}A whitespace character:\p{IsWhite_Space}
\dA digit: \p{IsDigit}
\DA non-digit: [^\d]
\sA whitespace character: \p{IsWhite_Space}
\SA non-whitespace character: [^\s]
\wA word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]
\WA non-word character: [^\w]
+ *

+ * + * Categories that behave like the java.lang.Character * boolean ismethodname methods (except for the deprecated ones) are * available through the same \p{prop} syntax where * the specified property has the name javamethodname. @@ -796,6 +878,28 @@ */ public static final int CANON_EQ = 0x80; + /** + * Enables the Unicode version of Predefined character classes and + * POSIX character classes. + * + *

When this flag is specified then the (US-ASCII only) + * Predefined character classes and POSIX character classes + * are in conformance with + * Unicode Technical + * Standard #18: Unicode Regular Expression + * Annex C: Compatibility Properties. + *

+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded + * flag expression (?U). + *

+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case + * folding. + *

+ * Specifying this flag may impose a performance penalty.

+ * @since 1.7 + */ + public static final int UNICODE_CHARACTER_CLASS = 0x100; + /* Pattern has only two serialized components: The pattern string * and the flags, which are all that is needed to recompile the pattern * when it is deserialized. @@ -918,7 +1022,8 @@ * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, - * {@link #LITERAL} and {@link #COMMENTS} + * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} + * and {@link #COMMENTS} * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined @@ -1209,6 +1314,10 @@ pattern = p; flags = f; + // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present + if ((flags & UNICODE_CHARACTER_CLASS) != 0) + flags |= UNICODE_CASE; + // Reset group index count capturingGroupCount = 1; localCount = 0; @@ -2164,12 +2273,14 @@ return -1; case 'B': if (inclass) break; - if (create) root = new Bound(Bound.NONE); + if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': - if (create) root = new Ctype(ASCII.DIGIT).complement(); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.DIGIT).complement() + : new Ctype(ASCII.DIGIT).complement(); return -1; case 'E': case 'F': @@ -2191,14 +2302,18 @@ case 'R': break; case 'S': - if (create) root = new Ctype(ASCII.SPACE).complement(); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WHITE_SPACE).complement() + : new Ctype(ASCII.SPACE).complement(); return -1; case 'T': case 'U': case 'V': break; case 'W': - if (create) root = new Ctype(ASCII.WORD).complement(); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WORD).complement() + : new Ctype(ASCII.WORD).complement(); return -1; case 'X': case 'Y': @@ -2216,12 +2331,14 @@ return '\007'; case 'b': if (inclass) break; - if (create) root = new Bound(Bound.BOTH); + if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); return -1; case 'c': return c(); case 'd': - if (create) root = new Ctype(ASCII.DIGIT); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.DIGIT) + : new Ctype(ASCII.DIGIT); return -1; case 'e': return '\033'; @@ -2259,7 +2376,9 @@ case 'r': return '\r'; case 's': - if (create) root = new Ctype(ASCII.SPACE); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WHITE_SPACE) + : new Ctype(ASCII.SPACE); return -1; case 't': return '\t'; @@ -2268,7 +2387,9 @@ case 'v': return '\013'; case 'w': - if (create) root = new Ctype(ASCII.WORD); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WORD) + : new Ctype(ASCII.WORD); return -1; case 'x': return x(); @@ -2490,7 +2611,7 @@ { next(); String name; - CharProperty node; + CharProperty node = null; if (singleLetter) { int c = temp[cursor]; @@ -2536,11 +2657,21 @@ } else if (name.startsWith("Is")) { // \p{isGeneralCategory} and \p{isScriptName} name = name.substring(2); - node = CharPropertyNames.charPropertyFor(name); + UnicodeProp uprop = UnicodeProp.forName(name); + if (uprop != null) + node = new Utype(uprop); + if (node == null) + node = CharPropertyNames.charPropertyFor(name); if (node == null) node = unicodeScriptPropertyFor(name); } else { - node = charPropertyNodeFor(name); + if (has(UNICODE_CHARACTER_CLASS)) { + UnicodeProp uprop = UnicodeProp.forPOSIXName(name); + if (uprop != null) + node = new Utype(uprop); + } + if (node == null) + node = charPropertyNodeFor(name); } } if (maybeComplement) { @@ -2822,6 +2953,9 @@ case 'x': flags |= COMMENTS; break; + case 'U': + flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); + break; case '-': // subFlag then fall through ch = next(); subFlag(); @@ -2861,6 +2995,8 @@ case 'x': flags &= ~COMMENTS; break; + case 'U': + flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); default: return; } @@ -3664,6 +3800,18 @@ } /** + * Node class that matches a Unicode "type" + */ + static final class Utype extends CharProperty { + final UnicodeProp uprop; + Utype(UnicodeProp uprop) { this.uprop = uprop; } + boolean isSatisfiedBy(int ch) { + return uprop.is(ch); + } + } + + + /** * Node class that matches a POSIX type. */ static final class Ctype extends BmpCharProperty { @@ -5025,9 +5173,17 @@ static int BOTH = 0x3; static int NONE = 0x4; int type; - Bound(int n) { + boolean useUWORD; + Bound(int n, boolean useUWORD) { type = n; + this.useUWORD = useUWORD; + } + + boolean isWord(int ch) { + return useUWORD ? UnicodeProp.WORD.is(ch) + : (ch == '_' || Character.isLetterOrDigit(ch)); } + int check(Matcher matcher, int i, CharSequence seq) { int ch; boolean left = false; @@ -5039,14 +5195,14 @@ } if (i > startIndex) { ch = Character.codePointBefore(seq, i); - left = (ch == '_' || Character.isLetterOrDigit(ch) || + left = (isWord(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i-1, seq))); } boolean right = false; if (i < endIndex) { ch = Character.codePointAt(seq, i); - right = (ch == '_' || Character.isLetterOrDigit(ch) || + right = (isWord(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i, seq))); } else { @@ -5428,6 +5584,12 @@ defClone("javaUpperCase", new CloneableProperty() { boolean isSatisfiedBy(int ch) { return Character.isUpperCase(ch);}}); + defClone("javaAlphabetic", new CloneableProperty() { + boolean isSatisfiedBy(int ch) { + return Character.isAlphabetic(ch);}}); + defClone("javaIdeographic", new CloneableProperty() { + boolean isSatisfiedBy(int ch) { + return Character.isIdeographic(ch);}}); defClone("javaTitleCase", new CloneableProperty() { boolean isSatisfiedBy(int ch) { return Character.isTitleCase(ch);}});