--- old/src/share/classes/java/util/regex/Pattern.java 2011-04-28 15:33:12.334988133 -0700 +++ new/src/share/classes/java/util/regex/Pattern.java 2011-04-28 15:33:11.999231406 -0700 @@ -206,13 +206,15 @@ * Equivalent to java.lang.Character.isMirrored() * *   - * Classes for Unicode scripts, blocks and categories + * Classes for Unicode scripts, blocks, categories and binary properties * * \p{IsLatin} - * A Latin script character (simple script) + * A Latin script character (script) * \p{InGreek} - * A character in the Greek block (simple block) + * A character in the Greek block (block) * \p{Lu} - * An uppercase letter (simple category) + * An uppercase letter (category) + * \p{isAlphabetic} + * An alphabetic character (binary property) * \p{Sc} * A currency symbol * \P{InGreek} @@ -328,10 +330,11 @@ * X, as a named-capturing group * (?:X) * X, as a non-capturing group - * (?idmsux-idmsux)  + * (?idmsuxU-idmsuxU)  * Nothing, but turns match flags i * d m s - * u x on - off + * u x U + * on - off * (?idmsux-idmsux:X)   * X, as a non-capturing group with the * given flags i d @@ -518,61 +521,140 @@ * *

This class is in conformance with Level 1 of Unicode Technical - * Standard #18: Unicode Regular Expression Guidelines, plus RL2.1 + * Standard #18: Unicode Regular Expression, plus RL2.1 * Canonical Equivalents. - * - *

Unicode escape sequences such as \u2014 in Java source code + *

+ * Unicode escape sequences such as \u2014 in Java source code * are processed as described in section 3.3 of * The Java™ Language Specification. - * Such escape sequences are also - * implemented directly by the regular-expression parser so that Unicode - * escapes can be used in expressions that are read from files or from the - * keyboard. Thus the strings "\u2014" and "\\u2014", - * while not equal, compile into the same pattern, which matches the character - * with hexadecimal value 0x2014. - * - *

A Unicode character can also be represented in a regular-expression by - * using its hexadecimal code point value directly as described in construct + * Such escape sequences are also implemented directly by the regular-expression + * parser so that Unicode escapes can be used in expressions that are read from + * files or from the keyboard. Thus the strings "\u2014" and + * "\\u2014", while not equal, compile into the same pattern, which + * matches the character with hexadecimal value 0x2014. + *

+ * A Unicode character can also be represented in a regular-expression by + * using its Hex notation(hexadecimal code point value) directly as described in construct * \x{...}, for example a supplementary character U+2011F * can be specified as \x{2011F}, instead of two consecutive * Unicode escape sequences of the surrogate pair * \uD840\uDD1F. - * - * - *

Unicode scripts, blocks and categories are written with the \p and - * \P constructs as in Perl. \p{prop} matches if + *

+ * Unicode scripts, blocks, categories and binary properties are written with + * the \p and \P constructs as in Perl. + * \p{prop} matches if * the input has the property prop, while \P{prop} * does not match if the input has that property. *

- * Scripts are specified either with the prefix {@code Is}, as in + * Scripts, blocks, categories and binary properties can be used both inside + * and outside of a character class. + * + *

+ * Scripts are specified either with the prefix {@code Is}, as in * {@code IsHiragana}, or by using the {@code script} keyword (or its short * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. *

- * Blocks are specified with the prefix {@code In}, as in + * The script names supported by Pattern are the valid script names + * accepted and defined by + * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. + * + *

+ * Blocks are specified with the prefix {@code In}, as in * {@code InMongolian}, or by using the keyword {@code block} (or its short * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. *

- * Categories may be specified with the optional prefix {@code Is}: + * The block names supported by Pattern are the valid block names + * accepted and defined by + * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. + *

+ * + * Categories may be specified with the optional prefix {@code Is}: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode * letters. Same as scripts and blocks, categories can also be specified * by using the keyword {@code general_category} (or its short form * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. *

- * Scripts, blocks and categories can be used both inside and outside of a - * character class. - *

The supported categories are those of + * The supported categories are those of * * The Unicode Standard in the version specified by the * {@link java.lang.Character Character} class. The category names are those * defined in the Standard, both normative and informative. - * The script names supported by Pattern are the valid script names - * accepted and defined by - * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. - * The block names supported by Pattern are the valid block names - * accepted and defined by - * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. *

- *

Categories that behave like the java.lang.Character + * + * Binary properties are specified with the prefix {@code Is}, as in + * {@code IsAlphabetic}. The supported binary properties by Pattern + * are + *

+ + + *

+ * Predefined Character classes and POSIX character classes are in + * conformance with the recommendation of Annex C: Compatibility Properties + * of Unicode Regular Expression + * , when {@link #UNICODE_CHARACTER_CLASS} flag is specified. + *

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ClassesMatches
\p{Lower}A lowercase character:\p{IsLowercase}
\p{Upper}An uppercase character:\p{IsUppercase}
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:\p{IsAlphabetic}
\p{Digit}A decimal digit character:p{IsDigit}
\p{Alnum}An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}]
\p{Punct}A punctuation character:p{IsPunctuation}
\p{Graph}A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]
\p{Print}A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]
\p{Blank}A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]
\p{Cntrl}A control character: \p{gc=Cc}
\p{XDigit}A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}]
\p{Space}A whitespace character:\p{IsWhite_Space}
\dA digit: \p{IsDigit}
\DA non-digit: [^\d]
\sA whitespace character: \p{IsWhite_Space}
\SA non-whitespace character: [^\s]
\wA word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]
\WA non-word character: [^\w]
+ *

+ * + * Categories that behave like the java.lang.Character * boolean ismethodname methods (except for the deprecated ones) are * available through the same \p{prop} syntax where * the specified property has the name javamethodname. @@ -796,6 +878,28 @@ */ public static final int CANON_EQ = 0x80; + /** + * Enables the Unicode version of Predefined character classes and + * POSIX character classes. + * + *

When this flag is specified then the (US-ASCII only) + * Predefined character classes and POSIX character classes + * are in conformance with + * Unicode Technical + * Standard #18: Unicode Regular Expression + * Annex C: Compatibility Properties. + *

+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded + * flag expression (?U). + *

+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case + * folding. + *

+ * Specifying this flag may impose a performance penalty.

+ * @since 1.7 + */ + public static final int UNICODE_CHARACTER_CLASS = 0x100; + /* Pattern has only two serialized components: The pattern string * and the flags, which are all that is needed to recompile the pattern * when it is deserialized. @@ -918,7 +1022,8 @@ * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, - * {@link #LITERAL} and {@link #COMMENTS} + * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} + * and {@link #COMMENTS} * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined @@ -1209,6 +1314,10 @@ pattern = p; flags = f; + // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present + if ((flags & UNICODE_CHARACTER_CLASS) != 0) + flags |= UNICODE_CASE; + // Reset group index count capturingGroupCount = 1; localCount = 0; @@ -2164,12 +2273,14 @@ return -1; case 'B': if (inclass) break; - if (create) root = new Bound(Bound.NONE); + if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': - if (create) root = new Ctype(ASCII.DIGIT).complement(); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.DIGIT).complement() + : new Ctype(ASCII.DIGIT).complement(); return -1; case 'E': case 'F': @@ -2191,14 +2302,18 @@ case 'R': break; case 'S': - if (create) root = new Ctype(ASCII.SPACE).complement(); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WHITE_SPACE).complement() + : new Ctype(ASCII.SPACE).complement(); return -1; case 'T': case 'U': case 'V': break; case 'W': - if (create) root = new Ctype(ASCII.WORD).complement(); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WORD).complement() + : new Ctype(ASCII.WORD).complement(); return -1; case 'X': case 'Y': @@ -2216,12 +2331,14 @@ return '\007'; case 'b': if (inclass) break; - if (create) root = new Bound(Bound.BOTH); + if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); return -1; case 'c': return c(); case 'd': - if (create) root = new Ctype(ASCII.DIGIT); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.DIGIT) + : new Ctype(ASCII.DIGIT); return -1; case 'e': return '\033'; @@ -2259,7 +2376,9 @@ case 'r': return '\r'; case 's': - if (create) root = new Ctype(ASCII.SPACE); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WHITE_SPACE) + : new Ctype(ASCII.SPACE); return -1; case 't': return '\t'; @@ -2268,7 +2387,9 @@ case 'v': return '\013'; case 'w': - if (create) root = new Ctype(ASCII.WORD); + if (create) root = has(UNICODE_CHARACTER_CLASS) + ? new Utype(UnicodeProp.WORD) + : new Ctype(ASCII.WORD); return -1; case 'x': return x(); @@ -2490,7 +2611,7 @@ { next(); String name; - CharProperty node; + CharProperty node = null; if (singleLetter) { int c = temp[cursor]; @@ -2536,11 +2657,21 @@ } else if (name.startsWith("Is")) { // \p{isGeneralCategory} and \p{isScriptName} name = name.substring(2); - node = CharPropertyNames.charPropertyFor(name); + UnicodeProp uprop = UnicodeProp.forName(name); + if (uprop != null) + node = new Utype(uprop); + if (node == null) + node = CharPropertyNames.charPropertyFor(name); if (node == null) node = unicodeScriptPropertyFor(name); } else { - node = charPropertyNodeFor(name); + if (has(UNICODE_CHARACTER_CLASS)) { + UnicodeProp uprop = UnicodeProp.forPOSIXName(name); + if (uprop != null) + node = new Utype(uprop); + } + if (node == null) + node = charPropertyNodeFor(name); } } if (maybeComplement) { @@ -2822,6 +2953,9 @@ case 'x': flags |= COMMENTS; break; + case 'U': + flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); + break; case '-': // subFlag then fall through ch = next(); subFlag(); @@ -2861,6 +2995,8 @@ case 'x': flags &= ~COMMENTS; break; + case 'U': + flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); default: return; } @@ -3664,6 +3800,18 @@ } /** + * Node class that matches a Unicode "type" + */ + static final class Utype extends CharProperty { + final UnicodeProp uprop; + Utype(UnicodeProp uprop) { this.uprop = uprop; } + boolean isSatisfiedBy(int ch) { + return uprop.is(ch); + } + } + + + /** * Node class that matches a POSIX type. */ static final class Ctype extends BmpCharProperty { @@ -5025,9 +5173,17 @@ static int BOTH = 0x3; static int NONE = 0x4; int type; - Bound(int n) { + boolean useUWORD; + Bound(int n, boolean useUWORD) { type = n; + this.useUWORD = useUWORD; + } + + boolean isWord(int ch) { + return useUWORD ? UnicodeProp.WORD.is(ch) + : (ch == '_' || Character.isLetterOrDigit(ch)); } + int check(Matcher matcher, int i, CharSequence seq) { int ch; boolean left = false; @@ -5039,14 +5195,14 @@ } if (i > startIndex) { ch = Character.codePointBefore(seq, i); - left = (ch == '_' || Character.isLetterOrDigit(ch) || + left = (isWord(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i-1, seq))); } boolean right = false; if (i < endIndex) { ch = Character.codePointAt(seq, i); - right = (ch == '_' || Character.isLetterOrDigit(ch) || + right = (isWord(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i, seq))); } else { @@ -5428,6 +5584,12 @@ defClone("javaUpperCase", new CloneableProperty() { boolean isSatisfiedBy(int ch) { return Character.isUpperCase(ch);}}); + defClone("javaAlphabetic", new CloneableProperty() { + boolean isSatisfiedBy(int ch) { + return Character.isAlphabetic(ch);}}); + defClone("javaIdeographic", new CloneableProperty() { + boolean isSatisfiedBy(int ch) { + return Character.isIdeographic(ch);}}); defClone("javaTitleCase", new CloneableProperty() { boolean isSatisfiedBy(int ch) { return Character.isTitleCase(ch);}}); --- /dev/null 2011-01-11 11:52:10.886369177 -0800 +++ new/src/share/classes/java/util/regex/UnicodeProp.java 2011-04-28 15:33:14.698997106 -0700 @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +import java.util.HashMap; +import java.util.Locale; + +enum UnicodeProp { + + ALPHABETIC { + public boolean is(int ch) { + return Character.isAlphabetic(ch); + } + }, + + LETTER { + public boolean is(int ch) { + return Character.isLetter(ch); + } + }, + + IDEOGRAPHIC { + public boolean is(int ch) { + return Character.isIdeographic(ch); + } + }, + + LOWERCASE { + public boolean is(int ch) { + return Character.isLowerCase(ch); + } + }, + + UPPERCASE { + public boolean is(int ch) { + return Character.isUpperCase(ch); + } + }, + + TITLECASE { + public boolean is(int ch) { + return Character.isTitleCase(ch); + } + }, + + WHITE_SPACE { + // \p{Whitespace} + public boolean is(int ch) { + return ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) + != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); + } + }, + + CONTROL { + // \p{gc=Control} + public boolean is(int ch) { + return Character.getType(ch) == Character.CONTROL; + } + }, + + PUNCTUATION { + // \p{gc=Punctuation} + public boolean is(int ch) { + return ((((1 << Character.CONNECTOR_PUNCTUATION) | + (1 << Character.DASH_PUNCTUATION) | + (1 << Character.START_PUNCTUATION) | + (1 << Character.END_PUNCTUATION) | + (1 << Character.OTHER_PUNCTUATION) | + (1 << Character.INITIAL_QUOTE_PUNCTUATION) | + (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0; + } + }, + + HEX_DIGIT { + // \p{gc=Decimal_Number} + // \p{Hex_Digit} -> PropList.txt: Hex_Digit + public boolean is(int ch) { + return DIGIT.is(ch) || + (ch >= 0x0030 && ch <= 0x0039) || + (ch >= 0x0041 && ch <= 0x0046) || + (ch >= 0x0061 && ch <= 0x0066) || + (ch >= 0xFF10 && ch <= 0xFF19) || + (ch >= 0xFF21 && ch <= 0xFF26) || + (ch >= 0xFF41 && ch <= 0xFF46); + } + }, + + ASSIGNED { + public boolean is(int ch) { + return Character.getType(ch) != Character.UNASSIGNED; + } + }, + + NONCHARACTER_CODE_POINT { + // PropList.txt:Noncharacter_Code_Point + public boolean is(int ch) { + return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); + } + }, + + DIGIT { + // \p{gc=Decimal_Number} + public boolean is(int ch) { + return Character.isDigit(ch); + } + }, + + ALNUM { + // \p{alpha} + // \p{digit} + public boolean is(int ch) { + return ALPHABETIC.is(ch) || DIGIT.is(ch); + } + }, + + BLANK { + // \p{Whitespace} -- + // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 + // \p{gc=Line_Separator} + // \p{gc=Paragraph_Separator}] + public boolean is(int ch) { + return Character.getType(ch) == Character.SPACE_SEPARATOR || + ch == 0x9; // \N{HT} + } + }, + + GRAPH { + // [^ + // \p{space} + // \p{gc=Control} + // \p{gc=Surrogate} + // \p{gc=Unassigned}] + public boolean is(int ch) { + return ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR) | + (1 << Character.CONTROL) | + (1 << Character.SURROGATE) | + (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) + == 0; + } + }, + + PRINT { + // \p{graph} + // \p{blank} + // -- \p{cntrl} + public boolean is(int ch) { + return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch); + } + }, + + WORD { + // \p{alpha} + // \p{gc=Mark} + // \p{digit} + // \p{gc=Connector_Punctuation} + + public boolean is(int ch) { + return ALPHABETIC.is(ch) || + ((((1 << Character.NON_SPACING_MARK) | + (1 << Character.ENCLOSING_MARK) | + (1 << Character.COMBINING_SPACING_MARK) | + (1 << Character.DECIMAL_DIGIT_NUMBER) | + (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0; + } + }; + + private final static HashMap posix = new HashMap<>(); + private final static HashMap aliases = new HashMap<>(); + static { + posix.put("ALPHA", "ALPHABETIC"); + posix.put("LOWER", "LOWERCASE"); + posix.put("UPPER", "UPPERCASE"); + posix.put("SPACE", "WHITE_SPACE"); + posix.put("PUNCT", "PUNCTUATION"); + posix.put("XDIGIT","HEX_DIGIT"); + posix.put("ALNUM", "ALNUM"); + posix.put("CNTRL", "CONTROL"); + posix.put("DIGIT", "DIGIT"); + posix.put("BLANK", "BLANK"); + posix.put("GRAPH", "GRAPH"); + posix.put("PRINT", "PRINT"); + + aliases.put("WHITESPACE", "WHITE_SPACE"); + aliases.put("HEXDIGIT","HEX_DIGIT"); + aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT"); + } + + public static UnicodeProp forName(String propName) { + propName = propName.toUpperCase(Locale.ENGLISH); + String alias = aliases.get(propName); + if (alias != null) + propName = alias; + try { + return valueOf (propName); + } catch (IllegalArgumentException x) {} + return null; + } + + public static UnicodeProp forPOSIXName(String propName) { + propName = posix.get(propName.toUpperCase(Locale.ENGLISH)); + if (propName == null) + return null; + return valueOf (propName); + } + + public abstract boolean is(int ch); +} --- old/test/java/util/regex/RegExTest.java 2011-04-28 15:33:16.165605921 -0700 +++ new/test/java/util/regex/RegExTest.java 2011-04-28 15:33:15.736361230 -0700 @@ -32,7 +32,7 @@ * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 - * 6350801 6676425 6878475 6919132 6931676 6948903 7014645 + * 6350801 6676425 6878475 6919132 6931676 6948903 7014645 7039066 */ import java.util.regex.*; @@ -137,6 +137,7 @@ nonBmpClassComplementTest(); unicodePropertiesTest(); unicodeHexNotationTest(); + unicodeClassesTest(); if (failure) throw new RuntimeException("Failure in the RE handling."); else @@ -3656,5 +3657,146 @@ failCount++; } report("unicodeHexNotation"); - } + } + + private static void unicodeClassesTest() throws Exception { + + Matcher lower = Pattern.compile("\\p{Lower}").matcher(""); + Matcher upper = Pattern.compile("\\p{Upper}").matcher(""); + Matcher ASCII = Pattern.compile("\\p{ASCII}").matcher(""); + Matcher alpha = Pattern.compile("\\p{Alpha}").matcher(""); + Matcher digit = Pattern.compile("\\p{Digit}").matcher(""); + Matcher alnum = Pattern.compile("\\p{Alnum}").matcher(""); + Matcher punct = Pattern.compile("\\p{Punct}").matcher(""); + Matcher graph = Pattern.compile("\\p{Graph}").matcher(""); + Matcher print = Pattern.compile("\\p{Print}").matcher(""); + Matcher blank = Pattern.compile("\\p{Blank}").matcher(""); + Matcher cntrl = Pattern.compile("\\p{Cntrl}").matcher(""); + Matcher xdigit = Pattern.compile("\\p{XDigit}").matcher(""); + Matcher space = Pattern.compile("\\p{Space}").matcher(""); + Matcher bound = Pattern.compile("\\b").matcher(""); + Matcher word = Pattern.compile("\\w++").matcher(""); + // UNICODE_CHARACTER_CLASS + Matcher lowerU = Pattern.compile("\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher upperU = Pattern.compile("\\p{Upper}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher ASCIIU = Pattern.compile("\\p{ASCII}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher alphaU = Pattern.compile("\\p{Alpha}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher digitU = Pattern.compile("\\p{Digit}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher alnumU = Pattern.compile("\\p{Alnum}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher punctU = Pattern.compile("\\p{Punct}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher graphU = Pattern.compile("\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher printU = Pattern.compile("\\p{Print}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher blankU = Pattern.compile("\\p{Blank}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher cntrlU = Pattern.compile("\\p{Cntrl}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher xdigitU = Pattern.compile("\\p{XDigit}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher spaceU = Pattern.compile("\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher boundU = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher wordU = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + // embedded flag (?U) + Matcher lowerEU = Pattern.compile("(?U)\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher graphEU = Pattern.compile("(?U)\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher wordEU = Pattern.compile("(?U)\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + + Matcher bwb = Pattern.compile("\\b\\w\\b").matcher(""); + Matcher bwbU = Pattern.compile("\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + Matcher bwbEU = Pattern.compile("(?U)\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher(""); + // properties + Matcher lowerP = Pattern.compile("\\p{IsLowerCase}").matcher(""); + Matcher upperP = Pattern.compile("\\p{IsUpperCase}").matcher(""); + Matcher titleP = Pattern.compile("\\p{IsTitleCase}").matcher(""); + Matcher letterP = Pattern.compile("\\p{IsLetter}").matcher(""); + Matcher alphaP = Pattern.compile("\\p{IsAlphabetic}").matcher(""); + Matcher ideogP = Pattern.compile("\\p{IsIdeographic}").matcher(""); + Matcher cntrlP = Pattern.compile("\\p{IsControl}").matcher(""); + Matcher spaceP = Pattern.compile("\\p{IsWhiteSpace}").matcher(""); + Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher(""); + Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher(""); + + // javaMethod + Matcher lowerJ = Pattern.compile("\\p{javaLowerCase}").matcher(""); + Matcher upperJ = Pattern.compile("\\p{javaUpperCase}").matcher(""); + Matcher alphaJ = Pattern.compile("\\p{javaAlphabetic}").matcher(""); + Matcher ideogJ = Pattern.compile("\\p{javaIdeographic}").matcher(""); + + for (int cp = 1; cp < 0x30000; cp++) { + String str = new String(Character.toChars(cp)); + int type = Character.getType(cp); + if (// lower + POSIX_ASCII.isLower(cp) != lower.reset(str).matches() || + Character.isLowerCase(cp) != lowerU.reset(str).matches() || + Character.isLowerCase(cp) != lowerP.reset(str).matches() || + Character.isLowerCase(cp) != lowerEU.reset(str).matches()|| + Character.isLowerCase(cp) != lowerJ.reset(str).matches()|| + // upper + POSIX_ASCII.isUpper(cp) != upper.reset(str).matches() || + POSIX_Unicode.isUpper(cp) != upperU.reset(str).matches() || + Character.isUpperCase(cp) != upperP.reset(str).matches() || + Character.isUpperCase(cp) != upperJ.reset(str).matches() || + // alpha + POSIX_ASCII.isAlpha(cp) != alpha.reset(str).matches() || + POSIX_Unicode.isAlpha(cp) != alphaU.reset(str).matches() || + Character.isAlphabetic(cp)!= alphaP.reset(str).matches() || + Character.isAlphabetic(cp)!= alphaJ.reset(str).matches() || + // digit + POSIX_ASCII.isDigit(cp) != digit.reset(str).matches() || + Character.isDigit(cp) != digitU.reset(str).matches() || + // alnum + POSIX_ASCII.isAlnum(cp) != alnum.reset(str).matches() || + POSIX_Unicode.isAlnum(cp) != alnumU.reset(str).matches() || + // punct + POSIX_ASCII.isPunct(cp) != punct.reset(str).matches() || + POSIX_Unicode.isPunct(cp) != punctU.reset(str).matches() || + // graph + POSIX_ASCII.isGraph(cp) != graph.reset(str).matches() || + POSIX_Unicode.isGraph(cp) != graphU.reset(str).matches() || + POSIX_Unicode.isGraph(cp) != graphEU.reset(str).matches()|| + // blank + POSIX_ASCII.isType(cp, POSIX_ASCII.BLANK) + != blank.reset(str).matches() || + POSIX_Unicode.isBlank(cp) != blankU.reset(str).matches() || + // print + POSIX_ASCII.isPrint(cp) != print.reset(str).matches() || + POSIX_Unicode.isPrint(cp) != printU.reset(str).matches() || + // cntrl + POSIX_ASCII.isCntrl(cp) != cntrl.reset(str).matches() || + POSIX_Unicode.isCntrl(cp) != cntrlU.reset(str).matches() || + (Character.CONTROL == type) != cntrlP.reset(str).matches() || + // hexdigit + POSIX_ASCII.isHexDigit(cp) != xdigit.reset(str).matches() || + POSIX_Unicode.isHexDigit(cp) != xdigitU.reset(str).matches() || + // space + POSIX_ASCII.isSpace(cp) != space.reset(str).matches() || + POSIX_Unicode.isSpace(cp) != spaceU.reset(str).matches() || + POSIX_Unicode.isSpace(cp) != spaceP.reset(str).matches() || + // word + POSIX_ASCII.isWord(cp) != word.reset(str).matches() || + POSIX_Unicode.isWord(cp) != wordU.reset(str).matches() || + POSIX_Unicode.isWord(cp) != wordEU.reset(str).matches()|| + // bwordb + POSIX_ASCII.isWord(cp) != bwb.reset(str).matches() || + POSIX_Unicode.isWord(cp) != bwbU.reset(str).matches() || + // properties + Character.isTitleCase(cp) != titleP.reset(str).matches() || + Character.isLetter(cp) != letterP.reset(str).matches()|| + Character.isIdeographic(cp) != ideogP.reset(str).matches() || + Character.isIdeographic(cp) != ideogJ.reset(str).matches() || + (Character.UNASSIGNED == type) == definedP.reset(str).matches() || + POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches()) + failCount++; + } + + // bounds/word align + twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10); + if (!bwbU.reset("\u0180sherman\u0400").matches()) + failCount++; + twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11); + if (!bwbU.reset("\u0180sh\u0345erman\u0400").matches()) + failCount++; + twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4); + if (!bwbU.reset("\u0724\u0739\u0724").matches()) + failCount++; + if (!bwbEU.reset("\u0724\u0739\u0724").matches()) + failCount++; + report("unicodePredefinedClasses"); + } } --- /dev/null 2011-01-11 11:52:10.886369177 -0800 +++ new/test/java/util/regex/POSIX_Unicode.java 2011-04-28 15:33:19.821266489 -0700 @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +import java.util.HashMap; +import java.util.Locale; + +final public class POSIX_Unicode { + + public static boolean isAlpha(int ch) { + return Character.isAlphabetic(ch); + } + + public static boolean isLower(int ch) { + return Character.isLowerCase(ch); + } + + public static boolean isUpper(int ch) { + return Character.isUpperCase(ch); + } + + // \p{Whitespace} + public static boolean isSpace(int ch) { + return ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) + != 0 || + (ch >= 0x9 && ch <= 0xd) || + (ch == 0x85); + } + + // \p{gc=Control} + public static boolean isCntrl(int ch) { + return Character.getType(ch) == Character.CONTROL; + } + + // \p{gc=Punctuation} + public static boolean isPunct(int ch) { + return ((((1 << Character.CONNECTOR_PUNCTUATION) | + (1 << Character.DASH_PUNCTUATION) | + (1 << Character.START_PUNCTUATION) | + (1 << Character.END_PUNCTUATION) | + (1 << Character.OTHER_PUNCTUATION) | + (1 << Character.INITIAL_QUOTE_PUNCTUATION) | + (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0; + } + + // \p{gc=Decimal_Number} + // \p{Hex_Digit} -> PropList.txt: Hex_Digit + public static boolean isHexDigit(int ch) { + return Character.isDigit(ch) || + (ch >= 0x0030 && ch <= 0x0039) || + (ch >= 0x0041 && ch <= 0x0046) || + (ch >= 0x0061 && ch <= 0x0066) || + (ch >= 0xFF10 && ch <= 0xFF19) || + (ch >= 0xFF21 && ch <= 0xFF26) || + (ch >= 0xFF41 && ch <= 0xFF46); + } + + // \p{gc=Decimal_Number} + public static boolean isDigit(int ch) { + return Character.isDigit(ch); + }; + + // \p{alpha} + // \p{digit} + public static boolean isAlnum(int ch) { + return Character.isAlphabetic(ch) || Character.isDigit(ch); + } + + // \p{Whitespace} -- + // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 + // \p{gc=Line_Separator} + // \p{gc=Paragraph_Separator}] + public static boolean isBlank(int ch) { + int type = Character.getType(ch); + return isSpace(ch) && + ch != 0xa & ch != 0xb && ch !=0xc && ch != 0xd && ch != 0x85 && + type != Character.LINE_SEPARATOR && + type != Character.PARAGRAPH_SEPARATOR; + } + + // [^ + // \p{space} + // \p{gc=Control} + // \p{gc=Surrogate} + // \p{gc=Unassigned}] + public static boolean isGraph(int ch) { + int type = Character.getType(ch); + return !(isSpace(ch) || + Character.CONTROL == type || + Character.SURROGATE == type || + Character.UNASSIGNED == type); + } + + // \p{graph} + // \p{blank} + // -- \p{cntrl} + public static boolean isPrint(int ch) { + return (isGraph(ch) || isBlank(ch)) && !isCntrl(ch); + } + + // PropList.txt:Noncharacter_Code_Point + public static boolean isNoncharacterCodePoint(int ch) { + return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); + } + + // \p{alpha} + // \p{gc=Mark} + // \p{digit} + // \p{gc=Connector_Punctuation} + public static boolean isWord(int ch) { + return isAlpha(ch) || + ((((1 << Character.NON_SPACING_MARK) | + (1 << Character.ENCLOSING_MARK) | + (1 << Character.COMBINING_SPACING_MARK) | + (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0 || + isDigit(ch); + } +} --- /dev/null 2011-01-11 11:52:10.886369177 -0800 +++ new/test/java/util/regex/POSIX_ASCII.java 2011-04-28 15:33:21.289582914 -0700 @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + + +final class POSIX_ASCII { + + static final int UPPER = 0x00000100; + + static final int LOWER = 0x00000200; + + static final int DIGIT = 0x00000400; + + static final int SPACE = 0x00000800; + + static final int PUNCT = 0x00001000; + + static final int CNTRL = 0x00002000; + + static final int BLANK = 0x00004000; + + static final int HEX = 0x00008000; + + static final int UNDER = 0x00010000; + + static final int ASCII = 0x0000FF00; + + static final int ALPHA = (UPPER|LOWER); + + static final int ALNUM = (UPPER|LOWER|DIGIT); + + static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT); + + static final int WORD = (UPPER|LOWER|UNDER|DIGIT); + + static final int XDIGIT = (HEX); + + private static final int[] ctype = new int[] { + CNTRL, /* 00 (NUL) */ + CNTRL, /* 01 (SOH) */ + CNTRL, /* 02 (STX) */ + CNTRL, /* 03 (ETX) */ + CNTRL, /* 04 (EOT) */ + CNTRL, /* 05 (ENQ) */ + CNTRL, /* 06 (ACK) */ + CNTRL, /* 07 (BEL) */ + CNTRL, /* 08 (BS) */ + SPACE+CNTRL+BLANK, /* 09 (HT) */ + SPACE+CNTRL, /* 0A (LF) */ + SPACE+CNTRL, /* 0B (VT) */ + SPACE+CNTRL, /* 0C (FF) */ + SPACE+CNTRL, /* 0D (CR) */ + CNTRL, /* 0E (SI) */ + CNTRL, /* 0F (SO) */ + CNTRL, /* 10 (DLE) */ + CNTRL, /* 11 (DC1) */ + CNTRL, /* 12 (DC2) */ + CNTRL, /* 13 (DC3) */ + CNTRL, /* 14 (DC4) */ + CNTRL, /* 15 (NAK) */ + CNTRL, /* 16 (SYN) */ + CNTRL, /* 17 (ETB) */ + CNTRL, /* 18 (CAN) */ + CNTRL, /* 19 (EM) */ + CNTRL, /* 1A (SUB) */ + CNTRL, /* 1B (ESC) */ + CNTRL, /* 1C (FS) */ + CNTRL, /* 1D (GS) */ + CNTRL, /* 1E (RS) */ + CNTRL, /* 1F (US) */ + SPACE+BLANK, /* 20 SPACE */ + PUNCT, /* 21 ! */ + PUNCT, /* 22 " */ + PUNCT, /* 23 # */ + PUNCT, /* 24 $ */ + PUNCT, /* 25 % */ + PUNCT, /* 26 & */ + PUNCT, /* 27 ' */ + PUNCT, /* 28 ( */ + PUNCT, /* 29 ) */ + PUNCT, /* 2A * */ + PUNCT, /* 2B + */ + PUNCT, /* 2C , */ + PUNCT, /* 2D - */ + PUNCT, /* 2E . */ + PUNCT, /* 2F / */ + DIGIT+HEX+0, /* 30 0 */ + DIGIT+HEX+1, /* 31 1 */ + DIGIT+HEX+2, /* 32 2 */ + DIGIT+HEX+3, /* 33 3 */ + DIGIT+HEX+4, /* 34 4 */ + DIGIT+HEX+5, /* 35 5 */ + DIGIT+HEX+6, /* 36 6 */ + DIGIT+HEX+7, /* 37 7 */ + DIGIT+HEX+8, /* 38 8 */ + DIGIT+HEX+9, /* 39 9 */ + PUNCT, /* 3A : */ + PUNCT, /* 3B ; */ + PUNCT, /* 3C < */ + PUNCT, /* 3D = */ + PUNCT, /* 3E > */ + PUNCT, /* 3F ? */ + PUNCT, /* 40 @ */ + UPPER+HEX+10, /* 41 A */ + UPPER+HEX+11, /* 42 B */ + UPPER+HEX+12, /* 43 C */ + UPPER+HEX+13, /* 44 D */ + UPPER+HEX+14, /* 45 E */ + UPPER+HEX+15, /* 46 F */ + UPPER+16, /* 47 G */ + UPPER+17, /* 48 H */ + UPPER+18, /* 49 I */ + UPPER+19, /* 4A J */ + UPPER+20, /* 4B K */ + UPPER+21, /* 4C L */ + UPPER+22, /* 4D M */ + UPPER+23, /* 4E N */ + UPPER+24, /* 4F O */ + UPPER+25, /* 50 P */ + UPPER+26, /* 51 Q */ + UPPER+27, /* 52 R */ + UPPER+28, /* 53 S */ + UPPER+29, /* 54 T */ + UPPER+30, /* 55 U */ + UPPER+31, /* 56 V */ + UPPER+32, /* 57 W */ + UPPER+33, /* 58 X */ + UPPER+34, /* 59 Y */ + UPPER+35, /* 5A Z */ + PUNCT, /* 5B [ */ + PUNCT, /* 5C \ */ + PUNCT, /* 5D ] */ + PUNCT, /* 5E ^ */ + PUNCT|UNDER, /* 5F _ */ + PUNCT, /* 60 ` */ + LOWER+HEX+10, /* 61 a */ + LOWER+HEX+11, /* 62 b */ + LOWER+HEX+12, /* 63 c */ + LOWER+HEX+13, /* 64 d */ + LOWER+HEX+14, /* 65 e */ + LOWER+HEX+15, /* 66 f */ + LOWER+16, /* 67 g */ + LOWER+17, /* 68 h */ + LOWER+18, /* 69 i */ + LOWER+19, /* 6A j */ + LOWER+20, /* 6B k */ + LOWER+21, /* 6C l */ + LOWER+22, /* 6D m */ + LOWER+23, /* 6E n */ + LOWER+24, /* 6F o */ + LOWER+25, /* 70 p */ + LOWER+26, /* 71 q */ + LOWER+27, /* 72 r */ + LOWER+28, /* 73 s */ + LOWER+29, /* 74 t */ + LOWER+30, /* 75 u */ + LOWER+31, /* 76 v */ + LOWER+32, /* 77 w */ + LOWER+33, /* 78 x */ + LOWER+34, /* 79 y */ + LOWER+35, /* 7A z */ + PUNCT, /* 7B { */ + PUNCT, /* 7C | */ + PUNCT, /* 7D } */ + PUNCT, /* 7E ~ */ + CNTRL, /* 7F (DEL) */ + }; + + static int getType(int ch) { + return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0); + } + + static boolean isType(int ch, int type) { + return (getType(ch) & type) != 0; + } + + static boolean isAscii(int ch) { + return ((ch & 0xFFFFFF80) == 0); + } + + static boolean isAlpha(int ch) { + return isType(ch, ALPHA); + } + + static boolean isDigit(int ch) { + return ((ch-'0')|('9'-ch)) >= 0; + } + + static boolean isAlnum(int ch) { + return isType(ch, ALNUM); + } + + static boolean isGraph(int ch) { + return isType(ch, GRAPH); + } + + static boolean isPrint(int ch) { + return ((ch-0x20)|(0x7E-ch)) >= 0; + } + + static boolean isPunct(int ch) { + return isType(ch, PUNCT); + } + + static boolean isSpace(int ch) { + return isType(ch, SPACE); + } + + static boolean isHexDigit(int ch) { + return isType(ch, HEX); + } + + static boolean isCntrl(int ch) { + return isType(ch, CNTRL); + } + + static boolean isLower(int ch) { + return ((ch-'a')|('z'-ch)) >= 0; + } + + static boolean isUpper(int ch) { + return ((ch-'A')|('Z'-ch)) >= 0; + } + + static boolean isWord(int ch) { + return isType(ch, WORD); + } +}