src/share/classes/java/util/regex/Pattern.java

Print this page

        

*** 204,220 **** * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr> * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td> * <td>Equivalent to java.lang.Character.isMirrored()</td></tr> * * <tr><th>&nbsp;</th></tr> ! * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr> * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td> ! * <td headers="matches">A Latin&nbsp;script character (simple <a href="#ubc">script</a>)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td> ! * <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td> ! * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td> * <td headers="matches">A currency symbol</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td> * <td headers="matches">Any character except one in the Greek block (negation)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td> --- 204,222 ---- * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr> * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td> * <td>Equivalent to java.lang.Character.isMirrored()</td></tr> * * <tr><th>&nbsp;</th></tr> ! * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr> * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td> ! * <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td> ! * <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td> ! * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr> ! * <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td> ! * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td> * <td headers="matches">A currency symbol</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td> * <td headers="matches">Any character except one in the Greek block (negation)</td></tr> * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td>
*** 326,339 **** * * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td> * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr> * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td> * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr> ! * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td> * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a> * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> ! * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr> * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td> * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a> * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a > * <a href="#COMMENTS">x</a> on - off</td></tr> --- 328,342 ---- * * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td> * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr> * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td> * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr> ! * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU)&nbsp;</tt></td> * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a> * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> ! * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a> ! * on - off</td></tr> * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td> * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a> * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a > * <a href="#COMMENTS">x</a> on - off</td></tr>
*** 516,580 **** * * <h4> Unicode support </h4> * * <p> This class is in conformance with Level 1 of <a * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical ! * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1 * Canonical Equivalents. ! * ! * <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code * are processed as described in section 3.3 of * <cite>The Java&trade; Language Specification</cite>. ! * Such escape sequences are also ! * implemented directly by the regular-expression parser so that Unicode ! * escapes can be used in expressions that are read from files or from the ! * keyboard. Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>, ! * while not equal, compile into the same pattern, which matches the character ! * with hexadecimal value <tt>0x2014</tt>. ! * ! * <p> A Unicode character can also be represented in a regular-expression by ! * using its hexadecimal code point value directly as described in construct * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive * Unicode escape sequences of the surrogate pair * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>. ! * ! * <a name="ubc"> ! * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and ! * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> * does not match if the input has that property. * <p> ! * Scripts are specified either with the prefix {@code Is}, as in * {@code IsHiragana}, or by using the {@code script} keyword (or its short * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. * <p> ! * Blocks are specified with the prefix {@code In}, as in * {@code InMongolian}, or by using the keyword {@code block} (or its short * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. * <p> ! * Categories may be specified with the optional prefix {@code Is}: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode * letters. Same as scripts and blocks, categories can also be specified * by using the keyword {@code general_category} (or its short form * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. * <p> ! * Scripts, blocks and categories can be used both inside and outside of a ! * character class. ! * <p> The supported categories are those of * <a href="http://www.unicode.org/unicode/standard/standard.html"> * <i>The Unicode Standard</i></a> in the version specified by the * {@link java.lang.Character Character} class. The category names are those * defined in the Standard, both normative and informative. - * The script names supported by <code>Pattern</code> are the valid script names - * accepted and defined by - * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. - * The block names supported by <code>Pattern</code> are the valid block names - * accepted and defined by - * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. * <p> ! * <a name="jcc"> <p>Categories that behave like the java.lang.Character * boolean is<i>methodname</i> methods (except for the deprecated ones) are * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where * the specified property has the name <tt>java<i>methodname</i></tt>. * * <h4> Comparison to Perl 5 </h4> --- 519,662 ---- * * <h4> Unicode support </h4> * * <p> This class is in conformance with Level 1 of <a * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical ! * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1 * Canonical Equivalents. ! * <p> ! * <b>Unicode escape sequences</b> such as <tt>&#92;u2014</tt> in Java source code * are processed as described in section 3.3 of * <cite>The Java&trade; Language Specification</cite>. ! * Such escape sequences are also implemented directly by the regular-expression ! * parser so that Unicode escapes can be used in expressions that are read from ! * files or from the keyboard. Thus the strings <tt>"&#92;u2014"</tt> and ! * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which ! * matches the character with hexadecimal value <tt>0x2014</tt>. ! * <p> ! * A Unicode character can also be represented in a regular-expression by ! * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive * Unicode escape sequences of the surrogate pair * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>. ! * <p> ! * Unicode scripts, blocks, categories and binary properties are written with ! * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl. ! * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> * does not match if the input has that property. * <p> ! * Scripts, blocks, categories and binary properties can be used both inside ! * and outside of a character class. ! * <a name="usc"> ! * <p> ! * <b>Scripts</b> are specified either with the prefix {@code Is}, as in * {@code IsHiragana}, or by using the {@code script} keyword (or its short * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. * <p> ! * The script names supported by <code>Pattern</code> are the valid script names ! * accepted and defined by ! * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. ! * <a name="ubc"> ! * <p> ! * <b>Blocks</b> are specified with the prefix {@code In}, as in * {@code InMongolian}, or by using the keyword {@code block} (or its short * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. * <p> ! * The block names supported by <code>Pattern</code> are the valid block names ! * accepted and defined by ! * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. ! * <p> ! * <a name="ucc"> ! * <b>Categories</b> may be specified with the optional prefix {@code Is}: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode * letters. Same as scripts and blocks, categories can also be specified * by using the keyword {@code general_category} (or its short form * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. * <p> ! * The supported categories are those of * <a href="http://www.unicode.org/unicode/standard/standard.html"> * <i>The Unicode Standard</i></a> in the version specified by the * {@link java.lang.Character Character} class. The category names are those * defined in the Standard, both normative and informative. * <p> ! * <a name="ubpc"> ! * <b>Binary properties</b> are specified with the prefix {@code Is}, as in ! * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code> ! * are ! * <ul> ! * <li> Alphabetic ! * <li> Ideographic ! * <li> Letter ! * <li> Lowercase ! * <li> Uppercase ! * <li> Titlecase ! * <li> Punctuation ! * <Li> Control ! * <li> White_Space ! * <li> Digit ! * <li> Hex_Digit ! * <li> Noncharacter_Code_Point ! * <li> Assigned ! * </ul> ! ! ! * <p> ! * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in ! * conformance with the recommendation of <i>Annex C: Compatibility Properties</i> ! * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression ! * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified. ! * <p> ! * <table border="0" cellpadding="1" cellspacing="0" ! * summary="predefined and posix character classes in Unicode mode"> ! * <tr align="left"> ! * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th> ! * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th> ! *</tr> ! * <tr><td><tt>\p{Lower}</tt></td> ! * <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr> ! * <tr><td><tt>\p{Upper}</tt></td> ! * <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr> ! * <tr><td><tt>\p{ASCII}</tt></td> ! * <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr> ! * <tr><td><tt>\p{Alpha}</tt></td> ! * <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr> ! * <tr><td><tt>\p{Digit}</tt></td> ! * <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr> ! * <tr><td><tt>\p{Alnum}</tt></td> ! * <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr> ! * <tr><td><tt>\p{Punct}</tt></td> ! * <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr> ! * <tr><td><tt>\p{Graph}</tt></td> ! * <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr> ! * <tr><td><tt>\p{Print}</tt></td> ! * <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr> ! * <tr><td><tt>\p{Blank}</tt></td> ! * <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr> ! * <tr><td><tt>\p{Cntrl}</tt></td> ! * <td>A control character: <tt>\p{gc=Cc}</tt></td></tr> ! * <tr><td><tt>\p{XDigit}</tt></td> ! * <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr> ! * <tr><td><tt>\p{Space}</tt></td> ! * <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr> ! * <tr><td><tt>\d</tt></td> ! * <td>A digit: <tt>\p{IsDigit}</tt></td></tr> ! * <tr><td><tt>\D</tt></td> ! * <td>A non-digit: <tt>[^\d]</tt></td></tr> ! * <tr><td><tt>\s</tt></td> ! * <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr> ! * <tr><td><tt>\S</tt></td> ! * <td>A non-whitespace character: <tt>[^\s]</tt></td></tr> ! * <tr><td><tt>\w</tt></td> ! * <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr> ! * <tr><td><tt>\W</tt></td> ! * <td>A non-word character: <tt>[^\w]</tt></td></tr> ! * </table> ! * <p> ! * <a name="jcc"> ! * Categories that behave like the java.lang.Character * boolean is<i>methodname</i> methods (except for the deprecated ones) are * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where * the specified property has the name <tt>java<i>methodname</i></tt>. * * <h4> Comparison to Perl 5 </h4>
*** 794,803 **** --- 876,907 ---- * * <p> Specifying this flag may impose a performance penalty. </p> */ public static final int CANON_EQ = 0x80; + /** + * Enables the Unicode version of <i>Predefined character classes</i> and + * <i>POSIX character classes</i>. + * + * <p> When this flag is specified then the (US-ASCII only) + * <i>Predefined character classes</i> and <i>POSIX character classes</i> + * are in conformance with + * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical + * Standard #18: Unicode Regular Expression</i></a> + * <i>Annex C: Compatibility Properties</i>. + * <p> + * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded + * flag expression&nbsp;<tt>(?U)</tt>. + * <p> + * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case + * folding. + * <p> + * Specifying this flag may impose a performance penalty. </p> + * @since 1.7 + */ + public static final int UNICODE_CHARACTER_CLASS = 0x100; + /* Pattern has only two serialized components: The pattern string * and the flags, which are all that is needed to recompile the pattern * when it is deserialized. */
*** 916,926 **** * * @param flags * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, ! * {@link #LITERAL} and {@link #COMMENTS} * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined * match flags are set in <tt>flags</tt> * --- 1020,1031 ---- * * @param flags * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, ! * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} ! * and {@link #COMMENTS} * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined * match flags are set in <tt>flags</tt> *
*** 1207,1216 **** --- 1312,1325 ---- */ private Pattern(String p, int f) { pattern = p; flags = f; + // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present + if ((flags & UNICODE_CHARACTER_CLASS) != 0) + flags |= UNICODE_CASE; + // Reset group index count capturingGroupCount = 1; localCount = 0; if (pattern.length() > 0) {
*** 2162,2177 **** if (inclass) break; if (create) root = new Begin(); return -1; case 'B': if (inclass) break; ! if (create) root = new Bound(Bound.NONE); return -1; case 'C': break; case 'D': ! if (create) root = new Ctype(ASCII.DIGIT).complement(); return -1; case 'E': case 'F': break; case 'G': --- 2271,2288 ---- if (inclass) break; if (create) root = new Begin(); return -1; case 'B': if (inclass) break; ! if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); return -1; case 'C': break; case 'D': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.DIGIT).complement() ! : new Ctype(ASCII.DIGIT).complement(); return -1; case 'E': case 'F': break; case 'G':
*** 2189,2206 **** case 'P': case 'Q': case 'R': break; case 'S': ! if (create) root = new Ctype(ASCII.SPACE).complement(); return -1; case 'T': case 'U': case 'V': break; case 'W': ! if (create) root = new Ctype(ASCII.WORD).complement(); return -1; case 'X': case 'Y': break; case 'Z': --- 2300,2321 ---- case 'P': case 'Q': case 'R': break; case 'S': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WHITE_SPACE).complement() ! : new Ctype(ASCII.SPACE).complement(); return -1; case 'T': case 'U': case 'V': break; case 'W': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WORD).complement() ! : new Ctype(ASCII.WORD).complement(); return -1; case 'X': case 'Y': break; case 'Z':
*** 2214,2229 **** return -1; case 'a': return '\007'; case 'b': if (inclass) break; ! if (create) root = new Bound(Bound.BOTH); return -1; case 'c': return c(); case 'd': ! if (create) root = new Ctype(ASCII.DIGIT); return -1; case 'e': return '\033'; case 'f': return '\f'; --- 2329,2346 ---- return -1; case 'a': return '\007'; case 'b': if (inclass) break; ! if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); return -1; case 'c': return c(); case 'd': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.DIGIT) ! : new Ctype(ASCII.DIGIT); return -1; case 'e': return '\033'; case 'f': return '\f';
*** 2257,2276 **** case 'q': break; case 'r': return '\r'; case 's': ! if (create) root = new Ctype(ASCII.SPACE); return -1; case 't': return '\t'; case 'u': return u(); case 'v': return '\013'; case 'w': ! if (create) root = new Ctype(ASCII.WORD); return -1; case 'x': return x(); case 'y': break; --- 2374,2397 ---- case 'q': break; case 'r': return '\r'; case 's': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WHITE_SPACE) ! : new Ctype(ASCII.SPACE); return -1; case 't': return '\t'; case 'u': return u(); case 'v': return '\013'; case 'w': ! if (create) root = has(UNICODE_CHARACTER_CLASS) ! ? new Utype(UnicodeProp.WORD) ! : new Ctype(ASCII.WORD); return -1; case 'x': return x(); case 'y': break;
*** 2488,2498 **** private CharProperty family(boolean singleLetter, boolean maybeComplement) { next(); String name; ! CharProperty node; if (singleLetter) { int c = temp[cursor]; if (!Character.isSupplementaryCodePoint(c)) { name = String.valueOf((char)c); --- 2609,2619 ---- private CharProperty family(boolean singleLetter, boolean maybeComplement) { next(); String name; ! CharProperty node = null; if (singleLetter) { int c = temp[cursor]; if (!Character.isSupplementaryCodePoint(c)) { name = String.valueOf((char)c);
*** 2534,2547 **** --- 2655,2678 ---- // \p{inBlockName} node = unicodeBlockPropertyFor(name.substring(2)); } else if (name.startsWith("Is")) { // \p{isGeneralCategory} and \p{isScriptName} name = name.substring(2); + UnicodeProp uprop = UnicodeProp.forName(name); + if (uprop != null) + node = new Utype(uprop); + if (node == null) node = CharPropertyNames.charPropertyFor(name); if (node == null) node = unicodeScriptPropertyFor(name); } else { + if (has(UNICODE_CHARACTER_CLASS)) { + UnicodeProp uprop = UnicodeProp.forPOSIXName(name); + if (uprop != null) + node = new Utype(uprop); + } + if (node == null) node = charPropertyNodeFor(name); } } if (maybeComplement) { if (node instanceof Category || node instanceof Block)
*** 2820,2829 **** --- 2951,2963 ---- flags |= CANON_EQ; break; case 'x': flags |= COMMENTS; break; + case 'U': + flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); + break; case '-': // subFlag then fall through ch = next(); subFlag(); default: return;
*** 2859,2868 **** --- 2993,3004 ---- flags &= ~CANON_EQ; break; case 'x': flags &= ~COMMENTS; break; + case 'U': + flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); default: return; } ch = next(); }
*** 3662,3671 **** --- 3798,3819 ---- return (typeMask & (1 << Character.getType(ch))) != 0; } } /** + * Node class that matches a Unicode "type" + */ + static final class Utype extends CharProperty { + final UnicodeProp uprop; + Utype(UnicodeProp uprop) { this.uprop = uprop; } + boolean isSatisfiedBy(int ch) { + return uprop.is(ch); + } + } + + + /** * Node class that matches a POSIX type. */ static final class Ctype extends BmpCharProperty { final int ctype; Ctype(int ctype) { this.ctype = ctype; }
*** 5023,5035 **** static int LEFT = 0x1; static int RIGHT= 0x2; static int BOTH = 0x3; static int NONE = 0x4; int type; ! Bound(int n) { type = n; } int check(Matcher matcher, int i, CharSequence seq) { int ch; boolean left = false; int startIndex = matcher.from; int endIndex = matcher.to; --- 5171,5191 ---- static int LEFT = 0x1; static int RIGHT= 0x2; static int BOTH = 0x3; static int NONE = 0x4; int type; ! boolean useUWORD; ! Bound(int n, boolean useUWORD) { type = n; + this.useUWORD = useUWORD; + } + + boolean isWord(int ch) { + return useUWORD ? UnicodeProp.WORD.is(ch) + : (ch == '_' || Character.isLetterOrDigit(ch)); } + int check(Matcher matcher, int i, CharSequence seq) { int ch; boolean left = false; int startIndex = matcher.from; int endIndex = matcher.to;
*** 5037,5054 **** startIndex = 0; endIndex = matcher.getTextLength(); } if (i > startIndex) { ch = Character.codePointBefore(seq, i); ! left = (ch == '_' || Character.isLetterOrDigit(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i-1, seq))); } boolean right = false; if (i < endIndex) { ch = Character.codePointAt(seq, i); ! right = (ch == '_' || Character.isLetterOrDigit(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i, seq))); } else { // Tried to access char past the end matcher.hitEnd = true; --- 5193,5210 ---- startIndex = 0; endIndex = matcher.getTextLength(); } if (i > startIndex) { ch = Character.codePointBefore(seq, i); ! left = (isWord(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i-1, seq))); } boolean right = false; if (i < endIndex) { ch = Character.codePointAt(seq, i); ! right = (isWord(ch) || ((Character.getType(ch) == Character.NON_SPACING_MARK) && hasBaseCharacter(matcher, i, seq))); } else { // Tried to access char past the end matcher.hitEnd = true;
*** 5426,5435 **** --- 5582,5597 ---- boolean isSatisfiedBy(int ch) { return Character.isLowerCase(ch);}}); defClone("javaUpperCase", new CloneableProperty() { boolean isSatisfiedBy(int ch) { return Character.isUpperCase(ch);}}); + defClone("javaAlphabetic", new CloneableProperty() { + boolean isSatisfiedBy(int ch) { + return Character.isAlphabetic(ch);}}); + defClone("javaIdeographic", new CloneableProperty() { + boolean isSatisfiedBy(int ch) { + return Character.isIdeographic(ch);}}); defClone("javaTitleCase", new CloneableProperty() { boolean isSatisfiedBy(int ch) { return Character.isTitleCase(ch);}}); defClone("javaDigit", new CloneableProperty() { boolean isSatisfiedBy(int ch) {