src/share/classes/java/util/regex/Pattern.java
Print this page
*** 204,220 ****
* <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
* <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
*
* <tr><th> </th></tr>
! * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
* * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
! * <td headers="matches">A Latin script character (simple <a href="#ubc">script</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
! * <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
! * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
* <td headers="matches">A currency symbol</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
* <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
--- 204,222 ----
* <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
* <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
*
* <tr><th> </th></tr>
! * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
* * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
! * <td headers="matches">A Latin script character (<a href="#usc">script</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
! * <td headers="matches">A character in the Greek block (<a href="#ubc">block</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
! * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
! * <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
! * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
* <td headers="matches">A currency symbol</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
* <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
*** 326,339 ****
*
* <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
! * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
! * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
* <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
* given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
* <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
* <a href="#COMMENTS">x</a> on - off</td></tr>
--- 328,342 ----
*
* <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
! * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU) </tt></td>
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
! * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
! * on - off</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
* <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
* given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
* <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
* <a href="#COMMENTS">x</a> on - off</td></tr>
*** 516,580 ****
*
* <h4> Unicode support </h4>
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
! * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
* Canonical Equivalents.
! *
! * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
* are processed as described in section 3.3 of
* <cite>The Java™ Language Specification</cite>.
! * Such escape sequences are also
! * implemented directly by the regular-expression parser so that Unicode
! * escapes can be used in expressions that are read from files or from the
! * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
! * while not equal, compile into the same pattern, which matches the character
! * with hexadecimal value <tt>0x2014</tt>.
! *
! * <p> A Unicode character can also be represented in a regular-expression by
! * using its hexadecimal code point value directly as described in construct
* <tt>\x{...}</tt>, for example a supplementary character U+2011F
* can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
* Unicode escape sequences of the surrogate pair
* <tt>\uD840</tt><tt>\uDD1F</tt>.
! *
! * <a name="ubc">
! * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
! * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
* the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
* does not match if the input has that property.
* <p>
! * Scripts are specified either with the prefix {@code Is}, as in
* {@code IsHiragana}, or by using the {@code script} keyword (or its short
* form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
* <p>
! * Blocks are specified with the prefix {@code In}, as in
* {@code InMongolian}, or by using the keyword {@code block} (or its short
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
* <p>
! * Categories may be specified with the optional prefix {@code Is}:
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
* letters. Same as scripts and blocks, categories can also be specified
* by using the keyword {@code general_category} (or its short form
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
* <p>
! * Scripts, blocks and categories can be used both inside and outside of a
! * character class.
! * <p> The supported categories are those of
* <a href="http://www.unicode.org/unicode/standard/standard.html">
* <i>The Unicode Standard</i></a> in the version specified by the
* {@link java.lang.Character Character} class. The category names are those
* defined in the Standard, both normative and informative.
- * The script names supported by <code>Pattern</code> are the valid script names
- * accepted and defined by
- * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
- * The block names supported by <code>Pattern</code> are the valid block names
- * accepted and defined by
- * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
* <p>
! * <a name="jcc"> <p>Categories that behave like the java.lang.Character
* boolean is<i>methodname</i> methods (except for the deprecated ones) are
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
* the specified property has the name <tt>java<i>methodname</i></tt>.
*
* <h4> Comparison to Perl 5 </h4>
--- 519,662 ----
*
* <h4> Unicode support </h4>
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
! * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
* Canonical Equivalents.
! * <p>
! * <b>Unicode escape sequences</b> such as <tt>\u2014</tt> in Java source code
* are processed as described in section 3.3 of
* <cite>The Java™ Language Specification</cite>.
! * Such escape sequences are also implemented directly by the regular-expression
! * parser so that Unicode escapes can be used in expressions that are read from
! * files or from the keyboard. Thus the strings <tt>"\u2014"</tt> and
! * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
! * matches the character with hexadecimal value <tt>0x2014</tt>.
! * <p>
! * A Unicode character can also be represented in a regular-expression by
! * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
* <tt>\x{...}</tt>, for example a supplementary character U+2011F
* can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
* Unicode escape sequences of the surrogate pair
* <tt>\uD840</tt><tt>\uDD1F</tt>.
! * <p>
! * Unicode scripts, blocks, categories and binary properties are written with
! * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
! * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
* the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
* does not match if the input has that property.
* <p>
! * Scripts, blocks, categories and binary properties can be used both inside
! * and outside of a character class.
! * <a name="usc">
! * <p>
! * <b>Scripts</b> are specified either with the prefix {@code Is}, as in
* {@code IsHiragana}, or by using the {@code script} keyword (or its short
* form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
* <p>
! * The script names supported by <code>Pattern</code> are the valid script names
! * accepted and defined by
! * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
! * <a name="ubc">
! * <p>
! * <b>Blocks</b> are specified with the prefix {@code In}, as in
* {@code InMongolian}, or by using the keyword {@code block} (or its short
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
* <p>
! * The block names supported by <code>Pattern</code> are the valid block names
! * accepted and defined by
! * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
! * <p>
! * <a name="ucc">
! * <b>Categories</b> may be specified with the optional prefix {@code Is}:
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
* letters. Same as scripts and blocks, categories can also be specified
* by using the keyword {@code general_category} (or its short form
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
* <p>
! * The supported categories are those of
* <a href="http://www.unicode.org/unicode/standard/standard.html">
* <i>The Unicode Standard</i></a> in the version specified by the
* {@link java.lang.Character Character} class. The category names are those
* defined in the Standard, both normative and informative.
* <p>
! * <a name="ubpc">
! * <b>Binary properties</b> are specified with the prefix {@code Is}, as in
! * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
! * are
! * <ul>
! * <li> Alphabetic
! * <li> Ideographic
! * <li> Letter
! * <li> Lowercase
! * <li> Uppercase
! * <li> Titlecase
! * <li> Punctuation
! * <Li> Control
! * <li> White_Space
! * <li> Digit
! * <li> Hex_Digit
! * <li> Noncharacter_Code_Point
! * <li> Assigned
! * </ul>
!
!
! * <p>
! * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
! * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
! * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
! * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
! * <p>
! * <table border="0" cellpadding="1" cellspacing="0"
! * summary="predefined and posix character classes in Unicode mode">
! * <tr align="left">
! * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
! * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
! *</tr>
! * <tr><td><tt>\p{Lower}</tt></td>
! * <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
! * <tr><td><tt>\p{Upper}</tt></td>
! * <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
! * <tr><td><tt>\p{ASCII}</tt></td>
! * <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
! * <tr><td><tt>\p{Alpha}</tt></td>
! * <td>An alphabetic character:<tt>\p{IsAlpahbetic}</tt></td></tr>
! * <tr><td><tt>\p{Digit}</tt></td>
! * <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
! * <tr><td><tt>\p{Alnum}</tt></td>
! * <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
! * <tr><td><tt>\p{Punct}</tt></td>
! * <td>An punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
! * <tr><td><tt>\p{Graph}</tt></td>
! * <td>An visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
! * <tr><td><tt>\p{Print}</tt></td>
! * <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
! * <tr><td><tt>\p{Blank}</tt></td>
! * <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
! * <tr><td><tt>\p{Cntrl}</tt></td>
! * <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
! * <tr><td><tt>\p{XDigit}</tt></td>
! * <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
! * <tr><td><tt>\p{Space}</tt></td>
! * <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
! * <tr><td><tt>\d</tt></td>
! * <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
! * <tr><td><tt>\D</tt></td>
! * <td>A non-digit: <tt>[^\d]</tt></td></tr>
! * <tr><td><tt>\s</tt></td>
! * <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
! * <tr><td><tt>\S</tt></td>
! * <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
! * <tr><td><tt>\w</tt></td>
! * <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
! * <tr><td><tt>\W</tt></td>
! * <td>A non-word character: <tt>[^\w]</tt></td></tr>
! * </table>
! * <p>
! * <a name="jcc">
! * Categories that behave like the java.lang.Character
* boolean is<i>methodname</i> methods (except for the deprecated ones) are
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
* the specified property has the name <tt>java<i>methodname</i></tt>.
*
* <h4> Comparison to Perl 5 </h4>
*** 794,803 ****
--- 876,907 ----
*
* <p> Specifying this flag may impose a performance penalty. </p>
*/
public static final int CANON_EQ = 0x80;
+ /**
+ * Enables Unicode version of <i>Predefined character classes</i> and
+ * <i>POSIX character classes</i>.
+ *
+ * <p> When this flag is specified then the (US-ASCII only)
+ * <i>Predefined character classes</i> and <i>POSIX character classes</i>
+ * are in conformance with
+ * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
+ * Standard #18: Unicode Regular Expression</i></a>
+ * <i>Annex C: Compatibility Properties</i>.
+ * <p>
+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
+ * flag expression <tt>(?U)</tt>.
+ * <p>
+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
+ * folding.
+ * <p>
+ * Specifying this flag may impose a performance penalty. </p>
+ * @since 1.7
+ */
+ public static final int UNICODE_CHARACTER_CLASS = 0x100;
+
/* Pattern has only two serialized components: The pattern string
* and the flags, which are all that is needed to recompile the pattern
* when it is deserialized.
*/
*** 916,926 ****
*
* @param flags
* Match flags, a bit mask that may include
* {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
* {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
! * {@link #LITERAL} and {@link #COMMENTS}
*
* @throws IllegalArgumentException
* If bit values other than those corresponding to the defined
* match flags are set in <tt>flags</tt>
*
--- 1020,1031 ----
*
* @param flags
* Match flags, a bit mask that may include
* {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
* {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
! * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
! * and {@link #COMMENTS}
*
* @throws IllegalArgumentException
* If bit values other than those corresponding to the defined
* match flags are set in <tt>flags</tt>
*
*** 1207,1216 ****
--- 1312,1325 ----
*/
private Pattern(String p, int f) {
pattern = p;
flags = f;
+ // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
+ if ((flags & UNICODE_CHARACTER_CLASS) != 0)
+ flags |= UNICODE_CASE;
+
// Reset group index count
capturingGroupCount = 1;
localCount = 0;
if (pattern.length() > 0) {
*** 2162,2177 ****
if (inclass) break;
if (create) root = new Begin();
return -1;
case 'B':
if (inclass) break;
! if (create) root = new Bound(Bound.NONE);
return -1;
case 'C':
break;
case 'D':
! if (create) root = new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E':
case 'F':
break;
case 'G':
--- 2271,2288 ----
if (inclass) break;
if (create) root = new Begin();
return -1;
case 'B':
if (inclass) break;
! if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.DIGIT).complement()
! : new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E':
case 'F':
break;
case 'G':
*** 2189,2206 ****
case 'P':
case 'Q':
case 'R':
break;
case 'S':
! if (create) root = new Ctype(ASCII.SPACE).complement();
return -1;
case 'T':
case 'U':
case 'V':
break;
case 'W':
! if (create) root = new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
case 'Y':
break;
case 'Z':
--- 2300,2321 ----
case 'P':
case 'Q':
case 'R':
break;
case 'S':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WHITE_SPACE).complement()
! : new Ctype(ASCII.SPACE).complement();
return -1;
case 'T':
case 'U':
case 'V':
break;
case 'W':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WORD).complement()
! : new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
case 'Y':
break;
case 'Z':
*** 2214,2229 ****
return -1;
case 'a':
return '\007';
case 'b':
if (inclass) break;
! if (create) root = new Bound(Bound.BOTH);
return -1;
case 'c':
return c();
case 'd':
! if (create) root = new Ctype(ASCII.DIGIT);
return -1;
case 'e':
return '\033';
case 'f':
return '\f';
--- 2329,2346 ----
return -1;
case 'a':
return '\007';
case 'b':
if (inclass) break;
! if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'c':
return c();
case 'd':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.DIGIT)
! : new Ctype(ASCII.DIGIT);
return -1;
case 'e':
return '\033';
case 'f':
return '\f';
*** 2257,2276 ****
case 'q':
break;
case 'r':
return '\r';
case 's':
! if (create) root = new Ctype(ASCII.SPACE);
return -1;
case 't':
return '\t';
case 'u':
return u();
case 'v':
return '\013';
case 'w':
! if (create) root = new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
case 'y':
break;
--- 2374,2397 ----
case 'q':
break;
case 'r':
return '\r';
case 's':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WHITE_SPACE)
! : new Ctype(ASCII.SPACE);
return -1;
case 't':
return '\t';
case 'u':
return u();
case 'v':
return '\013';
case 'w':
! if (create) root = has(UNICODE_CHARACTER_CLASS)
! ? new Utype(UnicodeProp.WORD)
! : new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
case 'y':
break;
*** 2488,2498 ****
private CharProperty family(boolean singleLetter,
boolean maybeComplement)
{
next();
String name;
! CharProperty node;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
--- 2609,2619 ----
private CharProperty family(boolean singleLetter,
boolean maybeComplement)
{
next();
String name;
! CharProperty node = null;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
*** 2534,2547 ****
--- 2655,2678 ----
// \p{inBlockName}
node = unicodeBlockPropertyFor(name.substring(2));
} else if (name.startsWith("Is")) {
// \p{isGeneralCategory} and \p{isScriptName}
name = name.substring(2);
+ UnicodeProp uprop = UnicodeProp.forName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ if (node == null)
node = CharPropertyNames.charPropertyFor(name);
if (node == null)
node = unicodeScriptPropertyFor(name);
} else {
+ if (has(UNICODE_CHARACTER_CLASS)) {
+ UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ }
+ if (node == null)
node = charPropertyNodeFor(name);
}
}
if (maybeComplement) {
if (node instanceof Category || node instanceof Block)
*** 2820,2829 ****
--- 2951,2963 ----
flags |= CANON_EQ;
break;
case 'x':
flags |= COMMENTS;
break;
+ case 'U':
+ flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+ break;
case '-': // subFlag then fall through
ch = next();
subFlag();
default:
return;
*** 2859,2868 ****
--- 2993,3004 ----
flags &= ~CANON_EQ;
break;
case 'x':
flags &= ~COMMENTS;
break;
+ case 'U':
+ flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
default:
return;
}
ch = next();
}
*** 3662,3671 ****
--- 3798,3819 ----
return (typeMask & (1 << Character.getType(ch))) != 0;
}
}
/**
+ * Node class that matches a Unicode "type"
+ */
+ static final class Utype extends CharProperty {
+ final UnicodeProp uprop;
+ Utype(UnicodeProp uprop) { this.uprop = uprop; }
+ boolean isSatisfiedBy(int ch) {
+ return uprop.is(ch);
+ }
+ }
+
+
+ /**
* Node class that matches a POSIX type.
*/
static final class Ctype extends BmpCharProperty {
final int ctype;
Ctype(int ctype) { this.ctype = ctype; }
*** 5023,5035 ****
static int LEFT = 0x1;
static int RIGHT= 0x2;
static int BOTH = 0x3;
static int NONE = 0x4;
int type;
! Bound(int n) {
type = n;
}
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
boolean left = false;
int startIndex = matcher.from;
int endIndex = matcher.to;
--- 5171,5191 ----
static int LEFT = 0x1;
static int RIGHT= 0x2;
static int BOTH = 0x3;
static int NONE = 0x4;
int type;
! boolean useUWORD;
! Bound(int n, boolean useUWORD) {
type = n;
+ this.useUWORD = useUWORD;
+ }
+
+ boolean isWord(int ch) {
+ return useUWORD ? UnicodeProp.WORD.is(ch)
+ : (ch == '_' || Character.isLetterOrDigit(ch));
}
+
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
boolean left = false;
int startIndex = matcher.from;
int endIndex = matcher.to;
*** 5037,5054 ****
startIndex = 0;
endIndex = matcher.getTextLength();
}
if (i > startIndex) {
ch = Character.codePointBefore(seq, i);
! left = (ch == '_' || Character.isLetterOrDigit(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i-1, seq)));
}
boolean right = false;
if (i < endIndex) {
ch = Character.codePointAt(seq, i);
! right = (ch == '_' || Character.isLetterOrDigit(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq)));
} else {
// Tried to access char past the end
matcher.hitEnd = true;
--- 5193,5210 ----
startIndex = 0;
endIndex = matcher.getTextLength();
}
if (i > startIndex) {
ch = Character.codePointBefore(seq, i);
! left = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i-1, seq)));
}
boolean right = false;
if (i < endIndex) {
ch = Character.codePointAt(seq, i);
! right = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq)));
} else {
// Tried to access char past the end
matcher.hitEnd = true;
*** 5426,5435 ****
--- 5582,5597 ----
boolean isSatisfiedBy(int ch) {
return Character.isLowerCase(ch);}});
defClone("javaUpperCase", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isUpperCase(ch);}});
+ defClone("javaAlphabetic", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isAlphabetic(ch);}});
+ defClone("javaIdeographic", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isIdeographic(ch);}});
defClone("javaTitleCase", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isTitleCase(ch);}});
defClone("javaDigit", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {