jdk Cdiff src/share/classes/java/util/regex/Pattern.java

src/share/classes/java/util/regex/Pattern.java


*** 204,220 ****
   *     <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
   * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
   *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
   *
   * <tr><th>&nbsp;</th></tr>
!  * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
   * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
!  *     <td headers="matches">A Latin&nbsp;script character (simple <a href="#ubc">script</a>)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
!  *     <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
!  *     <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
   *     <td headers="matches">A currency symbol</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
   *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td>
--- 204,222 ----
   *     <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
   * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
   *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
   *
   * <tr><th>&nbsp;</th></tr>
!  * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
   * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
!  *     <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
!  *     <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
!  *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
!  * <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
!  *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
   *     <td headers="matches">A currency symbol</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
   *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
   * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]]&nbsp;</tt></td>
*** 326,339 ****
   *
   * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>
   *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
   *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
!  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
   *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
   * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
!  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
   *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
   *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
   * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
   * <a href="#COMMENTS">x</a> on - off</td></tr>
--- 328,342 ----
   *
   * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>
   *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
   *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
!  * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU)&nbsp;</tt></td>
   *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
   * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
!  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
!  * on - off</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
   *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
   *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
   * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
   * <a href="#COMMENTS">x</a> on - off</td></tr>
*** 516,580 ****
   *
   * <h4> Unicode support </h4>
   *
   * <p> This class is in conformance with Level 1 of <a
   * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
!  * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
   * Canonical Equivalents.
!  *
!  * <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code
   * are processed as described in section 3.3 of
   * <cite>The Java&trade; Language Specification</cite>.
!  * Such escape sequences are also
!  * implemented directly by the regular-expression parser so that Unicode
!  * escapes can be used in expressions that are read from files or from the
!  * keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>,
!  * while not equal, compile into the same pattern, which matches the character
!  * with hexadecimal value <tt>0x2014</tt>.
!  *
!  * <p> A Unicode character can also be represented in a regular-expression by
!  * using its hexadecimal code point value directly as described in construct
   * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
   * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
   * Unicode escape sequences of the surrogate pair
   * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
!  *
!  * <a name="ubc">
!  * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
!  * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
   * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
   * does not match if the input has that property.
   * <p>
!  * Scripts are specified either with the prefix {@code Is}, as in
   * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
   * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
   * <p>
!  * Blocks are specified with the prefix {@code In}, as in
   * {@code InMongolian}, or by using the keyword {@code block} (or its short
   * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
   * <p>
!  * Categories may be specified with the optional prefix {@code Is}:
   * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
   * letters. Same as scripts and blocks, categories can also be specified
   * by using the keyword {@code general_category} (or its short form
   * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
   * <p>
!  * Scripts, blocks and categories can be used both inside and outside of a
!  * character class.
!  * <p> The supported categories are those of
   * <a href="http://www.unicode.org/unicode/standard/standard.html">
   * <i>The Unicode Standard</i></a> in the version specified by the
   * {@link java.lang.Character Character} class. The category names are those
   * defined in the Standard, both normative and informative.
-  * The script names supported by <code>Pattern</code> are the valid script names
-  * accepted and defined by
-  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
-  * The block names supported by <code>Pattern</code> are the valid block names
-  * accepted and defined by
-  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
   * <p>
!  * <a name="jcc"> <p>Categories that behave like the java.lang.Character
   * boolean is<i>methodname</i> methods (except for the deprecated ones) are
   * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
   * the specified property has the name <tt>java<i>methodname</i></tt>.
   *
   * <h4> Comparison to Perl 5 </h4>
--- 519,662 ----
   *
   * <h4> Unicode support </h4>
   *
   * <p> This class is in conformance with Level 1 of <a
   * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
!  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
   * Canonical Equivalents.
!  * <p>
!  * <b>Unicode escape sequences</b> such as <tt>&#92;u2014</tt> in Java source code
   * are processed as described in section 3.3 of
   * <cite>The Java&trade; Language Specification</cite>.
!  * Such escape sequences are also implemented directly by the regular-expression
!  * parser so that Unicode escapes can be used in expressions that are read from
!  * files or from the keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and
!  * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
!  * matches the character with hexadecimal value <tt>0x2014</tt>.
!  * <p>
!  * A Unicode character can also be represented in a regular-expression by
!  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
   * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
   * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
   * Unicode escape sequences of the surrogate pair
   * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
!  * <p>
!  * Unicode scripts, blocks, categories and binary properties are written with
!  * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
!  * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
   * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
   * does not match if the input has that property.
   * <p>
!  * Scripts, blocks, categories and binary properties can be used both inside
!  * and outside of a character class.
!  * <a name="usc">
!  * <p>
!  * <b>Scripts</b> are specified either with the prefix {@code Is}, as in
   * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
   * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
   * <p>
!  * The script names supported by <code>Pattern</code> are the valid script names
!  * accepted and defined by
!  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
!  * <a name="ubc">
!  * <p>
!  * <b>Blocks</b> are specified with the prefix {@code In}, as in
   * {@code InMongolian}, or by using the keyword {@code block} (or its short
   * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
   * <p>
!  * The block names supported by <code>Pattern</code> are the valid block names
!  * accepted and defined by
!  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
!  * <p>
!  * <a name="ucc">
!  * <b>Categories</b> may be specified with the optional prefix {@code Is}:
   * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
   * letters. Same as scripts and blocks, categories can also be specified
   * by using the keyword {@code general_category} (or its short form
   * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
   * <p>
!  * The supported categories are those of
   * <a href="http://www.unicode.org/unicode/standard/standard.html">
   * <i>The Unicode Standard</i></a> in the version specified by the
   * {@link java.lang.Character Character} class. The category names are those
   * defined in the Standard, both normative and informative.
   * <p>
!  * <a name="ubpc">
!  * <b>Binary properties</b> are specified with the prefix {@code Is}, as in
!  * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
!  * are
!  * <ul>
!  *   <li> Alphabetic
!  *   <li> Ideographic
!  *   <li> Letter
!  *   <li> Lowercase
!  *   <li> Uppercase
!  *   <li> Titlecase
!  *   <li> Punctuation
!  *   <Li> Control
!  *   <li> White_Space
!  *   <li> Digit
!  *   <li> Hex_Digit
!  *   <li> Noncharacter_Code_Point
!  *   <li> Assigned
!  * </ul>
! 
! 
!  * <p>
!  * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
!  * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
!  * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
!  * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
!  * <p>
!  * <table border="0" cellpadding="1" cellspacing="0"
!  *  summary="predefined and posix character classes in Unicode mode">
!  * <tr align="left">
!  * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
!  * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
!  *</tr>
!  * <tr><td><tt>\p{Lower}</tt></td>
!  *     <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
!  * <tr><td><tt>\p{Upper}</tt></td>
!  *     <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
!  * <tr><td><tt>\p{ASCII}</tt></td>
!  *     <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
!  * <tr><td><tt>\p{Alpha}</tt></td>
!  *     <td>An alphabetic character:<tt>\p{IsAlpahbetic}</tt></td></tr>
!  * <tr><td><tt>\p{Digit}</tt></td>
!  *     <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
!  * <tr><td><tt>\p{Alnum}</tt></td>
!  *     <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
!  * <tr><td><tt>\p{Punct}</tt></td>
!  *     <td>An punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
!  * <tr><td><tt>\p{Graph}</tt></td>
!  *     <td>An visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
!  * <tr><td><tt>\p{Print}</tt></td>
!  *     <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
!  * <tr><td><tt>\p{Blank}</tt></td>
!  *     <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
!  * <tr><td><tt>\p{Cntrl}</tt></td>
!  *     <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
!  * <tr><td><tt>\p{XDigit}</tt></td>
!  *     <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
!  * <tr><td><tt>\p{Space}</tt></td>
!  *     <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
!  * <tr><td><tt>\d</tt></td>
!  *     <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
!  * <tr><td><tt>\D</tt></td>
!  *     <td>A non-digit: <tt>[^\d]</tt></td></tr>
!  * <tr><td><tt>\s</tt></td>
!  *     <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
!  * <tr><td><tt>\S</tt></td>
!  *     <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
!  * <tr><td><tt>\w</tt></td>
!  *     <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
!  * <tr><td><tt>\W</tt></td>
!  *     <td>A non-word character: <tt>[^\w]</tt></td></tr>
!  * </table>
!  * <p>
!  * <a name="jcc">
!  * Categories that behave like the java.lang.Character
   * boolean is<i>methodname</i> methods (except for the deprecated ones) are
   * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
   * the specified property has the name <tt>java<i>methodname</i></tt>.
   *
   * <h4> Comparison to Perl 5 </h4>
*** 794,803 ****
--- 876,907 ----
       *
       * <p> Specifying this flag may impose a performance penalty.  </p>
       */
      public static final int CANON_EQ = 0x80;
  
+     /**
+      * Enables Unicode version of <i>Predefined character classes</i> and
+      * <i>POSIX character classes</i>.
+      *
+      * <p> When this flag is specified then the (US-ASCII only)
+      * <i>Predefined character classes</i> and <i>POSIX character classes</i>
+      * are in conformance with
+      * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
+      * Standard #18: Unicode Regular Expression</i></a>
+      * <i>Annex C: Compatibility Properties</i>.
+      * <p>
+      * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
+      * flag expression&nbsp;<tt>(?U)</tt>.
+      * <p>
+      * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
+      * folding.
+      * <p>
+      * Specifying this flag may impose a performance penalty.  </p>
+      * @since 1.7
+      */
+     public static final int UNICODE_CHARACTER_CLASS = 0x100;
+ 
      /* Pattern has only two serialized components: The pattern string
       * and the flags, which are all that is needed to recompile the pattern
       * when it is deserialized.
       */
  
*** 916,926 ****
       *
       * @param  flags
       *         Match flags, a bit mask that may include
       *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
       *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
!      *         {@link #LITERAL} and {@link #COMMENTS}
       *
       * @throws  IllegalArgumentException
       *          If bit values other than those corresponding to the defined
       *          match flags are set in <tt>flags</tt>
       *
--- 1020,1031 ----
       *
       * @param  flags
       *         Match flags, a bit mask that may include
       *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
       *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
!      *         {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
!      *         and {@link #COMMENTS}
       *
       * @throws  IllegalArgumentException
       *          If bit values other than those corresponding to the defined
       *          match flags are set in <tt>flags</tt>
       *
*** 1207,1216 ****
--- 1312,1325 ----
       */
      private Pattern(String p, int f) {
          pattern = p;
          flags = f;
  
+         // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
+         if ((flags & UNICODE_CHARACTER_CLASS) != 0)
+             flags |= UNICODE_CASE;
+ 
          // Reset group index count
          capturingGroupCount = 1;
          localCount = 0;
  
          if (pattern.length() > 0) {
*** 2162,2177 ****
              if (inclass) break;
              if (create) root = new Begin();
              return -1;
          case 'B':
              if (inclass) break;
!             if (create) root = new Bound(Bound.NONE);
              return -1;
          case 'C':
              break;
          case 'D':
!             if (create) root = new Ctype(ASCII.DIGIT).complement();
              return -1;
          case 'E':
          case 'F':
              break;
          case 'G':
--- 2271,2288 ----
              if (inclass) break;
              if (create) root = new Begin();
              return -1;
          case 'B':
              if (inclass) break;
!             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
              return -1;
          case 'C':
              break;
          case 'D':
!             if (create) root = has(UNICODE_CHARACTER_CLASS)
!                                ? new Utype(UnicodeProp.DIGIT).complement()
!                                : new Ctype(ASCII.DIGIT).complement();
              return -1;
          case 'E':
          case 'F':
              break;
          case 'G':
*** 2189,2206 ****
          case 'P':
          case 'Q':
          case 'R':
              break;
          case 'S':
!             if (create) root = new Ctype(ASCII.SPACE).complement();
              return -1;
          case 'T':
          case 'U':
          case 'V':
              break;
          case 'W':
!             if (create) root = new Ctype(ASCII.WORD).complement();
              return -1;
          case 'X':
          case 'Y':
              break;
          case 'Z':
--- 2300,2321 ----
          case 'P':
          case 'Q':
          case 'R':
              break;
          case 'S':
!             if (create) root = has(UNICODE_CHARACTER_CLASS)
!                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()
!                                : new Ctype(ASCII.SPACE).complement();
              return -1;
          case 'T':
          case 'U':
          case 'V':
              break;
          case 'W':
!             if (create) root = has(UNICODE_CHARACTER_CLASS)
!                                ? new Utype(UnicodeProp.WORD).complement()
!                                : new Ctype(ASCII.WORD).complement();
              return -1;
          case 'X':
          case 'Y':
              break;
          case 'Z':
*** 2214,2229 ****
              return -1;
          case 'a':
              return '\007';
          case 'b':
              if (inclass) break;
!             if (create) root = new Bound(Bound.BOTH);
              return -1;
          case 'c':
              return c();
          case 'd':
!             if (create) root = new Ctype(ASCII.DIGIT);
              return -1;
          case 'e':
              return '\033';
          case 'f':
              return '\f';
--- 2329,2346 ----
              return -1;
          case 'a':
              return '\007';
          case 'b':
              if (inclass) break;
!             if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
              return -1;
          case 'c':
              return c();
          case 'd':
!             if (create) root = has(UNICODE_CHARACTER_CLASS)
!                                ? new Utype(UnicodeProp.DIGIT)
!                                : new Ctype(ASCII.DIGIT);
              return -1;
          case 'e':
              return '\033';
          case 'f':
              return '\f';
*** 2257,2276 ****
          case 'q':
              break;
          case 'r':
              return '\r';
          case 's':
!             if (create) root = new Ctype(ASCII.SPACE);
              return -1;
          case 't':
              return '\t';
          case 'u':
              return u();
          case 'v':
              return '\013';
          case 'w':
!             if (create) root = new Ctype(ASCII.WORD);
              return -1;
          case 'x':
              return x();
          case 'y':
              break;
--- 2374,2397 ----
          case 'q':
              break;
          case 'r':
              return '\r';
          case 's':
!             if (create) root = has(UNICODE_CHARACTER_CLASS)
!                                ? new Utype(UnicodeProp.WHITE_SPACE)
!                                : new Ctype(ASCII.SPACE);
              return -1;
          case 't':
              return '\t';
          case 'u':
              return u();
          case 'v':
              return '\013';
          case 'w':
!             if (create) root = has(UNICODE_CHARACTER_CLASS)
!                                ? new Utype(UnicodeProp.WORD)
!                                : new Ctype(ASCII.WORD);
              return -1;
          case 'x':
              return x();
          case 'y':
              break;
*** 2488,2498 ****
      private CharProperty family(boolean singleLetter,
                                  boolean maybeComplement)
      {
          next();
          String name;
!         CharProperty node;
  
          if (singleLetter) {
              int c = temp[cursor];
              if (!Character.isSupplementaryCodePoint(c)) {
                  name = String.valueOf((char)c);
--- 2609,2619 ----
      private CharProperty family(boolean singleLetter,
                                  boolean maybeComplement)
      {
          next();
          String name;
!         CharProperty node = null;
  
          if (singleLetter) {
              int c = temp[cursor];
              if (!Character.isSupplementaryCodePoint(c)) {
                  name = String.valueOf((char)c);
*** 2534,2547 ****
--- 2655,2678 ----
                  // \p{inBlockName}
                  node = unicodeBlockPropertyFor(name.substring(2));
              } else if (name.startsWith("Is")) {
                  // \p{isGeneralCategory} and \p{isScriptName}
                  name = name.substring(2);
+                 UnicodeProp uprop = UnicodeProp.forName(name);
+                 if (uprop != null)
+                     node = new Utype(uprop);
+                 if (node == null)
                      node = CharPropertyNames.charPropertyFor(name);
                  if (node == null)
                      node = unicodeScriptPropertyFor(name);
              } else {
+                 if (has(UNICODE_CHARACTER_CLASS)) {
+                     UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+                     if (uprop != null)
+                         node = new Utype(uprop);
+                 }
+                 if (node == null)
                      node = charPropertyNodeFor(name);
              }
          }
          if (maybeComplement) {
              if (node instanceof Category || node instanceof Block)
*** 2820,2829 ****
--- 2951,2963 ----
                  flags |= CANON_EQ;
                  break;
              case 'x':
                  flags |= COMMENTS;
                  break;
+             case 'U':
+                 flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+                 break;
              case '-': // subFlag then fall through
                  ch = next();
                  subFlag();
              default:
                  return;
*** 2859,2868 ****
--- 2993,3004 ----
                  flags &= ~CANON_EQ;
                  break;
              case 'x':
                  flags &= ~COMMENTS;
                  break;
+             case 'U':
+                 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
              default:
                  return;
              }
              ch = next();
          }
*** 3662,3671 ****
--- 3798,3819 ----
              return (typeMask & (1 << Character.getType(ch))) != 0;
          }
      }
  
      /**
+      * Node class that matches a Unicode "type"
+      */
+     static final class Utype extends CharProperty {
+         final UnicodeProp uprop;
+         Utype(UnicodeProp uprop) { this.uprop = uprop; }
+         boolean isSatisfiedBy(int ch) {
+             return uprop.is(ch);
+         }
+     }
+ 
+ 
+     /**
       * Node class that matches a POSIX type.
       */
      static final class Ctype extends BmpCharProperty {
          final int ctype;
          Ctype(int ctype) { this.ctype = ctype; }
*** 5023,5035 ****
          static int LEFT = 0x1;
          static int RIGHT= 0x2;
          static int BOTH = 0x3;
          static int NONE = 0x4;
          int type;
!         Bound(int n) {
              type = n;
          }
          int check(Matcher matcher, int i, CharSequence seq) {
              int ch;
              boolean left = false;
              int startIndex = matcher.from;
              int endIndex = matcher.to;
--- 5171,5191 ----
          static int LEFT = 0x1;
          static int RIGHT= 0x2;
          static int BOTH = 0x3;
          static int NONE = 0x4;
          int type;
!         boolean useUWORD;
!         Bound(int n, boolean useUWORD) {
              type = n;
+             this.useUWORD = useUWORD;
+         }
+ 
+         boolean isWord(int ch) {
+             return useUWORD ? UnicodeProp.WORD.is(ch)
+                             : (ch == '_' || Character.isLetterOrDigit(ch));
          }
+ 
          int check(Matcher matcher, int i, CharSequence seq) {
              int ch;
              boolean left = false;
              int startIndex = matcher.from;
              int endIndex = matcher.to;
*** 5037,5054 ****
                  startIndex = 0;
                  endIndex = matcher.getTextLength();
              }
              if (i > startIndex) {
                  ch = Character.codePointBefore(seq, i);
!                 left = (ch == '_' || Character.isLetterOrDigit(ch) ||
                      ((Character.getType(ch) == Character.NON_SPACING_MARK)
                       && hasBaseCharacter(matcher, i-1, seq)));
              }
              boolean right = false;
              if (i < endIndex) {
                  ch = Character.codePointAt(seq, i);
!                 right = (ch == '_' || Character.isLetterOrDigit(ch) ||
                      ((Character.getType(ch) == Character.NON_SPACING_MARK)
                       && hasBaseCharacter(matcher, i, seq)));
              } else {
                  // Tried to access char past the end
                  matcher.hitEnd = true;
--- 5193,5210 ----
                  startIndex = 0;
                  endIndex = matcher.getTextLength();
              }
              if (i > startIndex) {
                  ch = Character.codePointBefore(seq, i);
!                 left = (isWord(ch) ||
                      ((Character.getType(ch) == Character.NON_SPACING_MARK)
                       && hasBaseCharacter(matcher, i-1, seq)));
              }
              boolean right = false;
              if (i < endIndex) {
                  ch = Character.codePointAt(seq, i);
!                 right = (isWord(ch) ||
                      ((Character.getType(ch) == Character.NON_SPACING_MARK)
                       && hasBaseCharacter(matcher, i, seq)));
              } else {
                  // Tried to access char past the end
                  matcher.hitEnd = true;
*** 5426,5435 ****
--- 5582,5597 ----
                  boolean isSatisfiedBy(int ch) {
                      return Character.isLowerCase(ch);}});
              defClone("javaUpperCase", new CloneableProperty() {
                  boolean isSatisfiedBy(int ch) {
                      return Character.isUpperCase(ch);}});
+             defClone("javaAlphabetic", new CloneableProperty() {
+                 boolean isSatisfiedBy(int ch) {
+                     return Character.isAlphabetic(ch);}});
+             defClone("javaIdeographic", new CloneableProperty() {
+                 boolean isSatisfiedBy(int ch) {
+                     return Character.isIdeographic(ch);}});
              defClone("javaTitleCase", new CloneableProperty() {
                  boolean isSatisfiedBy(int ch) {
                      return Character.isTitleCase(ch);}});
              defClone("javaDigit", new CloneableProperty() {
                  boolean isSatisfiedBy(int ch) {