src/java.base/share/classes/java/util/regex/Pattern.java

Print this page




  92  *     <td headers="matches">The backslash character</td></tr>
  93  * <tr><td valign="top" headers="construct characters">{@code \0}<i>n</i></td>
  94  *     <td headers="matches">The character with octal value {@code 0}<i>n</i>
  95  *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
  96  * <tr><td valign="top" headers="construct characters">{@code \0}<i>nn</i></td>
  97  *     <td headers="matches">The character with octal value {@code 0}<i>nn</i>
  98  *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
  99  * <tr><td valign="top" headers="construct characters">{@code \0}<i>mnn</i></td>
 100  *     <td headers="matches">The character with octal value {@code 0}<i>mnn</i>
 101  *         (0&nbsp;{@code <=}&nbsp;<i>m</i>&nbsp;{@code <=}&nbsp;3,
 102  *         0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
 103  * <tr><td valign="top" headers="construct characters">{@code \x}<i>hh</i></td>
 104  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hh</i></td></tr>
 105  * <tr><td valign="top" headers="construct characters"><code>&#92;u</code><i>hhhh</i></td>
 106  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hhhh</i></td></tr>
 107  * <tr><td valign="top" headers="construct characters"><code>&#92;x</code><i>{h...h}</i></td>
 108  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>h...h</i>
 109  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
 110  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
 111  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>


 112  * <tr><td valign="top" headers="matches">{@code \t}</td>
 113  *     <td headers="matches">The tab character (<code>'&#92;u0009'</code>)</td></tr>
 114  * <tr><td valign="top" headers="construct characters">{@code \n}</td>
 115  *     <td headers="matches">The newline (line feed) character (<code>'&#92;u000A'</code>)</td></tr>
 116  * <tr><td valign="top" headers="construct characters">{@code \r}</td>
 117  *     <td headers="matches">The carriage-return character (<code>'&#92;u000D'</code>)</td></tr>
 118  * <tr><td valign="top" headers="construct characters">{@code \f}</td>
 119  *     <td headers="matches">The form-feed character (<code>'&#92;u000C'</code>)</td></tr>
 120  * <tr><td valign="top" headers="construct characters">{@code \a}</td>
 121  *     <td headers="matches">The alert (bell) character (<code>'&#92;u0007'</code>)</td></tr>
 122  * <tr><td valign="top" headers="construct characters">{@code \e}</td>
 123  *     <td headers="matches">The escape character (<code>'&#92;u001B'</code>)</td></tr>
 124  * <tr><td valign="top" headers="construct characters">{@code \c}<i>x</i></td>
 125  *     <td headers="matches">The control character corresponding to <i>x</i></td></tr>
 126  *
 127  * <tr><th>&nbsp;</th></tr>
 128  * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
 129  *
 130  * <tr><td valign="top" headers="construct classes">{@code [abc]}</td>
 131  *     <td headers="matches">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>


 226  * <tr><td valign="top" headers="construct unicode">{@code \p{Lu}}</td>
 227  *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
 228  * <tr><td valign="top" headers="construct unicode">{@code \p{IsAlphabetic}}</td>
 229  *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
 230  * <tr><td valign="top" headers="construct unicode">{@code \p{Sc}}</td>
 231  *     <td headers="matches">A currency symbol</td></tr>
 232  * <tr><td valign="top" headers="construct unicode">{@code \P{InGreek}}</td>
 233  *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
 234  * <tr><td valign="top" headers="construct unicode">{@code [\p{L}&&[^\p{Lu}]]}</td>
 235  *     <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
 236  *
 237  * <tr><th>&nbsp;</th></tr>
 238  * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
 239  *
 240  * <tr><td valign="top" headers="construct bounds">{@code ^}</td>
 241  *     <td headers="matches">The beginning of a line</td></tr>
 242  * <tr><td valign="top" headers="construct bounds">{@code $}</td>
 243  *     <td headers="matches">The end of a line</td></tr>
 244  * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
 245  *     <td headers="matches">A word boundary</td></tr>


 246  * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
 247  *     <td headers="matches">A non-word boundary</td></tr>
 248  * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
 249  *     <td headers="matches">The beginning of the input</td></tr>
 250  * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
 251  *     <td headers="matches">The end of the previous match</td></tr>
 252  * <tr><td valign="top" headers="construct bounds">{@code \Z}</td>
 253  *     <td headers="matches">The end of the input but for the final
 254  *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>
 255  * <tr><td valign="top" headers="construct bounds">{@code \z}</td>
 256  *     <td headers="matches">The end of the input</td></tr>
 257  *
 258  * <tr><th>&nbsp;</th></tr>
 259  * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
 260  * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
 261  *     <td headers="matches">Any Unicode linebreak sequence, is equivalent to
 262  *     <code>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
 263  *     </code></td></tr>
 264  *
 265  * <tr><th>&nbsp;</th></tr>





 266  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
 267  *
 268  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
 269  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
 270  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code *}</td>
 271  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
 272  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code +}</td>
 273  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
 274  * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i><code>}</code></td>
 275  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
 276  * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}}</td>
 277  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
 278  * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></td>
 279  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
 280  *
 281  * <tr><th>&nbsp;</th></tr>
 282  * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
 283  *
 284  * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code ??}</td>
 285  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>


 529  * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups
 530  * that do not capture text and do not count towards the group total, or
 531  * <i>named-capturing</i> group.
 532  *
 533  * <h3> Unicode support </h3>
 534  *
 535  * <p> This class is in conformance with Level 1 of <a
 536  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
 537  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
 538  * Canonical Equivalents.
 539  * <p>
 540  * <b>Unicode escape sequences</b> such as <code>&#92;u2014</code> in Java source code
 541  * are processed as described in section 3.3 of
 542  * <cite>The Java&trade; Language Specification</cite>.
 543  * Such escape sequences are also implemented directly by the regular-expression
 544  * parser so that Unicode escapes can be used in expressions that are read from
 545  * files or from the keyboard.  Thus the strings <code>"&#92;u2014"</code> and
 546  * {@code "\\u2014"}, while not equal, compile into the same pattern, which
 547  * matches the character with hexadecimal value {@code 0x2014}.
 548  * <p>
 549  * A Unicode character can also be represented in a regular-expression by
 550  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
 551  * <code>&#92;x{...}</code>, for example a supplementary character U+2011F
 552  * can be specified as <code>&#92;x{2011F}</code>, instead of two consecutive
 553  * Unicode escape sequences of the surrogate pair
 554  * <code>&#92;uD840</code><code>&#92;uDD1F</code>.









 555  * <p>
 556  * Unicode scripts, blocks, categories and binary properties are written with
 557  * the {@code \p} and {@code \P} constructs as in Perl.
 558  * <code>\p{</code><i>prop</i><code>}</code> matches if
 559  * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
 560  * does not match if the input has that property.
 561  * <p>
 562  * Scripts, blocks, categories and binary properties can be used both inside
 563  * and outside of a character class.
 564  *
 565  * <p>
 566  * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
 567  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
 568  * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.
 569  * <p>
 570  * The script names supported by {@code Pattern} are the valid script names
 571  * accepted and defined by
 572  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
 573  *
 574  * <p>


 662  * <tr><td>{@code \w}</td>
 663  *     <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr>
 664  * <tr><td>{@code \W}</td>
 665  *     <td>A non-word character: {@code [^\w]}</td></tr>
 666  * </table>
 667  * <p>
 668  * <a name="jcc">
 669  * Categories that behave like the java.lang.Character
 670  * boolean is<i>methodname</i> methods (except for the deprecated ones) are
 671  * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where
 672  * the specified property has the name <code>java<i>methodname</i></code></a>.
 673  *
 674  * <h3> Comparison to Perl 5 </h3>
 675  *
 676  * <p>The {@code Pattern} engine performs traditional NFA-based matching
 677  * with ordered alternation as occurs in Perl 5.
 678  *
 679  * <p> Perl constructs not supported by this class: </p>
 680  *
 681  * <ul>
 682  *    <li><p> Predefined character classes (Unicode character)
 683  *    <p><code>\X&nbsp;&nbsp;&nbsp;&nbsp;</code>Match Unicode
 684  *    <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
 685  *    <i>extended grapheme cluster</i></a>
 686  *    </p></li>
 687  *
 688  *    <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
 689  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
 690  *    <code>\g{</code><i>name</i><code>}</code> for
 691  *    <a href="#groupname">named-capturing group</a>.
 692  *    </p></li>
 693  *
 694  *    <li><p> The named character construct, <code>\N{</code><i>name</i><code>}</code>
 695  *    for a Unicode character by its name.
 696  *    </p></li>
 697  *
 698  *    <li><p> The conditional constructs
 699  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
 700  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
 701  *    </p></li>
 702  *
 703  *    <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code>
 704  *    and <code>(??{</code><i>code</i><code>})</code>,</p></li>
 705  *
 706  *    <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li>
 707  *
 708  *    <li><p> The preprocessing operations {@code \l} <code>&#92;u</code>,
 709  *    {@code \L}, and {@code \U}.  </p></li>
 710  *
 711  * </ul>
 712  *
 713  * <p> Constructs supported by this class but not by Perl: </p>
 714  *
 715  * <ul>
 716  *
 717  *    <li><p> Character-class union and intersection as described


2340         case 'D':
2341             if (create) root = has(UNICODE_CHARACTER_CLASS)
2342                                ? new Utype(UnicodeProp.DIGIT).complement()
2343                                : new Ctype(ASCII.DIGIT).complement();
2344             return -1;
2345         case 'E':
2346         case 'F':
2347             break;
2348         case 'G':
2349             if (inclass) break;
2350             if (create) root = new LastMatch();
2351             return -1;
2352         case 'H':
2353             if (create) root = new HorizWS().complement();
2354             return -1;
2355         case 'I':
2356         case 'J':
2357         case 'K':
2358         case 'L':
2359         case 'M':

2360         case 'N':

2361         case 'O':
2362         case 'P':
2363         case 'Q':
2364             break;
2365         case 'R':
2366             if (inclass) break;
2367             if (create) root = new LineEnding();
2368             return -1;
2369         case 'S':
2370             if (create) root = has(UNICODE_CHARACTER_CLASS)
2371                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2372                                : new Ctype(ASCII.SPACE).complement();
2373             return -1;
2374         case 'T':
2375         case 'U':
2376             break;
2377         case 'V':
2378             if (create) root = new VertWS().complement();
2379             return -1;
2380         case 'W':
2381             if (create) root = has(UNICODE_CHARACTER_CLASS)
2382                                ? new Utype(UnicodeProp.WORD).complement()
2383                                : new Ctype(ASCII.WORD).complement();
2384             return -1;
2385         case 'X':





2386         case 'Y':
2387             break;
2388         case 'Z':
2389             if (inclass) break;
2390             if (create) {
2391                 if (has(UNIX_LINES))
2392                     root = new UnixDollar(false);
2393                 else
2394                     root = new Dollar(false);
2395             }
2396             return -1;
2397         case 'a':
2398             return '\007';
2399         case 'b':
2400             if (inclass) break;
2401             if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));












2402             return -1;
2403         case 'c':
2404             return c();
2405         case 'd':
2406             if (create) root = has(UNICODE_CHARACTER_CLASS)
2407                                ? new Utype(UnicodeProp.DIGIT)
2408                                : new Ctype(ASCII.DIGIT);
2409             return -1;
2410         case 'e':
2411             return '\033';
2412         case 'f':
2413             return '\f';
2414         case 'g':
2415             break;
2416         case 'h':
2417             if (create) root = new HorizWS();
2418             return -1;
2419         case 'i':
2420         case 'j':
2421             break;


3258             }
3259             n = n * 16 + ASCII.toDigit(ch);
3260         }
3261         return n;
3262     }
3263 
3264     private int u() {
3265         int n = uxxxx();
3266         if (Character.isHighSurrogate((char)n)) {
3267             int cur = cursor();
3268             if (read() == '\\' && read() == 'u') {
3269                 int n2 = uxxxx();
3270                 if (Character.isLowSurrogate((char)n2))
3271                     return Character.toCodePoint((char)n, (char)n2);
3272             }
3273             setcursor(cur);
3274         }
3275         return n;
3276     }
3277 
















3278     //
3279     // Utility methods for code point support
3280     //
3281 
3282     private static final int countChars(CharSequence seq, int index,
3283                                         int lengthInCodePoints) {
3284         // optimization
3285         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
3286             assert (index >= 0 && index < seq.length());
3287             return 1;
3288         }
3289         int length = seq.length();
3290         int x = index;
3291         if (lengthInCodePoints >= 0) {
3292             assert (index >= 0 && index < length);
3293             for (int i = 0; x < length && i < lengthInCodePoints; i++) {
3294                 if (Character.isHighSurrogate(seq.charAt(x++))) {
3295                     if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
3296                         x++;
3297                     }
3298                 }
3299             }
3300             return x - index;
3301         }


3941     static final class VertWS extends BmpCharProperty {
3942         boolean isSatisfiedBy(int cp) {
3943             return (cp >= 0x0A && cp <= 0x0D) ||
3944                    cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3945         }
3946     }
3947 
3948     /**
3949      * Node class that matches a Perl horizontal whitespace
3950      */
3951     static final class HorizWS extends BmpCharProperty {
3952         boolean isSatisfiedBy(int cp) {
3953             return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3954                    cp == 0x1680 || cp == 0x180e ||
3955                    cp >= 0x2000 && cp <= 0x200a ||
3956                    cp == 0x202f || cp == 0x205f || cp == 0x3000;
3957         }
3958     }
3959 
3960     /**
























































3961      * Base class for all Slice nodes
3962      */
3963     static class SliceNode extends Node {
3964         int[] buffer;
3965         SliceNode(int[] buf) {
3966             buffer = buf;
3967         }
3968         boolean study(TreeInfo info) {
3969             info.minLength += buffer.length;
3970             info.maxLength += buffer.length;
3971             return next.study(info);
3972         }
3973     }
3974 
3975     /**
3976      * Node class for a case sensitive/BMP-only sequence of literal
3977      * characters.
3978      */
3979     static class Slice extends SliceNode {
3980         Slice(int[] buf) {




  92  *     <td headers="matches">The backslash character</td></tr>
  93  * <tr><td valign="top" headers="construct characters">{@code \0}<i>n</i></td>
  94  *     <td headers="matches">The character with octal value {@code 0}<i>n</i>
  95  *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
  96  * <tr><td valign="top" headers="construct characters">{@code \0}<i>nn</i></td>
  97  *     <td headers="matches">The character with octal value {@code 0}<i>nn</i>
  98  *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
  99  * <tr><td valign="top" headers="construct characters">{@code \0}<i>mnn</i></td>
 100  *     <td headers="matches">The character with octal value {@code 0}<i>mnn</i>
 101  *         (0&nbsp;{@code <=}&nbsp;<i>m</i>&nbsp;{@code <=}&nbsp;3,
 102  *         0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
 103  * <tr><td valign="top" headers="construct characters">{@code \x}<i>hh</i></td>
 104  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hh</i></td></tr>
 105  * <tr><td valign="top" headers="construct characters"><code>&#92;u</code><i>hhhh</i></td>
 106  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hhhh</i></td></tr>
 107  * <tr><td valign="top" headers="construct characters"><code>&#92;x</code><i>{h...h}</i></td>
 108  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>h...h</i>
 109  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
 110  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
 111  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
 112  * <tr><td valign="top" headers="construct characters"><code>&#92;N{</code><i>name</i><code>}</code></td>
 113  *     <td headers="matches">The character with Unicode character name <i>'name'</i></td></tr>
 114  * <tr><td valign="top" headers="matches">{@code \t}</td>
 115  *     <td headers="matches">The tab character (<code>'&#92;u0009'</code>)</td></tr>
 116  * <tr><td valign="top" headers="construct characters">{@code \n}</td>
 117  *     <td headers="matches">The newline (line feed) character (<code>'&#92;u000A'</code>)</td></tr>
 118  * <tr><td valign="top" headers="construct characters">{@code \r}</td>
 119  *     <td headers="matches">The carriage-return character (<code>'&#92;u000D'</code>)</td></tr>
 120  * <tr><td valign="top" headers="construct characters">{@code \f}</td>
 121  *     <td headers="matches">The form-feed character (<code>'&#92;u000C'</code>)</td></tr>
 122  * <tr><td valign="top" headers="construct characters">{@code \a}</td>
 123  *     <td headers="matches">The alert (bell) character (<code>'&#92;u0007'</code>)</td></tr>
 124  * <tr><td valign="top" headers="construct characters">{@code \e}</td>
 125  *     <td headers="matches">The escape character (<code>'&#92;u001B'</code>)</td></tr>
 126  * <tr><td valign="top" headers="construct characters">{@code \c}<i>x</i></td>
 127  *     <td headers="matches">The control character corresponding to <i>x</i></td></tr>
 128  *
 129  * <tr><th>&nbsp;</th></tr>
 130  * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
 131  *
 132  * <tr><td valign="top" headers="construct classes">{@code [abc]}</td>
 133  *     <td headers="matches">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>


 228  * <tr><td valign="top" headers="construct unicode">{@code \p{Lu}}</td>
 229  *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
 230  * <tr><td valign="top" headers="construct unicode">{@code \p{IsAlphabetic}}</td>
 231  *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
 232  * <tr><td valign="top" headers="construct unicode">{@code \p{Sc}}</td>
 233  *     <td headers="matches">A currency symbol</td></tr>
 234  * <tr><td valign="top" headers="construct unicode">{@code \P{InGreek}}</td>
 235  *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
 236  * <tr><td valign="top" headers="construct unicode">{@code [\p{L}&&[^\p{Lu}]]}</td>
 237  *     <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
 238  *
 239  * <tr><th>&nbsp;</th></tr>
 240  * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
 241  *
 242  * <tr><td valign="top" headers="construct bounds">{@code ^}</td>
 243  *     <td headers="matches">The beginning of a line</td></tr>
 244  * <tr><td valign="top" headers="construct bounds">{@code $}</td>
 245  *     <td headers="matches">The end of a line</td></tr>
 246  * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
 247  *     <td headers="matches">A word boundary</td></tr>
 248  * <tr><td valign="top" headers="construct bounds">{@code \b{g}}</td>
 249  *     <td headers="matches">A Unicode extended grapheme cluster boundary</td></tr>
 250  * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
 251  *     <td headers="matches">A non-word boundary</td></tr>
 252  * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
 253  *     <td headers="matches">The beginning of the input</td></tr>
 254  * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
 255  *     <td headers="matches">The end of the previous match</td></tr>
 256  * <tr><td valign="top" headers="construct bounds">{@code \Z}</td>
 257  *     <td headers="matches">The end of the input but for the final
 258  *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>
 259  * <tr><td valign="top" headers="construct bounds">{@code \z}</td>
 260  *     <td headers="matches">The end of the input</td></tr>
 261  *
 262  * <tr><th>&nbsp;</th></tr>
 263  * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
 264  * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
 265  *     <td headers="matches">Any Unicode linebreak sequence, is equivalent to
 266  *     <code>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
 267  *     </code></td></tr>
 268  *
 269  * <tr><th>&nbsp;</th></tr>
 270  * <tr align="left"><th colspan="2" id="grapheme">Unicode Extended Grapheme matcher</th></tr>
 271  * <tr><td valign="top" headers="construct grapheme">{@code \X}</td>
 272  *     <td headers="matches">Any Unicode extended grapheme cluster</td></tr>
 273  *
 274  * <tr><th>&nbsp;</th></tr>
 275  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
 276  *
 277  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
 278  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
 279  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code *}</td>
 280  *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
 281  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code +}</td>
 282  *     <td headers="matches"><i>X</i>, one or more times</td></tr>
 283  * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i><code>}</code></td>
 284  *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
 285  * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}}</td>
 286  *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
 287  * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></td>
 288  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
 289  *
 290  * <tr><th>&nbsp;</th></tr>
 291  * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
 292  *
 293  * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code ??}</td>
 294  *     <td headers="matches"><i>X</i>, once or not at all</td></tr>


 538  * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups
 539  * that do not capture text and do not count towards the group total, or
 540  * <i>named-capturing</i> group.
 541  *
 542  * <h3> Unicode support </h3>
 543  *
 544  * <p> This class is in conformance with Level 1 of <a
 545  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
 546  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
 547  * Canonical Equivalents.
 548  * <p>
 549  * <b>Unicode escape sequences</b> such as <code>&#92;u2014</code> in Java source code
 550  * are processed as described in section 3.3 of
 551  * <cite>The Java&trade; Language Specification</cite>.
 552  * Such escape sequences are also implemented directly by the regular-expression
 553  * parser so that Unicode escapes can be used in expressions that are read from
 554  * files or from the keyboard.  Thus the strings <code>"&#92;u2014"</code> and
 555  * {@code "\\u2014"}, while not equal, compile into the same pattern, which
 556  * matches the character with hexadecimal value {@code 0x2014}.
 557  * <p>
 558  * A Unicode character can also be represented by using its <b>Hex notation</b>
 559  * (hexadecimal code point value) directly as described in construct
 560  * <code>&#92;x{...}</code>, for example a supplementary character U+2011F can be
 561  * specified as <code>&#92;x{2011F}</code>, instead of two consecutive Unicode escape
 562  * sequences of the surrogate pair <code>&#92;uD840</code><code>&#92;uDD1F</code>.
 563  * <p>
 564  * <b>Unicode character names</b> are supported by the named character construct
 565  * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code>
 566  * specifies character <code>&#92;u263A</code>. The character names supported
 567  * by this class are the valid Unicode character names matched by
 568  * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.
 569  * <p>
 570  * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
 571  * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme
 572  * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}.
 573  * <p>
 574  * Unicode scripts, blocks, categories and binary properties are written with
 575  * the {@code \p} and {@code \P} constructs as in Perl.
 576  * <code>\p{</code><i>prop</i><code>}</code> matches if
 577  * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
 578  * does not match if the input has that property.
 579  * <p>
 580  * Scripts, blocks, categories and binary properties can be used both inside
 581  * and outside of a character class.
 582  *
 583  * <p>
 584  * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
 585  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
 586  * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.
 587  * <p>
 588  * The script names supported by {@code Pattern} are the valid script names
 589  * accepted and defined by
 590  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
 591  *
 592  * <p>


 680  * <tr><td>{@code \w}</td>
 681  *     <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr>
 682  * <tr><td>{@code \W}</td>
 683  *     <td>A non-word character: {@code [^\w]}</td></tr>
 684  * </table>
 685  * <p>
 686  * <a name="jcc">
 687  * Categories that behave like the java.lang.Character
 688  * boolean is<i>methodname</i> methods (except for the deprecated ones) are
 689  * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where
 690  * the specified property has the name <code>java<i>methodname</i></code></a>.
 691  *
 692  * <h3> Comparison to Perl 5 </h3>
 693  *
 694  * <p>The {@code Pattern} engine performs traditional NFA-based matching
 695  * with ordered alternation as occurs in Perl 5.
 696  *
 697  * <p> Perl constructs not supported by this class: </p>
 698  *
 699  * <ul>






 700  *    <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
 701  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
 702  *    <code>\g{</code><i>name</i><code>}</code> for
 703  *    <a href="#groupname">named-capturing group</a>.
 704  *    </p></li>
 705  *




 706  *    <li><p> The conditional constructs
 707  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
 708  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
 709  *    </p></li>
 710  *
 711  *    <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code>
 712  *    and <code>(??{</code><i>code</i><code>})</code>,</p></li>
 713  *
 714  *    <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li>
 715  *
 716  *    <li><p> The preprocessing operations {@code \l} <code>&#92;u</code>,
 717  *    {@code \L}, and {@code \U}.  </p></li>
 718  *
 719  * </ul>
 720  *
 721  * <p> Constructs supported by this class but not by Perl: </p>
 722  *
 723  * <ul>
 724  *
 725  *    <li><p> Character-class union and intersection as described


2348         case 'D':
2349             if (create) root = has(UNICODE_CHARACTER_CLASS)
2350                                ? new Utype(UnicodeProp.DIGIT).complement()
2351                                : new Ctype(ASCII.DIGIT).complement();
2352             return -1;
2353         case 'E':
2354         case 'F':
2355             break;
2356         case 'G':
2357             if (inclass) break;
2358             if (create) root = new LastMatch();
2359             return -1;
2360         case 'H':
2361             if (create) root = new HorizWS().complement();
2362             return -1;
2363         case 'I':
2364         case 'J':
2365         case 'K':
2366         case 'L':
2367         case 'M':
2368             break;
2369         case 'N':
2370             return N();
2371         case 'O':
2372         case 'P':
2373         case 'Q':
2374             break;
2375         case 'R':
2376             if (inclass) break;
2377             if (create) root = new LineEnding();
2378             return -1;
2379         case 'S':
2380             if (create) root = has(UNICODE_CHARACTER_CLASS)
2381                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2382                                : new Ctype(ASCII.SPACE).complement();
2383             return -1;
2384         case 'T':
2385         case 'U':
2386             break;
2387         case 'V':
2388             if (create) root = new VertWS().complement();
2389             return -1;
2390         case 'W':
2391             if (create) root = has(UNICODE_CHARACTER_CLASS)
2392                                ? new Utype(UnicodeProp.WORD).complement()
2393                                : new Ctype(ASCII.WORD).complement();
2394             return -1;
2395         case 'X':
2396             if (inclass) break;
2397             if (create) {
2398                 root = new XGrapheme();
2399             }
2400             return -1;
2401         case 'Y':
2402             break;
2403         case 'Z':
2404             if (inclass) break;
2405             if (create) {
2406                 if (has(UNIX_LINES))
2407                     root = new UnixDollar(false);
2408                 else
2409                     root = new Dollar(false);
2410             }
2411             return -1;
2412         case 'a':
2413             return '\007';
2414         case 'b':
2415             if (inclass) break;
2416             if (create) {
2417                 if (peek() == '{') {
2418                     if (skip() == 'g') {
2419                         if (read() == '}') {
2420                             root = new GraphemeBound();
2421                             return -1;
2422                         }
2423                         break;  // error missing trailing }
2424                     }
2425                     unread(); unread();
2426                 }
2427                 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2428             }
2429             return -1;
2430         case 'c':
2431             return c();
2432         case 'd':
2433             if (create) root = has(UNICODE_CHARACTER_CLASS)
2434                                ? new Utype(UnicodeProp.DIGIT)
2435                                : new Ctype(ASCII.DIGIT);
2436             return -1;
2437         case 'e':
2438             return '\033';
2439         case 'f':
2440             return '\f';
2441         case 'g':
2442             break;
2443         case 'h':
2444             if (create) root = new HorizWS();
2445             return -1;
2446         case 'i':
2447         case 'j':
2448             break;


3285             }
3286             n = n * 16 + ASCII.toDigit(ch);
3287         }
3288         return n;
3289     }
3290 
3291     private int u() {
3292         int n = uxxxx();
3293         if (Character.isHighSurrogate((char)n)) {
3294             int cur = cursor();
3295             if (read() == '\\' && read() == 'u') {
3296                 int n2 = uxxxx();
3297                 if (Character.isLowSurrogate((char)n2))
3298                     return Character.toCodePoint((char)n, (char)n2);
3299             }
3300             setcursor(cur);
3301         }
3302         return n;
3303     }
3304 
3305     private int N() {
3306         if (read() == '{') {
3307             int i = cursor;
3308             while (cursor < patternLength && read() != '}') {}
3309             if (cursor > patternLength)
3310                 throw error("Unclosed character name escape sequence");
3311             String name = new String(temp, i, cursor - i - 1);
3312             try {
3313                 return Character.codePointOf(name);
3314             } catch (IllegalArgumentException x) {
3315                 throw error("Unknown character name [" + name + "]");
3316             }
3317         }
3318         throw error("Illegal character name escape sequence");
3319     }
3320 
3321     //
3322     // Utility methods for code point support
3323     //

3324     private static final int countChars(CharSequence seq, int index,
3325                                         int lengthInCodePoints) {
3326         // optimization
3327         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
3328             assert (index >= 0 && index < seq.length());
3329             return 1;
3330         }
3331         int length = seq.length();
3332         int x = index;
3333         if (lengthInCodePoints >= 0) {
3334             assert (index >= 0 && index < length);
3335             for (int i = 0; x < length && i < lengthInCodePoints; i++) {
3336                 if (Character.isHighSurrogate(seq.charAt(x++))) {
3337                     if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
3338                         x++;
3339                     }
3340                 }
3341             }
3342             return x - index;
3343         }


3983     static final class VertWS extends BmpCharProperty {
3984         boolean isSatisfiedBy(int cp) {
3985             return (cp >= 0x0A && cp <= 0x0D) ||
3986                    cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3987         }
3988     }
3989 
3990     /**
3991      * Node class that matches a Perl horizontal whitespace
3992      */
3993     static final class HorizWS extends BmpCharProperty {
3994         boolean isSatisfiedBy(int cp) {
3995             return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3996                    cp == 0x1680 || cp == 0x180e ||
3997                    cp >= 0x2000 && cp <= 0x200a ||
3998                    cp == 0x202f || cp == 0x205f || cp == 0x3000;
3999         }
4000     }
4001 
4002     /**
4003      * Node class that matches an unicode extended grapheme cluster
4004      */
4005     static class XGrapheme extends Node {
4006         boolean match(Matcher matcher, int i, CharSequence seq) {
4007             if (i < matcher.to) {
4008                 int ch0 = Character.codePointAt(seq, i);
4009                     i += Character.charCount(ch0);
4010                 while (i < matcher.to) {
4011                     int ch1 = Character.codePointAt(seq, i);
4012                     if (Grapheme.isBoundary(ch0, ch1))
4013                         break;
4014                     ch0 = ch1;                    
4015                     i += Character.charCount(ch1);
4016                 }
4017                 return next.match(matcher, i, seq);                        
4018             }
4019             matcher.hitEnd = true;
4020             return false;
4021         }
4022 
4023         boolean study(TreeInfo info) {
4024             info.minLength++;
4025             info.deterministic = false;
4026             return next.study(info);
4027         }
4028     }
4029 
4030     /**
4031      * Node class that handles grapheme boundaries
4032      */
4033     static class GraphemeBound extends Node {
4034         boolean match(Matcher matcher, int i, CharSequence seq) {
4035             int startIndex = matcher.from;
4036             int endIndex = matcher.to;
4037             if (matcher.transparentBounds) {
4038                 startIndex = 0;
4039                 endIndex = matcher.getTextLength();
4040             }
4041             if (i == startIndex) {
4042                 return next.match(matcher, i, seq);
4043             }
4044             if (i < endIndex) {
4045                 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
4046                     !Grapheme.isBoundary(Character.codePointBefore(seq, i),
4047                                          Character.codePointAt(seq, i))) {
4048                     return false;
4049                 }
4050             } else {
4051                 matcher.hitEnd = true;
4052                 matcher.requireEnd = true;
4053             }
4054             return next.match(matcher, i, seq);
4055         }
4056     }
4057 
4058     /**
4059      * Base class for all Slice nodes
4060      */
4061     static class SliceNode extends Node {
4062         int[] buffer;
4063         SliceNode(int[] buf) {
4064             buffer = buf;
4065         }
4066         boolean study(TreeInfo info) {
4067             info.minLength += buffer.length;
4068             info.maxLength += buffer.length;
4069             return next.study(info);
4070         }
4071     }
4072 
4073     /**
4074      * Node class for a case sensitive/BMP-only sequence of literal
4075      * characters.
4076      */
4077     static class Slice extends SliceNode {
4078         Slice(int[] buf) {