92 * <td headers="matches">The backslash character</td></tr>
93 * <tr><td valign="top" headers="construct characters">{@code \0}<i>n</i></td>
94 * <td headers="matches">The character with octal value {@code 0}<i>n</i>
95 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr>
96 * <tr><td valign="top" headers="construct characters">{@code \0}<i>nn</i></td>
97 * <td headers="matches">The character with octal value {@code 0}<i>nn</i>
98 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr>
99 * <tr><td valign="top" headers="construct characters">{@code \0}<i>mnn</i></td>
100 * <td headers="matches">The character with octal value {@code 0}<i>mnn</i>
101 * (0 {@code <=} <i>m</i> {@code <=} 3,
102 * 0 {@code <=} <i>n</i> {@code <=} 7)</td></tr>
103 * <tr><td valign="top" headers="construct characters">{@code \x}<i>hh</i></td>
104 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>hh</i></td></tr>
105 * <tr><td valign="top" headers="construct characters"><code>\u</code><i>hhhh</i></td>
106 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>hhhh</i></td></tr>
107 * <tr><td valign="top" headers="construct characters"><code>\x</code><i>{h...h}</i></td>
108 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>h...h</i>
109 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
110 * <= {@code 0x}<i>h...h</i> <=
111 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
112 * <tr><td valign="top" headers="matches">{@code \t}</td>
113 * <td headers="matches">The tab character (<code>'\u0009'</code>)</td></tr>
114 * <tr><td valign="top" headers="construct characters">{@code \n}</td>
115 * <td headers="matches">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
116 * <tr><td valign="top" headers="construct characters">{@code \r}</td>
117 * <td headers="matches">The carriage-return character (<code>'\u000D'</code>)</td></tr>
118 * <tr><td valign="top" headers="construct characters">{@code \f}</td>
119 * <td headers="matches">The form-feed character (<code>'\u000C'</code>)</td></tr>
120 * <tr><td valign="top" headers="construct characters">{@code \a}</td>
121 * <td headers="matches">The alert (bell) character (<code>'\u0007'</code>)</td></tr>
122 * <tr><td valign="top" headers="construct characters">{@code \e}</td>
123 * <td headers="matches">The escape character (<code>'\u001B'</code>)</td></tr>
124 * <tr><td valign="top" headers="construct characters">{@code \c}<i>x</i></td>
125 * <td headers="matches">The control character corresponding to <i>x</i></td></tr>
126 *
127 * <tr><th> </th></tr>
128 * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
129 *
130 * <tr><td valign="top" headers="construct classes">{@code [abc]}</td>
131 * <td headers="matches">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
226 * <tr><td valign="top" headers="construct unicode">{@code \p{Lu}}</td>
227 * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
228 * <tr><td valign="top" headers="construct unicode">{@code \p{IsAlphabetic}}</td>
229 * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
230 * <tr><td valign="top" headers="construct unicode">{@code \p{Sc}}</td>
231 * <td headers="matches">A currency symbol</td></tr>
232 * <tr><td valign="top" headers="construct unicode">{@code \P{InGreek}}</td>
233 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
234 * <tr><td valign="top" headers="construct unicode">{@code [\p{L}&&[^\p{Lu}]]}</td>
235 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
236 *
237 * <tr><th> </th></tr>
238 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
239 *
240 * <tr><td valign="top" headers="construct bounds">{@code ^}</td>
241 * <td headers="matches">The beginning of a line</td></tr>
242 * <tr><td valign="top" headers="construct bounds">{@code $}</td>
243 * <td headers="matches">The end of a line</td></tr>
244 * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
245 * <td headers="matches">A word boundary</td></tr>
246 * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
247 * <td headers="matches">A non-word boundary</td></tr>
248 * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
249 * <td headers="matches">The beginning of the input</td></tr>
250 * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
251 * <td headers="matches">The end of the previous match</td></tr>
252 * <tr><td valign="top" headers="construct bounds">{@code \Z}</td>
253 * <td headers="matches">The end of the input but for the final
254 * <a href="#lt">terminator</a>, if any</td></tr>
255 * <tr><td valign="top" headers="construct bounds">{@code \z}</td>
256 * <td headers="matches">The end of the input</td></tr>
257 *
258 * <tr><th> </th></tr>
259 * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
260 * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
261 * <td headers="matches">Any Unicode linebreak sequence, is equivalent to
262 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]
263 * </code></td></tr>
264 *
265 * <tr><th> </th></tr>
266 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
267 *
268 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
269 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
270 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code *}</td>
271 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
272 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code +}</td>
273 * <td headers="matches"><i>X</i>, one or more times</td></tr>
274 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i><code>}</code></td>
275 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
276 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}}</td>
277 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
278 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></td>
279 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
280 *
281 * <tr><th> </th></tr>
282 * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
283 *
284 * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code ??}</td>
285 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
529 * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups
530 * that do not capture text and do not count towards the group total, or
531 * <i>named-capturing</i> group.
532 *
533 * <h3> Unicode support </h3>
534 *
535 * <p> This class is in conformance with Level 1 of <a
536 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
537 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
538 * Canonical Equivalents.
539 * <p>
540 * <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code
541 * are processed as described in section 3.3 of
542 * <cite>The Java™ Language Specification</cite>.
543 * Such escape sequences are also implemented directly by the regular-expression
544 * parser so that Unicode escapes can be used in expressions that are read from
545 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and
546 * {@code "\\u2014"}, while not equal, compile into the same pattern, which
547 * matches the character with hexadecimal value {@code 0x2014}.
548 * <p>
549 * A Unicode character can also be represented in a regular-expression by
550 * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
551 * <code>\x{...}</code>, for example a supplementary character U+2011F
552 * can be specified as <code>\x{2011F}</code>, instead of two consecutive
553 * Unicode escape sequences of the surrogate pair
554 * <code>\uD840</code><code>\uDD1F</code>.
555 * <p>
556 * Unicode scripts, blocks, categories and binary properties are written with
557 * the {@code \p} and {@code \P} constructs as in Perl.
558 * <code>\p{</code><i>prop</i><code>}</code> matches if
559 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
560 * does not match if the input has that property.
561 * <p>
562 * Scripts, blocks, categories and binary properties can be used both inside
563 * and outside of a character class.
564 *
565 * <p>
566 * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
567 * {@code IsHiragana}, or by using the {@code script} keyword (or its short
568 * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.
569 * <p>
570 * The script names supported by {@code Pattern} are the valid script names
571 * accepted and defined by
572 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
573 *
574 * <p>
662 * <tr><td>{@code \w}</td>
663 * <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr>
664 * <tr><td>{@code \W}</td>
665 * <td>A non-word character: {@code [^\w]}</td></tr>
666 * </table>
667 * <p>
668 * <a name="jcc">
669 * Categories that behave like the java.lang.Character
670 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
671 * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where
672 * the specified property has the name <code>java<i>methodname</i></code></a>.
673 *
674 * <h3> Comparison to Perl 5 </h3>
675 *
676 * <p>The {@code Pattern} engine performs traditional NFA-based matching
677 * with ordered alternation as occurs in Perl 5.
678 *
679 * <p> Perl constructs not supported by this class: </p>
680 *
681 * <ul>
682 * <li><p> Predefined character classes (Unicode character)
683 * <p><code>\X </code>Match Unicode
684 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
685 * <i>extended grapheme cluster</i></a>
686 * </p></li>
687 *
688 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
689 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
690 * <code>\g{</code><i>name</i><code>}</code> for
691 * <a href="#groupname">named-capturing group</a>.
692 * </p></li>
693 *
694 * <li><p> The named character construct, <code>\N{</code><i>name</i><code>}</code>
695 * for a Unicode character by its name.
696 * </p></li>
697 *
698 * <li><p> The conditional constructs
699 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
700 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
701 * </p></li>
702 *
703 * <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code>
704 * and <code>(??{</code><i>code</i><code>})</code>,</p></li>
705 *
706 * <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li>
707 *
708 * <li><p> The preprocessing operations {@code \l} <code>\u</code>,
709 * {@code \L}, and {@code \U}. </p></li>
710 *
711 * </ul>
712 *
713 * <p> Constructs supported by this class but not by Perl: </p>
714 *
715 * <ul>
716 *
717 * <li><p> Character-class union and intersection as described
2340 case 'D':
2341 if (create) root = has(UNICODE_CHARACTER_CLASS)
2342 ? new Utype(UnicodeProp.DIGIT).complement()
2343 : new Ctype(ASCII.DIGIT).complement();
2344 return -1;
2345 case 'E':
2346 case 'F':
2347 break;
2348 case 'G':
2349 if (inclass) break;
2350 if (create) root = new LastMatch();
2351 return -1;
2352 case 'H':
2353 if (create) root = new HorizWS().complement();
2354 return -1;
2355 case 'I':
2356 case 'J':
2357 case 'K':
2358 case 'L':
2359 case 'M':
2360 case 'N':
2361 case 'O':
2362 case 'P':
2363 case 'Q':
2364 break;
2365 case 'R':
2366 if (inclass) break;
2367 if (create) root = new LineEnding();
2368 return -1;
2369 case 'S':
2370 if (create) root = has(UNICODE_CHARACTER_CLASS)
2371 ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2372 : new Ctype(ASCII.SPACE).complement();
2373 return -1;
2374 case 'T':
2375 case 'U':
2376 break;
2377 case 'V':
2378 if (create) root = new VertWS().complement();
2379 return -1;
2380 case 'W':
2381 if (create) root = has(UNICODE_CHARACTER_CLASS)
2382 ? new Utype(UnicodeProp.WORD).complement()
2383 : new Ctype(ASCII.WORD).complement();
2384 return -1;
2385 case 'X':
2386 case 'Y':
2387 break;
2388 case 'Z':
2389 if (inclass) break;
2390 if (create) {
2391 if (has(UNIX_LINES))
2392 root = new UnixDollar(false);
2393 else
2394 root = new Dollar(false);
2395 }
2396 return -1;
2397 case 'a':
2398 return '\007';
2399 case 'b':
2400 if (inclass) break;
2401 if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2402 return -1;
2403 case 'c':
2404 return c();
2405 case 'd':
2406 if (create) root = has(UNICODE_CHARACTER_CLASS)
2407 ? new Utype(UnicodeProp.DIGIT)
2408 : new Ctype(ASCII.DIGIT);
2409 return -1;
2410 case 'e':
2411 return '\033';
2412 case 'f':
2413 return '\f';
2414 case 'g':
2415 break;
2416 case 'h':
2417 if (create) root = new HorizWS();
2418 return -1;
2419 case 'i':
2420 case 'j':
2421 break;
3258 }
3259 n = n * 16 + ASCII.toDigit(ch);
3260 }
3261 return n;
3262 }
3263
3264 private int u() {
3265 int n = uxxxx();
3266 if (Character.isHighSurrogate((char)n)) {
3267 int cur = cursor();
3268 if (read() == '\\' && read() == 'u') {
3269 int n2 = uxxxx();
3270 if (Character.isLowSurrogate((char)n2))
3271 return Character.toCodePoint((char)n, (char)n2);
3272 }
3273 setcursor(cur);
3274 }
3275 return n;
3276 }
3277
3278 //
3279 // Utility methods for code point support
3280 //
3281
3282 private static final int countChars(CharSequence seq, int index,
3283 int lengthInCodePoints) {
3284 // optimization
3285 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
3286 assert (index >= 0 && index < seq.length());
3287 return 1;
3288 }
3289 int length = seq.length();
3290 int x = index;
3291 if (lengthInCodePoints >= 0) {
3292 assert (index >= 0 && index < length);
3293 for (int i = 0; x < length && i < lengthInCodePoints; i++) {
3294 if (Character.isHighSurrogate(seq.charAt(x++))) {
3295 if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
3296 x++;
3297 }
3298 }
3299 }
3300 return x - index;
3301 }
3941 static final class VertWS extends BmpCharProperty {
3942 boolean isSatisfiedBy(int cp) {
3943 return (cp >= 0x0A && cp <= 0x0D) ||
3944 cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3945 }
3946 }
3947
3948 /**
3949 * Node class that matches a Perl horizontal whitespace
3950 */
3951 static final class HorizWS extends BmpCharProperty {
3952 boolean isSatisfiedBy(int cp) {
3953 return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3954 cp == 0x1680 || cp == 0x180e ||
3955 cp >= 0x2000 && cp <= 0x200a ||
3956 cp == 0x202f || cp == 0x205f || cp == 0x3000;
3957 }
3958 }
3959
3960 /**
3961 * Base class for all Slice nodes
3962 */
3963 static class SliceNode extends Node {
3964 int[] buffer;
3965 SliceNode(int[] buf) {
3966 buffer = buf;
3967 }
3968 boolean study(TreeInfo info) {
3969 info.minLength += buffer.length;
3970 info.maxLength += buffer.length;
3971 return next.study(info);
3972 }
3973 }
3974
3975 /**
3976 * Node class for a case sensitive/BMP-only sequence of literal
3977 * characters.
3978 */
3979 static class Slice extends SliceNode {
3980 Slice(int[] buf) {
|
92 * <td headers="matches">The backslash character</td></tr>
93 * <tr><td valign="top" headers="construct characters">{@code \0}<i>n</i></td>
94 * <td headers="matches">The character with octal value {@code 0}<i>n</i>
95 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr>
96 * <tr><td valign="top" headers="construct characters">{@code \0}<i>nn</i></td>
97 * <td headers="matches">The character with octal value {@code 0}<i>nn</i>
98 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr>
99 * <tr><td valign="top" headers="construct characters">{@code \0}<i>mnn</i></td>
100 * <td headers="matches">The character with octal value {@code 0}<i>mnn</i>
101 * (0 {@code <=} <i>m</i> {@code <=} 3,
102 * 0 {@code <=} <i>n</i> {@code <=} 7)</td></tr>
103 * <tr><td valign="top" headers="construct characters">{@code \x}<i>hh</i></td>
104 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>hh</i></td></tr>
105 * <tr><td valign="top" headers="construct characters"><code>\u</code><i>hhhh</i></td>
106 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>hhhh</i></td></tr>
107 * <tr><td valign="top" headers="construct characters"><code>\x</code><i>{h...h}</i></td>
108 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>h...h</i>
109 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
110 * <= {@code 0x}<i>h...h</i> <=
111 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
112 * <tr><td valign="top" headers="construct characters"><code>\N{</code><i>name</i><code>}</code></td>
113 * <td headers="matches">The character with Unicode character name <i>'name'</i></td></tr>
114 * <tr><td valign="top" headers="matches">{@code \t}</td>
115 * <td headers="matches">The tab character (<code>'\u0009'</code>)</td></tr>
116 * <tr><td valign="top" headers="construct characters">{@code \n}</td>
117 * <td headers="matches">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
118 * <tr><td valign="top" headers="construct characters">{@code \r}</td>
119 * <td headers="matches">The carriage-return character (<code>'\u000D'</code>)</td></tr>
120 * <tr><td valign="top" headers="construct characters">{@code \f}</td>
121 * <td headers="matches">The form-feed character (<code>'\u000C'</code>)</td></tr>
122 * <tr><td valign="top" headers="construct characters">{@code \a}</td>
123 * <td headers="matches">The alert (bell) character (<code>'\u0007'</code>)</td></tr>
124 * <tr><td valign="top" headers="construct characters">{@code \e}</td>
125 * <td headers="matches">The escape character (<code>'\u001B'</code>)</td></tr>
126 * <tr><td valign="top" headers="construct characters">{@code \c}<i>x</i></td>
127 * <td headers="matches">The control character corresponding to <i>x</i></td></tr>
128 *
129 * <tr><th> </th></tr>
130 * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
131 *
132 * <tr><td valign="top" headers="construct classes">{@code [abc]}</td>
133 * <td headers="matches">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
228 * <tr><td valign="top" headers="construct unicode">{@code \p{Lu}}</td>
229 * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
230 * <tr><td valign="top" headers="construct unicode">{@code \p{IsAlphabetic}}</td>
231 * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
232 * <tr><td valign="top" headers="construct unicode">{@code \p{Sc}}</td>
233 * <td headers="matches">A currency symbol</td></tr>
234 * <tr><td valign="top" headers="construct unicode">{@code \P{InGreek}}</td>
235 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
236 * <tr><td valign="top" headers="construct unicode">{@code [\p{L}&&[^\p{Lu}]]}</td>
237 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
238 *
239 * <tr><th> </th></tr>
240 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
241 *
242 * <tr><td valign="top" headers="construct bounds">{@code ^}</td>
243 * <td headers="matches">The beginning of a line</td></tr>
244 * <tr><td valign="top" headers="construct bounds">{@code $}</td>
245 * <td headers="matches">The end of a line</td></tr>
246 * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
247 * <td headers="matches">A word boundary</td></tr>
248 * <tr><td valign="top" headers="construct bounds">{@code \b{g}}</td>
249 * <td headers="matches">A Unicode extended grapheme cluster boundary</td></tr>
250 * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
251 * <td headers="matches">A non-word boundary</td></tr>
252 * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
253 * <td headers="matches">The beginning of the input</td></tr>
254 * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
255 * <td headers="matches">The end of the previous match</td></tr>
256 * <tr><td valign="top" headers="construct bounds">{@code \Z}</td>
257 * <td headers="matches">The end of the input but for the final
258 * <a href="#lt">terminator</a>, if any</td></tr>
259 * <tr><td valign="top" headers="construct bounds">{@code \z}</td>
260 * <td headers="matches">The end of the input</td></tr>
261 *
262 * <tr><th> </th></tr>
263 * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
264 * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
265 * <td headers="matches">Any Unicode linebreak sequence, is equivalent to
266 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]
267 * </code></td></tr>
268 *
269 * <tr><th> </th></tr>
270 * <tr align="left"><th colspan="2" id="grapheme">Unicode Extended Grapheme matcher</th></tr>
271 * <tr><td valign="top" headers="construct grapheme">{@code \X}</td>
272 * <td headers="matches">Any Unicode extended grapheme cluster</td></tr>
273 *
274 * <tr><th> </th></tr>
275 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
276 *
277 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
278 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
279 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code *}</td>
280 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
281 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code +}</td>
282 * <td headers="matches"><i>X</i>, one or more times</td></tr>
283 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i><code>}</code></td>
284 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
285 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}}</td>
286 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
287 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></td>
288 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
289 *
290 * <tr><th> </th></tr>
291 * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
292 *
293 * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code ??}</td>
294 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
538 * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups
539 * that do not capture text and do not count towards the group total, or
540 * <i>named-capturing</i> group.
541 *
542 * <h3> Unicode support </h3>
543 *
544 * <p> This class is in conformance with Level 1 of <a
545 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
546 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
547 * Canonical Equivalents.
548 * <p>
549 * <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code
550 * are processed as described in section 3.3 of
551 * <cite>The Java™ Language Specification</cite>.
552 * Such escape sequences are also implemented directly by the regular-expression
553 * parser so that Unicode escapes can be used in expressions that are read from
554 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and
555 * {@code "\\u2014"}, while not equal, compile into the same pattern, which
556 * matches the character with hexadecimal value {@code 0x2014}.
557 * <p>
558 * A Unicode character can also be represented by using its <b>Hex notation</b>
559 * (hexadecimal code point value) directly as described in construct
560 * <code>\x{...}</code>, for example a supplementary character U+2011F can be
561 * specified as <code>\x{2011F}</code>, instead of two consecutive Unicode escape
562 * sequences of the surrogate pair <code>\uD840</code><code>\uDD1F</code>.
563 * <p>
564 * <b>Unicode character names</b> are supported by the named character construct
565 * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code>
566 * specifies character <code>\u263A</code>. The character names supported
567 * by this class are the valid Unicode character names matched by
568 * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.
569 * <p>
570 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
571 * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme
572 * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}.
573 * <p>
574 * Unicode scripts, blocks, categories and binary properties are written with
575 * the {@code \p} and {@code \P} constructs as in Perl.
576 * <code>\p{</code><i>prop</i><code>}</code> matches if
577 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
578 * does not match if the input has that property.
579 * <p>
580 * Scripts, blocks, categories and binary properties can be used both inside
581 * and outside of a character class.
582 *
583 * <p>
584 * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
585 * {@code IsHiragana}, or by using the {@code script} keyword (or its short
586 * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.
587 * <p>
588 * The script names supported by {@code Pattern} are the valid script names
589 * accepted and defined by
590 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
591 *
592 * <p>
680 * <tr><td>{@code \w}</td>
681 * <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr>
682 * <tr><td>{@code \W}</td>
683 * <td>A non-word character: {@code [^\w]}</td></tr>
684 * </table>
685 * <p>
686 * <a name="jcc">
687 * Categories that behave like the java.lang.Character
688 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
689 * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where
690 * the specified property has the name <code>java<i>methodname</i></code></a>.
691 *
692 * <h3> Comparison to Perl 5 </h3>
693 *
694 * <p>The {@code Pattern} engine performs traditional NFA-based matching
695 * with ordered alternation as occurs in Perl 5.
696 *
697 * <p> Perl constructs not supported by this class: </p>
698 *
699 * <ul>
700 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
701 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
702 * <code>\g{</code><i>name</i><code>}</code> for
703 * <a href="#groupname">named-capturing group</a>.
704 * </p></li>
705 *
706 * <li><p> The conditional constructs
707 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
708 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
709 * </p></li>
710 *
711 * <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code>
712 * and <code>(??{</code><i>code</i><code>})</code>,</p></li>
713 *
714 * <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li>
715 *
716 * <li><p> The preprocessing operations {@code \l} <code>\u</code>,
717 * {@code \L}, and {@code \U}. </p></li>
718 *
719 * </ul>
720 *
721 * <p> Constructs supported by this class but not by Perl: </p>
722 *
723 * <ul>
724 *
725 * <li><p> Character-class union and intersection as described
2348 case 'D':
2349 if (create) root = has(UNICODE_CHARACTER_CLASS)
2350 ? new Utype(UnicodeProp.DIGIT).complement()
2351 : new Ctype(ASCII.DIGIT).complement();
2352 return -1;
2353 case 'E':
2354 case 'F':
2355 break;
2356 case 'G':
2357 if (inclass) break;
2358 if (create) root = new LastMatch();
2359 return -1;
2360 case 'H':
2361 if (create) root = new HorizWS().complement();
2362 return -1;
2363 case 'I':
2364 case 'J':
2365 case 'K':
2366 case 'L':
2367 case 'M':
2368 break;
2369 case 'N':
2370 return N();
2371 case 'O':
2372 case 'P':
2373 case 'Q':
2374 break;
2375 case 'R':
2376 if (inclass) break;
2377 if (create) root = new LineEnding();
2378 return -1;
2379 case 'S':
2380 if (create) root = has(UNICODE_CHARACTER_CLASS)
2381 ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2382 : new Ctype(ASCII.SPACE).complement();
2383 return -1;
2384 case 'T':
2385 case 'U':
2386 break;
2387 case 'V':
2388 if (create) root = new VertWS().complement();
2389 return -1;
2390 case 'W':
2391 if (create) root = has(UNICODE_CHARACTER_CLASS)
2392 ? new Utype(UnicodeProp.WORD).complement()
2393 : new Ctype(ASCII.WORD).complement();
2394 return -1;
2395 case 'X':
2396 if (inclass) break;
2397 if (create) {
2398 root = new XGrapheme();
2399 }
2400 return -1;
2401 case 'Y':
2402 break;
2403 case 'Z':
2404 if (inclass) break;
2405 if (create) {
2406 if (has(UNIX_LINES))
2407 root = new UnixDollar(false);
2408 else
2409 root = new Dollar(false);
2410 }
2411 return -1;
2412 case 'a':
2413 return '\007';
2414 case 'b':
2415 if (inclass) break;
2416 if (create) {
2417 if (peek() == '{') {
2418 if (skip() == 'g') {
2419 if (read() == '}') {
2420 root = new GraphemeBound();
2421 return -1;
2422 }
2423 break; // error missing trailing }
2424 }
2425 unread(); unread();
2426 }
2427 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2428 }
2429 return -1;
2430 case 'c':
2431 return c();
2432 case 'd':
2433 if (create) root = has(UNICODE_CHARACTER_CLASS)
2434 ? new Utype(UnicodeProp.DIGIT)
2435 : new Ctype(ASCII.DIGIT);
2436 return -1;
2437 case 'e':
2438 return '\033';
2439 case 'f':
2440 return '\f';
2441 case 'g':
2442 break;
2443 case 'h':
2444 if (create) root = new HorizWS();
2445 return -1;
2446 case 'i':
2447 case 'j':
2448 break;
3285 }
3286 n = n * 16 + ASCII.toDigit(ch);
3287 }
3288 return n;
3289 }
3290
3291 private int u() {
3292 int n = uxxxx();
3293 if (Character.isHighSurrogate((char)n)) {
3294 int cur = cursor();
3295 if (read() == '\\' && read() == 'u') {
3296 int n2 = uxxxx();
3297 if (Character.isLowSurrogate((char)n2))
3298 return Character.toCodePoint((char)n, (char)n2);
3299 }
3300 setcursor(cur);
3301 }
3302 return n;
3303 }
3304
3305 private int N() {
3306 if (read() == '{') {
3307 int i = cursor;
3308 while (cursor < patternLength && read() != '}') {}
3309 if (cursor > patternLength)
3310 throw error("Unclosed character name escape sequence");
3311 String name = new String(temp, i, cursor - i - 1);
3312 try {
3313 return Character.codePointOf(name);
3314 } catch (IllegalArgumentException x) {
3315 throw error("Unknown character name [" + name + "]");
3316 }
3317 }
3318 throw error("Illegal character name escape sequence");
3319 }
3320
3321 //
3322 // Utility methods for code point support
3323 //
3324 private static final int countChars(CharSequence seq, int index,
3325 int lengthInCodePoints) {
3326 // optimization
3327 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
3328 assert (index >= 0 && index < seq.length());
3329 return 1;
3330 }
3331 int length = seq.length();
3332 int x = index;
3333 if (lengthInCodePoints >= 0) {
3334 assert (index >= 0 && index < length);
3335 for (int i = 0; x < length && i < lengthInCodePoints; i++) {
3336 if (Character.isHighSurrogate(seq.charAt(x++))) {
3337 if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
3338 x++;
3339 }
3340 }
3341 }
3342 return x - index;
3343 }
3983 static final class VertWS extends BmpCharProperty {
3984 boolean isSatisfiedBy(int cp) {
3985 return (cp >= 0x0A && cp <= 0x0D) ||
3986 cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3987 }
3988 }
3989
3990 /**
3991 * Node class that matches a Perl horizontal whitespace
3992 */
3993 static final class HorizWS extends BmpCharProperty {
3994 boolean isSatisfiedBy(int cp) {
3995 return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3996 cp == 0x1680 || cp == 0x180e ||
3997 cp >= 0x2000 && cp <= 0x200a ||
3998 cp == 0x202f || cp == 0x205f || cp == 0x3000;
3999 }
4000 }
4001
4002 /**
4003 * Node class that matches an unicode extended grapheme cluster
4004 */
4005 static class XGrapheme extends Node {
4006 boolean match(Matcher matcher, int i, CharSequence seq) {
4007 if (i < matcher.to) {
4008 int ch0 = Character.codePointAt(seq, i);
4009 i += Character.charCount(ch0);
4010 while (i < matcher.to) {
4011 int ch1 = Character.codePointAt(seq, i);
4012 if (Grapheme.isBoundary(ch0, ch1))
4013 break;
4014 ch0 = ch1;
4015 i += Character.charCount(ch1);
4016 }
4017 return next.match(matcher, i, seq);
4018 }
4019 matcher.hitEnd = true;
4020 return false;
4021 }
4022
4023 boolean study(TreeInfo info) {
4024 info.minLength++;
4025 info.deterministic = false;
4026 return next.study(info);
4027 }
4028 }
4029
4030 /**
4031 * Node class that handles grapheme boundaries
4032 */
4033 static class GraphemeBound extends Node {
4034 boolean match(Matcher matcher, int i, CharSequence seq) {
4035 int startIndex = matcher.from;
4036 int endIndex = matcher.to;
4037 if (matcher.transparentBounds) {
4038 startIndex = 0;
4039 endIndex = matcher.getTextLength();
4040 }
4041 if (i == startIndex) {
4042 return next.match(matcher, i, seq);
4043 }
4044 if (i < endIndex) {
4045 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
4046 !Grapheme.isBoundary(Character.codePointBefore(seq, i),
4047 Character.codePointAt(seq, i))) {
4048 return false;
4049 }
4050 } else {
4051 matcher.hitEnd = true;
4052 matcher.requireEnd = true;
4053 }
4054 return next.match(matcher, i, seq);
4055 }
4056 }
4057
4058 /**
4059 * Base class for all Slice nodes
4060 */
4061 static class SliceNode extends Node {
4062 int[] buffer;
4063 SliceNode(int[] buf) {
4064 buffer = buf;
4065 }
4066 boolean study(TreeInfo info) {
4067 info.minLength += buffer.length;
4068 info.maxLength += buffer.length;
4069 return next.study(info);
4070 }
4071 }
4072
4073 /**
4074 * Node class for a case sensitive/BMP-only sequence of literal
4075 * characters.
4076 */
4077 static class Slice extends SliceNode {
4078 Slice(int[] buf) {
|