189 * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
190 * <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>
191 * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
192 * <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
193 * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
194 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
195 *
196 * <tr><th> </th></tr>
197 * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
198 *
199 * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
200 * <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
201 * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
202 * <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
203 * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
204 * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
205 * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
206 * <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
207 *
208 * <tr><th> </th></tr>
209 * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
210 * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
211 * <td headers="matches">A Latin script character (simple <a href="#ubc">script</a>)</td></tr>
212 * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
213 * <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
214 * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
215 * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
216 * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
217 * <td headers="matches">A currency symbol</td></tr>
218 * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
219 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
220 * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
221 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
222 *
223 * <tr><th> </th></tr>
224 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
225 *
226 * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
227 * <td headers="matches">The beginning of a line</td></tr>
228 * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
229 * <td headers="matches">The end of a line</td></tr>
230 * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
231 * <td headers="matches">A word boundary</td></tr>
232 * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
233 * <td headers="matches">A non-word boundary</td></tr>
234 * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
235 * <td headers="matches">The beginning of the input</td></tr>
311 * <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
312 *
313 * <tr><th> </th></tr>
314 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
315 *
316 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
317 * <td headers="matches">Nothing, but quotes the following character</td></tr>
318 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
319 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
320 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
321 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
322 * <!-- Metachars: !$()*+.<>?[\]^{|} -->
323 *
324 * <tr><th> </th></tr>
325 * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
326 *
327 * <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td>
328 * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
329 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
330 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
331 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
332 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
333 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
334 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
335 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
336 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
337 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
338 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
339 * <a href="#COMMENTS">x</a> on - off</td></tr>
340 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
341 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
342 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
343 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
344 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
345 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
346 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
347 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
348 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
349 * <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
350 *
351 * </table>
352 *
353 * <hr>
354 *
501 *
502 * <p> A <tt>named-capturing group</tt> is still numbered as described in
503 * <a href="#gnumber">Group number</a>.
504 *
505 * <p> The captured input associated with a group is always the subsequence
506 * that the group most recently matched. If a group is evaluated a second time
507 * because of quantification then its previously-captured value, if any, will
508 * be retained if the second evaluation fails. Matching the string
509 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
510 * group two set to <tt>"b"</tt>. All captured input is discarded at the
511 * beginning of each match.
512 *
513 * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
514 * that do not capture text and do not count towards the group total, or
515 * <i>named-capturing</i> group.
516 *
517 * <h4> Unicode support </h4>
518 *
519 * <p> This class is in conformance with Level 1 of <a
520 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
521 * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
522 * Canonical Equivalents.
523 *
524 * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
525 * are processed as described in section 3.3 of
526 * <cite>The Java™ Language Specification</cite>.
527 * Such escape sequences are also
528 * implemented directly by the regular-expression parser so that Unicode
529 * escapes can be used in expressions that are read from files or from the
530 * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
531 * while not equal, compile into the same pattern, which matches the character
532 * with hexadecimal value <tt>0x2014</tt>.
533 *
534 * <p> A Unicode character can also be represented in a regular-expression by
535 * using its hexadecimal code point value directly as described in construct
536 * <tt>\x{...}</tt>, for example a supplementary character U+2011F
537 * can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
538 * Unicode escape sequences of the surrogate pair
539 * <tt>\uD840</tt><tt>\uDD1F</tt>.
540 *
541 * <a name="ubc">
542 * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
543 * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
544 * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
545 * does not match if the input has that property.
546 * <p>
547 * Scripts are specified either with the prefix {@code Is}, as in
548 * {@code IsHiragana}, or by using the {@code script} keyword (or its short
549 * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
550 * <p>
551 * Blocks are specified with the prefix {@code In}, as in
552 * {@code InMongolian}, or by using the keyword {@code block} (or its short
553 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
554 * <p>
555 * Categories may be specified with the optional prefix {@code Is}:
556 * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
557 * letters. Same as scripts and blocks, categories can also be specified
558 * by using the keyword {@code general_category} (or its short form
559 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
560 * <p>
561 * Scripts, blocks and categories can be used both inside and outside of a
562 * character class.
563 * <p> The supported categories are those of
564 * <a href="http://www.unicode.org/unicode/standard/standard.html">
565 * <i>The Unicode Standard</i></a> in the version specified by the
566 * {@link java.lang.Character Character} class. The category names are those
567 * defined in the Standard, both normative and informative.
568 * The script names supported by <code>Pattern</code> are the valid script names
569 * accepted and defined by
570 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
571 * The block names supported by <code>Pattern</code> are the valid block names
572 * accepted and defined by
573 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
574 * <p>
575 * <a name="jcc"> <p>Categories that behave like the java.lang.Character
576 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
577 * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
578 * the specified property has the name <tt>java<i>methodname</i></tt>.
579 *
580 * <h4> Comparison to Perl 5 </h4>
581 *
582 * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
583 * with ordered alternation as occurs in Perl 5.
584 *
585 * <p> Perl constructs not supported by this class: </p>
586 *
587 * <ul>
588 *
589 * <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
590 * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
591 * </p></li>
592 *
593 * <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
594 * and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
595 *
779 * <p> Specifying this flag may impose a performance penalty. </p>
780 */
781 public static final int UNICODE_CASE = 0x40;
782
783 /**
784 * Enables canonical equivalence.
785 *
786 * <p> When this flag is specified then two characters will be considered
787 * to match if, and only if, their full canonical decompositions match.
788 * The expression <tt>"a\u030A"</tt>, for example, will match the
789 * string <tt>"\u00E5"</tt> when this flag is specified. By default,
790 * matching does not take canonical equivalence into account.
791 *
792 * <p> There is no embedded flag character for enabling canonical
793 * equivalence.
794 *
795 * <p> Specifying this flag may impose a performance penalty. </p>
796 */
797 public static final int CANON_EQ = 0x80;
798
799 /* Pattern has only two serialized components: The pattern string
800 * and the flags, which are all that is needed to recompile the pattern
801 * when it is deserialized.
802 */
803
804 /** use serialVersionUID from Merlin b59 for interoperability */
805 private static final long serialVersionUID = 5073258162644648461L;
806
807 /**
808 * The original regular-expression pattern string.
809 *
810 * @serial
811 */
812 private String pattern;
813
814 /**
815 * The original pattern flags.
816 *
817 * @serial
818 */
901 * The expression to be compiled
902 *
903 * @throws PatternSyntaxException
904 * If the expression's syntax is invalid
905 */
906 public static Pattern compile(String regex) {
907 return new Pattern(regex, 0);
908 }
909
910 /**
911 * Compiles the given regular expression into a pattern with the given
912 * flags. </p>
913 *
914 * @param regex
915 * The expression to be compiled
916 *
917 * @param flags
918 * Match flags, a bit mask that may include
919 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
920 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
921 * {@link #LITERAL} and {@link #COMMENTS}
922 *
923 * @throws IllegalArgumentException
924 * If bit values other than those corresponding to the defined
925 * match flags are set in <tt>flags</tt>
926 *
927 * @throws PatternSyntaxException
928 * If the expression's syntax is invalid
929 */
930 public static Pattern compile(String regex, int flags) {
931 return new Pattern(regex, flags);
932 }
933
934 /**
935 * Returns the regular expression from which this pattern was compiled.
936 * </p>
937 *
938 * @return The source of this pattern
939 */
940 public String pattern() {
941 return pattern;
1192
1193 // if length > 0, the Pattern is lazily compiled
1194 compiled = false;
1195 if (pattern.length() == 0) {
1196 root = new Start(lastAccept);
1197 matchRoot = lastAccept;
1198 compiled = true;
1199 }
1200 }
1201
1202 /**
1203 * This private constructor is used to create all Patterns. The pattern
1204 * string and match flags are all that is needed to completely describe
1205 * a Pattern. An empty pattern string results in an object tree with
1206 * only a Start node and a LastNode node.
1207 */
1208 private Pattern(String p, int f) {
1209 pattern = p;
1210 flags = f;
1211
1212 // Reset group index count
1213 capturingGroupCount = 1;
1214 localCount = 0;
1215
1216 if (pattern.length() > 0) {
1217 compile();
1218 } else {
1219 root = new Start(lastAccept);
1220 matchRoot = lastAccept;
1221 }
1222 }
1223
1224 /**
1225 * The pattern is converted to normalizedD form and then a pure group
1226 * is constructed to match canonical equivalences of the characters.
1227 */
1228 private void normalize() {
1229 boolean inCharClass = false;
1230 int lastCodePoint = -1;
1231
2147 case '1':
2148 case '2':
2149 case '3':
2150 case '4':
2151 case '5':
2152 case '6':
2153 case '7':
2154 case '8':
2155 case '9':
2156 if (inclass) break;
2157 if (create) {
2158 root = ref((ch - '0'));
2159 }
2160 return -1;
2161 case 'A':
2162 if (inclass) break;
2163 if (create) root = new Begin();
2164 return -1;
2165 case 'B':
2166 if (inclass) break;
2167 if (create) root = new Bound(Bound.NONE);
2168 return -1;
2169 case 'C':
2170 break;
2171 case 'D':
2172 if (create) root = new Ctype(ASCII.DIGIT).complement();
2173 return -1;
2174 case 'E':
2175 case 'F':
2176 break;
2177 case 'G':
2178 if (inclass) break;
2179 if (create) root = new LastMatch();
2180 return -1;
2181 case 'H':
2182 case 'I':
2183 case 'J':
2184 case 'K':
2185 case 'L':
2186 case 'M':
2187 case 'N':
2188 case 'O':
2189 case 'P':
2190 case 'Q':
2191 case 'R':
2192 break;
2193 case 'S':
2194 if (create) root = new Ctype(ASCII.SPACE).complement();
2195 return -1;
2196 case 'T':
2197 case 'U':
2198 case 'V':
2199 break;
2200 case 'W':
2201 if (create) root = new Ctype(ASCII.WORD).complement();
2202 return -1;
2203 case 'X':
2204 case 'Y':
2205 break;
2206 case 'Z':
2207 if (inclass) break;
2208 if (create) {
2209 if (has(UNIX_LINES))
2210 root = new UnixDollar(false);
2211 else
2212 root = new Dollar(false);
2213 }
2214 return -1;
2215 case 'a':
2216 return '\007';
2217 case 'b':
2218 if (inclass) break;
2219 if (create) root = new Bound(Bound.BOTH);
2220 return -1;
2221 case 'c':
2222 return c();
2223 case 'd':
2224 if (create) root = new Ctype(ASCII.DIGIT);
2225 return -1;
2226 case 'e':
2227 return '\033';
2228 case 'f':
2229 return '\f';
2230 case 'g':
2231 case 'h':
2232 case 'i':
2233 case 'j':
2234 break;
2235 case 'k':
2236 if (inclass)
2237 break;
2238 if (read() != '<')
2239 throw error("\\k is not followed by '<' for named capturing group");
2240 String name = groupname(read());
2241 if (!namedGroups().containsKey(name))
2242 throw error("(named capturing group <"+ name+"> does not exit");
2243 if (create) {
2244 if (has(CASE_INSENSITIVE))
2245 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2246 else
2247 root = new BackRef(namedGroups().get(name));
2248 }
2249 return -1;
2250 case 'l':
2251 case 'm':
2252 break;
2253 case 'n':
2254 return '\n';
2255 case 'o':
2256 case 'p':
2257 case 'q':
2258 break;
2259 case 'r':
2260 return '\r';
2261 case 's':
2262 if (create) root = new Ctype(ASCII.SPACE);
2263 return -1;
2264 case 't':
2265 return '\t';
2266 case 'u':
2267 return u();
2268 case 'v':
2269 return '\013';
2270 case 'w':
2271 if (create) root = new Ctype(ASCII.WORD);
2272 return -1;
2273 case 'x':
2274 return x();
2275 case 'y':
2276 break;
2277 case 'z':
2278 if (inclass) break;
2279 if (create) root = new End();
2280 return -1;
2281 default:
2282 return ch;
2283 }
2284 throw error("Illegal/unsupported escape sequence");
2285 }
2286
2287 /**
2288 * Parse a character class, and return the node that matches it.
2289 *
2290 * Consumes a ] on the way out if consume is true. Usually consume
2291 * is true except for the case of [abc&&def] where def is a separate
2473
2474 private int single() {
2475 int ch = peek();
2476 switch (ch) {
2477 case '\\':
2478 return escape(true, false);
2479 default:
2480 next();
2481 return ch;
2482 }
2483 }
2484
2485 /**
2486 * Parses a Unicode character family and returns its representative node.
2487 */
2488 private CharProperty family(boolean singleLetter,
2489 boolean maybeComplement)
2490 {
2491 next();
2492 String name;
2493 CharProperty node;
2494
2495 if (singleLetter) {
2496 int c = temp[cursor];
2497 if (!Character.isSupplementaryCodePoint(c)) {
2498 name = String.valueOf((char)c);
2499 } else {
2500 name = new String(temp, cursor, 1);
2501 }
2502 read();
2503 } else {
2504 int i = cursor;
2505 mark('}');
2506 while(read() != '}') {
2507 }
2508 mark('\000');
2509 int j = cursor;
2510 if (j > patternLength)
2511 throw error("Unclosed character family");
2512 if (i + 1 >= j)
2513 throw error("Empty character family");
2519 // property construct \p{name=value}
2520 String value = name.substring(i + 1);
2521 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2522 if ("sc".equals(name) || "script".equals(name)) {
2523 node = unicodeScriptPropertyFor(value);
2524 } else if ("blk".equals(name) || "block".equals(name)) {
2525 node = unicodeBlockPropertyFor(value);
2526 } else if ("gc".equals(name) || "general_category".equals(name)) {
2527 node = charPropertyNodeFor(value);
2528 } else {
2529 throw error("Unknown Unicode property {name=<" + name + ">, "
2530 + "value=<" + value + ">}");
2531 }
2532 } else {
2533 if (name.startsWith("In")) {
2534 // \p{inBlockName}
2535 node = unicodeBlockPropertyFor(name.substring(2));
2536 } else if (name.startsWith("Is")) {
2537 // \p{isGeneralCategory} and \p{isScriptName}
2538 name = name.substring(2);
2539 node = CharPropertyNames.charPropertyFor(name);
2540 if (node == null)
2541 node = unicodeScriptPropertyFor(name);
2542 } else {
2543 node = charPropertyNodeFor(name);
2544 }
2545 }
2546 if (maybeComplement) {
2547 if (node instanceof Category || node instanceof Block)
2548 hasSupplementary = true;
2549 node = node.complement();
2550 }
2551 return node;
2552 }
2553
2554
2555 /**
2556 * Returns a CharProperty matching all characters belong to
2557 * a UnicodeScript.
2558 */
2559 private CharProperty unicodeScriptPropertyFor(String name) {
2560 final Character.UnicodeScript script;
2561 try {
2562 script = Character.UnicodeScript.forName(name);
2805 flags |= CASE_INSENSITIVE;
2806 break;
2807 case 'm':
2808 flags |= MULTILINE;
2809 break;
2810 case 's':
2811 flags |= DOTALL;
2812 break;
2813 case 'd':
2814 flags |= UNIX_LINES;
2815 break;
2816 case 'u':
2817 flags |= UNICODE_CASE;
2818 break;
2819 case 'c':
2820 flags |= CANON_EQ;
2821 break;
2822 case 'x':
2823 flags |= COMMENTS;
2824 break;
2825 case '-': // subFlag then fall through
2826 ch = next();
2827 subFlag();
2828 default:
2829 return;
2830 }
2831 ch = next();
2832 }
2833 }
2834
2835 /**
2836 * Parses the second part of inlined match flags and turns off
2837 * flags appropriately.
2838 */
2839 private void subFlag() {
2840 int ch = peek();
2841 for (;;) {
2842 switch (ch) {
2843 case 'i':
2844 flags &= ~CASE_INSENSITIVE;
2845 break;
2846 case 'm':
2847 flags &= ~MULTILINE;
2848 break;
2849 case 's':
2850 flags &= ~DOTALL;
2851 break;
2852 case 'd':
2853 flags &= ~UNIX_LINES;
2854 break;
2855 case 'u':
2856 flags &= ~UNICODE_CASE;
2857 break;
2858 case 'c':
2859 flags &= ~CANON_EQ;
2860 break;
2861 case 'x':
2862 flags &= ~COMMENTS;
2863 break;
2864 default:
2865 return;
2866 }
2867 ch = next();
2868 }
2869 }
2870
2871 static final int MAX_REPS = 0x7FFFFFFF;
2872
2873 static final int GREEDY = 0;
2874
2875 static final int LAZY = 1;
2876
2877 static final int POSSESSIVE = 2;
2878
2879 static final int INDEPENDENT = 3;
2880
2881 /**
2882 * Processes repetition. If the next character peeked is a quantifier
2883 * then new nodes must be appended to handle the repetition.
3647 Script(Character.UnicodeScript script) {
3648 this.script = script;
3649 }
3650 boolean isSatisfiedBy(int ch) {
3651 return script == Character.UnicodeScript.of(ch);
3652 }
3653 }
3654
3655 /**
3656 * Node class that matches a Unicode category.
3657 */
3658 static final class Category extends CharProperty {
3659 final int typeMask;
3660 Category(int typeMask) { this.typeMask = typeMask; }
3661 boolean isSatisfiedBy(int ch) {
3662 return (typeMask & (1 << Character.getType(ch))) != 0;
3663 }
3664 }
3665
3666 /**
3667 * Node class that matches a POSIX type.
3668 */
3669 static final class Ctype extends BmpCharProperty {
3670 final int ctype;
3671 Ctype(int ctype) { this.ctype = ctype; }
3672 boolean isSatisfiedBy(int ch) {
3673 return ch < 128 && ASCII.isType(ch, ctype);
3674 }
3675 }
3676
3677 /**
3678 * Base class for all Slice nodes
3679 */
3680 static class SliceNode extends Node {
3681 int[] buffer;
3682 SliceNode(int[] buf) {
3683 buffer = buf;
3684 }
3685 boolean study(TreeInfo info) {
3686 info.minLength += buffer.length;
5008 private static CharProperty setDifference(final CharProperty lhs,
5009 final CharProperty rhs) {
5010 return new CharProperty() {
5011 boolean isSatisfiedBy(int ch) {
5012 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
5013 }
5014
5015 /**
5016 * Handles word boundaries. Includes a field to allow this one class to
5017 * deal with the different types of word boundaries we can match. The word
5018 * characters include underscores, letters, and digits. Non spacing marks
5019 * can are also part of a word if they have a base character, otherwise
5020 * they are ignored for purposes of finding word boundaries.
5021 */
5022 static final class Bound extends Node {
5023 static int LEFT = 0x1;
5024 static int RIGHT= 0x2;
5025 static int BOTH = 0x3;
5026 static int NONE = 0x4;
5027 int type;
5028 Bound(int n) {
5029 type = n;
5030 }
5031 int check(Matcher matcher, int i, CharSequence seq) {
5032 int ch;
5033 boolean left = false;
5034 int startIndex = matcher.from;
5035 int endIndex = matcher.to;
5036 if (matcher.transparentBounds) {
5037 startIndex = 0;
5038 endIndex = matcher.getTextLength();
5039 }
5040 if (i > startIndex) {
5041 ch = Character.codePointBefore(seq, i);
5042 left = (ch == '_' || Character.isLetterOrDigit(ch) ||
5043 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5044 && hasBaseCharacter(matcher, i-1, seq)));
5045 }
5046 boolean right = false;
5047 if (i < endIndex) {
5048 ch = Character.codePointAt(seq, i);
5049 right = (ch == '_' || Character.isLetterOrDigit(ch) ||
5050 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5051 && hasBaseCharacter(matcher, i, seq)));
5052 } else {
5053 // Tried to access char past the end
5054 matcher.hitEnd = true;
5055 // The addition of another char could wreck a boundary
5056 matcher.requireEnd = true;
5057 }
5058 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
5059 }
5060 boolean match(Matcher matcher, int i, CharSequence seq) {
5061 return (check(matcher, i, seq) & type) > 0
5062 && next.match(matcher, i, seq);
5063 }
5064 }
5065
5066 /**
5067 * Non spacing marks only count as word characters in bounds calculations
5068 * if they have a base character.
5069 */
5411 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
5412 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
5413 defCtype("Blank", ASCII.BLANK); // Space and tab characters
5414 defCtype("Cntrl", ASCII.CNTRL); // Control characters
5415 defRange("Digit", '0', '9'); // Numeric characters
5416 defCtype("Graph", ASCII.GRAPH); // printable and visible
5417 defRange("Lower", 'a', 'z'); // Lower-case alphabetic
5418 defRange("Print", 0x20, 0x7E); // Printable characters
5419 defCtype("Punct", ASCII.PUNCT); // Punctuation characters
5420 defCtype("Space", ASCII.SPACE); // Space characters
5421 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
5422 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5423
5424 // Java character properties, defined by methods in Character.java
5425 defClone("javaLowerCase", new CloneableProperty() {
5426 boolean isSatisfiedBy(int ch) {
5427 return Character.isLowerCase(ch);}});
5428 defClone("javaUpperCase", new CloneableProperty() {
5429 boolean isSatisfiedBy(int ch) {
5430 return Character.isUpperCase(ch);}});
5431 defClone("javaTitleCase", new CloneableProperty() {
5432 boolean isSatisfiedBy(int ch) {
5433 return Character.isTitleCase(ch);}});
5434 defClone("javaDigit", new CloneableProperty() {
5435 boolean isSatisfiedBy(int ch) {
5436 return Character.isDigit(ch);}});
5437 defClone("javaDefined", new CloneableProperty() {
5438 boolean isSatisfiedBy(int ch) {
5439 return Character.isDefined(ch);}});
5440 defClone("javaLetter", new CloneableProperty() {
5441 boolean isSatisfiedBy(int ch) {
5442 return Character.isLetter(ch);}});
5443 defClone("javaLetterOrDigit", new CloneableProperty() {
5444 boolean isSatisfiedBy(int ch) {
5445 return Character.isLetterOrDigit(ch);}});
5446 defClone("javaJavaIdentifierStart", new CloneableProperty() {
5447 boolean isSatisfiedBy(int ch) {
5448 return Character.isJavaIdentifierStart(ch);}});
5449 defClone("javaJavaIdentifierPart", new CloneableProperty() {
5450 boolean isSatisfiedBy(int ch) {
|
189 * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
190 * <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>
191 * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
192 * <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
193 * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
194 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
195 *
196 * <tr><th> </th></tr>
197 * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
198 *
199 * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
200 * <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
201 * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
202 * <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
203 * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
204 * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
205 * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
206 * <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
207 *
208 * <tr><th> </th></tr>
209 * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
210 * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
211 * <td headers="matches">A Latin script character (<a href="#usc">script</a>)</td></tr>
212 * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
213 * <td headers="matches">A character in the Greek block (<a href="#ubc">block</a>)</td></tr>
214 * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
215 * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
216 * <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
217 * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
218 * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
219 * <td headers="matches">A currency symbol</td></tr>
220 * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
221 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
222 * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
223 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
224 *
225 * <tr><th> </th></tr>
226 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
227 *
228 * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
229 * <td headers="matches">The beginning of a line</td></tr>
230 * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
231 * <td headers="matches">The end of a line</td></tr>
232 * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
233 * <td headers="matches">A word boundary</td></tr>
234 * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
235 * <td headers="matches">A non-word boundary</td></tr>
236 * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
237 * <td headers="matches">The beginning of the input</td></tr>
313 * <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
314 *
315 * <tr><th> </th></tr>
316 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
317 *
318 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
319 * <td headers="matches">Nothing, but quotes the following character</td></tr>
320 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
321 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
322 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
323 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
324 * <!-- Metachars: !$()*+.<>?[\]^{|} -->
325 *
326 * <tr><th> </th></tr>
327 * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
328 *
329 * <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td>
330 * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
331 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
332 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
333 * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU) </tt></td>
334 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
335 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
336 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
337 * on - off</td></tr>
338 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
339 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
340 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
341 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
342 * <a href="#COMMENTS">x</a> on - off</td></tr>
343 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
344 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
345 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
346 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
347 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
348 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
349 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
350 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
351 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
352 * <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
353 *
354 * </table>
355 *
356 * <hr>
357 *
504 *
505 * <p> A <tt>named-capturing group</tt> is still numbered as described in
506 * <a href="#gnumber">Group number</a>.
507 *
508 * <p> The captured input associated with a group is always the subsequence
509 * that the group most recently matched. If a group is evaluated a second time
510 * because of quantification then its previously-captured value, if any, will
511 * be retained if the second evaluation fails. Matching the string
512 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
513 * group two set to <tt>"b"</tt>. All captured input is discarded at the
514 * beginning of each match.
515 *
516 * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
517 * that do not capture text and do not count towards the group total, or
518 * <i>named-capturing</i> group.
519 *
520 * <h4> Unicode support </h4>
521 *
522 * <p> This class is in conformance with Level 1 of <a
523 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
524 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
525 * Canonical Equivalents.
526 * <p>
527 * <b>Unicode escape sequences</b> such as <tt>\u2014</tt> in Java source code
528 * are processed as described in section 3.3 of
529 * <cite>The Java™ Language Specification</cite>.
530 * Such escape sequences are also implemented directly by the regular-expression
531 * parser so that Unicode escapes can be used in expressions that are read from
532 * files or from the keyboard. Thus the strings <tt>"\u2014"</tt> and
533 * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
534 * matches the character with hexadecimal value <tt>0x2014</tt>.
535 * <p>
536 * A Unicode character can also be represented in a regular-expression by
537 * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
538 * <tt>\x{...}</tt>, for example a supplementary character U+2011F
539 * can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
540 * Unicode escape sequences of the surrogate pair
541 * <tt>\uD840</tt><tt>\uDD1F</tt>.
542 * <p>
543 * Unicode scripts, blocks, categories and binary properties are written with
544 * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
545 * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
546 * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
547 * does not match if the input has that property.
548 * <p>
549 * Scripts, blocks, categories and binary properties can be used both inside
550 * and outside of a character class.
551 * <a name="usc">
552 * <p>
553 * <b>Scripts</b> are specified either with the prefix {@code Is}, as in
554 * {@code IsHiragana}, or by using the {@code script} keyword (or its short
555 * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
556 * <p>
557 * The script names supported by <code>Pattern</code> are the valid script names
558 * accepted and defined by
559 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
560 * <a name="ubc">
561 * <p>
562 * <b>Blocks</b> are specified with the prefix {@code In}, as in
563 * {@code InMongolian}, or by using the keyword {@code block} (or its short
564 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
565 * <p>
566 * The block names supported by <code>Pattern</code> are the valid block names
567 * accepted and defined by
568 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
569 * <p>
570 * <a name="ucc">
571 * <b>Categories</b> may be specified with the optional prefix {@code Is}:
572 * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
573 * letters. Same as scripts and blocks, categories can also be specified
574 * by using the keyword {@code general_category} (or its short form
575 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
576 * <p>
577 * The supported categories are those of
578 * <a href="http://www.unicode.org/unicode/standard/standard.html">
579 * <i>The Unicode Standard</i></a> in the version specified by the
580 * {@link java.lang.Character Character} class. The category names are those
581 * defined in the Standard, both normative and informative.
582 * <p>
583 * <a name="ubpc">
584 * <b>Binary properties</b> are specified with the prefix {@code Is}, as in
585 * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
586 * are
587 * <ul>
588 * <li> Alphabetic
589 * <li> Ideographic
590 * <li> Letter
591 * <li> Lowercase
592 * <li> Uppercase
593 * <li> Titlecase
594 * <li> Punctuation
595 * <Li> Control
596 * <li> White_Space
597 * <li> Digit
598 * <li> Hex_Digit
599 * <li> Noncharacter_Code_Point
600 * <li> Assigned
601 * </ul>
602
603
604 * <p>
605 * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
606 * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
607 * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
608 * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
609 * <p>
610 * <table border="0" cellpadding="1" cellspacing="0"
611 * summary="predefined and posix character classes in Unicode mode">
612 * <tr align="left">
613 * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
614 * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
615 *</tr>
616 * <tr><td><tt>\p{Lower}</tt></td>
617 * <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
618 * <tr><td><tt>\p{Upper}</tt></td>
619 * <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
620 * <tr><td><tt>\p{ASCII}</tt></td>
621 * <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
622 * <tr><td><tt>\p{Alpha}</tt></td>
623 * <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>
624 * <tr><td><tt>\p{Digit}</tt></td>
625 * <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
626 * <tr><td><tt>\p{Alnum}</tt></td>
627 * <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
628 * <tr><td><tt>\p{Punct}</tt></td>
629 * <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
630 * <tr><td><tt>\p{Graph}</tt></td>
631 * <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
632 * <tr><td><tt>\p{Print}</tt></td>
633 * <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
634 * <tr><td><tt>\p{Blank}</tt></td>
635 * <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
636 * <tr><td><tt>\p{Cntrl}</tt></td>
637 * <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
638 * <tr><td><tt>\p{XDigit}</tt></td>
639 * <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
640 * <tr><td><tt>\p{Space}</tt></td>
641 * <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
642 * <tr><td><tt>\d</tt></td>
643 * <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
644 * <tr><td><tt>\D</tt></td>
645 * <td>A non-digit: <tt>[^\d]</tt></td></tr>
646 * <tr><td><tt>\s</tt></td>
647 * <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
648 * <tr><td><tt>\S</tt></td>
649 * <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
650 * <tr><td><tt>\w</tt></td>
651 * <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
652 * <tr><td><tt>\W</tt></td>
653 * <td>A non-word character: <tt>[^\w]</tt></td></tr>
654 * </table>
655 * <p>
656 * <a name="jcc">
657 * Categories that behave like the java.lang.Character
658 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
659 * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
660 * the specified property has the name <tt>java<i>methodname</i></tt>.
661 *
662 * <h4> Comparison to Perl 5 </h4>
663 *
664 * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
665 * with ordered alternation as occurs in Perl 5.
666 *
667 * <p> Perl constructs not supported by this class: </p>
668 *
669 * <ul>
670 *
671 * <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
672 * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
673 * </p></li>
674 *
675 * <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
676 * and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
677 *
861 * <p> Specifying this flag may impose a performance penalty. </p>
862 */
863 public static final int UNICODE_CASE = 0x40;
864
865 /**
866 * Enables canonical equivalence.
867 *
868 * <p> When this flag is specified then two characters will be considered
869 * to match if, and only if, their full canonical decompositions match.
870 * The expression <tt>"a\u030A"</tt>, for example, will match the
871 * string <tt>"\u00E5"</tt> when this flag is specified. By default,
872 * matching does not take canonical equivalence into account.
873 *
874 * <p> There is no embedded flag character for enabling canonical
875 * equivalence.
876 *
877 * <p> Specifying this flag may impose a performance penalty. </p>
878 */
879 public static final int CANON_EQ = 0x80;
880
881 /**
882 * Enables the Unicode version of <i>Predefined character classes</i> and
883 * <i>POSIX character classes</i>.
884 *
885 * <p> When this flag is specified then the (US-ASCII only)
886 * <i>Predefined character classes</i> and <i>POSIX character classes</i>
887 * are in conformance with
888 * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
889 * Standard #18: Unicode Regular Expression</i></a>
890 * <i>Annex C: Compatibility Properties</i>.
891 * <p>
892 * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
893 * flag expression <tt>(?U)</tt>.
894 * <p>
895 * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
896 * folding.
897 * <p>
898 * Specifying this flag may impose a performance penalty. </p>
899 * @since 1.7
900 */
901 public static final int UNICODE_CHARACTER_CLASS = 0x100;
902
903 /* Pattern has only two serialized components: The pattern string
904 * and the flags, which are all that is needed to recompile the pattern
905 * when it is deserialized.
906 */
907
908 /** use serialVersionUID from Merlin b59 for interoperability */
909 private static final long serialVersionUID = 5073258162644648461L;
910
911 /**
912 * The original regular-expression pattern string.
913 *
914 * @serial
915 */
916 private String pattern;
917
918 /**
919 * The original pattern flags.
920 *
921 * @serial
922 */
1005 * The expression to be compiled
1006 *
1007 * @throws PatternSyntaxException
1008 * If the expression's syntax is invalid
1009 */
1010 public static Pattern compile(String regex) {
1011 return new Pattern(regex, 0);
1012 }
1013
1014 /**
1015 * Compiles the given regular expression into a pattern with the given
1016 * flags. </p>
1017 *
1018 * @param regex
1019 * The expression to be compiled
1020 *
1021 * @param flags
1022 * Match flags, a bit mask that may include
1023 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
1024 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
1025 * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
1026 * and {@link #COMMENTS}
1027 *
1028 * @throws IllegalArgumentException
1029 * If bit values other than those corresponding to the defined
1030 * match flags are set in <tt>flags</tt>
1031 *
1032 * @throws PatternSyntaxException
1033 * If the expression's syntax is invalid
1034 */
1035 public static Pattern compile(String regex, int flags) {
1036 return new Pattern(regex, flags);
1037 }
1038
1039 /**
1040 * Returns the regular expression from which this pattern was compiled.
1041 * </p>
1042 *
1043 * @return The source of this pattern
1044 */
1045 public String pattern() {
1046 return pattern;
1297
1298 // if length > 0, the Pattern is lazily compiled
1299 compiled = false;
1300 if (pattern.length() == 0) {
1301 root = new Start(lastAccept);
1302 matchRoot = lastAccept;
1303 compiled = true;
1304 }
1305 }
1306
1307 /**
1308 * This private constructor is used to create all Patterns. The pattern
1309 * string and match flags are all that is needed to completely describe
1310 * a Pattern. An empty pattern string results in an object tree with
1311 * only a Start node and a LastNode node.
1312 */
1313 private Pattern(String p, int f) {
1314 pattern = p;
1315 flags = f;
1316
1317 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1318 if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1319 flags |= UNICODE_CASE;
1320
1321 // Reset group index count
1322 capturingGroupCount = 1;
1323 localCount = 0;
1324
1325 if (pattern.length() > 0) {
1326 compile();
1327 } else {
1328 root = new Start(lastAccept);
1329 matchRoot = lastAccept;
1330 }
1331 }
1332
1333 /**
1334 * The pattern is converted to normalizedD form and then a pure group
1335 * is constructed to match canonical equivalences of the characters.
1336 */
1337 private void normalize() {
1338 boolean inCharClass = false;
1339 int lastCodePoint = -1;
1340
2256 case '1':
2257 case '2':
2258 case '3':
2259 case '4':
2260 case '5':
2261 case '6':
2262 case '7':
2263 case '8':
2264 case '9':
2265 if (inclass) break;
2266 if (create) {
2267 root = ref((ch - '0'));
2268 }
2269 return -1;
2270 case 'A':
2271 if (inclass) break;
2272 if (create) root = new Begin();
2273 return -1;
2274 case 'B':
2275 if (inclass) break;
2276 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2277 return -1;
2278 case 'C':
2279 break;
2280 case 'D':
2281 if (create) root = has(UNICODE_CHARACTER_CLASS)
2282 ? new Utype(UnicodeProp.DIGIT).complement()
2283 : new Ctype(ASCII.DIGIT).complement();
2284 return -1;
2285 case 'E':
2286 case 'F':
2287 break;
2288 case 'G':
2289 if (inclass) break;
2290 if (create) root = new LastMatch();
2291 return -1;
2292 case 'H':
2293 case 'I':
2294 case 'J':
2295 case 'K':
2296 case 'L':
2297 case 'M':
2298 case 'N':
2299 case 'O':
2300 case 'P':
2301 case 'Q':
2302 case 'R':
2303 break;
2304 case 'S':
2305 if (create) root = has(UNICODE_CHARACTER_CLASS)
2306 ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2307 : new Ctype(ASCII.SPACE).complement();
2308 return -1;
2309 case 'T':
2310 case 'U':
2311 case 'V':
2312 break;
2313 case 'W':
2314 if (create) root = has(UNICODE_CHARACTER_CLASS)
2315 ? new Utype(UnicodeProp.WORD).complement()
2316 : new Ctype(ASCII.WORD).complement();
2317 return -1;
2318 case 'X':
2319 case 'Y':
2320 break;
2321 case 'Z':
2322 if (inclass) break;
2323 if (create) {
2324 if (has(UNIX_LINES))
2325 root = new UnixDollar(false);
2326 else
2327 root = new Dollar(false);
2328 }
2329 return -1;
2330 case 'a':
2331 return '\007';
2332 case 'b':
2333 if (inclass) break;
2334 if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2335 return -1;
2336 case 'c':
2337 return c();
2338 case 'd':
2339 if (create) root = has(UNICODE_CHARACTER_CLASS)
2340 ? new Utype(UnicodeProp.DIGIT)
2341 : new Ctype(ASCII.DIGIT);
2342 return -1;
2343 case 'e':
2344 return '\033';
2345 case 'f':
2346 return '\f';
2347 case 'g':
2348 case 'h':
2349 case 'i':
2350 case 'j':
2351 break;
2352 case 'k':
2353 if (inclass)
2354 break;
2355 if (read() != '<')
2356 throw error("\\k is not followed by '<' for named capturing group");
2357 String name = groupname(read());
2358 if (!namedGroups().containsKey(name))
2359 throw error("(named capturing group <"+ name+"> does not exit");
2360 if (create) {
2361 if (has(CASE_INSENSITIVE))
2362 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2363 else
2364 root = new BackRef(namedGroups().get(name));
2365 }
2366 return -1;
2367 case 'l':
2368 case 'm':
2369 break;
2370 case 'n':
2371 return '\n';
2372 case 'o':
2373 case 'p':
2374 case 'q':
2375 break;
2376 case 'r':
2377 return '\r';
2378 case 's':
2379 if (create) root = has(UNICODE_CHARACTER_CLASS)
2380 ? new Utype(UnicodeProp.WHITE_SPACE)
2381 : new Ctype(ASCII.SPACE);
2382 return -1;
2383 case 't':
2384 return '\t';
2385 case 'u':
2386 return u();
2387 case 'v':
2388 return '\013';
2389 case 'w':
2390 if (create) root = has(UNICODE_CHARACTER_CLASS)
2391 ? new Utype(UnicodeProp.WORD)
2392 : new Ctype(ASCII.WORD);
2393 return -1;
2394 case 'x':
2395 return x();
2396 case 'y':
2397 break;
2398 case 'z':
2399 if (inclass) break;
2400 if (create) root = new End();
2401 return -1;
2402 default:
2403 return ch;
2404 }
2405 throw error("Illegal/unsupported escape sequence");
2406 }
2407
2408 /**
2409 * Parse a character class, and return the node that matches it.
2410 *
2411 * Consumes a ] on the way out if consume is true. Usually consume
2412 * is true except for the case of [abc&&def] where def is a separate
2594
2595 private int single() {
2596 int ch = peek();
2597 switch (ch) {
2598 case '\\':
2599 return escape(true, false);
2600 default:
2601 next();
2602 return ch;
2603 }
2604 }
2605
2606 /**
2607 * Parses a Unicode character family and returns its representative node.
2608 */
2609 private CharProperty family(boolean singleLetter,
2610 boolean maybeComplement)
2611 {
2612 next();
2613 String name;
2614 CharProperty node = null;
2615
2616 if (singleLetter) {
2617 int c = temp[cursor];
2618 if (!Character.isSupplementaryCodePoint(c)) {
2619 name = String.valueOf((char)c);
2620 } else {
2621 name = new String(temp, cursor, 1);
2622 }
2623 read();
2624 } else {
2625 int i = cursor;
2626 mark('}');
2627 while(read() != '}') {
2628 }
2629 mark('\000');
2630 int j = cursor;
2631 if (j > patternLength)
2632 throw error("Unclosed character family");
2633 if (i + 1 >= j)
2634 throw error("Empty character family");
2640 // property construct \p{name=value}
2641 String value = name.substring(i + 1);
2642 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2643 if ("sc".equals(name) || "script".equals(name)) {
2644 node = unicodeScriptPropertyFor(value);
2645 } else if ("blk".equals(name) || "block".equals(name)) {
2646 node = unicodeBlockPropertyFor(value);
2647 } else if ("gc".equals(name) || "general_category".equals(name)) {
2648 node = charPropertyNodeFor(value);
2649 } else {
2650 throw error("Unknown Unicode property {name=<" + name + ">, "
2651 + "value=<" + value + ">}");
2652 }
2653 } else {
2654 if (name.startsWith("In")) {
2655 // \p{inBlockName}
2656 node = unicodeBlockPropertyFor(name.substring(2));
2657 } else if (name.startsWith("Is")) {
2658 // \p{isGeneralCategory} and \p{isScriptName}
2659 name = name.substring(2);
2660 UnicodeProp uprop = UnicodeProp.forName(name);
2661 if (uprop != null)
2662 node = new Utype(uprop);
2663 if (node == null)
2664 node = CharPropertyNames.charPropertyFor(name);
2665 if (node == null)
2666 node = unicodeScriptPropertyFor(name);
2667 } else {
2668 if (has(UNICODE_CHARACTER_CLASS)) {
2669 UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
2670 if (uprop != null)
2671 node = new Utype(uprop);
2672 }
2673 if (node == null)
2674 node = charPropertyNodeFor(name);
2675 }
2676 }
2677 if (maybeComplement) {
2678 if (node instanceof Category || node instanceof Block)
2679 hasSupplementary = true;
2680 node = node.complement();
2681 }
2682 return node;
2683 }
2684
2685
2686 /**
2687 * Returns a CharProperty matching all characters belong to
2688 * a UnicodeScript.
2689 */
2690 private CharProperty unicodeScriptPropertyFor(String name) {
2691 final Character.UnicodeScript script;
2692 try {
2693 script = Character.UnicodeScript.forName(name);
2936 flags |= CASE_INSENSITIVE;
2937 break;
2938 case 'm':
2939 flags |= MULTILINE;
2940 break;
2941 case 's':
2942 flags |= DOTALL;
2943 break;
2944 case 'd':
2945 flags |= UNIX_LINES;
2946 break;
2947 case 'u':
2948 flags |= UNICODE_CASE;
2949 break;
2950 case 'c':
2951 flags |= CANON_EQ;
2952 break;
2953 case 'x':
2954 flags |= COMMENTS;
2955 break;
2956 case 'U':
2957 flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
2958 break;
2959 case '-': // subFlag then fall through
2960 ch = next();
2961 subFlag();
2962 default:
2963 return;
2964 }
2965 ch = next();
2966 }
2967 }
2968
2969 /**
2970 * Parses the second part of inlined match flags and turns off
2971 * flags appropriately.
2972 */
2973 private void subFlag() {
2974 int ch = peek();
2975 for (;;) {
2976 switch (ch) {
2977 case 'i':
2978 flags &= ~CASE_INSENSITIVE;
2979 break;
2980 case 'm':
2981 flags &= ~MULTILINE;
2982 break;
2983 case 's':
2984 flags &= ~DOTALL;
2985 break;
2986 case 'd':
2987 flags &= ~UNIX_LINES;
2988 break;
2989 case 'u':
2990 flags &= ~UNICODE_CASE;
2991 break;
2992 case 'c':
2993 flags &= ~CANON_EQ;
2994 break;
2995 case 'x':
2996 flags &= ~COMMENTS;
2997 break;
2998 case 'U':
2999 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3000 default:
3001 return;
3002 }
3003 ch = next();
3004 }
3005 }
3006
3007 static final int MAX_REPS = 0x7FFFFFFF;
3008
3009 static final int GREEDY = 0;
3010
3011 static final int LAZY = 1;
3012
3013 static final int POSSESSIVE = 2;
3014
3015 static final int INDEPENDENT = 3;
3016
3017 /**
3018 * Processes repetition. If the next character peeked is a quantifier
3019 * then new nodes must be appended to handle the repetition.
3783 Script(Character.UnicodeScript script) {
3784 this.script = script;
3785 }
3786 boolean isSatisfiedBy(int ch) {
3787 return script == Character.UnicodeScript.of(ch);
3788 }
3789 }
3790
3791 /**
3792 * Node class that matches a Unicode category.
3793 */
3794 static final class Category extends CharProperty {
3795 final int typeMask;
3796 Category(int typeMask) { this.typeMask = typeMask; }
3797 boolean isSatisfiedBy(int ch) {
3798 return (typeMask & (1 << Character.getType(ch))) != 0;
3799 }
3800 }
3801
3802 /**
3803 * Node class that matches a Unicode "type"
3804 */
3805 static final class Utype extends CharProperty {
3806 final UnicodeProp uprop;
3807 Utype(UnicodeProp uprop) { this.uprop = uprop; }
3808 boolean isSatisfiedBy(int ch) {
3809 return uprop.is(ch);
3810 }
3811 }
3812
3813
3814 /**
3815 * Node class that matches a POSIX type.
3816 */
3817 static final class Ctype extends BmpCharProperty {
3818 final int ctype;
3819 Ctype(int ctype) { this.ctype = ctype; }
3820 boolean isSatisfiedBy(int ch) {
3821 return ch < 128 && ASCII.isType(ch, ctype);
3822 }
3823 }
3824
3825 /**
3826 * Base class for all Slice nodes
3827 */
3828 static class SliceNode extends Node {
3829 int[] buffer;
3830 SliceNode(int[] buf) {
3831 buffer = buf;
3832 }
3833 boolean study(TreeInfo info) {
3834 info.minLength += buffer.length;
5156 private static CharProperty setDifference(final CharProperty lhs,
5157 final CharProperty rhs) {
5158 return new CharProperty() {
5159 boolean isSatisfiedBy(int ch) {
5160 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
5161 }
5162
5163 /**
5164 * Handles word boundaries. Includes a field to allow this one class to
5165 * deal with the different types of word boundaries we can match. The word
5166 * characters include underscores, letters, and digits. Non spacing marks
5167 * can are also part of a word if they have a base character, otherwise
5168 * they are ignored for purposes of finding word boundaries.
5169 */
5170 static final class Bound extends Node {
5171 static int LEFT = 0x1;
5172 static int RIGHT= 0x2;
5173 static int BOTH = 0x3;
5174 static int NONE = 0x4;
5175 int type;
5176 boolean useUWORD;
5177 Bound(int n, boolean useUWORD) {
5178 type = n;
5179 this.useUWORD = useUWORD;
5180 }
5181
5182 boolean isWord(int ch) {
5183 return useUWORD ? UnicodeProp.WORD.is(ch)
5184 : (ch == '_' || Character.isLetterOrDigit(ch));
5185 }
5186
5187 int check(Matcher matcher, int i, CharSequence seq) {
5188 int ch;
5189 boolean left = false;
5190 int startIndex = matcher.from;
5191 int endIndex = matcher.to;
5192 if (matcher.transparentBounds) {
5193 startIndex = 0;
5194 endIndex = matcher.getTextLength();
5195 }
5196 if (i > startIndex) {
5197 ch = Character.codePointBefore(seq, i);
5198 left = (isWord(ch) ||
5199 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5200 && hasBaseCharacter(matcher, i-1, seq)));
5201 }
5202 boolean right = false;
5203 if (i < endIndex) {
5204 ch = Character.codePointAt(seq, i);
5205 right = (isWord(ch) ||
5206 ((Character.getType(ch) == Character.NON_SPACING_MARK)
5207 && hasBaseCharacter(matcher, i, seq)));
5208 } else {
5209 // Tried to access char past the end
5210 matcher.hitEnd = true;
5211 // The addition of another char could wreck a boundary
5212 matcher.requireEnd = true;
5213 }
5214 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
5215 }
5216 boolean match(Matcher matcher, int i, CharSequence seq) {
5217 return (check(matcher, i, seq) & type) > 0
5218 && next.match(matcher, i, seq);
5219 }
5220 }
5221
5222 /**
5223 * Non spacing marks only count as word characters in bounds calculations
5224 * if they have a base character.
5225 */
5567 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
5568 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
5569 defCtype("Blank", ASCII.BLANK); // Space and tab characters
5570 defCtype("Cntrl", ASCII.CNTRL); // Control characters
5571 defRange("Digit", '0', '9'); // Numeric characters
5572 defCtype("Graph", ASCII.GRAPH); // printable and visible
5573 defRange("Lower", 'a', 'z'); // Lower-case alphabetic
5574 defRange("Print", 0x20, 0x7E); // Printable characters
5575 defCtype("Punct", ASCII.PUNCT); // Punctuation characters
5576 defCtype("Space", ASCII.SPACE); // Space characters
5577 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
5578 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5579
5580 // Java character properties, defined by methods in Character.java
5581 defClone("javaLowerCase", new CloneableProperty() {
5582 boolean isSatisfiedBy(int ch) {
5583 return Character.isLowerCase(ch);}});
5584 defClone("javaUpperCase", new CloneableProperty() {
5585 boolean isSatisfiedBy(int ch) {
5586 return Character.isUpperCase(ch);}});
5587 defClone("javaAlphabetic", new CloneableProperty() {
5588 boolean isSatisfiedBy(int ch) {
5589 return Character.isAlphabetic(ch);}});
5590 defClone("javaIdeographic", new CloneableProperty() {
5591 boolean isSatisfiedBy(int ch) {
5592 return Character.isIdeographic(ch);}});
5593 defClone("javaTitleCase", new CloneableProperty() {
5594 boolean isSatisfiedBy(int ch) {
5595 return Character.isTitleCase(ch);}});
5596 defClone("javaDigit", new CloneableProperty() {
5597 boolean isSatisfiedBy(int ch) {
5598 return Character.isDigit(ch);}});
5599 defClone("javaDefined", new CloneableProperty() {
5600 boolean isSatisfiedBy(int ch) {
5601 return Character.isDefined(ch);}});
5602 defClone("javaLetter", new CloneableProperty() {
5603 boolean isSatisfiedBy(int ch) {
5604 return Character.isLetter(ch);}});
5605 defClone("javaLetterOrDigit", new CloneableProperty() {
5606 boolean isSatisfiedBy(int ch) {
5607 return Character.isLetterOrDigit(ch);}});
5608 defClone("javaJavaIdentifierStart", new CloneableProperty() {
5609 boolean isSatisfiedBy(int ch) {
5610 return Character.isJavaIdentifierStart(ch);}});
5611 defClone("javaJavaIdentifierPart", new CloneableProperty() {
5612 boolean isSatisfiedBy(int ch) {
|