12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26 package java.util.regex;
27
28 import java.security.AccessController;
29 import java.security.PrivilegedAction;
30 import java.text.CharacterIterator;
31 import java.text.Normalizer;
32 import java.util.ArrayList;
33 import java.util.HashMap;
34 import java.util.Arrays;
35
36
37 /**
38 * A compiled representation of a regular expression.
39 *
40 * <p> A regular expression, specified as a string, must first be compiled into
41 * an instance of this class. The resulting pattern can then be used to create
42 * a {@link Matcher} object that can match arbitrary {@link
43 * java.lang.CharSequence </code>character sequences<code>} against the regular
44 * expression. All of the state involved in performing a match resides in the
45 * matcher, so many matchers can share the same pattern.
46 *
47 * <p> A typical invocation sequence is thus
48 *
49 * <blockquote><pre>
50 * Pattern p = Pattern.{@link #compile compile}("a*b");
51 * Matcher m = p.{@link #matcher matcher}("aaaaab");
281 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
282 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
283 *
284 * <tr><th> </th></tr>
285 * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
286 *
287 * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
288 * <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
289 * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
290 * <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
291 * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
292 * <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
293 *
294 * <tr><th> </th></tr>
295 * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
296 *
297 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
298 * <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
299 * <a href="#cg">capturing group</a> matched</td></tr>
300 *
301 * <tr><th> </th></tr>
302 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
303 *
304 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
305 * <td headers="matches">Nothing, but quotes the following character</td></tr>
306 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
307 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
308 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
309 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
310 * <!-- Metachars: !$()*+.<>?[\]^{|} -->
311 *
312 * <tr><th> </th></tr>
313 * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
314 *
315 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
316 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
317 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
318 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
319 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
320 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
321 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
322 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
323 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
324 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
325 * <a href="#COMMENTS">x</a> on - off</td></tr>
326 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
327 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
328 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
329 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
330 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
331 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
332 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
333 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
334 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
432 *
433 * <li> A paragraph-separator character (<tt>'\u2029</tt>).
434 *
435 * </ul>
436 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
437 * recognized are newline characters.
438 *
439 * <p> The regular expression <tt>.</tt> matches any character except a line
440 * terminator unless the {@link #DOTALL} flag is specified.
441 *
442 * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
443 * line terminators and only match at the beginning and the end, respectively,
444 * of the entire input sequence. If {@link #MULTILINE} mode is activated then
445 * <tt>^</tt> matches at the beginning of input and after any line terminator
446 * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
447 * matches just before a line terminator or the end of the input sequence.
448 *
449 * <a name="cg">
450 * <h4> Groups and capturing </h4>
451 *
452 * <p> Capturing groups are numbered by counting their opening parentheses from
453 * left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
454 * are four such groups: </p>
455 *
456 * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
457 * <tr><th>1 </th>
458 * <td><tt>((A)(B(C)))</tt></td></tr>
459 * <tr><th>2 </th>
460 * <td><tt>(A)</tt></td></tr>
461 * <tr><th>3 </th>
462 * <td><tt>(B(C))</tt></td></tr>
463 * <tr><th>4 </th>
464 * <td><tt>(C)</tt></td></tr>
465 * </table></blockquote>
466 *
467 * <p> Group zero always stands for the entire expression.
468 *
469 * <p> Capturing groups are so named because, during a match, each subsequence
470 * of the input sequence that matches such a group is saved. The captured
471 * subsequence may be used later in the expression, via a back reference, and
472 * may also be retrieved from the matcher once the match operation is complete.
473 *
474 * <p> The captured input associated with a group is always the subsequence
475 * that the group most recently matched. If a group is evaluated a second time
476 * because of quantification then its previously-captured value, if any, will
477 * be retained if the second evaluation fails. Matching the string
478 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
479 * group two set to <tt>"b"</tt>. All captured input is discarded at the
480 * beginning of each match.
481 *
482 * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
483 * that do not capture text and do not count towards the group total.
484 *
485 *
486 * <h4> Unicode support </h4>
487 *
488 * <p> This class is in conformance with Level 1 of <a
489 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
490 * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
491 * Canonical Equivalents.
492 *
493 * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
494 * are processed as described in <a
495 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">\u00A73.3</a>
496 * of the Java Language Specification. Such escape sequences are also
497 * implemented directly by the regular-expression parser so that Unicode
498 * escapes can be used in expressions that are read from files or from the
499 * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
500 * while not equal, compile into the same pattern, which matches the character
501 * with hexadecimal value <tt>0x2014</tt>.
502 *
503 * <a name="ubc"> <p>Unicode blocks and categories are written with the
504 * <tt>\p</tt> and <tt>\P</tt> constructs as in
505 * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
778
779 /**
780 * The starting point of state machine for the find operation. This allows
781 * a match to start anywhere in the input.
782 */
783 transient Node root;
784
785 /**
786 * The root of object tree for a match operation. The pattern is matched
787 * at the beginning. This may include a find that uses BnM or a First
788 * node.
789 */
790 transient Node matchRoot;
791
792 /**
793 * Temporary storage used by parsing pattern slice.
794 */
795 transient int[] buffer;
796
797 /**
798 * Temporary storage used while parsing group references.
799 */
800 transient GroupHead[] groupNodes;
801
802 /**
803 * Temporary null terminated code point array used by pattern compiling.
804 */
805 private transient int[] temp;
806
807 /**
808 * The number of capturing groups in this Pattern. Used by matchers to
809 * allocate storage needed to perform a match.
810 */
811 transient int capturingGroupCount;
812
813 /**
814 * The local variable count used by parsing tree. Used by matchers to
815 * allocate storage needed to perform a match.
816 */
817 transient int localCount;
1450
1451 boolean hasSupplementary = false;
1452 int c, count = 0;
1453 // Convert all chars into code points
1454 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1455 c = normalizedPattern.codePointAt(x);
1456 if (isSupplementary(c)) {
1457 hasSupplementary = true;
1458 }
1459 temp[count++] = c;
1460 }
1461
1462 patternLength = count; // patternLength now in code points
1463
1464 if (! has(LITERAL))
1465 RemoveQEQuoting();
1466
1467 // Allocate all temporary objects here.
1468 buffer = new int[32];
1469 groupNodes = new GroupHead[10];
1470
1471 if (has(LITERAL)) {
1472 // Literal pattern handling
1473 matchRoot = newSlice(temp, patternLength, hasSupplementary);
1474 matchRoot.next = lastAccept;
1475 } else {
1476 // Start recursive descent parsing
1477 matchRoot = expr(lastAccept);
1478 // Check extra pattern characters
1479 if (patternLength != cursor) {
1480 if (peek() == ')') {
1481 throw error("Unmatched closing ')'");
1482 } else {
1483 throw error("Unexpected internal error");
1484 }
1485 }
1486 }
1487
1488 // Peephole optimization
1489 if (matchRoot instanceof Slice) {
1490 root = BnM.optimize(matchRoot);
1491 if (root == matchRoot) {
1492 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1493 }
1494 } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1495 root = matchRoot;
1496 } else {
1497 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1498 }
1499
1500 // Release temporary storage
1501 temp = null;
1502 buffer = null;
1503 groupNodes = null;
1504 patternLength = 0;
1505 compiled = true;
1506 }
1507
1508 /**
1509 * Used to print out a subtree of the Pattern to help with debugging.
1510 */
1511 private static void printObjectTree(Node node) {
1512 while(node != null) {
1513 if (node instanceof Prolog) {
1514 System.out.println(node);
1515 printObjectTree(((Prolog)node).loop);
1516 System.out.println("**** end contents prolog loop");
1517 } else if (node instanceof Loop) {
1518 System.out.println(node);
1519 printObjectTree(((Loop)node).body);
1520 System.out.println("**** end contents Loop body");
1521 } else if (node instanceof Curly) {
1522 System.out.println(node);
1523 printObjectTree(((Curly)node).atom);
1524 System.out.println("**** end contents Curly body");
1525 } else if (node instanceof GroupCurly) {
1526 System.out.println(node);
1527 printObjectTree(((GroupCurly)node).atom);
2139 return -1;
2140 case 'a':
2141 return '\007';
2142 case 'b':
2143 if (inclass) break;
2144 if (create) root = new Bound(Bound.BOTH);
2145 return -1;
2146 case 'c':
2147 return c();
2148 case 'd':
2149 if (create) root = new Ctype(ASCII.DIGIT);
2150 return -1;
2151 case 'e':
2152 return '\033';
2153 case 'f':
2154 return '\f';
2155 case 'g':
2156 case 'h':
2157 case 'i':
2158 case 'j':
2159 case 'k':
2160 case 'l':
2161 case 'm':
2162 break;
2163 case 'n':
2164 return '\n';
2165 case 'o':
2166 case 'p':
2167 case 'q':
2168 break;
2169 case 'r':
2170 return '\r';
2171 case 's':
2172 if (create) root = new Ctype(ASCII.SPACE);
2173 return -1;
2174 case 't':
2175 return '\t';
2176 case 'u':
2177 return u();
2178 case 'v':
2179 return '\013';
2439 block = Character.UnicodeBlock.forName(name);
2440 } catch (IllegalArgumentException iae) {
2441 throw error("Unknown character block name {" + name + "}");
2442 }
2443 return new CharProperty() {
2444 boolean isSatisfiedBy(int ch) {
2445 return block == Character.UnicodeBlock.of(ch);}};
2446 }
2447
2448 /**
2449 * Returns a CharProperty matching all characters in a named property.
2450 */
2451 private CharProperty charPropertyNodeFor(String name) {
2452 CharProperty p = CharPropertyNames.charPropertyFor(name);
2453 if (p == null)
2454 throw error("Unknown character property name {" + name + "}");
2455 return p;
2456 }
2457
2458 /**
2459 * Parses a group and returns the head node of a set of nodes that process
2460 * the group. Sometimes a double return system is used where the tail is
2461 * returned in root.
2462 */
2463 private Node group0() {
2464 boolean capturingGroup = false;
2465 Node head = null;
2466 Node tail = null;
2467 int save = flags;
2468 root = null;
2469 int ch = next();
2470 if (ch == '?') {
2471 ch = skip();
2472 switch (ch) {
2473 case ':': // (?:xxx) pure group
2474 head = createGroup(true);
2475 tail = root;
2476 head.next = expr(tail);
2477 break;
2478 case '=': // (?=xxx) and (?!xxx) lookahead
2479 case '!':
2480 head = createGroup(true);
2481 tail = root;
2482 head.next = expr(tail);
2483 if (ch == '=') {
2484 head = tail = new Pos(head);
2485 } else {
2486 head = tail = new Neg(head);
2487 }
2488 break;
2489 case '>': // (?>xxx) independent group
2490 head = createGroup(true);
2491 tail = root;
2492 head.next = expr(tail);
2493 head = tail = new Ques(head, INDEPENDENT);
2494 break;
2495 case '<': // (?<xxx) look behind
2496 ch = read();
2497 int start = cursor;
2498 head = createGroup(true);
2499 tail = root;
2500 head.next = expr(tail);
2501 tail.next = lookbehindEnd;
2502 TreeInfo info = new TreeInfo();
2503 head.study(info);
2504 if (info.maxValid == false) {
2505 throw error("Look-behind group does not have "
2506 + "an obvious maximum length");
2507 }
2508 boolean hasSupplementary = findSupplementary(start, patternLength);
2509 if (ch == '=') {
2510 head = tail = (hasSupplementary ?
2511 new BehindS(head, info.maxLength,
2512 info.minLength) :
2513 new Behind(head, info.maxLength,
2514 info.minLength));
2515 } else if (ch == '!') {
2516 head = tail = (hasSupplementary ?
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26 package java.util.regex;
27
28 import java.security.AccessController;
29 import java.security.PrivilegedAction;
30 import java.text.CharacterIterator;
31 import java.text.Normalizer;
32 import java.util.Map;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.Arrays;
36
37
38 /**
39 * A compiled representation of a regular expression.
40 *
41 * <p> A regular expression, specified as a string, must first be compiled into
42 * an instance of this class. The resulting pattern can then be used to create
43 * a {@link Matcher} object that can match arbitrary {@link
44 * java.lang.CharSequence </code>character sequences<code>} against the regular
45 * expression. All of the state involved in performing a match resides in the
46 * matcher, so many matchers can share the same pattern.
47 *
48 * <p> A typical invocation sequence is thus
49 *
50 * <blockquote><pre>
51 * Pattern p = Pattern.{@link #compile compile}("a*b");
52 * Matcher m = p.{@link #matcher matcher}("aaaaab");
282 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
283 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
284 *
285 * <tr><th> </th></tr>
286 * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
287 *
288 * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
289 * <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
290 * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
291 * <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
292 * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
293 * <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
294 *
295 * <tr><th> </th></tr>
296 * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
297 *
298 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
299 * <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
300 * <a href="#cg">capturing group</a> matched</td></tr>
301 *
302 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i><<i>name</i>></td>
303 * <td valign="bottom" headers="matches">Whatever the
304 * <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
305 *
306 * <tr><th> </th></tr>
307 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
308 *
309 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
310 * <td headers="matches">Nothing, but quotes the following character</td></tr>
311 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
312 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
313 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
314 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
315 * <!-- Metachars: !$()*+.<>?[\]^{|} -->
316 *
317 * <tr><th> </th></tr>
318 * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
319 *
320 * <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td>
321 * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
322 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
323 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
324 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
325 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
326 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
327 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
328 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
329 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
330 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
331 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
332 * <a href="#COMMENTS">x</a> on - off</td></tr>
333 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
334 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
335 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
336 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
337 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
338 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
339 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
340 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
341 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
439 *
440 * <li> A paragraph-separator character (<tt>'\u2029</tt>).
441 *
442 * </ul>
443 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
444 * recognized are newline characters.
445 *
446 * <p> The regular expression <tt>.</tt> matches any character except a line
447 * terminator unless the {@link #DOTALL} flag is specified.
448 *
449 * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
450 * line terminators and only match at the beginning and the end, respectively,
451 * of the entire input sequence. If {@link #MULTILINE} mode is activated then
452 * <tt>^</tt> matches at the beginning of input and after any line terminator
453 * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
454 * matches just before a line terminator or the end of the input sequence.
455 *
456 * <a name="cg">
457 * <h4> Groups and capturing </h4>
458 *
459 * <a name="gnumber">
460 * <h5> Group number </h5>
461 * <p> Capturing groups are numbered by counting their opening parentheses from
462 * left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
463 * are four such groups: </p>
464 *
465 * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
466 * <tr><th>1 </th>
467 * <td><tt>((A)(B(C)))</tt></td></tr>
468 * <tr><th>2 </th>
469 * <td><tt>(A)</tt></td></tr>
470 * <tr><th>3 </th>
471 * <td><tt>(B(C))</tt></td></tr>
472 * <tr><th>4 </th>
473 * <td><tt>(C)</tt></td></tr>
474 * </table></blockquote>
475 *
476 * <p> Group zero always stands for the entire expression.
477 *
478 * <p> Capturing groups are so named because, during a match, each subsequence
479 * of the input sequence that matches such a group is saved. The captured
480 * subsequence may be used later in the expression, via a back reference, and
481 * may also be retrieved from the matcher once the match operation is complete.
482 *
483 * <a name="groupname">
484 * <h5> Group name </h5>
485 * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,
486 * and then be back-referenced later by the "name". Group names are composed of
487 * the following characters:
488 *
489 * <ul>
490 * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
491 * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>),
492 * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
493 * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>),
494 * <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
495 * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>),
496 * </ul>
497 *
498 * <p> A <tt>named-capturing group</tt> is still numbered as described in
499 * <a href="#gnumber">Group number</a>.
500 *
501 * <p> The captured input associated with a group is always the subsequence
502 * that the group most recently matched. If a group is evaluated a second time
503 * because of quantification then its previously-captured value, if any, will
504 * be retained if the second evaluation fails. Matching the string
505 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
506 * group two set to <tt>"b"</tt>. All captured input is discarded at the
507 * beginning of each match.
508 *
509 * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
510 * that do not capture text and do not count towards the group total, or
511 * <i>named-capturing</i> group.
512 *
513 * <h4> Unicode support </h4>
514 *
515 * <p> This class is in conformance with Level 1 of <a
516 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
517 * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
518 * Canonical Equivalents.
519 *
520 * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
521 * are processed as described in <a
522 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">\u00A73.3</a>
523 * of the Java Language Specification. Such escape sequences are also
524 * implemented directly by the regular-expression parser so that Unicode
525 * escapes can be used in expressions that are read from files or from the
526 * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
527 * while not equal, compile into the same pattern, which matches the character
528 * with hexadecimal value <tt>0x2014</tt>.
529 *
530 * <a name="ubc"> <p>Unicode blocks and categories are written with the
531 * <tt>\p</tt> and <tt>\P</tt> constructs as in
532 * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
805
806 /**
807 * The starting point of state machine for the find operation. This allows
808 * a match to start anywhere in the input.
809 */
810 transient Node root;
811
812 /**
813 * The root of object tree for a match operation. The pattern is matched
814 * at the beginning. This may include a find that uses BnM or a First
815 * node.
816 */
817 transient Node matchRoot;
818
819 /**
820 * Temporary storage used by parsing pattern slice.
821 */
822 transient int[] buffer;
823
824 /**
825 * Map the "name" of the "named capturing group" to its group id
826 * node.
827 */
828 transient volatile Map<String, Integer> namedGroups;
829
830 /**
831 * Temporary storage used while parsing group references.
832 */
833 transient GroupHead[] groupNodes;
834
835 /**
836 * Temporary null terminated code point array used by pattern compiling.
837 */
838 private transient int[] temp;
839
840 /**
841 * The number of capturing groups in this Pattern. Used by matchers to
842 * allocate storage needed to perform a match.
843 */
844 transient int capturingGroupCount;
845
846 /**
847 * The local variable count used by parsing tree. Used by matchers to
848 * allocate storage needed to perform a match.
849 */
850 transient int localCount;
1483
1484 boolean hasSupplementary = false;
1485 int c, count = 0;
1486 // Convert all chars into code points
1487 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1488 c = normalizedPattern.codePointAt(x);
1489 if (isSupplementary(c)) {
1490 hasSupplementary = true;
1491 }
1492 temp[count++] = c;
1493 }
1494
1495 patternLength = count; // patternLength now in code points
1496
1497 if (! has(LITERAL))
1498 RemoveQEQuoting();
1499
1500 // Allocate all temporary objects here.
1501 buffer = new int[32];
1502 groupNodes = new GroupHead[10];
1503 namedGroups = null;
1504
1505 if (has(LITERAL)) {
1506 // Literal pattern handling
1507 matchRoot = newSlice(temp, patternLength, hasSupplementary);
1508 matchRoot.next = lastAccept;
1509 } else {
1510 // Start recursive descent parsing
1511 matchRoot = expr(lastAccept);
1512 // Check extra pattern characters
1513 if (patternLength != cursor) {
1514 if (peek() == ')') {
1515 throw error("Unmatched closing ')'");
1516 } else {
1517 throw error("Unexpected internal error");
1518 }
1519 }
1520 }
1521
1522 // Peephole optimization
1523 if (matchRoot instanceof Slice) {
1524 root = BnM.optimize(matchRoot);
1525 if (root == matchRoot) {
1526 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1527 }
1528 } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1529 root = matchRoot;
1530 } else {
1531 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1532 }
1533
1534 // Release temporary storage
1535 temp = null;
1536 buffer = null;
1537 groupNodes = null;
1538 patternLength = 0;
1539 compiled = true;
1540 }
1541
1542 Map<String, Integer> namedGroups() {
1543 if (namedGroups == null)
1544 namedGroups = new HashMap<String, Integer>(2);
1545 return namedGroups;
1546 }
1547
1548 /**
1549 * Used to print out a subtree of the Pattern to help with debugging.
1550 */
1551 private static void printObjectTree(Node node) {
1552 while(node != null) {
1553 if (node instanceof Prolog) {
1554 System.out.println(node);
1555 printObjectTree(((Prolog)node).loop);
1556 System.out.println("**** end contents prolog loop");
1557 } else if (node instanceof Loop) {
1558 System.out.println(node);
1559 printObjectTree(((Loop)node).body);
1560 System.out.println("**** end contents Loop body");
1561 } else if (node instanceof Curly) {
1562 System.out.println(node);
1563 printObjectTree(((Curly)node).atom);
1564 System.out.println("**** end contents Curly body");
1565 } else if (node instanceof GroupCurly) {
1566 System.out.println(node);
1567 printObjectTree(((GroupCurly)node).atom);
2179 return -1;
2180 case 'a':
2181 return '\007';
2182 case 'b':
2183 if (inclass) break;
2184 if (create) root = new Bound(Bound.BOTH);
2185 return -1;
2186 case 'c':
2187 return c();
2188 case 'd':
2189 if (create) root = new Ctype(ASCII.DIGIT);
2190 return -1;
2191 case 'e':
2192 return '\033';
2193 case 'f':
2194 return '\f';
2195 case 'g':
2196 case 'h':
2197 case 'i':
2198 case 'j':
2199 break;
2200 case 'k':
2201 if (inclass)
2202 break;
2203 if (read() != '<')
2204 throw error("\\k is not followed by '<' for named capturing group");
2205 String name = groupname(read());
2206 if (!namedGroups().containsKey(name))
2207 throw error("(named capturing group <"+ name+"> does not exit");
2208 if (create) {
2209 if (has(CASE_INSENSITIVE))
2210 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2211 else
2212 root = new BackRef(namedGroups().get(name));
2213 }
2214 return -1;
2215 case 'l':
2216 case 'm':
2217 break;
2218 case 'n':
2219 return '\n';
2220 case 'o':
2221 case 'p':
2222 case 'q':
2223 break;
2224 case 'r':
2225 return '\r';
2226 case 's':
2227 if (create) root = new Ctype(ASCII.SPACE);
2228 return -1;
2229 case 't':
2230 return '\t';
2231 case 'u':
2232 return u();
2233 case 'v':
2234 return '\013';
2494 block = Character.UnicodeBlock.forName(name);
2495 } catch (IllegalArgumentException iae) {
2496 throw error("Unknown character block name {" + name + "}");
2497 }
2498 return new CharProperty() {
2499 boolean isSatisfiedBy(int ch) {
2500 return block == Character.UnicodeBlock.of(ch);}};
2501 }
2502
2503 /**
2504 * Returns a CharProperty matching all characters in a named property.
2505 */
2506 private CharProperty charPropertyNodeFor(String name) {
2507 CharProperty p = CharPropertyNames.charPropertyFor(name);
2508 if (p == null)
2509 throw error("Unknown character property name {" + name + "}");
2510 return p;
2511 }
2512
2513 /**
2514 * Parses and returns the name of a "named capturing group", the trailing
2515 * ">" is consumed after parsing.
2516 */
2517 private String groupname(int ch) {
2518 StringBuilder sb = new StringBuilder();
2519 sb.append(Character.toChars(ch));
2520 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2521 ASCII.isDigit(ch)) {
2522 sb.append(Character.toChars(ch));
2523 }
2524 if (sb.length() == 0)
2525 throw error("named capturing group has 0 length name");
2526 if (ch != '>')
2527 throw error("named capturing group is missing trailing '>'");
2528 return sb.toString();
2529 }
2530
2531 /**
2532 * Parses a group and returns the head node of a set of nodes that process
2533 * the group. Sometimes a double return system is used where the tail is
2534 * returned in root.
2535 */
2536 private Node group0() {
2537 boolean capturingGroup = false;
2538 Node head = null;
2539 Node tail = null;
2540 int save = flags;
2541 root = null;
2542 int ch = next();
2543 if (ch == '?') {
2544 ch = skip();
2545 switch (ch) {
2546 case ':': // (?:xxx) pure group
2547 head = createGroup(true);
2548 tail = root;
2549 head.next = expr(tail);
2550 break;
2551 case '=': // (?=xxx) and (?!xxx) lookahead
2552 case '!':
2553 head = createGroup(true);
2554 tail = root;
2555 head.next = expr(tail);
2556 if (ch == '=') {
2557 head = tail = new Pos(head);
2558 } else {
2559 head = tail = new Neg(head);
2560 }
2561 break;
2562 case '>': // (?>xxx) independent group
2563 head = createGroup(true);
2564 tail = root;
2565 head.next = expr(tail);
2566 head = tail = new Ques(head, INDEPENDENT);
2567 break;
2568 case '<': // (?<xxx) look behind
2569 ch = read();
2570 if (Character.isLetter(ch)) { // named captured group
2571 String name = groupname(ch);
2572 if (namedGroups().containsKey(name))
2573 throw error("Named capturing group <" + name
2574 + "> is already defined");
2575 capturingGroup = true;
2576 head = createGroup(false);
2577 tail = root;
2578 namedGroups().put(name, capturingGroupCount-1);
2579 head.next = expr(tail);
2580 break;
2581 }
2582 int start = cursor;
2583 head = createGroup(true);
2584 tail = root;
2585 head.next = expr(tail);
2586 tail.next = lookbehindEnd;
2587 TreeInfo info = new TreeInfo();
2588 head.study(info);
2589 if (info.maxValid == false) {
2590 throw error("Look-behind group does not have "
2591 + "an obvious maximum length");
2592 }
2593 boolean hasSupplementary = findSupplementary(start, patternLength);
2594 if (ch == '=') {
2595 head = tail = (hasSupplementary ?
2596 new BehindS(head, info.maxLength,
2597 info.minLength) :
2598 new Behind(head, info.maxLength,
2599 info.minLength));
2600 } else if (ch == '!') {
2601 head = tail = (hasSupplementary ?
|