src/share/classes/java/util/regex/Pattern.java

Print this page




  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  22  * CA 95054 USA or visit www.sun.com if you need additional information or
  23  * have any questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.security.AccessController;
  29 import java.security.PrivilegedAction;
  30 import java.text.CharacterIterator;
  31 import java.text.Normalizer;

  32 import java.util.ArrayList;
  33 import java.util.HashMap;
  34 import java.util.Arrays;
  35 
  36 
  37 /**
  38  * A compiled representation of a regular expression.
  39  *
  40  * <p> A regular expression, specified as a string, must first be compiled into
  41  * an instance of this class.  The resulting pattern can then be used to create
  42  * a {@link Matcher} object that can match arbitrary {@link
  43  * java.lang.CharSequence </code>character sequences<code>} against the regular
  44  * expression.  All of the state involved in performing a match resides in the
  45  * matcher, so many matchers can share the same pattern.
  46  *
  47  * <p> A typical invocation sequence is thus
  48  *
  49  * <blockquote><pre>
  50  * Pattern p = Pattern.{@link #compile compile}("a*b");
  51  * Matcher m = p.{@link #matcher matcher}("aaaaab");


 281  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
 282  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
 283  *
 284  * <tr><th>&nbsp;</th></tr>
 285  * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
 286  *
 287  * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
 288  *     <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
 289  * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
 290  *     <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
 291  * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
 292  *     <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
 293  *
 294  * <tr><th>&nbsp;</th></tr>
 295  * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
 296  *
 297  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
 298  *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
 299  *     <a href="#cg">capturing group</a> matched</td></tr>
 300  *




 301  * <tr><th>&nbsp;</th></tr>
 302  * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
 303  *
 304  * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
 305  *     <td headers="matches">Nothing, but quotes the following character</td></tr>
 306  * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
 307  *     <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
 308  * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
 309  *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
 310  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
 311  *
 312  * <tr><th>&nbsp;</th></tr>
 313  * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
 314  *


 315  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
 316  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
 317  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
 318  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
 319  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
 320  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
 321  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
 322  *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
 323  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
 324  * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
 325  * <a href="#COMMENTS">x</a> on - off</td></tr>
 326  * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
 327  *     <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
 328  * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
 329  *     <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
 330  * <tr><td valign="top" headers="construct special"><tt>(?&lt;=</tt><i>X</i><tt>)</tt></td>
 331  *     <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
 332  * <tr><td valign="top" headers="construct special"><tt>(?&lt;!</tt><i>X</i><tt>)</tt></td>
 333  *     <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
 334  * <tr><td valign="top" headers="construct special"><tt>(?&gt;</tt><i>X</i><tt>)</tt></td>


 432  *
 433  *   <li> A paragraph-separator character&nbsp;(<tt>'&#92;u2029</tt>).
 434  *
 435  * </ul>
 436  * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
 437  * recognized are newline characters.
 438  *
 439  * <p> The regular expression <tt>.</tt> matches any character except a line
 440  * terminator unless the {@link #DOTALL} flag is specified.
 441  *
 442  * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
 443  * line terminators and only match at the beginning and the end, respectively,
 444  * of the entire input sequence. If {@link #MULTILINE} mode is activated then
 445  * <tt>^</tt> matches at the beginning of input and after any line terminator
 446  * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
 447  * matches just before a line terminator or the end of the input sequence.
 448  *
 449  * <a name="cg">
 450  * <h4> Groups and capturing </h4>
 451  *


 452  * <p> Capturing groups are numbered by counting their opening parentheses from
 453  * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there
 454  * are four such groups: </p>
 455  *
 456  * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
 457  * <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
 458  *     <td><tt>((A)(B(C)))</tt></td></tr>
 459  * <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
 460  *     <td><tt>(A)</tt></td></tr>
 461  * <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
 462  *     <td><tt>(B(C))</tt></td></tr>
 463  * <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
 464  *     <td><tt>(C)</tt></td></tr>
 465  * </table></blockquote>
 466  *
 467  * <p> Group zero always stands for the entire expression.
 468  *
 469  * <p> Capturing groups are so named because, during a match, each subsequence
 470  * of the input sequence that matches such a group is saved.  The captured
 471  * subsequence may be used later in the expression, via a back reference, and
 472  * may also be retrieved from the matcher once the match operation is complete.
 473  *


















 474  * <p> The captured input associated with a group is always the subsequence
 475  * that the group most recently matched.  If a group is evaluated a second time
 476  * because of quantification then its previously-captured value, if any, will
 477  * be retained if the second evaluation fails.  Matching the string
 478  * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
 479  * group two set to <tt>"b"</tt>.  All captured input is discarded at the
 480  * beginning of each match.
 481  *
 482  * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
 483  * that do not capture text and do not count towards the group total.

 484  *
 485  *
 486  * <h4> Unicode support </h4>
 487  *
 488  * <p> This class is in conformance with Level 1 of <a
 489  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
 490  * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
 491  * Canonical Equivalents.
 492  *
 493  * <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code
 494  * are processed as described in <a
 495  * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">\u00A73.3</a>
 496  * of the Java Language Specification.  Such escape sequences are also
 497  * implemented directly by the regular-expression parser so that Unicode
 498  * escapes can be used in expressions that are read from files or from the
 499  * keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>,
 500  * while not equal, compile into the same pattern, which matches the character
 501  * with hexadecimal value <tt>0x2014</tt>.
 502  *
 503  * <a name="ubc"> <p>Unicode blocks and categories are written with the
 504  * <tt>\p</tt> and <tt>\P</tt> constructs as in
 505  * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the


 778 
 779     /**
 780      * The starting point of state machine for the find operation.  This allows
 781      * a match to start anywhere in the input.
 782      */
 783     transient Node root;
 784 
 785     /**
 786      * The root of object tree for a match operation.  The pattern is matched
 787      * at the beginning.  This may include a find that uses BnM or a First
 788      * node.
 789      */
 790     transient Node matchRoot;
 791 
 792     /**
 793      * Temporary storage used by parsing pattern slice.
 794      */
 795     transient int[] buffer;
 796 
 797     /**






 798      * Temporary storage used while parsing group references.
 799      */
 800     transient GroupHead[] groupNodes;
 801 
 802     /**
 803      * Temporary null terminated code point array used by pattern compiling.
 804      */
 805     private transient int[] temp;
 806 
 807     /**
 808      * The number of capturing groups in this Pattern. Used by matchers to
 809      * allocate storage needed to perform a match.
 810      */
 811     transient int capturingGroupCount;
 812 
 813     /**
 814      * The local variable count used by parsing tree. Used by matchers to
 815      * allocate storage needed to perform a match.
 816      */
 817     transient int localCount;


1450 
1451         boolean hasSupplementary = false;
1452         int c, count = 0;
1453         // Convert all chars into code points
1454         for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1455             c = normalizedPattern.codePointAt(x);
1456             if (isSupplementary(c)) {
1457                 hasSupplementary = true;
1458             }
1459             temp[count++] = c;
1460         }
1461 
1462         patternLength = count;   // patternLength now in code points
1463 
1464         if (! has(LITERAL))
1465             RemoveQEQuoting();
1466 
1467         // Allocate all temporary objects here.
1468         buffer = new int[32];
1469         groupNodes = new GroupHead[10];

1470 
1471         if (has(LITERAL)) {
1472             // Literal pattern handling
1473             matchRoot = newSlice(temp, patternLength, hasSupplementary);
1474             matchRoot.next = lastAccept;
1475         } else {
1476             // Start recursive descent parsing
1477             matchRoot = expr(lastAccept);
1478             // Check extra pattern characters
1479             if (patternLength != cursor) {
1480                 if (peek() == ')') {
1481                     throw error("Unmatched closing ')'");
1482                 } else {
1483                     throw error("Unexpected internal error");
1484                 }
1485             }
1486         }
1487 
1488         // Peephole optimization
1489         if (matchRoot instanceof Slice) {
1490             root = BnM.optimize(matchRoot);
1491             if (root == matchRoot) {
1492                 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1493             }
1494         } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1495             root = matchRoot;
1496         } else {
1497             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1498         }
1499 
1500         // Release temporary storage
1501         temp = null;
1502         buffer = null;
1503         groupNodes = null;
1504         patternLength = 0;
1505         compiled = true;
1506     }
1507 






1508     /**
1509      * Used to print out a subtree of the Pattern to help with debugging.
1510      */
1511     private static void printObjectTree(Node node) {
1512         while(node != null) {
1513             if (node instanceof Prolog) {
1514                 System.out.println(node);
1515                 printObjectTree(((Prolog)node).loop);
1516                 System.out.println("**** end contents prolog loop");
1517             } else if (node instanceof Loop) {
1518                 System.out.println(node);
1519                 printObjectTree(((Loop)node).body);
1520                 System.out.println("**** end contents Loop body");
1521             } else if (node instanceof Curly) {
1522                 System.out.println(node);
1523                 printObjectTree(((Curly)node).atom);
1524                 System.out.println("**** end contents Curly body");
1525             } else if (node instanceof GroupCurly) {
1526                 System.out.println(node);
1527                 printObjectTree(((GroupCurly)node).atom);


2139             return -1;
2140         case 'a':
2141             return '\007';
2142         case 'b':
2143             if (inclass) break;
2144             if (create) root = new Bound(Bound.BOTH);
2145             return -1;
2146         case 'c':
2147             return c();
2148         case 'd':
2149             if (create) root = new Ctype(ASCII.DIGIT);
2150             return -1;
2151         case 'e':
2152             return '\033';
2153         case 'f':
2154             return '\f';
2155         case 'g':
2156         case 'h':
2157         case 'i':
2158         case 'j':

2159         case 'k':














2160         case 'l':
2161         case 'm':
2162             break;
2163         case 'n':
2164             return '\n';
2165         case 'o':
2166         case 'p':
2167         case 'q':
2168             break;
2169         case 'r':
2170             return '\r';
2171         case 's':
2172             if (create) root = new Ctype(ASCII.SPACE);
2173             return -1;
2174         case 't':
2175             return '\t';
2176         case 'u':
2177             return u();
2178         case 'v':
2179             return '\013';


2439             block = Character.UnicodeBlock.forName(name);
2440         } catch (IllegalArgumentException iae) {
2441             throw error("Unknown character block name {" + name + "}");
2442         }
2443         return new CharProperty() {
2444                 boolean isSatisfiedBy(int ch) {
2445                     return block == Character.UnicodeBlock.of(ch);}};
2446     }
2447 
2448     /**
2449      * Returns a CharProperty matching all characters in a named property.
2450      */
2451     private CharProperty charPropertyNodeFor(String name) {
2452         CharProperty p = CharPropertyNames.charPropertyFor(name);
2453         if (p == null)
2454             throw error("Unknown character property name {" + name + "}");
2455         return p;
2456     }
2457 
2458     /**


















2459      * Parses a group and returns the head node of a set of nodes that process
2460      * the group. Sometimes a double return system is used where the tail is
2461      * returned in root.
2462      */
2463     private Node group0() {
2464         boolean capturingGroup = false;
2465         Node head = null;
2466         Node tail = null;
2467         int save = flags;
2468         root = null;
2469         int ch = next();
2470         if (ch == '?') {
2471             ch = skip();
2472             switch (ch) {
2473             case ':':   //  (?:xxx) pure group
2474                 head = createGroup(true);
2475                 tail = root;
2476                 head.next = expr(tail);
2477                 break;
2478             case '=':   // (?=xxx) and (?!xxx) lookahead
2479             case '!':
2480                 head = createGroup(true);
2481                 tail = root;
2482                 head.next = expr(tail);
2483                 if (ch == '=') {
2484                     head = tail = new Pos(head);
2485                 } else {
2486                     head = tail = new Neg(head);
2487                 }
2488                 break;
2489             case '>':   // (?>xxx)  independent group
2490                 head = createGroup(true);
2491                 tail = root;
2492                 head.next = expr(tail);
2493                 head = tail = new Ques(head, INDEPENDENT);
2494                 break;
2495             case '<':   // (?<xxx)  look behind
2496                 ch = read();












2497                 int start = cursor;
2498                 head = createGroup(true);
2499                 tail = root;
2500                 head.next = expr(tail);
2501                 tail.next = lookbehindEnd;
2502                 TreeInfo info = new TreeInfo();
2503                 head.study(info);
2504                 if (info.maxValid == false) {
2505                     throw error("Look-behind group does not have "
2506                                 + "an obvious maximum length");
2507                 }
2508                 boolean hasSupplementary = findSupplementary(start, patternLength);
2509                 if (ch == '=') {
2510                     head = tail = (hasSupplementary ?
2511                                    new BehindS(head, info.maxLength,
2512                                                info.minLength) :
2513                                    new Behind(head, info.maxLength,
2514                                               info.minLength));
2515                 } else if (ch == '!') {
2516                     head = tail = (hasSupplementary ?




  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  22  * CA 95054 USA or visit www.sun.com if you need additional information or
  23  * have any questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.security.AccessController;
  29 import java.security.PrivilegedAction;
  30 import java.text.CharacterIterator;
  31 import java.text.Normalizer;
  32 import java.util.Map;
  33 import java.util.ArrayList;
  34 import java.util.HashMap;
  35 import java.util.Arrays;
  36 
  37 
  38 /**
  39  * A compiled representation of a regular expression.
  40  *
  41  * <p> A regular expression, specified as a string, must first be compiled into
  42  * an instance of this class.  The resulting pattern can then be used to create
  43  * a {@link Matcher} object that can match arbitrary {@link
  44  * java.lang.CharSequence </code>character sequences<code>} against the regular
  45  * expression.  All of the state involved in performing a match resides in the
  46  * matcher, so many matchers can share the same pattern.
  47  *
  48  * <p> A typical invocation sequence is thus
  49  *
  50  * <blockquote><pre>
  51  * Pattern p = Pattern.{@link #compile compile}("a*b");
  52  * Matcher m = p.{@link #matcher matcher}("aaaaab");


 282  * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
 283  *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
 284  *
 285  * <tr><th>&nbsp;</th></tr>
 286  * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
 287  *
 288  * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
 289  *     <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
 290  * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
 291  *     <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
 292  * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
 293  *     <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
 294  *
 295  * <tr><th>&nbsp;</th></tr>
 296  * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
 297  *
 298  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
 299  *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
 300  *     <a href="#cg">capturing group</a> matched</td></tr>
 301  *
 302  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i>&lt;<i>name</i>&gt;</td>
 303  *     <td valign="bottom" headers="matches">Whatever the 
 304  *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
 305  *
 306  * <tr><th>&nbsp;</th></tr>
 307  * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
 308  *
 309  * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
 310  *     <td headers="matches">Nothing, but quotes the following character</td></tr>
 311  * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
 312  *     <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
 313  * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
 314  *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
 315  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
 316  *
 317  * <tr><th>&nbsp;</th></tr>
 318  * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
 319  *
 320  * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>
 321  *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
 322  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
 323  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
 324  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
 325  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
 326  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
 327  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
 328  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
 329  *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
 330  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
 331  * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
 332  * <a href="#COMMENTS">x</a> on - off</td></tr>
 333  * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
 334  *     <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
 335  * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
 336  *     <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
 337  * <tr><td valign="top" headers="construct special"><tt>(?&lt;=</tt><i>X</i><tt>)</tt></td>
 338  *     <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
 339  * <tr><td valign="top" headers="construct special"><tt>(?&lt;!</tt><i>X</i><tt>)</tt></td>
 340  *     <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
 341  * <tr><td valign="top" headers="construct special"><tt>(?&gt;</tt><i>X</i><tt>)</tt></td>


 439  *
 440  *   <li> A paragraph-separator character&nbsp;(<tt>'&#92;u2029</tt>).
 441  *
 442  * </ul>
 443  * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
 444  * recognized are newline characters.
 445  *
 446  * <p> The regular expression <tt>.</tt> matches any character except a line
 447  * terminator unless the {@link #DOTALL} flag is specified.
 448  *
 449  * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
 450  * line terminators and only match at the beginning and the end, respectively,
 451  * of the entire input sequence. If {@link #MULTILINE} mode is activated then
 452  * <tt>^</tt> matches at the beginning of input and after any line terminator
 453  * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
 454  * matches just before a line terminator or the end of the input sequence.
 455  *
 456  * <a name="cg">
 457  * <h4> Groups and capturing </h4>
 458  *
 459  * <a name="gnumber">
 460  * <h5> Group number </h5>
 461  * <p> Capturing groups are numbered by counting their opening parentheses from
 462  * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there
 463  * are four such groups: </p>
 464  *
 465  * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
 466  * <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
 467  *     <td><tt>((A)(B(C)))</tt></td></tr>
 468  * <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
 469  *     <td><tt>(A)</tt></td></tr>
 470  * <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
 471  *     <td><tt>(B(C))</tt></td></tr>
 472  * <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
 473  *     <td><tt>(C)</tt></td></tr>
 474  * </table></blockquote>
 475  *
 476  * <p> Group zero always stands for the entire expression.
 477  *
 478  * <p> Capturing groups are so named because, during a match, each subsequence
 479  * of the input sequence that matches such a group is saved.  The captured
 480  * subsequence may be used later in the expression, via a back reference, and
 481  * may also be retrieved from the matcher once the match operation is complete.
 482  *
 483  * <a name="groupname">
 484  * <h5> Group name </h5>
 485  * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,
 486  * and then be back-referenced later by the "name". Group names are composed of
 487  * the following characters:
 488  *
 489  * <ul>
 490  *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
 491  *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),
 492  *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
 493  *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),
 494  *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
 495  *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),
 496  * </ul>
 497  *
 498  * <p> A <tt>named-capturing group</tt> is still numbered as described in
 499  * <a href="#gnumber">Group number</a>.
 500  *
 501  * <p> The captured input associated with a group is always the subsequence
 502  * that the group most recently matched.  If a group is evaluated a second time
 503  * because of quantification then its previously-captured value, if any, will
 504  * be retained if the second evaluation fails.  Matching the string
 505  * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
 506  * group two set to <tt>"b"</tt>.  All captured input is discarded at the
 507  * beginning of each match.
 508  *
 509  * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
 510  * that do not capture text and do not count towards the group total, or
 511  * <i>named-capturing</i> group.
 512  *

 513  * <h4> Unicode support </h4>
 514  *
 515  * <p> This class is in conformance with Level 1 of <a
 516  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
 517  * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
 518  * Canonical Equivalents.
 519  *
 520  * <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code
 521  * are processed as described in <a
 522  * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">\u00A73.3</a>
 523  * of the Java Language Specification.  Such escape sequences are also
 524  * implemented directly by the regular-expression parser so that Unicode
 525  * escapes can be used in expressions that are read from files or from the
 526  * keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>,
 527  * while not equal, compile into the same pattern, which matches the character
 528  * with hexadecimal value <tt>0x2014</tt>.
 529  *
 530  * <a name="ubc"> <p>Unicode blocks and categories are written with the
 531  * <tt>\p</tt> and <tt>\P</tt> constructs as in
 532  * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the


 805 
 806     /**
 807      * The starting point of state machine for the find operation.  This allows
 808      * a match to start anywhere in the input.
 809      */
 810     transient Node root;
 811 
 812     /**
 813      * The root of object tree for a match operation.  The pattern is matched
 814      * at the beginning.  This may include a find that uses BnM or a First
 815      * node.
 816      */
 817     transient Node matchRoot;
 818 
 819     /**
 820      * Temporary storage used by parsing pattern slice.
 821      */
 822     transient int[] buffer;
 823 
 824     /**
 825      * Map the "name" of the "named capturing group" to its group id
 826      * node.
 827      */
 828     transient volatile Map<String, Integer> namedGroups;
 829 
 830     /**
 831      * Temporary storage used while parsing group references.
 832      */
 833     transient GroupHead[] groupNodes;
 834 
 835     /**
 836      * Temporary null terminated code point array used by pattern compiling.
 837      */
 838     private transient int[] temp;
 839 
 840     /**
 841      * The number of capturing groups in this Pattern. Used by matchers to
 842      * allocate storage needed to perform a match.
 843      */
 844     transient int capturingGroupCount;
 845 
 846     /**
 847      * The local variable count used by parsing tree. Used by matchers to
 848      * allocate storage needed to perform a match.
 849      */
 850     transient int localCount;


1483 
1484         boolean hasSupplementary = false;
1485         int c, count = 0;
1486         // Convert all chars into code points
1487         for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1488             c = normalizedPattern.codePointAt(x);
1489             if (isSupplementary(c)) {
1490                 hasSupplementary = true;
1491             }
1492             temp[count++] = c;
1493         }
1494 
1495         patternLength = count;   // patternLength now in code points
1496 
1497         if (! has(LITERAL))
1498             RemoveQEQuoting();
1499 
1500         // Allocate all temporary objects here.
1501         buffer = new int[32];
1502         groupNodes = new GroupHead[10];
1503         namedGroups = null;
1504 
1505         if (has(LITERAL)) {
1506             // Literal pattern handling
1507             matchRoot = newSlice(temp, patternLength, hasSupplementary);
1508             matchRoot.next = lastAccept;
1509         } else {
1510             // Start recursive descent parsing
1511             matchRoot = expr(lastAccept);
1512             // Check extra pattern characters
1513             if (patternLength != cursor) {
1514                 if (peek() == ')') {
1515                     throw error("Unmatched closing ')'");
1516                 } else {
1517                     throw error("Unexpected internal error");
1518                 }
1519             }
1520         }
1521 
1522         // Peephole optimization
1523         if (matchRoot instanceof Slice) {
1524             root = BnM.optimize(matchRoot);
1525             if (root == matchRoot) {
1526                 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1527             }
1528         } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1529             root = matchRoot;
1530         } else {
1531             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1532         }
1533 
1534         // Release temporary storage
1535         temp = null;
1536         buffer = null;
1537         groupNodes = null;
1538         patternLength = 0;
1539         compiled = true;
1540     }
1541 
1542     Map<String, Integer> namedGroups() {
1543         if (namedGroups == null)
1544             namedGroups = new HashMap<String, Integer>(2);
1545         return namedGroups;
1546     }
1547 
1548     /**
1549      * Used to print out a subtree of the Pattern to help with debugging.
1550      */
1551     private static void printObjectTree(Node node) {
1552         while(node != null) {
1553             if (node instanceof Prolog) {
1554                 System.out.println(node);
1555                 printObjectTree(((Prolog)node).loop);
1556                 System.out.println("**** end contents prolog loop");
1557             } else if (node instanceof Loop) {
1558                 System.out.println(node);
1559                 printObjectTree(((Loop)node).body);
1560                 System.out.println("**** end contents Loop body");
1561             } else if (node instanceof Curly) {
1562                 System.out.println(node);
1563                 printObjectTree(((Curly)node).atom);
1564                 System.out.println("**** end contents Curly body");
1565             } else if (node instanceof GroupCurly) {
1566                 System.out.println(node);
1567                 printObjectTree(((GroupCurly)node).atom);


2179             return -1;
2180         case 'a':
2181             return '\007';
2182         case 'b':
2183             if (inclass) break;
2184             if (create) root = new Bound(Bound.BOTH);
2185             return -1;
2186         case 'c':
2187             return c();
2188         case 'd':
2189             if (create) root = new Ctype(ASCII.DIGIT);
2190             return -1;
2191         case 'e':
2192             return '\033';
2193         case 'f':
2194             return '\f';
2195         case 'g':
2196         case 'h':
2197         case 'i':
2198         case 'j':
2199             break;
2200         case 'k':
2201             if (inclass)
2202                 break;
2203             if (read() != '<')
2204                 throw error("\\k is not followed by '<' for named capturing group");
2205             String name = groupname(read());
2206             if (!namedGroups().containsKey(name))
2207                 throw error("(named capturing group <"+ name+"> does not exit");
2208             if (create) {
2209                 if (has(CASE_INSENSITIVE))
2210                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2211                 else
2212                     root = new BackRef(namedGroups().get(name));
2213             }
2214             return -1;
2215         case 'l':
2216         case 'm':
2217             break;
2218         case 'n':
2219             return '\n';
2220         case 'o':
2221         case 'p':
2222         case 'q':
2223             break;
2224         case 'r':
2225             return '\r';
2226         case 's':
2227             if (create) root = new Ctype(ASCII.SPACE);
2228             return -1;
2229         case 't':
2230             return '\t';
2231         case 'u':
2232             return u();
2233         case 'v':
2234             return '\013';


2494             block = Character.UnicodeBlock.forName(name);
2495         } catch (IllegalArgumentException iae) {
2496             throw error("Unknown character block name {" + name + "}");
2497         }
2498         return new CharProperty() {
2499                 boolean isSatisfiedBy(int ch) {
2500                     return block == Character.UnicodeBlock.of(ch);}};
2501     }
2502 
2503     /**
2504      * Returns a CharProperty matching all characters in a named property.
2505      */
2506     private CharProperty charPropertyNodeFor(String name) {
2507         CharProperty p = CharPropertyNames.charPropertyFor(name);
2508         if (p == null)
2509             throw error("Unknown character property name {" + name + "}");
2510         return p;
2511     }
2512 
2513     /**
2514      * Parses and returns the name of a "named capturing group", the trailing
2515      * ">" is consumed after parsing.
2516      */
2517     private String groupname(int ch) {
2518         StringBuilder sb = new StringBuilder();
2519         sb.append(Character.toChars(ch));
2520         while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2521                ASCII.isDigit(ch)) {
2522             sb.append(Character.toChars(ch));
2523         }
2524         if (sb.length() == 0)
2525             throw error("named capturing group has 0 length name");
2526         if (ch != '>')
2527             throw error("named capturing group is missing trailing '>'");
2528         return sb.toString();
2529     }
2530 
2531     /**
2532      * Parses a group and returns the head node of a set of nodes that process
2533      * the group. Sometimes a double return system is used where the tail is
2534      * returned in root.
2535      */
2536     private Node group0() {
2537         boolean capturingGroup = false;
2538         Node head = null;
2539         Node tail = null;
2540         int save = flags;
2541         root = null;
2542         int ch = next();
2543         if (ch == '?') {
2544             ch = skip();
2545             switch (ch) {
2546             case ':':   //  (?:xxx) pure group
2547                 head = createGroup(true);
2548                 tail = root;
2549                 head.next = expr(tail);
2550                 break;
2551             case '=':   // (?=xxx) and (?!xxx) lookahead
2552             case '!':
2553                 head = createGroup(true);
2554                 tail = root;
2555                 head.next = expr(tail);
2556                 if (ch == '=') {
2557                     head = tail = new Pos(head);
2558                 } else {
2559                     head = tail = new Neg(head);
2560                 }
2561                 break;
2562             case '>':   // (?>xxx)  independent group
2563                 head = createGroup(true);
2564                 tail = root;
2565                 head.next = expr(tail);
2566                 head = tail = new Ques(head, INDEPENDENT);
2567                 break;
2568             case '<':   // (?<xxx)  look behind
2569                 ch = read();
2570                 if (Character.isLetter(ch)) {     // named captured group
2571                     String name = groupname(ch);
2572                     if (namedGroups().containsKey(name))
2573                         throw error("Named capturing group <" + name
2574                                     + "> is already defined");
2575                     capturingGroup = true;
2576                     head = createGroup(false);
2577                     tail = root;
2578                     namedGroups().put(name, capturingGroupCount-1);
2579                     head.next = expr(tail);
2580                     break;
2581                 }
2582                 int start = cursor;
2583                 head = createGroup(true);
2584                 tail = root;
2585                 head.next = expr(tail);
2586                 tail.next = lookbehindEnd;
2587                 TreeInfo info = new TreeInfo();
2588                 head.study(info);
2589                 if (info.maxValid == false) {
2590                     throw error("Look-behind group does not have "
2591                                 + "an obvious maximum length");
2592                 }
2593                 boolean hasSupplementary = findSupplementary(start, patternLength);
2594                 if (ch == '=') {
2595                     head = tail = (hasSupplementary ?
2596                                    new BehindS(head, info.maxLength,
2597                                                info.minLength) :
2598                                    new Behind(head, info.maxLength,
2599                                               info.minLength));
2600                 } else if (ch == '!') {
2601                     head = tail = (hasSupplementary ?