jdk Cdiff src/share/classes/java/util/regex/Pattern.java

src/share/classes/java/util/regex/Pattern.java


*** 27,36 ****
--- 27,37 ----
  
  import java.security.AccessController;
  import java.security.PrivilegedAction;
  import java.text.CharacterIterator;
  import java.text.Normalizer;
+ import java.util.Map;
  import java.util.ArrayList;
  import java.util.HashMap;
  import java.util.Arrays;
  
  
*** 296,305 ****
--- 297,310 ----
   *
   * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
   *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
   *     <a href="#cg">capturing group</a> matched</td></tr>
   *
+  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i>&lt;<i>name</i>&gt;</td>
+  *     <td valign="bottom" headers="matches">Whatever the 
+  *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
+  *
   * <tr><th>&nbsp;</th></tr>
   * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
   *
   * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
   *     <td headers="matches">Nothing, but quotes the following character</td></tr>
*** 308,319 ****
   * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
   *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
   *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
   *
   * <tr><th>&nbsp;</th></tr>
!  * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
   *
   * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
   *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
   *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
   * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
--- 313,326 ----
   * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
   *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
   *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
   *
   * <tr><th>&nbsp;</th></tr>
!  * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
   *
+  * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>
+  *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
   *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
   * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
   *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
   * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
*** 447,456 ****
--- 454,465 ----
   * matches just before a line terminator or the end of the input sequence.
   *
   * <a name="cg">
   * <h4> Groups and capturing </h4>
   *
+  * <a name="gnumber">
+  * <h5> Group number </h5>
   * <p> Capturing groups are numbered by counting their opening parentheses from
   * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there
   * are four such groups: </p>
   *
   * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
*** 469,490 ****
   * <p> Capturing groups are so named because, during a match, each subsequence
   * of the input sequence that matches such a group is saved.  The captured
   * subsequence may be used later in the expression, via a back reference, and
   * may also be retrieved from the matcher once the match operation is complete.
   *
   * <p> The captured input associated with a group is always the subsequence
   * that the group most recently matched.  If a group is evaluated a second time
   * because of quantification then its previously-captured value, if any, will
   * be retained if the second evaluation fails.  Matching the string
   * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
   * group two set to <tt>"b"</tt>.  All captured input is discarded at the
   * beginning of each match.
   *
!  * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
!  * that do not capture text and do not count towards the group total.
   *
-  *
   * <h4> Unicode support </h4>
   *
   * <p> This class is in conformance with Level 1 of <a
   * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
   * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
--- 478,517 ----
   * <p> Capturing groups are so named because, during a match, each subsequence
   * of the input sequence that matches such a group is saved.  The captured
   * subsequence may be used later in the expression, via a back reference, and
   * may also be retrieved from the matcher once the match operation is complete.
   *
+  * <a name="groupname">
+  * <h5> Group name </h5>
+  * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,
+  * and then be back-referenced later by the "name". Group names are composed of
+  * the following characters:
+  *
+  * <ul>
+  *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
+  *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),
+  *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
+  *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),
+  *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
+  *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),
+  * </ul>
+  *
+  * <p> A <tt>named-capturing group</tt> is still numbered as described in
+  * <a href="#gnumber">Group number</a>.
+  *
   * <p> The captured input associated with a group is always the subsequence
   * that the group most recently matched.  If a group is evaluated a second time
   * because of quantification then its previously-captured value, if any, will
   * be retained if the second evaluation fails.  Matching the string
   * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
   * group two set to <tt>"b"</tt>.  All captured input is discarded at the
   * beginning of each match.
   *
!  * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
!  * that do not capture text and do not count towards the group total, or
!  * <i>named-capturing</i> group.
   *
   * <h4> Unicode support </h4>
   *
   * <p> This class is in conformance with Level 1 of <a
   * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
   * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
*** 793,802 ****
--- 820,835 ----
       * Temporary storage used by parsing pattern slice.
       */
      transient int[] buffer;
  
      /**
+      * Map the "name" of the "named capturing group" to its group id
+      * node.
+      */
+     transient volatile Map<String, Integer> namedGroups;
+ 
+     /**
       * Temporary storage used while parsing group references.
       */
      transient GroupHead[] groupNodes;
  
      /**
*** 1465,1474 ****
--- 1498,1508 ----
              RemoveQEQuoting();
  
          // Allocate all temporary objects here.
          buffer = new int[32];
          groupNodes = new GroupHead[10];
+         namedGroups = null;
  
          if (has(LITERAL)) {
              // Literal pattern handling
              matchRoot = newSlice(temp, patternLength, hasSupplementary);
              matchRoot.next = lastAccept;
*** 1503,1512 ****
--- 1537,1552 ----
          groupNodes = null;
          patternLength = 0;
          compiled = true;
      }
  
+     Map<String, Integer> namedGroups() {
+         if (namedGroups == null)
+             namedGroups = new HashMap<String, Integer>(2);
+         return namedGroups;
+     }
+ 
      /**
       * Used to print out a subtree of the Pattern to help with debugging.
       */
      private static void printObjectTree(Node node) {
          while(node != null) {
*** 2154,2164 ****
--- 2194,2219 ----
              return '\f';
          case 'g':
          case 'h':
          case 'i':
          case 'j':
+             break;
          case 'k':
+             if (inclass)
+                 break;
+             if (read() != '<')
+                 throw error("\\k is not followed by '<' for named capturing group");
+             String name = groupname(read());
+             if (!namedGroups().containsKey(name))
+                 throw error("(named capturing group <"+ name+"> does not exit");
+             if (create) {
+                 if (has(CASE_INSENSITIVE))
+                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
+                 else
+                     root = new BackRef(namedGroups().get(name));
+             }
+             return -1;
          case 'l':
          case 'm':
              break;
          case 'n':
              return '\n';
*** 2454,2463 ****
--- 2509,2536 ----
              throw error("Unknown character property name {" + name + "}");
          return p;
      }
  
      /**
+      * Parses and returns the name of a "named capturing group", the trailing
+      * ">" is consumed after parsing.
+      */
+     private String groupname(int ch) {
+         StringBuilder sb = new StringBuilder();
+         sb.append(Character.toChars(ch));
+         while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
+                ASCII.isDigit(ch)) {
+             sb.append(Character.toChars(ch));
+         }
+         if (sb.length() == 0)
+             throw error("named capturing group has 0 length name");
+         if (ch != '>')
+             throw error("named capturing group is missing trailing '>'");
+         return sb.toString();
+     }
+ 
+     /**
       * Parses a group and returns the head node of a set of nodes that process
       * the group. Sometimes a double return system is used where the tail is
       * returned in root.
       */
      private Node group0() {
*** 2492,2501 ****
--- 2565,2586 ----
                  head.next = expr(tail);
                  head = tail = new Ques(head, INDEPENDENT);
                  break;
              case '<':   // (?<xxx)  look behind
                  ch = read();
+                 if (Character.isLetter(ch)) {     // named captured group
+                     String name = groupname(ch);
+                     if (namedGroups().containsKey(name))
+                         throw error("Named capturing group <" + name
+                                     + "> is already defined");
+                     capturingGroup = true;
+                     head = createGroup(false);
+                     tail = root;
+                     namedGroups().put(name, capturingGroupCount-1);
+                     head.next = expr(tail);
+                     break;
+                 }
                  int start = cursor;
                  head = createGroup(true);
                  tail = root;
                  head.next = expr(tail);
                  tail.next = lookbehindEnd;