src/share/classes/java/util/regex/Pattern.java

Print this page

        

@@ -27,10 +27,11 @@
 
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import java.text.CharacterIterator;
 import java.text.Normalizer;
+import java.util.Map;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Arrays;
 
 

@@ -296,10 +297,14 @@
  *
  * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
  *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
  *     <a href="#cg">capturing group</a> matched</td></tr>
  *
+ * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i>&lt;<i>name</i>&gt;</td>
+ *     <td valign="bottom" headers="matches">Whatever the 
+ *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
+ *
  * <tr><th>&nbsp;</th></tr>
  * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
  *
  * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
  *     <td headers="matches">Nothing, but quotes the following character</td></tr>

@@ -308,12 +313,14 @@
  * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
  *     <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
  *
  * <tr><th>&nbsp;</th></tr>
- * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
+ * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
  *
+ * <tr><td valign="top" headers="construct special"><tt>(?&lt;<a href="#groupname">name</a>&gt;</tt><i>X</i><tt>)</tt></td>
+ *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>

@@ -447,10 +454,12 @@
  * matches just before a line terminator or the end of the input sequence.
  *
  * <a name="cg">
  * <h4> Groups and capturing </h4>
  *
+ * <a name="gnumber">
+ * <h5> Group number </h5>
  * <p> Capturing groups are numbered by counting their opening parentheses from
  * left to right.  In the expression <tt>((A)(B(C)))</tt>, for example, there
  * are four such groups: </p>
  *
  * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">

@@ -469,22 +478,40 @@
  * <p> Capturing groups are so named because, during a match, each subsequence
  * of the input sequence that matches such a group is saved.  The captured
  * subsequence may be used later in the expression, via a back reference, and
  * may also be retrieved from the matcher once the match operation is complete.
  *
+ * <a name="groupname">
+ * <h5> Group name </h5>
+ * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,
+ * and then be back-referenced later by the "name". Group names are composed of
+ * the following characters:
+ *
+ * <ul>
+ *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
+ *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),
+ *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
+ *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),
+ *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
+ *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),
+ * </ul>
+ *
+ * <p> A <tt>named-capturing group</tt> is still numbered as described in
+ * <a href="#gnumber">Group number</a>.
+ *
  * <p> The captured input associated with a group is always the subsequence
  * that the group most recently matched.  If a group is evaluated a second time
  * because of quantification then its previously-captured value, if any, will
  * be retained if the second evaluation fails.  Matching the string
  * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
  * group two set to <tt>"b"</tt>.  All captured input is discarded at the
  * beginning of each match.
  *
- * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
- * that do not capture text and do not count towards the group total.
+ * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
+ * that do not capture text and do not count towards the group total, or
+ * <i>named-capturing</i> group.
  *
- *
  * <h4> Unicode support </h4>
  *
  * <p> This class is in conformance with Level 1 of <a
  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
  * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1

@@ -793,10 +820,16 @@
      * Temporary storage used by parsing pattern slice.
      */
     transient int[] buffer;
 
     /**
+     * Map the "name" of the "named capturing group" to its group id
+     * node.
+     */
+    transient volatile Map<String, Integer> namedGroups;
+
+    /**
      * Temporary storage used while parsing group references.
      */
     transient GroupHead[] groupNodes;
 
     /**

@@ -1465,10 +1498,11 @@
             RemoveQEQuoting();
 
         // Allocate all temporary objects here.
         buffer = new int[32];
         groupNodes = new GroupHead[10];
+        namedGroups = null;
 
         if (has(LITERAL)) {
             // Literal pattern handling
             matchRoot = newSlice(temp, patternLength, hasSupplementary);
             matchRoot.next = lastAccept;

@@ -1503,10 +1537,16 @@
         groupNodes = null;
         patternLength = 0;
         compiled = true;
     }
 
+    Map<String, Integer> namedGroups() {
+        if (namedGroups == null)
+            namedGroups = new HashMap<String, Integer>(2);
+        return namedGroups;
+    }
+
     /**
      * Used to print out a subtree of the Pattern to help with debugging.
      */
     private static void printObjectTree(Node node) {
         while(node != null) {

@@ -2154,11 +2194,26 @@
             return '\f';
         case 'g':
         case 'h':
         case 'i':
         case 'j':
+            break;
         case 'k':
+            if (inclass)
+                break;
+            if (read() != '<')
+                throw error("\\k is not followed by '<' for named capturing group");
+            String name = groupname(read());
+            if (!namedGroups().containsKey(name))
+                throw error("(named capturing group <"+ name+"> does not exit");
+            if (create) {
+                if (has(CASE_INSENSITIVE))
+                    root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
+                else
+                    root = new BackRef(namedGroups().get(name));
+            }
+            return -1;
         case 'l':
         case 'm':
             break;
         case 'n':
             return '\n';

@@ -2454,10 +2509,28 @@
             throw error("Unknown character property name {" + name + "}");
         return p;
     }
 
     /**
+     * Parses and returns the name of a "named capturing group", the trailing
+     * ">" is consumed after parsing.
+     */
+    private String groupname(int ch) {
+        StringBuilder sb = new StringBuilder();
+        sb.append(Character.toChars(ch));
+        while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
+               ASCII.isDigit(ch)) {
+            sb.append(Character.toChars(ch));
+        }
+        if (sb.length() == 0)
+            throw error("named capturing group has 0 length name");
+        if (ch != '>')
+            throw error("named capturing group is missing trailing '>'");
+        return sb.toString();
+    }
+
+    /**
      * Parses a group and returns the head node of a set of nodes that process
      * the group. Sometimes a double return system is used where the tail is
      * returned in root.
      */
     private Node group0() {

@@ -2492,10 +2565,22 @@
                 head.next = expr(tail);
                 head = tail = new Ques(head, INDEPENDENT);
                 break;
             case '<':   // (?<xxx)  look behind
                 ch = read();
+                if (Character.isLetter(ch)) {     // named captured group
+                    String name = groupname(ch);
+                    if (namedGroups().containsKey(name))
+                        throw error("Named capturing group <" + name
+                                    + "> is already defined");
+                    capturingGroup = true;
+                    head = createGroup(false);
+                    tail = root;
+                    namedGroups().put(name, capturingGroupCount-1);
+                    head.next = expr(tail);
+                    break;
+                }
                 int start = cursor;
                 head = createGroup(true);
                 tail = root;
                 head.next = expr(tail);
                 tail.next = lookbehindEnd;