src/share/classes/java/util/regex/Pattern.java
Print this page
*** 27,36 ****
--- 27,37 ----
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.text.CharacterIterator;
import java.text.Normalizer;
+ import java.util.Map;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Arrays;
*** 296,305 ****
--- 297,310 ----
*
* <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
* <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
* <a href="#cg">capturing group</a> matched</td></tr>
*
+ * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i><<i>name</i>></td>
+ * <td valign="bottom" headers="matches">Whatever the
+ * <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
+ *
* <tr><th> </th></tr>
* <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
*
* <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
* <td headers="matches">Nothing, but quotes the following character</td></tr>
*** 308,319 ****
* <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
* <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
* <!-- Metachars: !$()*+.<>?[\]^{|} -->
*
* <tr><th> </th></tr>
! * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
*
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
--- 313,326 ----
* <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
* <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
* <!-- Metachars: !$()*+.<>?[\]^{|} -->
*
* <tr><th> </th></tr>
! * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
*
+ * <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td>
+ * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
*** 447,456 ****
--- 454,465 ----
* matches just before a line terminator or the end of the input sequence.
*
* <a name="cg">
* <h4> Groups and capturing </h4>
*
+ * <a name="gnumber">
+ * <h5> Group number </h5>
* <p> Capturing groups are numbered by counting their opening parentheses from
* left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
* are four such groups: </p>
*
* <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
*** 469,490 ****
* <p> Capturing groups are so named because, during a match, each subsequence
* of the input sequence that matches such a group is saved. The captured
* subsequence may be used later in the expression, via a back reference, and
* may also be retrieved from the matcher once the match operation is complete.
*
* <p> The captured input associated with a group is always the subsequence
* that the group most recently matched. If a group is evaluated a second time
* because of quantification then its previously-captured value, if any, will
* be retained if the second evaluation fails. Matching the string
* <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
* group two set to <tt>"b"</tt>. All captured input is discarded at the
* beginning of each match.
*
! * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
! * that do not capture text and do not count towards the group total.
*
- *
* <h4> Unicode support </h4>
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
* Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
--- 478,517 ----
* <p> Capturing groups are so named because, during a match, each subsequence
* of the input sequence that matches such a group is saved. The captured
* subsequence may be used later in the expression, via a back reference, and
* may also be retrieved from the matcher once the match operation is complete.
*
+ * <a name="groupname">
+ * <h5> Group name </h5>
+ * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>,
+ * and then be back-referenced later by the "name". Group names are composed of
+ * the following characters:
+ *
+ * <ul>
+ * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
+ * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>),
+ * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
+ * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>),
+ * <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
+ * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>),
+ * </ul>
+ *
+ * <p> A <tt>named-capturing group</tt> is still numbered as described in
+ * <a href="#gnumber">Group number</a>.
+ *
* <p> The captured input associated with a group is always the subsequence
* that the group most recently matched. If a group is evaluated a second time
* because of quantification then its previously-captured value, if any, will
* be retained if the second evaluation fails. Matching the string
* <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
* group two set to <tt>"b"</tt>. All captured input is discarded at the
* beginning of each match.
*
! * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups
! * that do not capture text and do not count towards the group total, or
! * <i>named-capturing</i> group.
*
* <h4> Unicode support </h4>
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
* Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
*** 793,802 ****
--- 820,835 ----
* Temporary storage used by parsing pattern slice.
*/
transient int[] buffer;
/**
+ * Map the "name" of the "named capturing group" to its group id
+ * node.
+ */
+ transient volatile Map<String, Integer> namedGroups;
+
+ /**
* Temporary storage used while parsing group references.
*/
transient GroupHead[] groupNodes;
/**
*** 1465,1474 ****
--- 1498,1508 ----
RemoveQEQuoting();
// Allocate all temporary objects here.
buffer = new int[32];
groupNodes = new GroupHead[10];
+ namedGroups = null;
if (has(LITERAL)) {
// Literal pattern handling
matchRoot = newSlice(temp, patternLength, hasSupplementary);
matchRoot.next = lastAccept;
*** 1503,1512 ****
--- 1537,1552 ----
groupNodes = null;
patternLength = 0;
compiled = true;
}
+ Map<String, Integer> namedGroups() {
+ if (namedGroups == null)
+ namedGroups = new HashMap<String, Integer>(2);
+ return namedGroups;
+ }
+
/**
* Used to print out a subtree of the Pattern to help with debugging.
*/
private static void printObjectTree(Node node) {
while(node != null) {
*** 2154,2164 ****
--- 2194,2219 ----
return '\f';
case 'g':
case 'h':
case 'i':
case 'j':
+ break;
case 'k':
+ if (inclass)
+ break;
+ if (read() != '<')
+ throw error("\\k is not followed by '<' for named capturing group");
+ String name = groupname(read());
+ if (!namedGroups().containsKey(name))
+ throw error("(named capturing group <"+ name+"> does not exit");
+ if (create) {
+ if (has(CASE_INSENSITIVE))
+ root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
+ else
+ root = new BackRef(namedGroups().get(name));
+ }
+ return -1;
case 'l':
case 'm':
break;
case 'n':
return '\n';
*** 2454,2463 ****
--- 2509,2536 ----
throw error("Unknown character property name {" + name + "}");
return p;
}
/**
+ * Parses and returns the name of a "named capturing group", the trailing
+ * ">" is consumed after parsing.
+ */
+ private String groupname(int ch) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(Character.toChars(ch));
+ while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
+ ASCII.isDigit(ch)) {
+ sb.append(Character.toChars(ch));
+ }
+ if (sb.length() == 0)
+ throw error("named capturing group has 0 length name");
+ if (ch != '>')
+ throw error("named capturing group is missing trailing '>'");
+ return sb.toString();
+ }
+
+ /**
* Parses a group and returns the head node of a set of nodes that process
* the group. Sometimes a double return system is used where the tail is
* returned in root.
*/
private Node group0() {
*** 2492,2501 ****
--- 2565,2586 ----
head.next = expr(tail);
head = tail = new Ques(head, INDEPENDENT);
break;
case '<': // (?<xxx) look behind
ch = read();
+ if (Character.isLetter(ch)) { // named captured group
+ String name = groupname(ch);
+ if (namedGroups().containsKey(name))
+ throw error("Named capturing group <" + name
+ + "> is already defined");
+ capturingGroup = true;
+ head = createGroup(false);
+ tail = root;
+ namedGroups().put(name, capturingGroupCount-1);
+ head.next = expr(tail);
+ break;
+ }
int start = cursor;
head = createGroup(true);
tail = root;
head.next = expr(tail);
tail.next = lookbehindEnd;