src/java.base/share/classes/java/util/regex/Pattern.java
Print this page
*** 107,116 ****
--- 107,118 ----
* <tr><td valign="top" headers="construct characters"><code>\x</code><i>{h...h}</i></td>
* <td headers="matches">The character with hexadecimal value {@code 0x}<i>h...h</i>
* ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
* <= {@code 0x}<i>h...h</i> <=
* {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
+ * <tr><td valign="top" headers="construct characters"><code>\N{</code><i>name</i><code>}</code></td>
+ * <td headers="matches">The character with Unicode character name <i>'name'</i></td></tr>
* <tr><td valign="top" headers="matches">{@code \t}</td>
* <td headers="matches">The tab character (<code>'\u0009'</code>)</td></tr>
* <tr><td valign="top" headers="construct characters">{@code \n}</td>
* <td headers="matches">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
* <tr><td valign="top" headers="construct characters">{@code \r}</td>
*** 241,250 ****
--- 243,254 ----
* <td headers="matches">The beginning of a line</td></tr>
* <tr><td valign="top" headers="construct bounds">{@code $}</td>
* <td headers="matches">The end of a line</td></tr>
* <tr><td valign="top" headers="construct bounds">{@code \b}</td>
* <td headers="matches">A word boundary</td></tr>
+ * <tr><td valign="top" headers="construct bounds">{@code \b{g}}</td>
+ * <td headers="matches">A Unicode extended grapheme cluster boundary</td></tr>
* <tr><td valign="top" headers="construct bounds">{@code \B}</td>
* <td headers="matches">A non-word boundary</td></tr>
* <tr><td valign="top" headers="construct bounds">{@code \A}</td>
* <td headers="matches">The beginning of the input</td></tr>
* <tr><td valign="top" headers="construct bounds">{@code \G}</td>
*** 261,270 ****
--- 265,279 ----
* <td headers="matches">Any Unicode linebreak sequence, is equivalent to
* <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]
* </code></td></tr>
*
* <tr><th> </th></tr>
+ * <tr align="left"><th colspan="2" id="grapheme">Unicode Extended Grapheme matcher</th></tr>
+ * <tr><td valign="top" headers="construct grapheme">{@code \X}</td>
+ * <td headers="matches">Any Unicode extended grapheme cluster</td></tr>
+ *
+ * <tr><th> </th></tr>
* <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
*
* <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
* <td headers="matches"><i>X</i>, once or not at all</td></tr>
* <tr><td valign="top" headers="construct greedy"><i>X</i>{@code *}</td>
*** 544,559 ****
* parser so that Unicode escapes can be used in expressions that are read from
* files or from the keyboard. Thus the strings <code>"\u2014"</code> and
* {@code "\\u2014"}, while not equal, compile into the same pattern, which
* matches the character with hexadecimal value {@code 0x2014}.
* <p>
! * A Unicode character can also be represented in a regular-expression by
! * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
! * <code>\x{...}</code>, for example a supplementary character U+2011F
! * can be specified as <code>\x{2011F}</code>, instead of two consecutive
! * Unicode escape sequences of the surrogate pair
! * <code>\uD840</code><code>\uDD1F</code>.
* <p>
* Unicode scripts, blocks, categories and binary properties are written with
* the {@code \p} and {@code \P} constructs as in Perl.
* <code>\p{</code><i>prop</i><code>}</code> matches if
* the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
--- 553,577 ----
* parser so that Unicode escapes can be used in expressions that are read from
* files or from the keyboard. Thus the strings <code>"\u2014"</code> and
* {@code "\\u2014"}, while not equal, compile into the same pattern, which
* matches the character with hexadecimal value {@code 0x2014}.
* <p>
! * A Unicode character can also be represented by using its <b>Hex notation</b>
! * (hexadecimal code point value) directly as described in construct
! * <code>\x{...}</code>, for example a supplementary character U+2011F can be
! * specified as <code>\x{2011F}</code>, instead of two consecutive Unicode escape
! * sequences of the surrogate pair <code>\uD840</code><code>\uDD1F</code>.
! * <p>
! * <b>Unicode character names</b> are supported by the named character construct
! * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code>
! * specifies character <code>\u263A</code>. The character names supported
! * by this class are the valid Unicode character names matched by
! * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.
! * <p>
! * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
! * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme
! * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}.
* <p>
* Unicode scripts, blocks, categories and binary properties are written with
* the {@code \p} and {@code \P} constructs as in Perl.
* <code>\p{</code><i>prop</i><code>}</code> matches if
* the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
*** 677,702 ****
* with ordered alternation as occurs in Perl 5.
*
* <p> Perl constructs not supported by this class: </p>
*
* <ul>
- * <li><p> Predefined character classes (Unicode character)
- * <p><code>\X </code>Match Unicode
- * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
- * <i>extended grapheme cluster</i></a>
- * </p></li>
- *
* <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
* the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
* <code>\g{</code><i>name</i><code>}</code> for
* <a href="#groupname">named-capturing group</a>.
* </p></li>
*
- * <li><p> The named character construct, <code>\N{</code><i>name</i><code>}</code>
- * for a Unicode character by its name.
- * </p></li>
- *
* <li><p> The conditional constructs
* {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
* {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
* </p></li>
*
--- 695,710 ----
*** 2355,2365 ****
--- 2363,2375 ----
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
+ break;
case 'N':
+ return N();
case 'O':
case 'P':
case 'Q':
break;
case 'R':
*** 2381,2390 ****
--- 2391,2405 ----
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD).complement()
: new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
+ if (inclass) break;
+ if (create) {
+ root = new XGrapheme();
+ }
+ return -1;
case 'Y':
break;
case 'Z':
if (inclass) break;
if (create) {
*** 2396,2406 ****
return -1;
case 'a':
return '\007';
case 'b':
if (inclass) break;
! if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'c':
return c();
case 'd':
if (create) root = has(UNICODE_CHARACTER_CLASS)
--- 2411,2433 ----
return -1;
case 'a':
return '\007';
case 'b':
if (inclass) break;
! if (create) {
! if (peek() == '{') {
! if (skip() == 'g') {
! if (read() == '}') {
! root = new GraphemeBound();
! return -1;
! }
! break; // error missing trailing }
! }
! unread(); unread();
! }
! root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
! }
return -1;
case 'c':
return c();
case 'd':
if (create) root = has(UNICODE_CHARACTER_CLASS)
*** 3273,3286 ****
setcursor(cur);
}
return n;
}
//
// Utility methods for code point support
//
-
private static final int countChars(CharSequence seq, int index,
int lengthInCodePoints) {
// optimization
if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
assert (index >= 0 && index < seq.length());
--- 3300,3328 ----
setcursor(cur);
}
return n;
}
+ private int N() {
+ if (read() == '{') {
+ int i = cursor;
+ while (cursor < patternLength && read() != '}') {}
+ if (cursor > patternLength)
+ throw error("Unclosed character name escape sequence");
+ String name = new String(temp, i, cursor - i - 1);
+ try {
+ return Character.codePointOf(name);
+ } catch (IllegalArgumentException x) {
+ throw error("Unknown character name [" + name + "]");
+ }
+ }
+ throw error("Illegal character name escape sequence");
+ }
+
//
// Utility methods for code point support
//
private static final int countChars(CharSequence seq, int index,
int lengthInCodePoints) {
// optimization
if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
assert (index >= 0 && index < seq.length());
*** 3956,3965 ****
--- 3998,4063 ----
cp == 0x202f || cp == 0x205f || cp == 0x3000;
}
}
/**
+ * Node class that matches an unicode extended grapheme cluster
+ */
+ static class XGrapheme extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i < matcher.to) {
+ int ch0 = Character.codePointAt(seq, i);
+ i += Character.charCount(ch0);
+ while (i < matcher.to) {
+ int ch1 = Character.codePointAt(seq, i);
+ if (Grapheme.isBoundary(ch0, ch1))
+ break;
+ ch0 = ch1;
+ i += Character.charCount(ch1);
+ }
+ return next.match(matcher, i, seq);
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+
+ boolean study(TreeInfo info) {
+ info.minLength++;
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Node class that handles grapheme boundaries
+ */
+ static class GraphemeBound extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (matcher.transparentBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ if (i == startIndex) {
+ return next.match(matcher, i, seq);
+ }
+ if (i < endIndex) {
+ if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
+ !Grapheme.isBoundary(Character.codePointBefore(seq, i),
+ Character.codePointAt(seq, i))) {
+ return false;
+ }
+ } else {
+ matcher.hitEnd = true;
+ matcher.requireEnd = true;
+ }
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
* Base class for all Slice nodes
*/
static class SliceNode extends Node {
int[] buffer;
SliceNode(int[] buf) {