--- old/src/java.base/share/classes/java/util/regex/Pattern.java 2016-02-09 21:53:06.532409902 -0800 +++ new/src/java.base/share/classes/java/util/regex/Pattern.java 2016-02-09 21:53:06.307410882 -0800 @@ -109,6 +109,8 @@ * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} * <= {@code 0x}h...h <= * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT}) + *
\N{
name}
'\u0009'
)
- * A Unicode character can also be represented in a regular-expression by
- * using its Hex notation(hexadecimal code point value) directly as described in construct
- * \x{...}
, for example a supplementary character U+2011F
- * can be specified as \x{2011F}
, instead of two consecutive
- * Unicode escape sequences of the surrogate pair
- * \uD840
\uDD1F
.
+ * A Unicode character can also be represented by using its Hex notation
+ * (hexadecimal code point value) directly as described in construct
+ * \x{...}
, for example a supplementary character U+2011F can be
+ * specified as \x{2011F}
, instead of two consecutive Unicode escape
+ * sequences of the surrogate pair \uD840
\uDD1F
.
+ *
+ * Unicode character names are supported by the named character construct
+ * \N{
...}
, for example, \N{WHITE SMILING FACE}
+ * specifies character \u263A
. The character names supported
+ * by this class are the valid Unicode character names matched by
+ * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.
+ *
+ * + * Unicode extended grapheme clusters are supported by the grapheme + * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}. *
* Unicode scripts, blocks, categories and binary properties are written with * the {@code \p} and {@code \P} constructs as in Perl. @@ -679,22 +697,12 @@ *
Perl constructs not supported by this class:
* *Predefined character classes (Unicode character) - *
\X
Match Unicode
- *
- * extended grapheme cluster
- *
The backreference constructs, \g{
n}
for
* the nthcapturing group and
* \g{
name}
for
* named-capturing group.
*
The named character construct, \N{
name}
- * for a Unicode character by its name.
- *
The conditional constructs * {@code (?(}condition{@code )}X{@code )} and * {@code (?(}condition{@code )}X{@code |}Y{@code )}, @@ -2357,7 +2365,9 @@ case 'K': case 'L': case 'M': + break; case 'N': + return N(); case 'O': case 'P': case 'Q': @@ -2383,6 +2393,11 @@ : new Ctype(ASCII.WORD).complement(); return -1; case 'X': + if (inclass) break; + if (create) { + root = new XGrapheme(); + } + return -1; case 'Y': break; case 'Z': @@ -2398,7 +2413,19 @@ return '\007'; case 'b': if (inclass) break; - if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); + if (create) { + if (peek() == '{') { + if (skip() == 'g') { + if (read() == '}') { + root = new GraphemeBound(); + return -1; + } + break; // error missing trailing } + } + unread(); unread(); + } + root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); + } return -1; case 'c': return c(); @@ -3275,10 +3302,25 @@ return n; } + private int N() { + if (read() == '{') { + int i = cursor; + while (cursor < patternLength && read() != '}') {} + if (cursor > patternLength) + throw error("Unclosed character name escape sequence"); + String name = new String(temp, i, cursor - i - 1); + try { + return Character.codePointOf(name); + } catch (IllegalArgumentException x) { + throw error("Unknown character name [" + name + "]"); + } + } + throw error("Illegal character name escape sequence"); + } + // // Utility methods for code point support // - private static final int countChars(CharSequence seq, int index, int lengthInCodePoints) { // optimization @@ -3958,6 +4000,62 @@ } /** + * Node class that matches an unicode extended grapheme cluster + */ + static class XGrapheme extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + if (i < matcher.to) { + int ch0 = Character.codePointAt(seq, i); + i += Character.charCount(ch0); + while (i < matcher.to) { + int ch1 = Character.codePointAt(seq, i); + if (Grapheme.isBoundary(ch0, ch1)) + break; + ch0 = ch1; + i += Character.charCount(ch1); + } + return next.match(matcher, i, seq); + } + matcher.hitEnd = true; + return false; + } + + boolean study(TreeInfo info) { + info.minLength++; + info.deterministic = false; + return next.study(info); + } + } + + /** + * Node class that handles grapheme boundaries + */ + static class GraphemeBound extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + int startIndex = matcher.from; + int endIndex = matcher.to; + if (matcher.transparentBounds) { + startIndex = 0; + endIndex = matcher.getTextLength(); + } + if (i == startIndex) { + return next.match(matcher, i, seq); + } + if (i < endIndex) { + if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) || + !Grapheme.isBoundary(Character.codePointBefore(seq, i), + Character.codePointAt(seq, i))) { + return false; + } + } else { + matcher.hitEnd = true; + matcher.requireEnd = true; + } + return next.match(matcher, i, seq); + } + } + + /** * Base class for all Slice nodes */ static class SliceNode extends Node {