< prev index next >
src/java.base/share/classes/java/util/regex/Pattern.java
Print this page
rev 54996 : 8221431: Support for Unicode 12.1
Reviewed-by:
@@ -538,11 +538,11 @@
* <h2> Unicode support </h2>
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
* Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
- * Canonical Equivalents.
+ * Canonical Equivalents and RL2.2 Extended Grapheme Clusters.
* <p>
* <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code
* are processed as described in section 3.3 of
* <cite>The Java™ Language Specification</cite>.
* Such escape sequences are also implemented directly by the regular-expression
@@ -1499,19 +1499,12 @@
if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
dst.append((char)ch0);
off++;
continue;
}
- int j = off + Character.charCount(ch0);
+ int j = Grapheme.nextBoundary(src, off, limit);
int ch1;
- while (j < limit) {
- ch1 = src.codePointAt(j);
- if (Grapheme.isBoundary(ch0, ch1))
- break;
- ch0 = ch1;
- j += Character.charCount(ch1);
- }
String seq = src.substring(off, j);
String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
off = j;
if (nfd.length() > 1) {
ch0 = nfd.codePointAt(0);
@@ -3973,18 +3966,11 @@
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i);
int n = Character.charCount(ch0);
- int j = i + n;
- while (j < matcher.to) {
- int ch1 = Character.codePointAt(seq, j);
- if (Grapheme.isBoundary(ch0, ch1))
- break;
- ch0 = ch1;
- j += Character.charCount(ch1);
- }
+ int j = Grapheme.nextBoundary(seq, i, matcher.to);
if (i + n == j) { // single, assume nfc cp
if (predicate.is(ch0))
return next.match(matcher, j, seq);
} else {
while (i + n < j) {
@@ -4019,19 +4005,11 @@
* Node class that matches an unicode extended grapheme cluster
*/
static class XGrapheme extends Node {
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
- int ch0 = Character.codePointAt(seq, i);
- i += Character.charCount(ch0);
- while (i < matcher.to) {
- int ch1 = Character.codePointAt(seq, i);
- if (Grapheme.isBoundary(ch0, ch1))
- break;
- ch0 = ch1;
- i += Character.charCount(ch1);
- }
+ i = Grapheme.nextBoundary(seq, i, matcher.to);
return next.match(matcher, i, seq);
}
matcher.hitEnd = true;
return false;
}
@@ -4057,12 +4035,13 @@
if (i == startIndex) {
return next.match(matcher, i, seq);
}
if (i < endIndex) {
if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
- !Grapheme.isBoundary(Character.codePointBefore(seq, i),
- Character.codePointAt(seq, i))) {
+ Grapheme.nextBoundary(seq,
+ i - Character.charCount(Character.codePointBefore(seq, i)),
+ i + Character.charCount(Character.codePointAt(seq, i))) > i) {
return false;
}
} else {
matcher.hitEnd = true;
matcher.requireEnd = true;
< prev index next >