13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.util.regex;
27
28 import java.util.Objects;
29
30 final class Grapheme {
31
32 /**
33 * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
34 * the start of the char sequence is a boundary.
35 * <p>
36 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
37 * for the extended grapheme cluster boundary rules. The following implementation
38 * is based on version 12.0 of the annex.
39 * (http://www.unicode.org/reports/tr29/tr29-35.html)
40 *
41 * @param src the {@code CharSequence} to be scanned
42 * @param off offset to start looking for the next boundary in the src
43 * @param limit limit offset in the src (exclusive)
44 * @return the next possible boundary
45 */
46 static int nextBoundary(CharSequence src, int off, int limit) {
47 Objects.checkFromToIndex(off, limit, src.length());
48
49 int ch0 = Character.codePointAt(src, 0);
50 int ret = Character.charCount(ch0);
51 int ch1;
52 // indicates whether gb11 or gb12 is underway
53 boolean gb11 = EmojiData.isExtendedPictographic(ch0);
54 int riCount = getType(ch0) == RI ? 1 : 0;
55 while (ret < limit) {
56 ch1 = Character.codePointAt(src, ret);
57 int t0 = getType(ch0);
58 int t1 = getType(ch1);
59
60 if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
61 gb11 = false;
62 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
63 // continue for gb12
64 } else if (rules[t0][t1]) {
65 if (ret > off) {
66 break;
67 } else {
68 gb11 = EmojiData.isExtendedPictographic(ch1);
69 riCount = 0;
70 }
71 }
72
73 riCount += getType(ch1) == RI ? 1 : 0;
74 ch0 = ch1;
75 ret += Character.charCount(ch1);
76 }
77 return ret;
78 }
79
80 // types
81 private static final int OTHER = 0;
82 private static final int CR = 1;
83 private static final int LF = 2;
84 private static final int CONTROL = 3;
85 private static final int EXTEND = 4;
86 private static final int ZWJ = 5;
87 private static final int RI = 6;
88 private static final int PREPEND = 7;
89 private static final int SPACINGMARK = 8;
90 private static final int L = 9;
91 private static final int V = 10;
92 private static final int T = 11;
93 private static final int LV = 12;
94 private static final int LVT = 13;
146 private static final int VCOUNT = 21;
147 private static final int TCOUNT = 28;
148 private static final int NCOUNT = VCOUNT * TCOUNT; // 588
149 private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
150
151 // #tr29: SpacingMark exceptions: The following (which have
152 // General_Category = Spacing_Mark and would otherwise be included)
153 // are specifically excluded
154 private static boolean isExcludedSpacingMark(int cp) {
155 return cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
156 cp >= 0x1062 && cp <= 0x1064 ||
157 cp >= 0x1062 && cp <= 0x106D ||
158 cp == 0x1083 ||
159 cp >= 0x1087 && cp <= 0x108C ||
160 cp == 0x108F ||
161 cp >= 0x109A && cp <= 0x109C ||
162 cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
163 cp == 0xAA7B || cp == 0xAA7D;
164 }
165
166 @SuppressWarnings("fallthrough")
167 private static int getType(int cp) {
168 if (EmojiData.isExtendedPictographic(cp)) {
169 return EXTENDED_PICTOGRAPHIC;
170 }
171
172 int type = Character.getType(cp);
173 switch(type) {
174 case Character.CONTROL:
175 if (cp == 0x000D)
176 return CR;
177 if (cp == 0x000A)
178 return LF;
179 return CONTROL;
180 case Character.UNASSIGNED:
181 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
182 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
183 // so type it as "Other" to make the test happy
184 if (cp == 0x0378)
185 return OTHER;
186
187 case Character.LINE_SEPARATOR:
188 case Character.PARAGRAPH_SEPARATOR:
189 case Character.SURROGATE:
190 return CONTROL;
191 case Character.FORMAT:
192 if (cp == 0x200C ||
193 cp >= 0xE0020 && cp <= 0xE007F)
194 return EXTEND;
195 if (cp == 0x200D)
196 return ZWJ;
197 if (cp >= 0x0600 && cp <= 0x0605 ||
198 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
199 cp == 0x110BD || cp == 0x110CD)
200 return PREPEND;
201 return CONTROL;
202 case Character.NON_SPACING_MARK:
203 case Character.ENCLOSING_MARK:
204 // NOTE:
205 // #tr29 "plus a few General_Category = Spacing_Mark needed for
206 // canonical equivalence."
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.util.regex;
27
28 import java.util.Objects;
29
30 final class Grapheme {
31
32 /**
33 * Determines if there is an extended grapheme cluster boundary between two
34 * continuing characters {@code cp1} and {@code cp2}.
35 * <p>
36 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
37 * for the extended grapheme cluster boundary rules
38 * <p>
39 * Note: this method does not take care of stateful breaking.
40 */
41 static boolean isBoundary(int cp1, int cp2) {
42 return rules[getType(cp1)][getType(cp2)];
43 }
44
45 /**
46 * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
47 * the start of the char sequence is a boundary.
48 * <p>
49 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
50 * for the extended grapheme cluster boundary rules. The following implementation
51 * is based on version 12.0 of the annex.
52 * (http://www.unicode.org/reports/tr29/tr29-35.html)
53 *
54 * @param src the {@code CharSequence} to be scanned
55 * @param off offset to start looking for the next boundary in the src
56 * @param limit limit offset in the src (exclusive)
57 * @return the next possible boundary
58 */
59 static int nextBoundary(CharSequence src, int off, int limit) {
60 Objects.checkFromToIndex(off, limit, src.length());
61
62 int ch0 = Character.codePointAt(src, 0);
63 int ret = Character.charCount(ch0);
64 int ch1;
65 // indicates whether gb11 or gb12 is underway
66 int t0 = getGraphemeType(ch0);
67 int riCount = t0 == RI ? 1 : 0;
68 boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
69 while (ret < limit) {
70 ch1 = Character.codePointAt(src, ret);
71 int t1 = getGraphemeType(ch1);
72
73 if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
74 gb11 = false;
75 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
76 // continue for gb12
77 } else if (rules[t0][t1]) {
78 if (ret > off) {
79 break;
80 } else {
81 gb11 = t1 == EXTENDED_PICTOGRAPHIC;
82 riCount = 0;
83 }
84 }
85
86 riCount += (t1 == RI) ? 1 : 0;
87 t0 = t1;
88
89 ret += Character.charCount(ch1);
90 }
91 return ret;
92 }
93
94 // types
95 private static final int OTHER = 0;
96 private static final int CR = 1;
97 private static final int LF = 2;
98 private static final int CONTROL = 3;
99 private static final int EXTEND = 4;
100 private static final int ZWJ = 5;
101 private static final int RI = 6;
102 private static final int PREPEND = 7;
103 private static final int SPACINGMARK = 8;
104 private static final int L = 9;
105 private static final int V = 10;
106 private static final int T = 11;
107 private static final int LV = 12;
108 private static final int LVT = 13;
160 private static final int VCOUNT = 21;
161 private static final int TCOUNT = 28;
162 private static final int NCOUNT = VCOUNT * TCOUNT; // 588
163 private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
164
165 // #tr29: SpacingMark exceptions: The following (which have
166 // General_Category = Spacing_Mark and would otherwise be included)
167 // are specifically excluded
168 private static boolean isExcludedSpacingMark(int cp) {
169 return cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
170 cp >= 0x1062 && cp <= 0x1064 ||
171 cp >= 0x1062 && cp <= 0x106D ||
172 cp == 0x1083 ||
173 cp >= 0x1087 && cp <= 0x108C ||
174 cp == 0x108F ||
175 cp >= 0x109A && cp <= 0x109C ||
176 cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
177 cp == 0xAA7B || cp == 0xAA7D;
178 }
179
180 private static int getGraphemeType(int cp) {
181 if (cp < 0x007F) { // ASCII
182 if (cp < 32) { // Control characters
183 if (cp == 0x000D)
184 return CR;
185 if (cp == 0x000A)
186 return LF;
187 return CONTROL;
188 }
189 return OTHER;
190 }
191 return getType(cp);
192 }
193
194 @SuppressWarnings("fallthrough")
195 private static int getType(int cp) {
196 if (EmojiData.isExtendedPictographic(cp)) {
197 return EXTENDED_PICTOGRAPHIC;
198 }
199
200 int type = Character.getType(cp);
201 switch(type) {
202 case Character.UNASSIGNED:
203 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
204 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
205 // so type it as "Other" to make the test happy
206 if (cp == 0x0378)
207 return OTHER;
208
209 case Character.CONTROL:
210 case Character.LINE_SEPARATOR:
211 case Character.PARAGRAPH_SEPARATOR:
212 case Character.SURROGATE:
213 return CONTROL;
214 case Character.FORMAT:
215 if (cp == 0x200C ||
216 cp >= 0xE0020 && cp <= 0xE007F)
217 return EXTEND;
218 if (cp == 0x200D)
219 return ZWJ;
220 if (cp >= 0x0600 && cp <= 0x0605 ||
221 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
222 cp == 0x110BD || cp == 0x110CD)
223 return PREPEND;
224 return CONTROL;
225 case Character.NON_SPACING_MARK:
226 case Character.ENCLOSING_MARK:
227 // NOTE:
228 // #tr29 "plus a few General_Category = Spacing_Mark needed for
229 // canonical equivalence."
|