1 /* 2 * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.Objects; 29 30 final class Grapheme { 31 32 /** 33 * Determines if there is an extended grapheme cluster boundary between two 34 * continuing characters {@code cp1} and {@code cp2}. 35 * <p> 36 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification 37 * for the extended grapheme cluster boundary rules 38 * <p> 39 * Note: this method does not take care of stateful breaking. 40 */ 41 static boolean isBoundary(int cp1, int cp2) { 42 return rules[getType(cp1)][getType(cp2)]; 43 } 44 45 /** 46 * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes 47 * the start of the char sequence is a boundary. 48 * <p> 49 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification 50 * for the extended grapheme cluster boundary rules. The following implementation 51 * is based on version 12.0 of the annex. 52 * (http://www.unicode.org/reports/tr29/tr29-35.html) 53 * 54 * @param src the {@code CharSequence} to be scanned 55 * @param off offset to start looking for the next boundary in the src 56 * @param limit limit offset in the src (exclusive) 57 * @return the next possible boundary 58 */ 59 static int nextBoundary(CharSequence src, int off, int limit) { 60 Objects.checkFromToIndex(off, limit, src.length()); 61 62 int ch0 = Character.codePointAt(src, 0); 63 int ret = Character.charCount(ch0); 64 int ch1; 65 // indicates whether gb11 or gb12 is underway 66 int t0 = getGraphemeType(ch0); 67 int riCount = t0 == RI ? 1 : 0; 68 boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; 69 while (ret < limit) { 70 ch1 = Character.codePointAt(src, ret); 71 int t1 = getGraphemeType(ch1); 72 73 if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { 74 gb11 = false; 75 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { 76 // continue for gb12 77 } else if (rules[t0][t1]) { 78 if (ret > off) { 79 break; 80 } else { 81 gb11 = t1 == EXTENDED_PICTOGRAPHIC; 82 riCount = 0; 83 } 84 } 85 86 riCount += (t1 == RI) ? 1 : 0; 87 t0 = t1; 88 89 ret += Character.charCount(ch1); 90 } 91 return ret; 92 } 93 94 // types 95 private static final int OTHER = 0; 96 private static final int CR = 1; 97 private static final int LF = 2; 98 private static final int CONTROL = 3; 99 private static final int EXTEND = 4; 100 private static final int ZWJ = 5; 101 private static final int RI = 6; 102 private static final int PREPEND = 7; 103 private static final int SPACINGMARK = 8; 104 private static final int L = 9; 105 private static final int V = 10; 106 private static final int T = 11; 107 private static final int LV = 12; 108 private static final int LVT = 13; 109 private static final int EXTENDED_PICTOGRAPHIC = 14; 110 111 private static final int FIRST_TYPE = 0; 112 private static final int LAST_TYPE = 14; 113 114 private static boolean[][] rules; 115 static { 116 rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1]; 117 // GB 999 Any + Any -> default 118 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) 119 for (int j = FIRST_TYPE; j <= LAST_TYPE; j++) 120 rules[i][j] = true; 121 // GB 6 L x (L | V | LV | VT) 122 rules[L][L] = false; 123 rules[L][V] = false; 124 rules[L][LV] = false; 125 rules[L][LVT] = false; 126 // GB 7 (LV | V) x (V | T) 127 rules[LV][V] = false; 128 rules[LV][T] = false; 129 rules[V][V] = false; 130 rules[V][T] = false; 131 // GB 8 (LVT | T) x T 132 rules[LVT][T] = false; 133 rules[T][T] = false; 134 // GB 9 x (Extend|ZWJ) 135 // GB 9a x Spacing Mark 136 // GB 9b Prepend x 137 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) { 138 rules[i][EXTEND] = false; 139 rules[i][ZWJ] = false; 140 rules[i][SPACINGMARK] = false; 141 rules[PREPEND][i] = false; 142 } 143 // GB 4 (Control | CR | LF) + 144 // GB 5 + (Control | CR | LF) 145 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) 146 for (int j = CR; j <= CONTROL; j++) { 147 rules[i][j] = true; 148 rules[j][i] = true; 149 } 150 // GB 3 CR x LF 151 rules[CR][LF] = false; 152 // GB 11 Exended_Pictographic x (Extend|ZWJ) 153 rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false; 154 rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false; 155 } 156 157 // Hangul syllables 158 private static final int SYLLABLE_BASE = 0xAC00; 159 private static final int LCOUNT = 19; 160 private static final int VCOUNT = 21; 161 private static final int TCOUNT = 28; 162 private static final int NCOUNT = VCOUNT * TCOUNT; // 588 163 private static final int SCOUNT = LCOUNT * NCOUNT; // 11172 164 165 // #tr29: SpacingMark exceptions: The following (which have 166 // General_Category = Spacing_Mark and would otherwise be included) 167 // are specifically excluded 168 private static boolean isExcludedSpacingMark(int cp) { 169 return cp == 0x102B || cp == 0x102C || cp == 0x1038 || 170 cp >= 0x1062 && cp <= 0x1064 || 171 cp >= 0x1062 && cp <= 0x106D || 172 cp == 0x1083 || 173 cp >= 0x1087 && cp <= 0x108C || 174 cp == 0x108F || 175 cp >= 0x109A && cp <= 0x109C || 176 cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || 177 cp == 0xAA7B || cp == 0xAA7D; 178 } 179 180 private static int getGraphemeType(int cp) { 181 if (cp < 0x007F) { // ASCII 182 if (cp < 32) { // Control characters 183 if (cp == 0x000D) 184 return CR; 185 if (cp == 0x000A) 186 return LF; 187 return CONTROL; 188 } 189 return OTHER; 190 } 191 return getType(cp); 192 } 193 194 @SuppressWarnings("fallthrough") 195 private static int getType(int cp) { 196 if (EmojiData.isExtendedPictographic(cp)) { 197 return EXTENDED_PICTOGRAPHIC; 198 } 199 200 int type = Character.getType(cp); 201 switch(type) { 202 case Character.UNASSIGNED: 203 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control 204 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" 205 // so type it as "Other" to make the test happy 206 if (cp == 0x0378) 207 return OTHER; 208 209 case Character.CONTROL: 210 case Character.LINE_SEPARATOR: 211 case Character.PARAGRAPH_SEPARATOR: 212 case Character.SURROGATE: 213 return CONTROL; 214 case Character.FORMAT: 215 if (cp == 0x200C || 216 cp >= 0xE0020 && cp <= 0xE007F) 217 return EXTEND; 218 if (cp == 0x200D) 219 return ZWJ; 220 if (cp >= 0x0600 && cp <= 0x0605 || 221 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 || 222 cp == 0x110BD || cp == 0x110CD) 223 return PREPEND; 224 return CONTROL; 225 case Character.NON_SPACING_MARK: 226 case Character.ENCLOSING_MARK: 227 // NOTE: 228 // #tr29 "plus a few General_Category = Spacing_Mark needed for 229 // canonical equivalence." 230 // but for "extended grapheme clusters" support, there is no 231 // need actually to diff "extend" and "spackmark" given GB9, GB9a 232 return EXTEND; 233 case Character.COMBINING_SPACING_MARK: 234 if (isExcludedSpacingMark(cp)) 235 return OTHER; 236 // NOTE: 237 // 0x11720 and 0x11721 are mentioned in #tr29 as 238 // OTHER_LETTER but it appears their category has been updated to 239 // COMBING_SPACING_MARK already (verified in ver.8) 240 return SPACINGMARK; 241 case Character.OTHER_SYMBOL: 242 if (cp >= 0x1F1E6 && cp <= 0x1F1FF) 243 return RI; 244 return OTHER; 245 case Character.MODIFIER_LETTER: 246 case Character.MODIFIER_SYMBOL: 247 // WARNING: 248 // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt 249 if (cp == 0xFF9E || cp == 0xFF9F || 250 cp >= 0x1F3FB && cp <= 0x1F3FF) 251 return EXTEND; 252 return OTHER; 253 case Character.OTHER_LETTER: 254 if (cp == 0x0E33 || cp == 0x0EB3) 255 return SPACINGMARK; 256 // hangul jamo 257 if (cp >= 0x1100 && cp <= 0x11FF) { 258 if (cp <= 0x115F) 259 return L; 260 if (cp <= 0x11A7) 261 return V; 262 return T; 263 } 264 // hangul syllables 265 int sindex = cp - SYLLABLE_BASE; 266 if (sindex >= 0 && sindex < SCOUNT) { 267 268 if (sindex % TCOUNT == 0) 269 return LV; 270 return LVT; 271 } 272 // hangul jamo_extended A 273 if (cp >= 0xA960 && cp <= 0xA97C) 274 return L; 275 // hangul jamo_extended B 276 if (cp >= 0xD7B0 && cp <= 0xD7C6) 277 return V; 278 if (cp >= 0xD7CB && cp <= 0xD7FB) 279 return T; 280 281 // Prepend 282 switch (cp) { 283 case 0x0D4E: 284 case 0x111C2: 285 case 0x111C3: 286 case 0x11A3A: 287 case 0x11A84: 288 case 0x11A85: 289 case 0x11A86: 290 case 0x11A87: 291 case 0x11A88: 292 case 0x11A89: 293 case 0x11D46: 294 return PREPEND; 295 } 296 } 297 return OTHER; 298 } 299 }