1 /* 2 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* 25 * @test 26 * @bug 7071819 27 * @summary tests Unicode Extended Grapheme support 28 * @run main GraphemeTest 29 */ 30 31 import java.io.IOException; 32 import java.nio.file.Files; 33 import java.nio.file.Path; 34 import java.nio.file.Paths; 35 import java.util.Arrays; 36 import java.util.ArrayList; 37 import java.util.Scanner; 38 import java.util.regex.Pattern; 39 import java.util.regex.Matcher; 40 41 public class GraphemeTest { 42 43 public static void main(String[] args) throws Throwable { 44 testProps(Paths.get(System.getProperty("test.src", "."), 45 "GraphemeBreakProperty.txt")); 46 testBreak(Paths.get(System.getProperty("test.src", "."), 47 "GraphemeBreakTest.txt")); 48 } 49 50 private static void testProps(Path path) throws IOException { 51 Files.lines(path) 52 .filter( ln -> ln.length() != 0 && !ln.startsWith("#") ) 53 .forEach(ln -> { 54 String[] strs = ln.split("\\s+"); 55 int off = strs[0].indexOf(".."); 56 int cp0, cp1; 57 String expected = strs[2]; 58 if (off != -1) { 59 cp0 = Integer.parseInt(strs[0], 0, off, 16); 60 cp1 = Integer.parseInt(strs[0], off + 2, strs[0].length(), 16); 61 } else { 62 cp0 = cp1 = Integer.parseInt(strs[0], 16); 63 } 64 for (int cp = cp0; cp <= cp1; cp++) { 65 // NOTE: 66 // #tr29 "plus a few General_Category = Spacing_Mark needed for 67 // canonical equivalence." 68 // For "extended grapheme clusters" support, there is no 69 // need actually to diff "extend" and "spackmark" given GB9, GB9a. 70 if (!expected.equals(types[getType(cp)])) { 71 if ("Extend".equals(expected) && 72 "SpacingMark".equals(types[getType(cp)])) 73 System.out.printf("[%x] [%s][%d] -> [%s]%n", 74 cp, expected, Character.getType(cp), types[getType(cp)]); 75 else 76 throw new RuntimeException(String.format( 77 "cp=[%x], expeced:[%s] result:[%s]%n", 78 cp, expected, types[getType(cp)])); 79 } 80 } 81 }); 82 } 83 84 private static void testBreak(Path path) throws IOException { 85 Files.lines(path) 86 .filter( ln -> ln.length() != 0 && !ln.startsWith("#") ) 87 .forEach(ln -> { 88 String str = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", ""); 89 // System.out.println(str); 90 String[] cstrs = str.split("\u00f7|\u00d7"); 91 int prevCp = -1; 92 char prevBk = '\u00f7'; 93 int offBk = 0; 94 for (String cstr : cstrs) { 95 if (cstr.length() == 0) // first empty str 96 continue; 97 int cp = Integer.parseInt(cstr, 16); 98 if (prevCp == -1) { 99 prevCp = cp; 100 } else { 101 // test against the rules directly 102 if (rules[getType(prevCp)][getType(cp)] != (prevBk == '\u00f7')) { 103 throw new RuntimeException(String.format( 104 "NG %x[%d] %x[%d] -> %b [%s]%n", 105 prevCp, getType(prevCp), cp, getType(cp), 106 rules[getType(prevCp)][getType(cp)], 107 ln)); 108 } 109 } 110 prevCp = cp; 111 offBk += (cstr.length() + 1); 112 prevBk = str.charAt(offBk); 113 } 114 }); 115 } 116 117 private static final String[] types = { 118 "Other", "CR", "LF", "Control", "Extend", "Regional_Indicator", 119 "Prepend", "SpacingMark", 120 "L", "V", "T", "LV", "LVT" }; 121 122 /////////////////////////////////////////////////////////////////////////////////////////////////////////////// 123 124 // types 125 private static final int OTHER = 0; 126 private static final int CR = 1; 127 private static final int LF = 2; 128 private static final int CONTROL = 3; 129 private static final int EXTEND = 4; 130 private static final int RI = 5; 131 private static final int PREPEND = 6; 132 private static final int SPACINGMARK = 7; 133 private static final int L = 8; 134 private static final int V = 9; 135 private static final int T = 10; 136 private static final int LV = 11; 137 private static final int LVT = 12; 138 139 private static final int FIRST_TYPE = 0; 140 private static final int LAST_TYPE = 12; 141 142 private static boolean[][] rules; 143 static { 144 rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1]; 145 // default, any ÷ any 146 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) 147 for (int j = FIRST_TYPE; j <= LAST_TYPE; j++) 148 rules[i][j] = true; 149 // GB 6 L x (L | V | LV | VT) 150 rules[L][L] = false; 151 rules[L][V] = false; 152 rules[L][LV] = false; 153 rules[L][LVT] = false; 154 // GB 7 (LV | V) x (V | T) 155 rules[LV][V] = false; 156 rules[LV][T] = false; 157 rules[V][V] = false; 158 rules[V][T] = false; 159 // GB 8 (LVT | T) x T 160 rules[LVT][T] = false; 161 rules[T][T] = false; 162 // GB 8a RI x RI 163 rules[RI][RI] = false; 164 // GB 9 x Extend 165 // GB 9a x Spacing Mark 166 // GB 9b Prepend x 167 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) { 168 rules[i][EXTEND] = false; 169 rules[i][SPACINGMARK] = false; 170 rules[PREPEND][i] = false; 171 } 172 // GB 4 (Control | CR | LF) ÷ 173 // GB 5 ÷ (Control | CR | LF) 174 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) 175 for (int j = CR; j <= CONTROL; j++) { 176 rules[i][j] = true; 177 rules[j][i] = true; 178 } 179 // GB 3 CR x LF 180 rules[CR][LF] = false; 181 // GB 10 Any ÷ Any -> default 182 } 183 184 // Hangul syllables 185 private static final int SYLLABLE_BASE = 0xAC00; 186 private static final int LCOUNT = 19; 187 private static final int VCOUNT = 21; 188 private static final int TCOUNT = 28; 189 private static final int NCOUNT = VCOUNT * TCOUNT; // 588 190 private static final int SCOUNT = LCOUNT * NCOUNT; // 11172 191 192 // #tr29: SpacingMark exceptions: The following (which have 193 // General_Category = Spacing_Mark and would otherwise be included) 194 // are specifically excluded 195 private static boolean isExcludedSpacingMark(int cp) { 196 return cp == 0x102B || cp == 0x102C || cp == 0x1038 || 197 cp >= 0x1062 && cp <= 0x1064 || 198 cp >= 0x1062 && cp <= 0x106D || 199 cp == 0x1083 || 200 cp >= 0x1087 && cp <= 0x108C || 201 cp == 0x108F || 202 cp >= 0x109A && cp <= 0x109C || 203 cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || 204 cp == 0xAA7B || cp == 0xAA7D; 205 } 206 207 private static int getType(int cp) { 208 int type = Character.getType(cp); 209 switch(type) { 210 case Character.CONTROL: 211 if (cp == 0x000D) 212 return CR; 213 if (cp == 0x000A) 214 return LF; 215 return CONTROL; 216 case Character.UNASSIGNED: 217 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control 218 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" 219 // so type it as "Other" to make the test happy 220 if (cp == 0x0378) 221 return OTHER; 222 case Character.LINE_SEPARATOR: 223 case Character.PARAGRAPH_SEPARATOR: 224 case Character.SURROGATE: 225 return CONTROL; 226 case Character.FORMAT: 227 if (cp == 0x200C || cp == 0x200D) 228 return EXTEND; 229 return CONTROL; 230 case Character.NON_SPACING_MARK: 231 case Character.ENCLOSING_MARK: 232 // NOTE: 233 // #tr29 "plus a few General_Category = Spacing_Mark needed for 234 // canonical equivalence." 235 // but for "extended grapheme clusters" support, there is no 236 // need actually to diff "extend" and "spackmark" given GB9, GB9a 237 return EXTEND; 238 case Character.COMBINING_SPACING_MARK: 239 if (isExcludedSpacingMark(cp)) 240 return OTHER; 241 // NOTE: 242 // 0x11720 and 0x11721 are mentioned in #tr29 as 243 // OTHER_LETTER but it appears their category has been updated to 244 // COMBING_SPACING_MARK already (verified in ver.8) 245 return SPACINGMARK; 246 case Character.OTHER_SYMBOL: 247 if (cp >= 0x1F1E6 && cp <= 0x1F1FF) 248 return RI; 249 return OTHER; 250 case Character.MODIFIER_LETTER: 251 // WARNING: 252 // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt 253 if (cp == 0xFF9E || cp == 0xFF9F) 254 return EXTEND; 255 return OTHER; 256 case Character.OTHER_LETTER: 257 if (cp == 0x0E33 || cp == 0x0EB3) 258 return SPACINGMARK; 259 // hangul jamo 260 if (cp >= 0x1100 && cp <= 0x11FF) { 261 if (cp <= 0x115F) 262 return L; 263 if (cp <= 0x11A7) 264 return V; 265 return T; 266 } 267 // hangul syllables 268 int sindex = cp - SYLLABLE_BASE; 269 if (sindex >= 0 && sindex < SCOUNT) { 270 271 if (sindex % TCOUNT == 0) 272 return LV; 273 return LVT; 274 } 275 // hangul jamo_extended A 276 if (cp >= 0xA960 && cp <= 0xA97C) 277 return L; 278 // hangul jamo_extended B 279 if (cp >= 0xD7B0 && cp <= 0xD7C6) 280 return V; 281 if (cp >= 0xD7CB && cp <= 0xD7FB) 282 return T; 283 } 284 return OTHER; 285 } 286 }