1 /* 2 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 /* 24 * test 25 * bug 4221795 26 * summary Confirm *.icu data using ICU4J Normalizer 27 */ 28 29 import java.io.BufferedReader; 30 import java.io.FileInputStream; 31 import java.io.InputStreamReader; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.util.BitSet; 35 import java.util.StringTokenizer; 36 37 import com.ibm.icu.text.Normalizer; 38 import com.ibm.icu.impl.NormalizerImpl; 39 40 /** 41 * This is not a test program but a data validation utility. 42 * Two datafiles for Normalizer, unorm.icu and uprops.icu under 43 * sun/text/resouces, are generated using generators in ICU4C 3.2 on a 44 * BIG-ENDIAN machine. Before using them with java.text.Normalizer and 45 * sun.text.Normalizer, you may want to check these test datafile's validation. 46 * You can test datafiles using Normalizer in ICU4J 3.2. Download ICU4J 3.2 and 47 * run this test program with -cp <ICU4J 3.2>. 48 */ 49 public class DataValidationTest { 50 51 // 52 // Options to be used with com.ibm.icu.text.Normalizer 53 // 54 55 /* 56 * Default Unicode 3.2.0 normalization. 57 * 58 * - With Corrigendum 4 fix 59 * (Different from Mustang's Normalizer.) 60 * - With Public Review Issue #29 fix 61 * (Different from Mustang's Normalizer.) 62 */ 63 private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2; 64 65 /* 66 * *Incomplete* Unicode 3.2.0 normalization for IDNA/StringPrep. 67 * 68 * - With Corrigendum 4 fix 69 * - Without Public Review Issue #29 fix 70 * 71 * ICU4J's Normalizer itself doesn't support normalization for Unicode 3.2.0 72 * without Corrigendum 4 fix, which is necessary for IDNA/StringPrep. It is 73 * done in StringPrep. Therefore, we don't test the normlaization in this 74 * test program. We merely test normalization for Unicode 3.2.0 without 75 * Public Review Issue #29 fix with this test program. 76 */ 77 private static final int UNICODE_3_2_0_BEFORE_PRI_29 = 78 Normalizer.UNICODE_3_2 | 79 NormalizerImpl.BEFORE_PRI_29; 80 81 /* 82 * Default normalization. 83 * 84 * - Unicode 4.0.1 85 * (Different from Mustang's Normalizer.) 86 * - With Corrigendum 4 fix 87 * - With Public Review Issue #29 fix 88 * (Different from Mustang's Normalizer.) 89 * 90 * Because Public Review Issue #29 is fixed in Unicode 4.1.0. I think that 91 * IUC4J 3.2 should not support it. But it actually supports PRI #29 fix 92 * as default.... 93 */ 94 private static final int UNICODE_LATEST = 0x00; 95 96 /* 97 * Normalization without Public Review Issue #29 fix. 98 * 99 * - Unicode 4.0.1 100 * - Without Corrigendum 4 fix 101 * - Without Public Review Issue #29 fix 102 */ 103 static final int UNICODE_LATEST_BEFORE_PRI_29 = 104 NormalizerImpl.BEFORE_PRI_29; 105 106 // 107 // Conformance test datafiles 108 // 109 110 /* 111 * Conformance test datafile for normalization for Unicode 3.2.0 with 112 * Corrigendum 4 corrections. This is NOT an original Conformace test 113 * data. Some inconvenient test cases are commented out. 114 * About corrigendum 4, please refer 115 * http://www.unicode.org/versions/corrigendum4.html 116 * 117 * ICU4J 3.2's Normalizer itself doesn't support normalization for Unicode 118 * 3.2.0 without Corrigendum 4 corrections. StringPrep helps it. So, we 119 * don't test the normalization with this test program. 120 */ 121 static final String DATA_3_2_0 = "NormalizationTest-3.2.0.Corrigendum4.txt"; 122 123 /* 124 * Conformance test datafile for the latest Unicode which is supported 125 * by J2SE. 126 */ 127 static final String DATA_LATEST = "NormalizationTest-Latest.txt"; 128 129 /* 130 * Decorder 131 */ 132 static final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder(); 133 134 /* 135 * List to pick up characters which are not listed in Part1 136 */ 137 static BitSet charList = new BitSet(Character.MAX_CODE_POINT+1); 138 139 /* 140 * Shortcuts 141 */ 142 static final Normalizer.Mode NFC = com.ibm.icu.text.Normalizer.NFC; 143 static final Normalizer.Mode NFD = com.ibm.icu.text.Normalizer.NFD; 144 static final Normalizer.Mode NFKC = com.ibm.icu.text.Normalizer.NFKC; 145 static final Normalizer.Mode NFKD = com.ibm.icu.text.Normalizer.NFKD; 146 static final Normalizer.Mode[] modes = {NFC, NFD, NFKC, NFKD}; 147 148 149 public static void main(String[] args) throws Exception { 150 test(DATA_3_2_0, UNICODE_3_2_0); 151 test(DATA_3_2_0, UNICODE_3_2_0_BEFORE_PRI_29); 152 test(DATA_LATEST, UNICODE_LATEST); 153 // This test started failing since ICU4J 3.6. 154 // test(DATA_LATEST, UNICODE_LATEST_BEFORE_PRI_29); 155 156 /* Unconformity test */ 157 // test(DATA_3_2_0, UNICODE_LATEST); 158 // test(DATA_LATEST, UNICODE_3_2); 159 } 160 161 private static void test(String filename, int unicodeVer) throws Exception { 162 163 FileInputStream fis = new FileInputStream(filename); 164 BufferedReader in = 165 new BufferedReader(new InputStreamReader(fis, decoder)); 166 167 System.out.println("\nStart testing with " + filename + 168 " for options: " + 169 (((unicodeVer & Normalizer.UNICODE_3_2) != 0) ? 170 "Unicode 3.2.0" : "the latest Unicode") + ", " + 171 (((unicodeVer & NormalizerImpl.BEFORE_PRI_29) != 0) ? 172 "with" : "without") + " PRI #29 fix"); 173 174 int lineNo = 0; 175 String text; 176 String[] columns = new String[6]; 177 boolean part1test = false; 178 179 while ((text = in.readLine()) != null) { 180 lineNo ++; 181 182 char c = text.charAt(0); 183 if (c == '#') { 184 continue; 185 } else if (c == '@') { 186 if (text.startsWith("@Part")) { 187 System.out.println("# Testing data in " + text); 188 189 if (text.startsWith("@Part1 ")) { 190 part1test = true; 191 } else { 192 part1test = false; 193 } 194 195 continue; 196 } 197 } 198 199 prepareColumns(columns, text, filename, lineNo, part1test); 200 201 testNFC(columns, unicodeVer, filename, lineNo); 202 testNFD(columns, unicodeVer, filename, lineNo); 203 testNFKC(columns, unicodeVer, filename, lineNo); 204 testNFKD(columns, unicodeVer, filename, lineNo); 205 } 206 207 in.close(); 208 fis.close(); 209 210 if (unicodeVer == UNICODE_LATEST) { 211 System.out.println("# Testing characters which are not listed in Part1"); 212 testRemainingChars(filename, unicodeVer); 213 } 214 } 215 216 /* 217 * Test for NFC 218 * 219 * c2 == NFC(c1) == NFC(c2) == NFC(c3) 220 * c4 == NFC(c4) == NFC(c5) 221 */ 222 private static void testNFC(String[] c, int unicodeVer, 223 String file, int line) throws Exception { 224 test(2, c, 1, 3, NFC, unicodeVer, file, line); 225 test(4, c, 4, 5, NFC, unicodeVer, file, line); 226 } 227 228 /* 229 * Test for NFD 230 * 231 * c3 == NFD(c1) == NFD(c2) == NFD(c3) 232 * c5 == NFD(c4) == NFD(c5) 233 */ 234 private static void testNFD(String[] c, int unicodeVer, 235 String file, int line) throws Exception { 236 test(3, c, 1, 3, NFD, unicodeVer, file, line); 237 test(5, c, 4, 5, NFD, unicodeVer, file, line); 238 } 239 240 /* 241 * Test for NFKC 242 * 243 * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) 244 */ 245 private static void testNFKC(String[] c, int unicodeVer, 246 String file, int line) throws Exception { 247 test(4, c, 1, 5, NFKC, unicodeVer, file, line); 248 } 249 250 /* 251 * Test for NFKD 252 * 253 * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) 254 */ 255 private static void testNFKD(String[] c, int unicodeVer, 256 String file, int line) throws Exception { 257 test(5, c, 1, 5, NFKD, unicodeVer, file, line); 258 } 259 260 /* 261 * Test for characters which aren't listed in Part1 262 * 263 * X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X) 264 */ 265 private static void testRemainingChars(String file, 266 int unicodeVer) throws Exception { 267 for (int i = Character.MIN_CODE_POINT; 268 i <= Character.MAX_CODE_POINT; 269 i++) { 270 if (!charList.get(i)) { 271 String from = String.valueOf(Character.toChars(i)); 272 String to; 273 274 for (int j = 0; j < modes.length; j++) { 275 Normalizer.Mode mode = modes[j]; 276 277 to = Normalizer.normalize(from, mode, unicodeVer); 278 if (!from.equals(to)) { 279 error(mode, from, from, to, file, -1); 280 // } else { 281 // okay(mode, from, from, to, file, -1); 282 } 283 284 if (!Normalizer.isNormalized(from, mode, unicodeVer)) { 285 error(mode, from, file, -1); 286 // } else { 287 // okay(mode, from, file, -1); 288 } 289 } 290 } 291 } 292 } 293 294 /* 295 * Test normalize() and isNormalized() 296 */ 297 private static void test(int col, String[] c, 298 int FROM, int TO, 299 Normalizer.Mode mode, int unicodeVer, 300 String file, int line) throws Exception { 301 for (int i = FROM; i <= TO; i++) { 302 String got = Normalizer.normalize(c[i], mode, unicodeVer); 303 if (!c[col].equals(got)) { 304 error(mode, c[i], c[col], got, file, line); 305 // } else { 306 // okay(mode, c[i], c[col], got, file, line); 307 } 308 309 /* 310 * If the original String equals its normalized String, it means 311 * that the original String is normalizerd. Thus, isNormalized() 312 * should return true. And, vice versa! 313 */ 314 if (c[col].equals(c[i])) { 315 if (!Normalizer.isNormalized(c[i], mode, unicodeVer)) { 316 error(mode, c[i], file, line); 317 // } else { 318 // okay(mode, c[i], file, line); 319 } 320 } else { 321 if (Normalizer.isNormalized(c[i], mode, unicodeVer)) { 322 error(mode, c[i], file, line); 323 // } else { 324 // okay(mode, c[i], file, line); 325 } 326 } 327 } 328 } 329 330 /* 331 * Generate an array of String from a line of conformance datafile. 332 */ 333 private static void prepareColumns(String[] col, String text, 334 String file, int line, 335 boolean part1test) throws Exception { 336 int index = text.indexOf('#'); 337 if (index != -1) { 338 text = text.substring(0, index); 339 } 340 341 StringTokenizer st = new StringTokenizer(text, ";"); 342 int tokenCount = st.countTokens(); 343 if (tokenCount < 5) { 344 throw new RuntimeException("# of tokens in datafile should be 6, but got: " + tokenCount + " at line " + line + " in " + file); 345 } 346 347 StringBuffer sb = new StringBuffer(); 348 for (int i = 1; i <= 5; i++) { 349 StringTokenizer tst = new StringTokenizer(st.nextToken(), " "); 350 351 while (tst.hasMoreTokens()) { 352 int code = Integer.parseInt(tst.nextToken(), 16); 353 sb.append(Character.toChars(code)); 354 } 355 356 col[i] = sb.toString(); 357 sb.setLength(0); 358 } 359 360 if (part1test) { 361 charList.set(col[1].codePointAt(0)); 362 } 363 } 364 365 /* 366 * Show an error message when normalize() didn't return the expected value. 367 * (An exception is sometimes convenient. Therefore, it is commented out 368 * for the moment.) 369 */ 370 private static void error(Normalizer.Mode mode, 371 String from, String to, String got, 372 String file, int line) throws Exception { 373 System.err.println("\t" + toString(mode) + ": normalize(" + 374 toHexString(from) + ") doesn't equal <" + toHexString(to) + 375 "> at line " + line + " in " + file + ". Got <" + 376 toHexString(got) + ">."); 377 // throw new RuntimeException("Normalization(" + toString(mode) + ") failed"); 378 } 379 380 /* 381 * Show an error message when isNormalize() didn't return the expected value. 382 * (An exception is sometimes convenient. Therefore, it is commented out 383 * for the moment.) 384 */ 385 private static void error(Normalizer.Mode mode, String orig, 386 String file, int line) throws Exception { 387 System.err.println("\t" + toString(mode) + ": isNormalized(" + 388 toHexString(orig) + ") returned the wrong value at line " + line + 389 " in " + file + "."); 390 // throw new RuntimeException("Normalization(" + toString(mode) +") failed"); 391 } 392 393 /* 394 * (For debugging) 395 * Shows a message when normalize() returned the expected value. 396 */ 397 private static void okay(Normalizer.Mode mode, 398 String from, String to, String got, 399 String file, int line) { 400 System.out.println("\t" + toString(mode) + ": normalize(" + 401 toHexString(from) + ") equals <" + toHexString(to) + 402 "> at line " + line + " in " + file + ". Got <" + 403 toHexString(got) + ">."); 404 } 405 406 /* 407 * (For debugging) 408 * Shows a message when isNormalized() returned the expected value. 409 */ 410 private static void okay(Normalizer.Mode mode, String orig, 411 String file, int line) { 412 System.out.println("\t" + toString(mode) + ": isNormalized(" + 413 toHexString(orig) + ") returned the correct value at line " + 414 line + " in " + file + "."); 415 } 416 417 /* 418 * Returns a spece-delimited hex String 419 */ 420 private static String toHexString(String s) { 421 StringBuffer sb = new StringBuffer(" "); 422 423 for (int i = 0; i < s.length(); i++) { 424 sb.append(Integer.toHexString(s.charAt(i))); 425 sb.append(' '); 426 } 427 428 return sb.toString(); 429 } 430 431 /* 432 * Returns the name of Normalizer.Mode 433 */ 434 private static String toString(Normalizer.Mode mode) { 435 if (mode == Normalizer.NFC) { 436 return "NFC"; 437 } else if (mode == Normalizer.NFD) { 438 return "NFD"; 439 } else if (mode == Normalizer.NFKC) { 440 return "NFKC"; 441 } else if (mode == Normalizer.NFKD) { 442 return "NFKD"; 443 } 444 445 return "unknown"; 446 } 447 }