New test/jdk/java/text/Normalizer/ConformanceTest.java

   1 /*
   2  * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 /*
  24  * @test
  25  * @bug  4221795 6565620 6959267 7070436 7198195 8032446 8174270 8221431 8239383
  26  * @summary Confirm Normalizer's fundamental behavior
  27  * @library /lib/testlibrary/java/lang
  28  * @modules java.base/sun.text java.base/jdk.internal.icu.text
  29  * @compile -XDignore.symbol.file ConformanceTest.java
  30  * @run main/timeout=3000 ConformanceTest
  31  */
  32 
  33 import java.io.BufferedReader;
  34 import java.io.File;
  35 import java.io.FileInputStream;
  36 import java.io.InputStreamReader;
  37 import java.nio.charset.Charset;
  38 import java.nio.charset.CharsetDecoder;
  39 import java.util.BitSet;
  40 import java.util.StringTokenizer;
  41 
  42 import jdk.internal.icu.text.NormalizerBase;
  43 
  44 /*
  45  * Conformance test for java.text.Normalizer and sun.text.Normalizer.
  46  */
  47 public class ConformanceTest {
  48 
  49     //
  50     // Options to be used with sun.text.Normalizer
  51     //
  52 
  53     /*
  54      * Default Unicode 3.2.0 normalization. (Provided for IDNA/StringPrep)
  55      *
  56      *   - Without Corrigendum 4 fix
  57      *     (Different from ICU4J 3.2's Normalizer.)
  58      *   - Without Public Review Issue #29 fix
  59      *     (Different from ICU4J 3.2's Normalizer.)
  60      */
  61     private static final int UNICODE_3_2_0 = sun.text.Normalizer.UNICODE_3_2;
  62 
  63     /*
  64      * Original Unicode 3.2.0 normalization. (Provided for testing only)
  65      *
  66      *   - With Corrigendum 4 fix
  67      *   - With Public Revilew Issue #29 fix
  68      */
  69     private static final int UNICODE_3_2_0_ORIGINAL =
  70                                  NormalizerBase.UNICODE_3_2;
  71 
  72     /*
  73      * Default normalization. In JDK 6,
  74      *   - Unicode 4.0.0
  75      *   - With Corrigendum 4 fix
  76      *   - Without Public Review Issue #29 fix
  77      *
  78      * In JDK 7,
  79      *   - Unicode 5.1.0
  80      *     (Different from ICU4J 3.2's Normalizer.)
  81      *   - With Corrigendum 4 fix
  82      *   - With Public Review Issue #29 fix
  83      *
  84      * In JDK 8,
  85      *   - Unicode 6.1.0
  86      *   - With Corrigendum 4 fix
  87      *   - With Public Review Issue #29 fix
  88      *
  89      *  When we support Unicode 4.1.0 or later, we need to do normalization
  90      *  with Public Review Issue #29 fix. For more details of PRI #29, see
  91      *  http://unicode.org/review/pr-29.html .
  92      */
  93     private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST;
  94 
  95     //
  96     // Conformance test datafiles
  97     //
  98 
  99     /*
 100      * Conformance test datafile for Unicode 3.2.0 with Corrigendum4
 101      * corrections.
 102      * This testdata is for sun.text.Normalize(UNICODE_3_2)
 103      *
 104      * This is NOT an original Conformace test data. Some inconvenient test
 105      * cases are commented out. About corrigendum 4, please refer
 106      *   http://www.unicode.org/review/resolved-pri.html#pri29
 107      *
 108      */
 109     static final String DATA_3_2_0_CORRIGENDUM =
 110                             "NormalizationTest-3.2.0.Corrigendum4.txt";
 111 
 112     /*
 113      * Conformance test datafile for Unicode 3.2.0 without Corrigendum4
 114      * corrections. This is the original Conformace test data.
 115      *
 116      * This testdata is for sun.text.Normalize(UNICODE_3_2_IDNA)
 117      */
 118     static final String DATA_3_2_0 = "NormalizationTest-3.2.0.txt";
 119 
 120     /*
 121      * Conformance test datafile for the latest Unicode which is supported
 122      * by J2SE.
 123      * Unicode 4.0.0 is the latest version in JDK 5.0 and JDK 6. Unicode 5.1.0
 124      * in JDK 7, and 6.1.0 in JDK 8. This Unicode can be used via both
 125      * java.text.Normalizer and sun.text.Normalizer.
 126      *
 127      * This testdata is for sun.text.Normalize(UNICODE_LATEST)
 128      */
 129     static final String DATA_LATEST = "NormalizationTest.txt";
 130 
 131     /*
 132      * Conformance test datafile in ICU4J 3.2.
 133      */
 134     static final String DATA_ICU = "ICUNormalizationTest.txt";
 135 
 136     /*
 137      * Decorder
 138      */
 139     static final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
 140 
 141     /*
 142      * List to pick up characters which are not listed in Part1
 143      */
 144     static BitSet charList = new BitSet(Character.MAX_CODE_POINT+1);
 145 
 146     /*
 147      * Shortcuts
 148      */
 149     private static final java.text.Normalizer.Form NFC  =
 150         java.text.Normalizer.Form.NFC;
 151     private static final java.text.Normalizer.Form NFD  =
 152         java.text.Normalizer.Form.NFD;
 153     private static final java.text.Normalizer.Form NFKC =
 154         java.text.Normalizer.Form.NFKC;
 155     private static final java.text.Normalizer.Form NFKD =
 156         java.text.Normalizer.Form.NFKD;
 157     static final java.text.Normalizer.Form[] forms = {NFC, NFD, NFKC, NFKD};
 158 
 159 
 160     static TestNormalizer normalizer;
 161 
 162     public static void main(String[] args) throws Exception {
 163         ConformanceTest ct = new ConformanceTest();
 164         ct.test();
 165     }
 166 
 167     void test() throws Exception {
 168         normalizer = new testJavaNormalizer();
 169         test(DATA_LATEST, UNICODE_LATEST);
 170 
 171         normalizer = new testSunNormalizer();
 172         test(DATA_3_2_0_CORRIGENDUM, UNICODE_3_2_0);
 173         test(DATA_LATEST, UNICODE_LATEST);
 174         test(DATA_ICU, UNICODE_LATEST);
 175 
 176         /* Unconformity test */
 177 //      test(DATA_3_2_0, UNICODE_LATEST);
 178 //      test(DATA_LATEST, UNICODE_3_2_0);
 179     }
 180 
 181     /*
 182      * Main routine of conformance test
 183      */
 184     private static void test(String filename, int unicodeVer) throws Exception {
 185 
 186         File  f = filename.equals(DATA_LATEST) ?
 187             UCDFiles.NORMALIZATION_TEST.toFile() :
 188             new File(System.getProperty("test.src", "."), filename);
 189         FileInputStream fis = new FileInputStream(f);
 190         BufferedReader in =
 191             new BufferedReader(new InputStreamReader(fis, decoder));
 192 
 193         System.out.println("\nStart testing for " + normalizer.name +
 194             " with " + filename + " for options: " +
 195             (((unicodeVer & NormalizerBase.UNICODE_3_2) != 0) ?
 196                 "Unicode 3.2.0" : "the latest Unicode"));
 197 
 198         int lineNo = 0;
 199         String text;
 200         boolean part1test = false;
 201         boolean part1testExists = false;
 202         String[] columns = new String[6];
 203 
 204         while ((text = in.readLine()) != null) {
 205             lineNo ++;
 206 
 207             char c = text.charAt(0);
 208             if (c == '#') {
 209                 continue;
 210             } else if (c == '@') {
 211                 if (text.startsWith("@Part")) {
 212                     System.out.println("# Testing data in " + text);
 213 
 214                     if (text.startsWith("@Part1 ")) {
 215                         part1test = true;
 216                         part1testExists = true;
 217                     } else {
 218                         part1test = false;
 219                     }
 220 
 221                     continue;
 222                 }
 223             }
 224 
 225             prepareColumns(columns, text, filename, lineNo, part1test);
 226 
 227             testNFC(columns, unicodeVer, filename, lineNo);
 228             testNFD(columns, unicodeVer, filename, lineNo);
 229             testNFKC(columns, unicodeVer, filename, lineNo);
 230             testNFKD(columns, unicodeVer, filename, lineNo);
 231         }
 232 
 233         in.close();
 234         fis.close();
 235 
 236         if (part1testExists) {
 237             System.out.println("# Testing characters which are not listed in Part1");
 238             testRemainingChars(filename, unicodeVer);
 239             part1testExists = false;
 240         }
 241     }
 242 
 243     /*
 244      * Test for NFC
 245      *
 246      *   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
 247      *   c4 ==  NFC(c4) ==  NFC(c5)
 248      */
 249     private static void testNFC(String[] c, int unicodeVer,
 250                                 String file, int line) throws Exception {
 251         test(2, c, 1, 3, NFC, unicodeVer, file, line);
 252         test(4, c, 4, 5, NFC, unicodeVer, file, line);
 253     }
 254 
 255     /*
 256      * Test for NFD
 257      *
 258      *   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
 259      *   c5 ==  NFD(c4) ==  NFD(c5)
 260      */
 261     private static void testNFD(String[] c, int unicodeVer,
 262                                 String file, int line) throws Exception {
 263         test(3, c, 1, 3, NFD, unicodeVer, file, line);
 264         test(5, c, 4, 5, NFD, unicodeVer, file, line);
 265     }
 266 
 267     /*
 268      * Test for NFKC
 269      *
 270      *   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
 271      */
 272     private static void testNFKC(String[] c, int unicodeVer,
 273                                  String file, int line) throws Exception {
 274         test(4, c, 1, 5, NFKC, unicodeVer, file, line);
 275     }
 276 
 277     /*
 278      * Test for NFKD
 279      *
 280      *   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
 281      */
 282     private static void testNFKD(String[] c, int unicodeVer,
 283                                  String file, int line) throws Exception {
 284         test(5, c, 1, 5, NFKD, unicodeVer, file, line);
 285     }
 286 
 287     /*
 288      * Test for characters which aren't listed in Part1
 289      *
 290      *   X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
 291      */
 292     private static void testRemainingChars(String file,
 293                                            int unicodeVer) throws Exception {
 294         for (int i = Character.MIN_CODE_POINT;
 295              i <= Character.MAX_CODE_POINT;
 296              i++) {
 297             if (!charList.get(i)) {
 298                 String from = String.valueOf(Character.toChars(i));
 299                 String to;
 300 
 301                 for (int j = 0; j < forms.length; j++) {
 302                     java.text.Normalizer.Form form = forms[j];
 303 
 304                     to = normalizer.normalize(from, form, unicodeVer);
 305                     if (!from.equals(to)) {
 306                         error(form, from, from, to, file, -1);
 307 //                  } else {
 308 //                      okay(form, from, from, to, file, -1);
 309                     }
 310 
 311                     if (!normalizer.isNormalized(from, form, unicodeVer)) {
 312                         error(form, from, file, -1);
 313 //                  } else {
 314 //                      okay(form, from, file, -1);
 315                     }
 316                 }
 317             }
 318         }
 319     }
 320 
 321     /*
 322      * Test normalize() and isNormalized()
 323      */
 324     private static void test(int col, String[] c,
 325                              int FROM, int TO,
 326                              java.text.Normalizer.Form form, int unicodeVer,
 327                              String file, int line) throws Exception {
 328         for (int i = FROM; i <= TO; i++) {
 329             String got = normalizer.normalize(c[i], form, unicodeVer);
 330             if (!c[col].equals(got)) {
 331                 error(form, c[i], c[col], got, file, line);
 332 //          } else {
 333 //              okay(form, c[i], c[col], got, file, line);
 334             }
 335 
 336             /*
 337              * If the original String equals its normalized String, it means
 338              * that the original String is normalizerd. Thus, isNormalized()
 339              * should return true. And, vice versa!
 340              */
 341             if (c[col].equals(c[i])) {
 342                 if (!normalizer.isNormalized(c[i], form, unicodeVer)) {
 343                     error(form, c[i], file, line);
 344 //              } else {
 345 //                  okay(form, c[i], file, line);
 346                 }
 347             } else {
 348                 if (normalizer.isNormalized(c[i], form, unicodeVer)) {
 349                     error(form, c[i], file, line);
 350 //              } else {
 351 //                  okay(form, c[i], file, line);
 352                 }
 353             }
 354         }
 355     }
 356 
 357     /*
 358      * Generate an array of String from a line of conformance datafile.
 359      */
 360     private static void prepareColumns(String[] cols, String text,
 361                                            String file, int line,
 362                                            boolean part1test) throws Exception {
 363         int index = text.indexOf('#');
 364         if (index != -1) {
 365             text = text.substring(0, index);
 366         }
 367 
 368         StringTokenizer st = new StringTokenizer(text, ";");
 369         int tokenCount = st.countTokens();
 370         if (tokenCount < 5) {
 371              throw new RuntimeException("# of tokens in datafile should be 6, but got: " + tokenCount + " at line " + line + " in " + file);
 372         }
 373 
 374         StringBuffer sb = new StringBuffer();
 375         for (int i = 1; i <= 5; i++) {
 376             StringTokenizer tst = new StringTokenizer(st.nextToken(), " ");
 377 
 378             while (tst.hasMoreTokens()) {
 379                 int code = Integer.parseInt(tst.nextToken(), 16);
 380                 sb.append(Character.toChars(code));
 381             }
 382 
 383             cols[i] = sb.toString();
 384             sb.setLength(0);
 385         }
 386 
 387         if (part1test) {
 388             charList.set(cols[1].codePointAt(0));
 389         }
 390     }
 391 
 392     /*
 393      * Show an error message when normalize() didn't return the expected value.
 394      * (An exception is sometimes convenient. Therefore, it is commented out
 395      * for the moment.)
 396      */
 397     private static void error(java.text.Normalizer.Form form,
 398                               String from, String to, String got,
 399                               String file, int line) throws Exception {
 400         System.err.println("-\t" + form.toString() + ": normalize(" +
 401             toHexString(from) + ") doesn't equal <" + toHexString(to) +
 402             "> at line " + line + " in " + file + ". Got [" +
 403             toHexString(got) + "]");
 404         throw new RuntimeException("Normalization(" + form.toString() + ") failed");
 405     }
 406 
 407     /*
 408      * Show an error message when isNormalize() didn't return the expected
 409      * value.
 410      * (An exception is sometimes convenient. Therefore, it is commented out
 411      * for the moment.)
 412      */
 413     private static void error(java.text.Normalizer.Form form, String s,
 414                               String file, int line) throws Exception {
 415         System.err.println("\t" + form.toString() + ": isNormalized(" +
 416             toHexString(s) + ") returned the wrong value at line " + line +
 417             " in " + file);
 418         throw new RuntimeException("Normalization(" + form.toString() +") failed");
 419     }
 420 
 421     /*
 422      * (For debugging)
 423      * Shows a message when normalize() returned the expected value.
 424      */
 425     private static void okay(java.text.Normalizer.Form form,
 426                              String from, String to, String got,
 427                              String file, int line) {
 428         System.out.println("\t" + form.toString() + ": normalize(" +
 429             toHexString(from) + ") equals <" + toHexString(to) +
 430             "> at line " + line + " in " + file + ". Got [" +
 431             toHexString(got) + "]");
 432     }
 433 
 434     /*
 435      * (For debugging)
 436      * Shows a message when isNormalized() returned the expected value.
 437      */
 438     private static void okay(java.text.Normalizer.Form form, String s,
 439                              String file, int line) {
 440         System.out.println("\t" + form.toString() + ": isNormalized(" +
 441             toHexString(s) + ") returned the correct value at line " +
 442             line + " in " + file);
 443     }
 444 
 445     /*
 446      * Returns a spece-delimited hex String
 447      */
 448     private static String toHexString(String s) {
 449         StringBuffer sb = new StringBuffer(" ");
 450 
 451         for (int i = 0; i < s.length(); i++) {
 452             sb.append(Integer.toHexString(s.charAt(i)));
 453             sb.append(' ');
 454         }
 455 
 456         return sb.toString();
 457     }
 458 
 459     /*
 460      * Abstract class to call each Normalizer in java.text or sun.text.
 461      */
 462     private abstract class TestNormalizer {
 463         String name;
 464 
 465         TestNormalizer(String str) {
 466             name = str;
 467         }
 468 
 469         String getNormalizerName() {
 470             return name;
 471         }
 472 
 473         abstract String normalize(CharSequence cs,
 474                                   java.text.Normalizer.Form form,
 475                                   int option);
 476 
 477         abstract boolean isNormalized(CharSequence cs,
 478                                      java.text.Normalizer.Form form,
 479                                      int option);
 480     }
 481 
 482     /*
 483      * For java.text.Normalizer
 484      *   - normalize(CharSequence, Normalizer.Form)
 485      *   - isNormalized(CharSequence, Normalizer.Form)
 486      */
 487     private class testJavaNormalizer extends TestNormalizer {
 488         testJavaNormalizer() {
 489             super("java.text.Normalizer");
 490         }
 491 
 492         String normalize(CharSequence cs,
 493                          java.text.Normalizer.Form form,
 494                          int option) {
 495             return java.text.Normalizer.normalize(cs, form);
 496         }
 497 
 498         boolean isNormalized(CharSequence cs,
 499                              java.text.Normalizer.Form form,
 500                              int option) {
 501             return java.text.Normalizer.isNormalized(cs, form);
 502         }
 503     }
 504 
 505     /*
 506      * For sun.text.Normalizer
 507      *   - normalize(CharSequence, Normalizer.Form, int)
 508      *   - isNormalized(CharSequence, Normalizer.Form, int)
 509      */
 510     private class testSunNormalizer extends TestNormalizer {
 511         testSunNormalizer() {
 512             super("sun.text.Normalizer");
 513         }
 514 
 515         String normalize(CharSequence cs,
 516                          java.text.Normalizer.Form form,
 517                          int option) {
 518             return sun.text.Normalizer.normalize(cs, form, option);
 519         }
 520 
 521         boolean isNormalized(CharSequence cs,
 522                              java.text.Normalizer.Form form,
 523                              int option) {
 524             return sun.text.Normalizer.isNormalized(cs, form, option);
 525         }
 526     }
 527 }