1 /*
   2  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 /*
  24  * test
  25  * bug  4221795
  26  * summary Confirm *.icu data using ICU4J Normalizer
  27  */
  28 
  29 import java.io.BufferedReader;
  30 import java.io.FileInputStream;
  31 import java.io.InputStreamReader;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.util.BitSet;
  35 import java.util.StringTokenizer;
  36 
  37 import com.ibm.icu.text.Normalizer;
  38 import com.ibm.icu.impl.NormalizerImpl;
  39 
  40 /**
  41  * This is not a test program but a data validation utility.
  42  * Two datafiles for Normalizer, unorm.icu and uprops.icu under
  43  * sun/text/resouces, are generated using generators in ICU4C 3.2 on a
  44  * BIG-ENDIAN machine. Before using them with java.text.Normalizer and
  45  * sun.text.Normalizer, you may want to check these test datafile's validation.
  46  * You can test datafiles using Normalizer in ICU4J 3.2. Download ICU4J 3.2 and
  47  * run this test program with -cp <ICU4J 3.2>.
  48  */
  49 public class DataValidationTest {
  50 
  51     //
  52     // Options to be used with com.ibm.icu.text.Normalizer
  53     //
  54 
  55     /*
  56      * Default Unicode 3.2.0 normalization.
  57      *
  58      *   - With Corrigendum 4 fix
  59      *     (Different from Mustang's Normalizer.)
  60      *   - With Public Review Issue #29 fix
  61      *     (Different from Mustang's Normalizer.)
  62      */
  63     private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2;
  64 
  65     /*
  66      * *Incomplete* Unicode 3.2.0 normalization for IDNA/StringPrep.
  67      *
  68      *   - With Corrigendum 4 fix
  69      *   - Without Public Review Issue #29 fix
  70      *
  71      * ICU4J's Normalizer itself doesn't support normalization for Unicode 3.2.0
  72      * without Corrigendum 4 fix, which is necessary for IDNA/StringPrep. It is
  73      * done in StringPrep. Therefore, we don't test the normlaization in this
  74      * test program. We merely test normalization for Unicode 3.2.0 without
  75      * Public Review Issue #29 fix with this test program.
  76      */
  77     private static final int UNICODE_3_2_0_BEFORE_PRI_29 =
  78                                  Normalizer.UNICODE_3_2 |
  79                                  NormalizerImpl.BEFORE_PRI_29;
  80 
  81     /*
  82      * Default normalization.
  83      *
  84      *   - Unicode 4.0.1
  85      *     (Different from Mustang's Normalizer.)
  86      *   - With Corrigendum 4 fix
  87      *   - With Public Review Issue #29 fix
  88      *     (Different from Mustang's Normalizer.)
  89      *
  90      * Because Public Review Issue #29 is fixed in Unicode 4.1.0. I think that
  91      * IUC4J 3.2 should not support it. But it actually supports PRI #29 fix
  92      * as default....
  93      */
  94     private static final int UNICODE_LATEST = 0x00;
  95 
  96     /*
  97      * Normalization without Public Review Issue #29 fix.
  98      *
  99      *   - Unicode 4.0.1
 100      *   - Without Corrigendum 4 fix
 101      *   - Without Public Review Issue #29 fix
 102      */
 103     static final int UNICODE_LATEST_BEFORE_PRI_29 =
 104                          NormalizerImpl.BEFORE_PRI_29;
 105 
 106     //
 107     // Conformance test datafiles
 108     //
 109 
 110     /*
 111      * Conformance test datafile for normalization for Unicode 3.2.0 with
 112      * Corrigendum 4 corrections. This is NOT an original Conformace test
 113      * data. Some inconvenient test cases are commented out.
 114      * About corrigendum 4, please refer
 115      *   http://www.unicode.org/versions/corrigendum4.html
 116      *
 117      * ICU4J 3.2's Normalizer itself doesn't support normalization for Unicode
 118      * 3.2.0 without Corrigendum 4 corrections. StringPrep helps it. So, we
 119      * don't test the normalization with this test program.
 120      */
 121     static final String DATA_3_2_0 = "NormalizationTest-3.2.0.Corrigendum4.txt";
 122 
 123     /*
 124      * Conformance test datafile for the latest Unicode which is supported
 125      * by J2SE.
 126      */
 127     static final String DATA_LATEST = "NormalizationTest-Latest.txt";
 128 
 129    /*
 130     * Decorder
 131     */
 132     static final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
 133 
 134    /*
 135     * List to pick up characters which are not listed in Part1
 136     */
 137     static BitSet charList = new BitSet(Character.MAX_CODE_POINT+1);
 138 
 139    /*
 140     * Shortcuts
 141     */
 142     static final Normalizer.Mode NFC  = com.ibm.icu.text.Normalizer.NFC;
 143     static final Normalizer.Mode NFD  = com.ibm.icu.text.Normalizer.NFD;
 144     static final Normalizer.Mode NFKC = com.ibm.icu.text.Normalizer.NFKC;
 145     static final Normalizer.Mode NFKD = com.ibm.icu.text.Normalizer.NFKD;
 146     static final Normalizer.Mode[] modes = {NFC, NFD, NFKC, NFKD};
 147 
 148 
 149     public static void main(String[] args) throws Exception {
 150         test(DATA_3_2_0, UNICODE_3_2_0);
 151         test(DATA_3_2_0, UNICODE_3_2_0_BEFORE_PRI_29);
 152         test(DATA_LATEST, UNICODE_LATEST);
 153         // This test started failing since ICU4J 3.6.
 154 //      test(DATA_LATEST, UNICODE_LATEST_BEFORE_PRI_29);
 155 
 156         /* Unconformity test */
 157 //      test(DATA_3_2_0, UNICODE_LATEST);
 158 //      test(DATA_LATEST, UNICODE_3_2);
 159     }
 160 
 161     private static void test(String filename, int unicodeVer) throws Exception {
 162 
 163         FileInputStream fis = new FileInputStream(filename);
 164         BufferedReader in =
 165             new BufferedReader(new InputStreamReader(fis, decoder));
 166 
 167         System.out.println("\nStart testing with " + filename +
 168             " for options: " +
 169             (((unicodeVer & Normalizer.UNICODE_3_2) != 0) ?
 170                 "Unicode 3.2.0" : "the latest Unicode") + ", " +
 171             (((unicodeVer & NormalizerImpl.BEFORE_PRI_29) != 0) ?
 172                 "with" : "without") + " PRI #29 fix");
 173 
 174         int lineNo = 0;
 175         String text;
 176         String[] columns = new String[6];
 177         boolean part1test = false;
 178 
 179         while ((text = in.readLine()) != null) {
 180             lineNo ++;
 181 
 182             char c = text.charAt(0);
 183             if (c == '#') {
 184                 continue;
 185             } else if (c == '@') {
 186                 if (text.startsWith("@Part")) {
 187                     System.out.println("# Testing data in " + text);
 188 
 189                     if (text.startsWith("@Part1 ")) {
 190                         part1test = true;
 191                     } else {
 192                         part1test = false;
 193                     }
 194 
 195                     continue;
 196                 }
 197             }
 198 
 199             prepareColumns(columns, text, filename, lineNo, part1test);
 200 
 201             testNFC(columns, unicodeVer, filename, lineNo);
 202             testNFD(columns, unicodeVer, filename, lineNo);
 203             testNFKC(columns, unicodeVer, filename, lineNo);
 204             testNFKD(columns, unicodeVer, filename, lineNo);
 205         }
 206 
 207         in.close();
 208         fis.close();
 209 
 210         if (unicodeVer == UNICODE_LATEST) {
 211             System.out.println("# Testing characters which are not listed in Part1");
 212             testRemainingChars(filename, unicodeVer);
 213         }
 214     }
 215 
 216     /*
 217      * Test for NFC
 218      *
 219      *   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
 220      *   c4 ==  NFC(c4) ==  NFC(c5)
 221      */
 222     private static void testNFC(String[] c, int unicodeVer,
 223                                 String file, int line) throws Exception {
 224         test(2, c, 1, 3, NFC, unicodeVer, file, line);
 225         test(4, c, 4, 5, NFC, unicodeVer, file, line);
 226     }
 227 
 228     /*
 229      * Test for NFD
 230      *
 231      *   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
 232      *   c5 ==  NFD(c4) ==  NFD(c5)
 233      */
 234     private static void testNFD(String[] c, int unicodeVer,
 235                                 String file, int line) throws Exception {
 236         test(3, c, 1, 3, NFD, unicodeVer, file, line);
 237         test(5, c, 4, 5, NFD, unicodeVer, file, line);
 238     }
 239 
 240     /*
 241      * Test for NFKC
 242      *
 243      *   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
 244      */
 245     private static void testNFKC(String[] c, int unicodeVer,
 246                                  String file, int line) throws Exception {
 247         test(4, c, 1, 5, NFKC, unicodeVer, file, line);
 248     }
 249 
 250     /*
 251      * Test for NFKD
 252      *
 253      *   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
 254      */
 255     private static void testNFKD(String[] c, int unicodeVer,
 256                                  String file, int line) throws Exception {
 257         test(5, c, 1, 5, NFKD, unicodeVer, file, line);
 258     }
 259 
 260     /*
 261      * Test for characters which aren't listed in Part1
 262      *
 263      *   X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
 264      */
 265     private static void testRemainingChars(String file,
 266                                            int unicodeVer) throws Exception {
 267         for (int i = Character.MIN_CODE_POINT;
 268              i <= Character.MAX_CODE_POINT;
 269              i++) {
 270             if (!charList.get(i)) {
 271                 String from = String.valueOf(Character.toChars(i));
 272                 String to;
 273 
 274                 for (int j = 0; j < modes.length; j++) {
 275                     Normalizer.Mode mode = modes[j];
 276 
 277                     to = Normalizer.normalize(from, mode, unicodeVer);
 278                     if (!from.equals(to)) {
 279                         error(mode, from, from, to, file, -1);
 280 //                  } else {
 281 //                      okay(mode, from, from, to, file, -1);
 282                     }
 283 
 284                     if (!Normalizer.isNormalized(from, mode, unicodeVer)) {
 285                         error(mode, from, file, -1);
 286 //                  } else {
 287 //                      okay(mode, from, file, -1);
 288                     }
 289                 }
 290             }
 291         }
 292     }
 293 
 294     /*
 295      * Test normalize() and isNormalized()
 296      */
 297     private static void test(int col, String[] c,
 298                              int FROM, int TO,
 299                              Normalizer.Mode mode, int unicodeVer,
 300                              String file, int line) throws Exception {
 301         for (int i = FROM; i <= TO; i++) {
 302             String got = Normalizer.normalize(c[i], mode, unicodeVer);
 303             if (!c[col].equals(got)) {
 304                 error(mode, c[i], c[col], got, file, line);
 305 //          } else {
 306 //              okay(mode, c[i], c[col], got, file, line);
 307             }
 308 
 309             /*
 310              * If the original String equals its normalized String, it means
 311              * that the original String is normalizerd. Thus, isNormalized()
 312              * should return true. And, vice versa!
 313              */
 314             if (c[col].equals(c[i])) {
 315                 if (!Normalizer.isNormalized(c[i], mode, unicodeVer)) {
 316                     error(mode, c[i], file, line);
 317 //              } else {
 318 //                  okay(mode, c[i], file, line);
 319                 }
 320             } else {
 321                 if (Normalizer.isNormalized(c[i], mode, unicodeVer)) {
 322                     error(mode, c[i], file, line);
 323 //              } else {
 324 //                  okay(mode, c[i], file, line);
 325                 }
 326             }
 327         }
 328     }
 329 
 330     /*
 331      * Generate an array of String from a line of conformance datafile.
 332      */
 333     private static void prepareColumns(String[] col, String text,
 334                                        String file, int line,
 335                                        boolean part1test) throws Exception {
 336         int index = text.indexOf('#');
 337         if (index != -1) {
 338             text = text.substring(0, index);
 339         }
 340 
 341         StringTokenizer st = new StringTokenizer(text, ";");
 342         int tokenCount = st.countTokens();
 343         if (tokenCount < 5) {
 344              throw new RuntimeException("# of tokens in datafile should be 6, but got: " + tokenCount + " at line " + line + " in " + file);
 345         }
 346 
 347         StringBuffer sb = new StringBuffer();
 348         for (int i = 1; i <= 5; i++) {
 349             StringTokenizer tst = new StringTokenizer(st.nextToken(), " ");
 350 
 351             while (tst.hasMoreTokens()) {
 352                 int code = Integer.parseInt(tst.nextToken(), 16);
 353                 sb.append(Character.toChars(code));
 354             }
 355 
 356             col[i] = sb.toString();
 357             sb.setLength(0);
 358         }
 359 
 360         if (part1test) {
 361             charList.set(col[1].codePointAt(0));
 362         }
 363     }
 364 
 365     /*
 366      * Show an error message when normalize() didn't return the expected value.
 367      * (An exception is sometimes convenient. Therefore, it is commented out
 368      * for the moment.)
 369      */
 370     private static void error(Normalizer.Mode mode,
 371                               String from, String to, String got,
 372                               String file, int line) throws Exception {
 373         System.err.println("\t" + toString(mode) + ": normalize(" +
 374             toHexString(from) + ") doesn't equal <" + toHexString(to) +
 375             "> at line " + line + " in " + file + ". Got <" +
 376             toHexString(got) + ">.");
 377 //      throw new RuntimeException("Normalization(" + toString(mode) + ") failed");
 378     }
 379 
 380     /*
 381      * Show an error message when isNormalize() didn't return the expected value.
 382      * (An exception is sometimes convenient. Therefore, it is commented out
 383      * for the moment.)
 384      */
 385     private static void error(Normalizer.Mode mode, String orig,
 386                               String file, int line) throws Exception {
 387         System.err.println("\t" + toString(mode) + ": isNormalized(" +
 388             toHexString(orig) + ") returned the wrong value at line " + line +
 389             " in " + file + ".");
 390 //      throw new RuntimeException("Normalization(" + toString(mode) +") failed");
 391     }
 392 
 393     /*
 394      * (For debugging)
 395      * Shows a message when normalize() returned the expected value.
 396      */
 397     private static void okay(Normalizer.Mode mode,
 398                              String from, String to, String got,
 399                              String file, int line) {
 400         System.out.println("\t" + toString(mode) + ": normalize(" +
 401             toHexString(from) + ") equals <" + toHexString(to) +
 402             "> at line " + line + " in " + file + ". Got <" +
 403             toHexString(got) + ">.");
 404     }
 405 
 406     /*
 407      * (For debugging)
 408      * Shows a message when isNormalized() returned the expected value.
 409      */
 410     private static void okay(Normalizer.Mode mode, String orig,
 411                              String file, int line) {
 412         System.out.println("\t" + toString(mode) + ": isNormalized(" +
 413             toHexString(orig) + ") returned the correct value at line " +
 414             line + " in " + file + ".");
 415     }
 416 
 417     /*
 418      * Returns a spece-delimited hex String
 419      */
 420     private static String toHexString(String s) {
 421         StringBuffer sb = new StringBuffer(" ");
 422 
 423         for (int i = 0; i < s.length(); i++) {
 424             sb.append(Integer.toHexString(s.charAt(i)));
 425             sb.append(' ');
 426         }
 427 
 428         return sb.toString();
 429     }
 430 
 431    /*
 432     * Returns the name of Normalizer.Mode
 433     */
 434     private static String toString(Normalizer.Mode mode) {
 435         if (mode == Normalizer.NFC) {
 436             return "NFC";
 437         } else if (mode == Normalizer.NFD) {
 438             return "NFD";
 439         } else if (mode == Normalizer.NFKC) {
 440             return "NFKC";
 441         } else if (mode == Normalizer.NFKD) {
 442             return "NFKD";
 443         }
 444 
 445         return "unknown";
 446     }
 447 }