--- /dev/null 2019-05-22 15:33:53.000000000 -0700 +++ new/test/jdk/java/text/Normalizer/DataValidationTest.java 2019-05-22 15:33:53.000000000 -0700 @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + * test + * bug 4221795 + * summary Confirm *.icu data using ICU4J Normalizer + */ + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.util.BitSet; +import java.util.StringTokenizer; + +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.impl.NormalizerImpl; + +/** + * This is not a test program but a data validation utility. + * Two datafiles for Normalizer, unorm.icu and uprops.icu under + * sun/text/resouces, are generated using generators in ICU4C 3.2 on a + * BIG-ENDIAN machine. Before using them with java.text.Normalizer and + * sun.text.Normalizer, you may want to check these test datafile's validation. + * You can test datafiles using Normalizer in ICU4J 3.2. Download ICU4J 3.2 and + * run this test program with -cp . + */ +public class DataValidationTest { + + // + // Options to be used with com.ibm.icu.text.Normalizer + // + + /* + * Default Unicode 3.2.0 normalization. + * + * - With Corrigendum 4 fix + * (Different from Mustang's Normalizer.) + * - With Public Review Issue #29 fix + * (Different from Mustang's Normalizer.) + */ + private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2; + + /* + * *Incomplete* Unicode 3.2.0 normalization for IDNA/StringPrep. + * + * - With Corrigendum 4 fix + * - Without Public Review Issue #29 fix + * + * ICU4J's Normalizer itself doesn't support normalization for Unicode 3.2.0 + * without Corrigendum 4 fix, which is necessary for IDNA/StringPrep. It is + * done in StringPrep. Therefore, we don't test the normlaization in this + * test program. We merely test normalization for Unicode 3.2.0 without + * Public Review Issue #29 fix with this test program. + */ + private static final int UNICODE_3_2_0_BEFORE_PRI_29 = + Normalizer.UNICODE_3_2 | + NormalizerImpl.BEFORE_PRI_29; + + /* + * Default normalization. + * + * - Unicode 4.0.1 + * (Different from Mustang's Normalizer.) + * - With Corrigendum 4 fix + * - With Public Review Issue #29 fix + * (Different from Mustang's Normalizer.) + * + * Because Public Review Issue #29 is fixed in Unicode 4.1.0. I think that + * IUC4J 3.2 should not support it. But it actually supports PRI #29 fix + * as default.... + */ + private static final int UNICODE_LATEST = 0x00; + + /* + * Normalization without Public Review Issue #29 fix. + * + * - Unicode 4.0.1 + * - Without Corrigendum 4 fix + * - Without Public Review Issue #29 fix + */ + static final int UNICODE_LATEST_BEFORE_PRI_29 = + NormalizerImpl.BEFORE_PRI_29; + + // + // Conformance test datafiles + // + + /* + * Conformance test datafile for normalization for Unicode 3.2.0 with + * Corrigendum 4 corrections. This is NOT an original Conformace test + * data. Some inconvenient test cases are commented out. + * About corrigendum 4, please refer + * http://www.unicode.org/versions/corrigendum4.html + * + * ICU4J 3.2's Normalizer itself doesn't support normalization for Unicode + * 3.2.0 without Corrigendum 4 corrections. StringPrep helps it. So, we + * don't test the normalization with this test program. + */ + static final String DATA_3_2_0 = "NormalizationTest-3.2.0.Corrigendum4.txt"; + + /* + * Conformance test datafile for the latest Unicode which is supported + * by J2SE. + */ + static final String DATA_LATEST = "NormalizationTest-Latest.txt"; + + /* + * Decorder + */ + static final CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder(); + + /* + * List to pick up characters which are not listed in Part1 + */ + static BitSet charList = new BitSet(Character.MAX_CODE_POINT+1); + + /* + * Shortcuts + */ + static final Normalizer.Mode NFC = com.ibm.icu.text.Normalizer.NFC; + static final Normalizer.Mode NFD = com.ibm.icu.text.Normalizer.NFD; + static final Normalizer.Mode NFKC = com.ibm.icu.text.Normalizer.NFKC; + static final Normalizer.Mode NFKD = com.ibm.icu.text.Normalizer.NFKD; + static final Normalizer.Mode[] modes = {NFC, NFD, NFKC, NFKD}; + + + public static void main(String[] args) throws Exception { + test(DATA_3_2_0, UNICODE_3_2_0); + test(DATA_3_2_0, UNICODE_3_2_0_BEFORE_PRI_29); + test(DATA_LATEST, UNICODE_LATEST); + // This test started failing since ICU4J 3.6. +// test(DATA_LATEST, UNICODE_LATEST_BEFORE_PRI_29); + + /* Unconformity test */ +// test(DATA_3_2_0, UNICODE_LATEST); +// test(DATA_LATEST, UNICODE_3_2); + } + + private static void test(String filename, int unicodeVer) throws Exception { + + FileInputStream fis = new FileInputStream(filename); + BufferedReader in = + new BufferedReader(new InputStreamReader(fis, decoder)); + + System.out.println("\nStart testing with " + filename + + " for options: " + + (((unicodeVer & Normalizer.UNICODE_3_2) != 0) ? + "Unicode 3.2.0" : "the latest Unicode") + ", " + + (((unicodeVer & NormalizerImpl.BEFORE_PRI_29) != 0) ? + "with" : "without") + " PRI #29 fix"); + + int lineNo = 0; + String text; + String[] columns = new String[6]; + boolean part1test = false; + + while ((text = in.readLine()) != null) { + lineNo ++; + + char c = text.charAt(0); + if (c == '#') { + continue; + } else if (c == '@') { + if (text.startsWith("@Part")) { + System.out.println("# Testing data in " + text); + + if (text.startsWith("@Part1 ")) { + part1test = true; + } else { + part1test = false; + } + + continue; + } + } + + prepareColumns(columns, text, filename, lineNo, part1test); + + testNFC(columns, unicodeVer, filename, lineNo); + testNFD(columns, unicodeVer, filename, lineNo); + testNFKC(columns, unicodeVer, filename, lineNo); + testNFKD(columns, unicodeVer, filename, lineNo); + } + + in.close(); + fis.close(); + + if (unicodeVer == UNICODE_LATEST) { + System.out.println("# Testing characters which are not listed in Part1"); + testRemainingChars(filename, unicodeVer); + } + } + + /* + * Test for NFC + * + * c2 == NFC(c1) == NFC(c2) == NFC(c3) + * c4 == NFC(c4) == NFC(c5) + */ + private static void testNFC(String[] c, int unicodeVer, + String file, int line) throws Exception { + test(2, c, 1, 3, NFC, unicodeVer, file, line); + test(4, c, 4, 5, NFC, unicodeVer, file, line); + } + + /* + * Test for NFD + * + * c3 == NFD(c1) == NFD(c2) == NFD(c3) + * c5 == NFD(c4) == NFD(c5) + */ + private static void testNFD(String[] c, int unicodeVer, + String file, int line) throws Exception { + test(3, c, 1, 3, NFD, unicodeVer, file, line); + test(5, c, 4, 5, NFD, unicodeVer, file, line); + } + + /* + * Test for NFKC + * + * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) + */ + private static void testNFKC(String[] c, int unicodeVer, + String file, int line) throws Exception { + test(4, c, 1, 5, NFKC, unicodeVer, file, line); + } + + /* + * Test for NFKD + * + * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) + */ + private static void testNFKD(String[] c, int unicodeVer, + String file, int line) throws Exception { + test(5, c, 1, 5, NFKD, unicodeVer, file, line); + } + + /* + * Test for characters which aren't listed in Part1 + * + * X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X) + */ + private static void testRemainingChars(String file, + int unicodeVer) throws Exception { + for (int i = Character.MIN_CODE_POINT; + i <= Character.MAX_CODE_POINT; + i++) { + if (!charList.get(i)) { + String from = String.valueOf(Character.toChars(i)); + String to; + + for (int j = 0; j < modes.length; j++) { + Normalizer.Mode mode = modes[j]; + + to = Normalizer.normalize(from, mode, unicodeVer); + if (!from.equals(to)) { + error(mode, from, from, to, file, -1); +// } else { +// okay(mode, from, from, to, file, -1); + } + + if (!Normalizer.isNormalized(from, mode, unicodeVer)) { + error(mode, from, file, -1); +// } else { +// okay(mode, from, file, -1); + } + } + } + } + } + + /* + * Test normalize() and isNormalized() + */ + private static void test(int col, String[] c, + int FROM, int TO, + Normalizer.Mode mode, int unicodeVer, + String file, int line) throws Exception { + for (int i = FROM; i <= TO; i++) { + String got = Normalizer.normalize(c[i], mode, unicodeVer); + if (!c[col].equals(got)) { + error(mode, c[i], c[col], got, file, line); +// } else { +// okay(mode, c[i], c[col], got, file, line); + } + + /* + * If the original String equals its normalized String, it means + * that the original String is normalizerd. Thus, isNormalized() + * should return true. And, vice versa! + */ + if (c[col].equals(c[i])) { + if (!Normalizer.isNormalized(c[i], mode, unicodeVer)) { + error(mode, c[i], file, line); +// } else { +// okay(mode, c[i], file, line); + } + } else { + if (Normalizer.isNormalized(c[i], mode, unicodeVer)) { + error(mode, c[i], file, line); +// } else { +// okay(mode, c[i], file, line); + } + } + } + } + + /* + * Generate an array of String from a line of conformance datafile. + */ + private static void prepareColumns(String[] col, String text, + String file, int line, + boolean part1test) throws Exception { + int index = text.indexOf('#'); + if (index != -1) { + text = text.substring(0, index); + } + + StringTokenizer st = new StringTokenizer(text, ";"); + int tokenCount = st.countTokens(); + if (tokenCount < 5) { + throw new RuntimeException("# of tokens in datafile should be 6, but got: " + tokenCount + " at line " + line + " in " + file); + } + + StringBuffer sb = new StringBuffer(); + for (int i = 1; i <= 5; i++) { + StringTokenizer tst = new StringTokenizer(st.nextToken(), " "); + + while (tst.hasMoreTokens()) { + int code = Integer.parseInt(tst.nextToken(), 16); + sb.append(Character.toChars(code)); + } + + col[i] = sb.toString(); + sb.setLength(0); + } + + if (part1test) { + charList.set(col[1].codePointAt(0)); + } + } + + /* + * Show an error message when normalize() didn't return the expected value. + * (An exception is sometimes convenient. Therefore, it is commented out + * for the moment.) + */ + private static void error(Normalizer.Mode mode, + String from, String to, String got, + String file, int line) throws Exception { + System.err.println("\t" + toString(mode) + ": normalize(" + + toHexString(from) + ") doesn't equal <" + toHexString(to) + + "> at line " + line + " in " + file + ". Got <" + + toHexString(got) + ">."); +// throw new RuntimeException("Normalization(" + toString(mode) + ") failed"); + } + + /* + * Show an error message when isNormalize() didn't return the expected value. + * (An exception is sometimes convenient. Therefore, it is commented out + * for the moment.) + */ + private static void error(Normalizer.Mode mode, String orig, + String file, int line) throws Exception { + System.err.println("\t" + toString(mode) + ": isNormalized(" + + toHexString(orig) + ") returned the wrong value at line " + line + + " in " + file + "."); +// throw new RuntimeException("Normalization(" + toString(mode) +") failed"); + } + + /* + * (For debugging) + * Shows a message when normalize() returned the expected value. + */ + private static void okay(Normalizer.Mode mode, + String from, String to, String got, + String file, int line) { + System.out.println("\t" + toString(mode) + ": normalize(" + + toHexString(from) + ") equals <" + toHexString(to) + + "> at line " + line + " in " + file + ". Got <" + + toHexString(got) + ">."); + } + + /* + * (For debugging) + * Shows a message when isNormalized() returned the expected value. + */ + private static void okay(Normalizer.Mode mode, String orig, + String file, int line) { + System.out.println("\t" + toString(mode) + ": isNormalized(" + + toHexString(orig) + ") returned the correct value at line " + + line + " in " + file + "."); + } + + /* + * Returns a spece-delimited hex String + */ + private static String toHexString(String s) { + StringBuffer sb = new StringBuffer(" "); + + for (int i = 0; i < s.length(); i++) { + sb.append(Integer.toHexString(s.charAt(i))); + sb.append(' '); + } + + return sb.toString(); + } + + /* + * Returns the name of Normalizer.Mode + */ + private static String toString(Normalizer.Mode mode) { + if (mode == Normalizer.NFC) { + return "NFC"; + } else if (mode == Normalizer.NFD) { + return "NFD"; + } else if (mode == Normalizer.NFKC) { + return "NFKC"; + } else if (mode == Normalizer.NFKD) { + return "NFKD"; + } + + return "unknown"; + } +}