--- /dev/null 2019-05-22 15:33:55.000000000 -0700 +++ new/test/jdk/java/text/Normalizer/ICUBasicTest.java 2019-05-22 15:33:54.000000000 -0700 @@ -0,0 +1,610 @@ +/* + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + * @test + * @bug 4221795 8032446 + * @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's + * src/com/ibm/icu/dev/test and modified. + * @modules java.base/sun.text java.base/sun.text.normalizer + * @library /java/text/testlib + * @compile -XDignore.symbol.file ICUBasicTest.java + * @run main/timeout=30 ICUBasicTest + */ + +/* + ******************************************************************************* + * Copyright (C) 1996-2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +import sun.text.Normalizer; +import sun.text.normalizer.NormalizerBase; +import sun.text.normalizer.NormalizerImpl; + +import static java.text.Normalizer.Form.*; +import static sun.text.normalizer.NormalizerBase.Mode.*; + +public class ICUBasicTest extends IntlTest { + + public static void main(String[] args) throws Exception { + new ICUBasicTest().run(args); + } + + /* + * Normalization modes + */ + private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC; + private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD; + private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC; + private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD; + private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE; + + /* + * Normalization options + */ + + /* Normal Unicode versions */ + private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2; + private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST; + + /* + * Special cases for UAX #15 bug + * see Unicode Public Review Issue #29 + * at http://www.unicode.org/review/resolved-pri.html#pri29 + * + * Note: + * PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are + * different for earlier Unicode versions. + */ + public void TestComposition() { + + final TestCompositionCase cases[] = new TestCompositionCase[] { + new TestCompositionCase(NFC, UNICODE_3_2_0, + "\u1100\u0300\u1161\u0327", + "\u1100\u0300\u1161\u0327"), + new TestCompositionCase(NFC, UNICODE_LATEST, + "\u1100\u0300\u1161\u0327", + "\u1100\u0300\u1161\u0327"), + + new TestCompositionCase(NFC, UNICODE_3_2_0, + "\u1100\u0300\u1161\u0327\u11a8", + "\u1100\u0300\u1161\u0327\u11a8"), + new TestCompositionCase(NFC, UNICODE_LATEST, + "\u1100\u0300\u1161\u0327\u11a8", + "\u1100\u0300\u1161\u0327\u11a8"), + + new TestCompositionCase(NFC, UNICODE_3_2_0, + "\uac00\u0300\u0327\u11a8", + "\uac00\u0327\u0300\u11a8"), + new TestCompositionCase(NFC, UNICODE_LATEST, + "\uac00\u0300\u0327\u11a8", + "\uac00\u0327\u0300\u11a8"), + + new TestCompositionCase(NFC, UNICODE_3_2_0, + "\u0b47\u0300\u0b3e", + "\u0b47\u0300\u0b3e"), + new TestCompositionCase(NFC, UNICODE_LATEST, + "\u0b47\u0300\u0b3e", + "\u0b47\u0300\u0b3e"), + }; + + String output; + int i, length; + + for (i=0; i col2 + // Expect col2 x DECOMP => col3 + "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", + "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", + "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", + "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", + "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", + }; + + for (int i=0; i " + + toHexString(b)); + } else { + errln("FAIL: " + toHexString(a) + " x COMPOSE_COMPAT => " + + toHexString(b) + ", expect " + toHexString(exp)); + } + + a = NormalizerBase.normalize(b, NFD); + exp = DATA[i+2]; + if (a.equals(exp)) { + logln("Ok: " + toHexString(b) + " x DECOMP => " + + toHexString(a)); + } else { + errln("FAIL: " + toHexString(b) + " x DECOMP => " + + toHexString(a) + ", expect " + toHexString(exp)); + } + } + } + + /** + * Make sure characters in the CompositionExclusion.txt list do not get + * composed to. + */ + public void TestCompositionExclusion() throws Exception { + // This list is generated from CompositionExclusion.txt. + // Update whenever the normalizer tables are updated. Note + // that we test all characters listed, even those that can be + // derived from the Unicode DB and are therefore commented + // out. + + /* + * kyuka's note: + * Original data seemed to be based on Unicode 3.0.0(the initial + * Composition Exclusions list) and seemed to have some mistakes. + * Updated in order to correct mistakes and to support Unicode 4.0.0. + * And, this table can be used also for Unicode 3.2.0. + */ + String[][] EXCLUDED_UNICODE_3_2_0 = { + {"\u0340"}, + {"\u0341"}, + {"\u0343"}, + {"\u0344"}, + {"\u0374"}, + {"\u037E"}, + {"\u0387"}, + {"\u0958"}, + {"\u0959", "\u095F"}, + {"\u09DC"}, + {"\u09DD"}, + {"\u09DF"}, + {"\u0A33"}, + {"\u0A36"}, + {"\u0A59", "\u0A5B"}, + {"\u0A5E"}, + {"\u0B5C"}, + {"\u0B5D"}, + {"\u0F43"}, + {"\u0F4D"}, + {"\u0F52"}, + {"\u0F57"}, + {"\u0F5C"}, + {"\u0F69"}, + {"\u0F73"}, + {"\u0F75"}, + {"\u0F76"}, + {"\u0F78"}, + {"\u0F81"}, + {"\u0F93"}, + {"\u0F9D"}, + {"\u0FA2"}, + {"\u0FA7"}, + {"\u0FAC"}, + {"\u0FB9"}, + {"\u1F71"}, + {"\u1F73"}, + {"\u1F75"}, + {"\u1F77"}, + {"\u1F79"}, + {"\u1F7B"}, + {"\u1F7D"}, + {"\u1FBB"}, + {"\u1FBE"}, + {"\u1FC9"}, + {"\u1FCB"}, + {"\u1FD3"}, + {"\u1FDB"}, + {"\u1FE3"}, + {"\u1FEB"}, + {"\u1FEE"}, + {"\u1FEF"}, + {"\u1FF9"}, + {"\u1FFB"}, + {"\u1FFD"}, + {"\u2000"}, + {"\u2001"}, + {"\u2126"}, + {"\u212A"}, + {"\u212B"}, + {"\u2329"}, + {"\u232A"}, + {"\u2ADC"}, + {"\uF900", "\uFA0D"}, + {"\uFA10"}, + {"\uFA12"}, + {"\uFA15", "\uFA1E"}, + {"\uFA20"}, + {"\uFA22"}, + {"\uFA25"}, + {"\uFA26"}, + {"\uFA2A", "\uFA2D"}, + {"\uFA30", "\uFA6A"}, + {"\uFB1D"}, + {"\uFB1F"}, + {"\uFB2A", "\uFB36"}, + {"\uFB38", "\uFB3C"}, + {"\uFB3E"}, + {"\uFB40"}, + {"\uFB41"}, + {"\uFB43"}, + {"\uFB44"}, + {"\uFB46", "\uFB4E"}, + {"\uD834\uDD5E", "\uD834\uDD64"}, + {"\uD834\uDDBB", "\uD834\uDDC0"}, + {"\uD87E\uDC00", "\uD87E\uDE1D"} + }; + + String[][] EXCLUDED_LATEST = { + + }; + + for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) { + if (EXCLUDED_UNICODE_3_2_0[i].length == 1) { + checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]); + } else { + int from, to; + from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0); + to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0); + + for (int j = from; j <= to; j++) { + checkCompositionExclusion_320(String.valueOf(Character.toChars(j))); + } + } + } + } + + private void checkCompositionExclusion_320(String s) throws Exception { + String a = String.valueOf(s); + String b = NormalizerBase.normalize(a, NFKD); + String c = NormalizerBase.normalize(b, NFC); + + if (c.equals(a)) { + errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " + + toHexString(b) + " x COMPOSE => " + + toHexString(c) + " for the latest Unicode"); + } else if (verbose) { + logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " + + toHexString(b) + " x COMPOSE => " + + toHexString(c) + " for the latest Unicode"); + } + + b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2); + c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2); + if (c.equals(a)) { + errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " + + toHexString(b) + " x COMPOSE => " + + toHexString(c) + " for Unicode 3.2.0"); + } else if (verbose) { + logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " + + toHexString(b) + " x COMPOSE => " + + toHexString(c) + " for Unicode 3.2.0"); + } + } + + public void TestTibetan() throws Exception { + String[][] decomp = { + { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } + }; + String[][] compose = { + { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } + }; + + staticTest(NFD, decomp, 1); + staticTest(NFKD,decomp, 2); + staticTest(NFC, compose, 1); + staticTest(NFKC,compose, 2); + } + + public void TestExplodingBase() throws Exception{ + // \u017f - Latin small letter long s + // \u0307 - combining dot above + // \u1e61 - Latin small letter s with dot above + // \u1e9b - Latin small letter long s with dot above + String[][] canon = { + // Input Decomposed Composed + { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, + { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, + }; + String[][] compat = { + // Input Decomposed Composed + { "\u017f", "s", "s" }, + { "\u1e9b", "s\u0307", "\u1e61" }, + }; + + staticTest(NFD, canon, 1); + staticTest(NFC, canon, 2); + staticTest(NFKD, compat, 1); + staticTest(NFKC, compat, 2); + } + + private String[][] canonTests = { + // Input Decomposed Composed + + { "cat", "cat", "cat" }, + { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, + + // D-dot_above + { "\u1e0a", "D\u0307", "\u1e0a" }, + + // D dot_above + { "D\u0307", "D\u0307", "\u1e0a" }, + + // D-dot_below dot_above + { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, + + // D-dot_above dot_below + { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, + + // D dot_below dot_above + { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, + + // D dot_below cedilla dot_above + { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, + + // D dot_above ogonek dot_below + { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, + + // E-macron-grave + { "\u1E14", "E\u0304\u0300", "\u1E14" }, + + // E-macron + grave + { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, + + // E-grave + macron + { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, + + // angstrom_sign + { "\u212b", "A\u030a", "\u00c5" }, + + // A-ring + { "\u00c5", "A\u030a", "\u00c5" }, + { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, + { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, + + //updated with 3.0 + { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, + { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, + + { "Henry IV", "Henry IV", "Henry IV" }, + { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, + + // ga(Zenkaku-Katakana) + { "\u30AC", "\u30AB\u3099", "\u30AC" }, + + // ka(Zenkaku-Katakana) + ten(Zenkaku) + { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, + + // ka(Hankaku-Katakana) + ten(Hankaku-Katakana) + { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, + + // ka(Zenkaku-Katakana) + ten(Hankaku) + { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, + // ka(Hankaku-Katakana) + ten(Zenkaku) + { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, + + { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, + + { "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e", + "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65", + "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" }, + }; + + private String[][] compatTests = { + // Input Decomposed Composed + + { "cat", "cat", "cat" }, + + // Alef-Lamed vs. Alef, Lamed + { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, + + { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, + + // ffi ligature -> f + f + i + { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, + + //updated for 3.0 + { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, + + // ffi ligature -> f + f + i + { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, + + { "Henry IV", "Henry IV", "Henry IV" }, + { "Henry \u2163", "Henry IV", "Henry IV" }, + + // ga(Zenkaku-Katakana) + { "\u30AC", "\u30AB\u3099", "\u30AC" }, + + // ka(Zenkaku-Katakana) + ten(Zenkaku) + { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, + + // ka(Hankaku-Katakana) + ten(Zenkaku) + { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, + + /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ + // ka(Hankaku-Katakana) + ten(Hankaku) + { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, + + // ka(Zenkaku-Katakana) + ten(Hankaku) + { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, + }; + + public void TestNFD() throws Exception{ + staticTest(NFD, canonTests, 1); + } + + public void TestNFC() throws Exception{ + staticTest(NFC, canonTests, 2); + } + + public void TestNFKD() throws Exception{ + staticTest(NFKD, compatTests, 1); + } + + public void TestNFKC() throws Exception{ + staticTest(NFKC, compatTests, 2); + } + + private void staticTest(java.text.Normalizer.Form form, + String[][] tests, + int outCol) throws Exception { + for (int i = 0; i < tests.length; i++) { + String input = tests[i][0]; + logln("Normalizing '" + input + "' (" + toHexString(input) + ")" ); + + String expect =tests[i][outCol]; + String output = java.text.Normalizer.normalize(input, form); + + if (!output.equals(expect)) { + errln("FAIL: case " + i + + " expected '" + expect + "' (" + toHexString(expect) + ")" + + " but got '" + output + "' (" + toHexString(output) + ")" +); + } + } + } + + // With Canonical decomposition, Hangul syllables should get decomposed + // into Jamo, but Jamo characters should not be decomposed into + // conjoining Jamo + private String[][] hangulCanon = { + // Input Decomposed Composed + { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, + { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, + }; + + public void TestHangulCompose() throws Exception{ + logln("Canonical composition..."); + staticTest(NFC, hangulCanon, 2); + } + + public void TestHangulDecomp() throws Exception{ + logln("Canonical decomposition..."); + staticTest(NFD, hangulCanon, 1); + } + +}