1 /* 2 * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 /* 24 * @test 25 * @bug 4221795 8032446 8174270 26 * @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's 27 * src/com/ibm/icu/dev/test and modified. 28 * @modules java.base/sun.text java.base/jdk.internal.icu.text 29 * @library /java/text/testlib 30 * @compile -XDignore.symbol.file ICUBasicTest.java 31 * @run main/timeout=30 ICUBasicTest 32 */ 33 34 /* 35 ******************************************************************************* 36 * Copyright (C) 1996-2004, International Business Machines Corporation and * 37 * others. All Rights Reserved. * 38 ******************************************************************************* 39 */ 40 41 import sun.text.Normalizer; 42 import jdk.internal.icu.text.NormalizerBase; 43 44 import static java.text.Normalizer.Form.*; 45 46 public class ICUBasicTest extends IntlTest { 47 48 public static void main(String[] args) throws Exception { 49 new ICUBasicTest().run(args); 50 } 51 52 /* 53 * Normalization modes 54 */ 55 private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC; 56 private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD; 57 private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC; 58 private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD; 59 private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE; 60 61 /* 62 * Normalization options 63 */ 64 65 /* Normal Unicode versions */ 66 private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2; 67 private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST; 68 69 /* 70 * Special cases for UAX #15 bug 71 * see Unicode Public Review Issue #29 72 * at http://www.unicode.org/review/resolved-pri.html#pri29 73 * 74 * Note: 75 * PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are 76 * different for earlier Unicode versions. 77 */ 78 public void TestComposition() { 79 80 final TestCompositionCase cases[] = new TestCompositionCase[] { 81 new TestCompositionCase(NFC, UNICODE_3_2_0, 82 "\u1100\u0300\u1161\u0327", 83 "\u1100\u0300\u1161\u0327"), 84 new TestCompositionCase(NFC, UNICODE_LATEST, 85 "\u1100\u0300\u1161\u0327", 86 "\u1100\u0300\u1161\u0327"), 87 88 new TestCompositionCase(NFC, UNICODE_3_2_0, 89 "\u1100\u0300\u1161\u0327\u11a8", 90 "\u1100\u0300\u1161\u0327\u11a8"), 91 new TestCompositionCase(NFC, UNICODE_LATEST, 92 "\u1100\u0300\u1161\u0327\u11a8", 93 "\u1100\u0300\u1161\u0327\u11a8"), 94 95 new TestCompositionCase(NFC, UNICODE_3_2_0, 96 "\uac00\u0300\u0327\u11a8", 97 "\uac00\u0327\u0300\u11a8"), 98 new TestCompositionCase(NFC, UNICODE_LATEST, 99 "\uac00\u0300\u0327\u11a8", 100 "\uac00\u0327\u0300\u11a8"), 101 102 new TestCompositionCase(NFC, UNICODE_3_2_0, 103 "\u0b47\u0300\u0b3e", 104 "\u0b47\u0300\u0b3e"), 105 new TestCompositionCase(NFC, UNICODE_LATEST, 106 "\u0b47\u0300\u0b3e", 107 "\u0b47\u0300\u0b3e"), 108 }; 109 110 String output; 111 int i, length; 112 113 for (i=0; i<cases.length; ++i) { 114 output = Normalizer.normalize(cases[i].input, 115 cases[i].form, cases[i].options); 116 if (!output.equals(cases[i].expect)) { 117 errln("unexpected result for case " + i + ". Expected=" 118 + cases[i].expect + ", Actual=" + output); 119 } else if (verbose) { 120 logln("expected result for case " + i + ". Expected=" 121 + cases[i].expect + ", Actual=" + output); 122 } 123 } 124 } 125 126 private final static class TestCompositionCase { 127 public java.text.Normalizer.Form form; 128 public int options; 129 public String input, expect; 130 131 TestCompositionCase(java.text.Normalizer.Form form, 132 int options, 133 String input, 134 String expect) { 135 this.form = form; 136 this.options = options; 137 this.input = input; 138 this.expect = expect; 139 } 140 } 141 142 /* 143 * Added in order to detect a regression. 144 */ 145 public void TestCombiningMarks() { 146 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; 147 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; 148 String result = NormalizerBase.normalize(src, NFD); 149 150 if (!expected.equals(result)) { 151 errln("Reordering of combining marks failed. Expected: " + 152 toHexString(expected) + " Got: "+ toHexString(result)); 153 } 154 } 155 156 /* 157 * Added in order to detect a regression. 158 */ 159 public void TestBengali() throws Exception { 160 String input = "\u09bc\u09be\u09cd\u09be"; 161 String output=NormalizerBase.normalize(input, NFC); 162 163 if (!input.equals(output)) { 164 errln("ERROR in NFC of string"); 165 } 166 return; 167 } 168 169 170 /* 171 * Added in order to detect a regression. 172 */ 173 /** 174 * Test for a problem found by Verisign. Problem is that 175 * characters at the start of a string are not put in canonical 176 * order correctly by compose() if there is no starter. 177 */ 178 public void TestVerisign() throws Exception { 179 String[] inputs = { 180 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", 181 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" 182 }; 183 String[] outputs = { 184 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", 185 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" 186 }; 187 188 for (int i = 0; i < inputs.length; ++i) { 189 String input = inputs[i]; 190 String output = outputs[i]; 191 192 String result = NormalizerBase.normalize(input, NFD); 193 if (!result.equals(output)) { 194 errln("FAIL input: " + toHexString(input) + "\n" + 195 " decompose: " + toHexString(result) + "\n" + 196 " expected: " + toHexString(output)); 197 } 198 199 result = NormalizerBase.normalize(input, NFC); 200 if (!result.equals(output)) { 201 errln("FAIL input: " + toHexString(input) + "\n" + 202 " compose: " + toHexString(result) + "\n" + 203 " expected: " + toHexString(output)); 204 } 205 } 206 } 207 208 /** 209 * Test for a problem that showed up just before ICU 1.6 release 210 * having to do with combining characters with an index of zero. 211 * Such characters do not participate in any canonical 212 * decompositions. However, having an index of zero means that 213 * they all share one typeMask[] entry, that is, they all have to 214 * map to the same canonical class, which is not the case, in 215 * reality. 216 */ 217 public void TestZeroIndex() throws Exception { 218 String[] DATA = { 219 // Expect col1 x COMPOSE_COMPAT => col2 220 // Expect col2 x DECOMP => col3 221 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", 222 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", 223 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", 224 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", 225 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", 226 }; 227 228 for (int i=0; i<DATA.length; i+=3) { 229 String a = DATA[i]; 230 String b = NormalizerBase.normalize(a, NFKC); 231 String exp = DATA[i+1]; 232 233 if (b.equals(exp)) { 234 logln("Ok: " + toHexString(a) + " x COMPOSE_COMPAT => " + 235 toHexString(b)); 236 } else { 237 errln("FAIL: " + toHexString(a) + " x COMPOSE_COMPAT => " + 238 toHexString(b) + ", expect " + toHexString(exp)); 239 } 240 241 a = NormalizerBase.normalize(b, NFD); 242 exp = DATA[i+2]; 243 if (a.equals(exp)) { 244 logln("Ok: " + toHexString(b) + " x DECOMP => " + 245 toHexString(a)); 246 } else { 247 errln("FAIL: " + toHexString(b) + " x DECOMP => " + 248 toHexString(a) + ", expect " + toHexString(exp)); 249 } 250 } 251 } 252 253 /** 254 * Make sure characters in the CompositionExclusion.txt list do not get 255 * composed to. 256 */ 257 public void TestCompositionExclusion() throws Exception { 258 // This list is generated from CompositionExclusion.txt. 259 // Update whenever the normalizer tables are updated. Note 260 // that we test all characters listed, even those that can be 261 // derived from the Unicode DB and are therefore commented 262 // out. 263 264 /* 265 * kyuka's note: 266 * Original data seemed to be based on Unicode 3.0.0(the initial 267 * Composition Exclusions list) and seemed to have some mistakes. 268 * Updated in order to correct mistakes and to support Unicode 4.0.0. 269 * And, this table can be used also for Unicode 3.2.0. 270 */ 271 String[][] EXCLUDED_UNICODE_3_2_0 = { 272 {"\u0340"}, 273 {"\u0341"}, 274 {"\u0343"}, 275 {"\u0344"}, 276 {"\u0374"}, 277 {"\u037E"}, 278 {"\u0387"}, 279 {"\u0958"}, 280 {"\u0959", "\u095F"}, 281 {"\u09DC"}, 282 {"\u09DD"}, 283 {"\u09DF"}, 284 {"\u0A33"}, 285 {"\u0A36"}, 286 {"\u0A59", "\u0A5B"}, 287 {"\u0A5E"}, 288 {"\u0B5C"}, 289 {"\u0B5D"}, 290 {"\u0F43"}, 291 {"\u0F4D"}, 292 {"\u0F52"}, 293 {"\u0F57"}, 294 {"\u0F5C"}, 295 {"\u0F69"}, 296 {"\u0F73"}, 297 {"\u0F75"}, 298 {"\u0F76"}, 299 {"\u0F78"}, 300 {"\u0F81"}, 301 {"\u0F93"}, 302 {"\u0F9D"}, 303 {"\u0FA2"}, 304 {"\u0FA7"}, 305 {"\u0FAC"}, 306 {"\u0FB9"}, 307 {"\u1F71"}, 308 {"\u1F73"}, 309 {"\u1F75"}, 310 {"\u1F77"}, 311 {"\u1F79"}, 312 {"\u1F7B"}, 313 {"\u1F7D"}, 314 {"\u1FBB"}, 315 {"\u1FBE"}, 316 {"\u1FC9"}, 317 {"\u1FCB"}, 318 {"\u1FD3"}, 319 {"\u1FDB"}, 320 {"\u1FE3"}, 321 {"\u1FEB"}, 322 {"\u1FEE"}, 323 {"\u1FEF"}, 324 {"\u1FF9"}, 325 {"\u1FFB"}, 326 {"\u1FFD"}, 327 {"\u2000"}, 328 {"\u2001"}, 329 {"\u2126"}, 330 {"\u212A"}, 331 {"\u212B"}, 332 {"\u2329"}, 333 {"\u232A"}, 334 {"\u2ADC"}, 335 {"\uF900", "\uFA0D"}, 336 {"\uFA10"}, 337 {"\uFA12"}, 338 {"\uFA15", "\uFA1E"}, 339 {"\uFA20"}, 340 {"\uFA22"}, 341 {"\uFA25"}, 342 {"\uFA26"}, 343 {"\uFA2A", "\uFA2D"}, 344 {"\uFA30", "\uFA6A"}, 345 {"\uFB1D"}, 346 {"\uFB1F"}, 347 {"\uFB2A", "\uFB36"}, 348 {"\uFB38", "\uFB3C"}, 349 {"\uFB3E"}, 350 {"\uFB40"}, 351 {"\uFB41"}, 352 {"\uFB43"}, 353 {"\uFB44"}, 354 {"\uFB46", "\uFB4E"}, 355 {"\uD834\uDD5E", "\uD834\uDD64"}, 356 {"\uD834\uDDBB", "\uD834\uDDC0"}, 357 {"\uD87E\uDC00", "\uD87E\uDE1D"} 358 }; 359 360 String[][] EXCLUDED_LATEST = { 361 362 }; 363 364 for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) { 365 if (EXCLUDED_UNICODE_3_2_0[i].length == 1) { 366 checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]); 367 } else { 368 int from, to; 369 from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0); 370 to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0); 371 372 for (int j = from; j <= to; j++) { 373 checkCompositionExclusion_320(String.valueOf(Character.toChars(j))); 374 } 375 } 376 } 377 } 378 379 private void checkCompositionExclusion_320(String s) throws Exception { 380 String a = String.valueOf(s); 381 String b = NormalizerBase.normalize(a, NFKD); 382 String c = NormalizerBase.normalize(b, NFC); 383 384 if (c.equals(a)) { 385 errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " + 386 toHexString(b) + " x COMPOSE => " + 387 toHexString(c) + " for the latest Unicode"); 388 } else if (verbose) { 389 logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " + 390 toHexString(b) + " x COMPOSE => " + 391 toHexString(c) + " for the latest Unicode"); 392 } 393 394 b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2); 395 c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2); 396 if (c.equals(a)) { 397 errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " + 398 toHexString(b) + " x COMPOSE => " + 399 toHexString(c) + " for Unicode 3.2.0"); 400 } else if (verbose) { 401 logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " + 402 toHexString(b) + " x COMPOSE => " + 403 toHexString(c) + " for Unicode 3.2.0"); 404 } 405 } 406 407 public void TestTibetan() throws Exception { 408 String[][] decomp = { 409 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } 410 }; 411 String[][] compose = { 412 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } 413 }; 414 415 staticTest(NFD, decomp, 1); 416 staticTest(NFKD,decomp, 2); 417 staticTest(NFC, compose, 1); 418 staticTest(NFKC,compose, 2); 419 } 420 421 public void TestExplodingBase() throws Exception{ 422 // \u017f - Latin small letter long s 423 // \u0307 - combining dot above 424 // \u1e61 - Latin small letter s with dot above 425 // \u1e9b - Latin small letter long s with dot above 426 String[][] canon = { 427 // Input Decomposed Composed 428 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, 429 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, 430 }; 431 String[][] compat = { 432 // Input Decomposed Composed 433 { "\u017f", "s", "s" }, 434 { "\u1e9b", "s\u0307", "\u1e61" }, 435 }; 436 437 staticTest(NFD, canon, 1); 438 staticTest(NFC, canon, 2); 439 staticTest(NFKD, compat, 1); 440 staticTest(NFKC, compat, 2); 441 } 442 443 private String[][] canonTests = { 444 // Input Decomposed Composed 445 446 { "cat", "cat", "cat" }, 447 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, 448 449 // D-dot_above 450 { "\u1e0a", "D\u0307", "\u1e0a" }, 451 452 // D dot_above 453 { "D\u0307", "D\u0307", "\u1e0a" }, 454 455 // D-dot_below dot_above 456 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, 457 458 // D-dot_above dot_below 459 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, 460 461 // D dot_below dot_above 462 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, 463 464 // D dot_below cedilla dot_above 465 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, 466 467 // D dot_above ogonek dot_below 468 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, 469 470 // E-macron-grave 471 { "\u1E14", "E\u0304\u0300", "\u1E14" }, 472 473 // E-macron + grave 474 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, 475 476 // E-grave + macron 477 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, 478 479 // angstrom_sign 480 { "\u212b", "A\u030a", "\u00c5" }, 481 482 // A-ring 483 { "\u00c5", "A\u030a", "\u00c5" }, 484 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, 485 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, 486 487 //updated with 3.0 488 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, 489 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, 490 491 { "Henry IV", "Henry IV", "Henry IV" }, 492 { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, 493 494 // ga(Zenkaku-Katakana) 495 { "\u30AC", "\u30AB\u3099", "\u30AC" }, 496 497 // ka(Zenkaku-Katakana) + ten(Zenkaku) 498 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, 499 500 // ka(Hankaku-Katakana) + ten(Hankaku-Katakana) 501 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, 502 503 // ka(Zenkaku-Katakana) + ten(Hankaku) 504 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, 505 // ka(Hankaku-Katakana) + ten(Zenkaku) 506 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, 507 508 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, 509 510 { "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e", 511 "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65", 512 "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" }, 513 }; 514 515 private String[][] compatTests = { 516 // Input Decomposed Composed 517 518 { "cat", "cat", "cat" }, 519 520 // Alef-Lamed vs. Alef, Lamed 521 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, 522 523 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, 524 525 // ffi ligature -> f + f + i 526 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, 527 528 //updated for 3.0 529 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, 530 531 // ffi ligature -> f + f + i 532 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, 533 534 { "Henry IV", "Henry IV", "Henry IV" }, 535 { "Henry \u2163", "Henry IV", "Henry IV" }, 536 537 // ga(Zenkaku-Katakana) 538 { "\u30AC", "\u30AB\u3099", "\u30AC" }, 539 540 // ka(Zenkaku-Katakana) + ten(Zenkaku) 541 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, 542 543 // ka(Hankaku-Katakana) + ten(Zenkaku) 544 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, 545 546 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ 547 // ka(Hankaku-Katakana) + ten(Hankaku) 548 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, 549 550 // ka(Zenkaku-Katakana) + ten(Hankaku) 551 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, 552 }; 553 554 public void TestNFD() throws Exception{ 555 staticTest(NFD, canonTests, 1); 556 } 557 558 public void TestNFC() throws Exception{ 559 staticTest(NFC, canonTests, 2); 560 } 561 562 public void TestNFKD() throws Exception{ 563 staticTest(NFKD, compatTests, 1); 564 } 565 566 public void TestNFKC() throws Exception{ 567 staticTest(NFKC, compatTests, 2); 568 } 569 570 private void staticTest(java.text.Normalizer.Form form, 571 String[][] tests, 572 int outCol) throws Exception { 573 for (int i = 0; i < tests.length; i++) { 574 String input = tests[i][0]; 575 logln("Normalizing '" + input + "' (" + toHexString(input) + ")" ); 576 577 String expect =tests[i][outCol]; 578 String output = java.text.Normalizer.normalize(input, form); 579 580 if (!output.equals(expect)) { 581 errln("FAIL: case " + i 582 + " expected '" + expect + "' (" + toHexString(expect) + ")" 583 + " but got '" + output + "' (" + toHexString(output) + ")" 584 ); 585 } 586 } 587 } 588 589 // With Canonical decomposition, Hangul syllables should get decomposed 590 // into Jamo, but Jamo characters should not be decomposed into 591 // conjoining Jamo 592 private String[][] hangulCanon = { 593 // Input Decomposed Composed 594 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, 595 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, 596 }; 597 598 public void TestHangulCompose() throws Exception{ 599 logln("Canonical composition..."); 600 staticTest(NFC, hangulCanon, 2); 601 } 602 603 public void TestHangulDecomp() throws Exception{ 604 logln("Canonical decomposition..."); 605 staticTest(NFD, hangulCanon, 1); 606 } 607 608 }