1 /* 2 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 /* 24 * @test 25 * @bug 4221795 8032446 26 * @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's 27 * src/com/ibm/icu/dev/test and modified. 28 * @modules java.base/sun.text java.base/sun.text.normalizer 29 * @library /java/text/testlib 30 * @compile -XDignore.symbol.file ICUBasicTest.java 31 * @run main/timeout=30 ICUBasicTest 32 */ 33 34 /* 35 ******************************************************************************* 36 * Copyright (C) 1996-2004, International Business Machines Corporation and * 37 * others. All Rights Reserved. * 38 ******************************************************************************* 39 */ 40 41 import sun.text.Normalizer; 42 import sun.text.normalizer.NormalizerBase; 43 import sun.text.normalizer.NormalizerImpl; 44 45 import static java.text.Normalizer.Form.*; 46 import static sun.text.normalizer.NormalizerBase.Mode.*; 47 48 public class ICUBasicTest extends IntlTest { 49 50 public static void main(String[] args) throws Exception { 51 new ICUBasicTest().run(args); 52 } 53 54 /* 55 * Normalization modes 56 */ 57 private static final NormalizerBase.Mode NFCmode = NormalizerBase.NFC; 58 private static final NormalizerBase.Mode NFDmode = NormalizerBase.NFD; 59 private static final NormalizerBase.Mode NFKCmode = NormalizerBase.NFKC; 60 private static final NormalizerBase.Mode NFKDmode = NormalizerBase.NFKD; 61 private static final NormalizerBase.Mode NONEmode = NormalizerBase.NONE; 62 63 /* 64 * Normalization options 65 */ 66 67 /* Normal Unicode versions */ 68 private static final int UNICODE_3_2_0 = Normalizer.UNICODE_3_2; 69 private static final int UNICODE_LATEST = NormalizerBase.UNICODE_LATEST; 70 71 /* 72 * Special cases for UAX #15 bug 73 * see Unicode Public Review Issue #29 74 * at http://www.unicode.org/review/resolved-pri.html#pri29 75 * 76 * Note: 77 * PRI #29 is supported in Unicode 4.1.0. Therefore, expected results are 78 * different for earlier Unicode versions. 79 */ 80 public void TestComposition() { 81 82 final TestCompositionCase cases[] = new TestCompositionCase[] { 83 new TestCompositionCase(NFC, UNICODE_3_2_0, 84 "\u1100\u0300\u1161\u0327", 85 "\u1100\u0300\u1161\u0327"), 86 new TestCompositionCase(NFC, UNICODE_LATEST, 87 "\u1100\u0300\u1161\u0327", 88 "\u1100\u0300\u1161\u0327"), 89 90 new TestCompositionCase(NFC, UNICODE_3_2_0, 91 "\u1100\u0300\u1161\u0327\u11a8", 92 "\u1100\u0300\u1161\u0327\u11a8"), 93 new TestCompositionCase(NFC, UNICODE_LATEST, 94 "\u1100\u0300\u1161\u0327\u11a8", 95 "\u1100\u0300\u1161\u0327\u11a8"), 96 97 new TestCompositionCase(NFC, UNICODE_3_2_0, 98 "\uac00\u0300\u0327\u11a8", 99 "\uac00\u0327\u0300\u11a8"), 100 new TestCompositionCase(NFC, UNICODE_LATEST, 101 "\uac00\u0300\u0327\u11a8", 102 "\uac00\u0327\u0300\u11a8"), 103 104 new TestCompositionCase(NFC, UNICODE_3_2_0, 105 "\u0b47\u0300\u0b3e", 106 "\u0b47\u0300\u0b3e"), 107 new TestCompositionCase(NFC, UNICODE_LATEST, 108 "\u0b47\u0300\u0b3e", 109 "\u0b47\u0300\u0b3e"), 110 }; 111 112 String output; 113 int i, length; 114 115 for (i=0; i<cases.length; ++i) { 116 output = Normalizer.normalize(cases[i].input, 117 cases[i].form, cases[i].options); 118 if (!output.equals(cases[i].expect)) { 119 errln("unexpected result for case " + i + ". Expected=" 120 + cases[i].expect + ", Actual=" + output); 121 } else if (verbose) { 122 logln("expected result for case " + i + ". Expected=" 123 + cases[i].expect + ", Actual=" + output); 124 } 125 } 126 } 127 128 private final static class TestCompositionCase { 129 public java.text.Normalizer.Form form; 130 public int options; 131 public String input, expect; 132 133 TestCompositionCase(java.text.Normalizer.Form form, 134 int options, 135 String input, 136 String expect) { 137 this.form = form; 138 this.options = options; 139 this.input = input; 140 this.expect = expect; 141 } 142 } 143 144 /* 145 * Added in order to detect a regression. 146 */ 147 public void TestCombiningMarks() { 148 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; 149 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; 150 String result = NormalizerBase.normalize(src, NFD); 151 152 if (!expected.equals(result)) { 153 errln("Reordering of combining marks failed. Expected: " + 154 toHexString(expected) + " Got: "+ toHexString(result)); 155 } 156 } 157 158 /* 159 * Added in order to detect a regression. 160 */ 161 public void TestBengali() throws Exception { 162 String input = "\u09bc\u09be\u09cd\u09be"; 163 String output=NormalizerBase.normalize(input, NFC); 164 165 if (!input.equals(output)) { 166 errln("ERROR in NFC of string"); 167 } 168 return; 169 } 170 171 172 /* 173 * Added in order to detect a regression. 174 */ 175 /** 176 * Test for a problem found by Verisign. Problem is that 177 * characters at the start of a string are not put in canonical 178 * order correctly by compose() if there is no starter. 179 */ 180 public void TestVerisign() throws Exception { 181 String[] inputs = { 182 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", 183 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" 184 }; 185 String[] outputs = { 186 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", 187 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" 188 }; 189 190 for (int i = 0; i < inputs.length; ++i) { 191 String input = inputs[i]; 192 String output = outputs[i]; 193 194 String result = NormalizerBase.normalize(input, NFD); 195 if (!result.equals(output)) { 196 errln("FAIL input: " + toHexString(input) + "\n" + 197 " decompose: " + toHexString(result) + "\n" + 198 " expected: " + toHexString(output)); 199 } 200 201 result = NormalizerBase.normalize(input, NFC); 202 if (!result.equals(output)) { 203 errln("FAIL input: " + toHexString(input) + "\n" + 204 " compose: " + toHexString(result) + "\n" + 205 " expected: " + toHexString(output)); 206 } 207 } 208 } 209 210 /** 211 * Test for a problem that showed up just before ICU 1.6 release 212 * having to do with combining characters with an index of zero. 213 * Such characters do not participate in any canonical 214 * decompositions. However, having an index of zero means that 215 * they all share one typeMask[] entry, that is, they all have to 216 * map to the same canonical class, which is not the case, in 217 * reality. 218 */ 219 public void TestZeroIndex() throws Exception { 220 String[] DATA = { 221 // Expect col1 x COMPOSE_COMPAT => col2 222 // Expect col2 x DECOMP => col3 223 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", 224 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", 225 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", 226 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", 227 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", 228 }; 229 230 for (int i=0; i<DATA.length; i+=3) { 231 String a = DATA[i]; 232 String b = NormalizerBase.normalize(a, NFKC); 233 String exp = DATA[i+1]; 234 235 if (b.equals(exp)) { 236 logln("Ok: " + toHexString(a) + " x COMPOSE_COMPAT => " + 237 toHexString(b)); 238 } else { 239 errln("FAIL: " + toHexString(a) + " x COMPOSE_COMPAT => " + 240 toHexString(b) + ", expect " + toHexString(exp)); 241 } 242 243 a = NormalizerBase.normalize(b, NFD); 244 exp = DATA[i+2]; 245 if (a.equals(exp)) { 246 logln("Ok: " + toHexString(b) + " x DECOMP => " + 247 toHexString(a)); 248 } else { 249 errln("FAIL: " + toHexString(b) + " x DECOMP => " + 250 toHexString(a) + ", expect " + toHexString(exp)); 251 } 252 } 253 } 254 255 /** 256 * Make sure characters in the CompositionExclusion.txt list do not get 257 * composed to. 258 */ 259 public void TestCompositionExclusion() throws Exception { 260 // This list is generated from CompositionExclusion.txt. 261 // Update whenever the normalizer tables are updated. Note 262 // that we test all characters listed, even those that can be 263 // derived from the Unicode DB and are therefore commented 264 // out. 265 266 /* 267 * kyuka's note: 268 * Original data seemed to be based on Unicode 3.0.0(the initial 269 * Composition Exclusions list) and seemed to have some mistakes. 270 * Updated in order to correct mistakes and to support Unicode 4.0.0. 271 * And, this table can be used also for Unicode 3.2.0. 272 */ 273 String[][] EXCLUDED_UNICODE_3_2_0 = { 274 {"\u0340"}, 275 {"\u0341"}, 276 {"\u0343"}, 277 {"\u0344"}, 278 {"\u0374"}, 279 {"\u037E"}, 280 {"\u0387"}, 281 {"\u0958"}, 282 {"\u0959", "\u095F"}, 283 {"\u09DC"}, 284 {"\u09DD"}, 285 {"\u09DF"}, 286 {"\u0A33"}, 287 {"\u0A36"}, 288 {"\u0A59", "\u0A5B"}, 289 {"\u0A5E"}, 290 {"\u0B5C"}, 291 {"\u0B5D"}, 292 {"\u0F43"}, 293 {"\u0F4D"}, 294 {"\u0F52"}, 295 {"\u0F57"}, 296 {"\u0F5C"}, 297 {"\u0F69"}, 298 {"\u0F73"}, 299 {"\u0F75"}, 300 {"\u0F76"}, 301 {"\u0F78"}, 302 {"\u0F81"}, 303 {"\u0F93"}, 304 {"\u0F9D"}, 305 {"\u0FA2"}, 306 {"\u0FA7"}, 307 {"\u0FAC"}, 308 {"\u0FB9"}, 309 {"\u1F71"}, 310 {"\u1F73"}, 311 {"\u1F75"}, 312 {"\u1F77"}, 313 {"\u1F79"}, 314 {"\u1F7B"}, 315 {"\u1F7D"}, 316 {"\u1FBB"}, 317 {"\u1FBE"}, 318 {"\u1FC9"}, 319 {"\u1FCB"}, 320 {"\u1FD3"}, 321 {"\u1FDB"}, 322 {"\u1FE3"}, 323 {"\u1FEB"}, 324 {"\u1FEE"}, 325 {"\u1FEF"}, 326 {"\u1FF9"}, 327 {"\u1FFB"}, 328 {"\u1FFD"}, 329 {"\u2000"}, 330 {"\u2001"}, 331 {"\u2126"}, 332 {"\u212A"}, 333 {"\u212B"}, 334 {"\u2329"}, 335 {"\u232A"}, 336 {"\u2ADC"}, 337 {"\uF900", "\uFA0D"}, 338 {"\uFA10"}, 339 {"\uFA12"}, 340 {"\uFA15", "\uFA1E"}, 341 {"\uFA20"}, 342 {"\uFA22"}, 343 {"\uFA25"}, 344 {"\uFA26"}, 345 {"\uFA2A", "\uFA2D"}, 346 {"\uFA30", "\uFA6A"}, 347 {"\uFB1D"}, 348 {"\uFB1F"}, 349 {"\uFB2A", "\uFB36"}, 350 {"\uFB38", "\uFB3C"}, 351 {"\uFB3E"}, 352 {"\uFB40"}, 353 {"\uFB41"}, 354 {"\uFB43"}, 355 {"\uFB44"}, 356 {"\uFB46", "\uFB4E"}, 357 {"\uD834\uDD5E", "\uD834\uDD64"}, 358 {"\uD834\uDDBB", "\uD834\uDDC0"}, 359 {"\uD87E\uDC00", "\uD87E\uDE1D"} 360 }; 361 362 String[][] EXCLUDED_LATEST = { 363 364 }; 365 366 for (int i = 0; i < EXCLUDED_UNICODE_3_2_0.length; ++i) { 367 if (EXCLUDED_UNICODE_3_2_0[i].length == 1) { 368 checkCompositionExclusion_320(EXCLUDED_UNICODE_3_2_0[i][0]); 369 } else { 370 int from, to; 371 from = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][0], 0); 372 to = Character.codePointAt(EXCLUDED_UNICODE_3_2_0[i][1], 0); 373 374 for (int j = from; j <= to; j++) { 375 checkCompositionExclusion_320(String.valueOf(Character.toChars(j))); 376 } 377 } 378 } 379 } 380 381 private void checkCompositionExclusion_320(String s) throws Exception { 382 String a = String.valueOf(s); 383 String b = NormalizerBase.normalize(a, NFKD); 384 String c = NormalizerBase.normalize(b, NFC); 385 386 if (c.equals(a)) { 387 errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " + 388 toHexString(b) + " x COMPOSE => " + 389 toHexString(c) + " for the latest Unicode"); 390 } else if (verbose) { 391 logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " + 392 toHexString(b) + " x COMPOSE => " + 393 toHexString(c) + " for the latest Unicode"); 394 } 395 396 b = NormalizerBase.normalize(a, NFKD, Normalizer.UNICODE_3_2); 397 c = NormalizerBase.normalize(b, NFC, Normalizer.UNICODE_3_2); 398 if (c.equals(a)) { 399 errln("FAIL: " + toHexString(a) + " x DECOMP_COMPAT => " + 400 toHexString(b) + " x COMPOSE => " + 401 toHexString(c) + " for Unicode 3.2.0"); 402 } else if (verbose) { 403 logln("Ok: " + toHexString(a) + " x DECOMP_COMPAT => " + 404 toHexString(b) + " x COMPOSE => " + 405 toHexString(c) + " for Unicode 3.2.0"); 406 } 407 } 408 409 public void TestTibetan() throws Exception { 410 String[][] decomp = { 411 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } 412 }; 413 String[][] compose = { 414 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } 415 }; 416 417 staticTest(NFD, decomp, 1); 418 staticTest(NFKD,decomp, 2); 419 staticTest(NFC, compose, 1); 420 staticTest(NFKC,compose, 2); 421 } 422 423 public void TestExplodingBase() throws Exception{ 424 // \u017f - Latin small letter long s 425 // \u0307 - combining dot above 426 // \u1e61 - Latin small letter s with dot above 427 // \u1e9b - Latin small letter long s with dot above 428 String[][] canon = { 429 // Input Decomposed Composed 430 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, 431 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, 432 }; 433 String[][] compat = { 434 // Input Decomposed Composed 435 { "\u017f", "s", "s" }, 436 { "\u1e9b", "s\u0307", "\u1e61" }, 437 }; 438 439 staticTest(NFD, canon, 1); 440 staticTest(NFC, canon, 2); 441 staticTest(NFKD, compat, 1); 442 staticTest(NFKC, compat, 2); 443 } 444 445 private String[][] canonTests = { 446 // Input Decomposed Composed 447 448 { "cat", "cat", "cat" }, 449 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, 450 451 // D-dot_above 452 { "\u1e0a", "D\u0307", "\u1e0a" }, 453 454 // D dot_above 455 { "D\u0307", "D\u0307", "\u1e0a" }, 456 457 // D-dot_below dot_above 458 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, 459 460 // D-dot_above dot_below 461 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, 462 463 // D dot_below dot_above 464 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, 465 466 // D dot_below cedilla dot_above 467 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, 468 469 // D dot_above ogonek dot_below 470 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, 471 472 // E-macron-grave 473 { "\u1E14", "E\u0304\u0300", "\u1E14" }, 474 475 // E-macron + grave 476 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, 477 478 // E-grave + macron 479 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, 480 481 // angstrom_sign 482 { "\u212b", "A\u030a", "\u00c5" }, 483 484 // A-ring 485 { "\u00c5", "A\u030a", "\u00c5" }, 486 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, 487 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, 488 489 //updated with 3.0 490 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, 491 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, 492 493 { "Henry IV", "Henry IV", "Henry IV" }, 494 { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, 495 496 // ga(Zenkaku-Katakana) 497 { "\u30AC", "\u30AB\u3099", "\u30AC" }, 498 499 // ka(Zenkaku-Katakana) + ten(Zenkaku) 500 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, 501 502 // ka(Hankaku-Katakana) + ten(Hankaku-Katakana) 503 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, 504 505 // ka(Zenkaku-Katakana) + ten(Hankaku) 506 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, 507 // ka(Hankaku-Katakana) + ten(Zenkaku) 508 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, 509 510 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, 511 512 { "\ud834\udd5e\ud834\udd57\ud834\udd65\ud834\udd5e", 513 "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65", 514 "\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65\ud834\udd57\ud834\udd65" }, 515 }; 516 517 private String[][] compatTests = { 518 // Input Decomposed Composed 519 520 { "cat", "cat", "cat" }, 521 522 // Alef-Lamed vs. Alef, Lamed 523 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, 524 525 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, 526 527 // ffi ligature -> f + f + i 528 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, 529 530 //updated for 3.0 531 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, 532 533 // ffi ligature -> f + f + i 534 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, 535 536 { "Henry IV", "Henry IV", "Henry IV" }, 537 { "Henry \u2163", "Henry IV", "Henry IV" }, 538 539 // ga(Zenkaku-Katakana) 540 { "\u30AC", "\u30AB\u3099", "\u30AC" }, 541 542 // ka(Zenkaku-Katakana) + ten(Zenkaku) 543 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, 544 545 // ka(Hankaku-Katakana) + ten(Zenkaku) 546 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, 547 548 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ 549 // ka(Hankaku-Katakana) + ten(Hankaku) 550 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, 551 552 // ka(Zenkaku-Katakana) + ten(Hankaku) 553 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, 554 }; 555 556 public void TestNFD() throws Exception{ 557 staticTest(NFD, canonTests, 1); 558 } 559 560 public void TestNFC() throws Exception{ 561 staticTest(NFC, canonTests, 2); 562 } 563 564 public void TestNFKD() throws Exception{ 565 staticTest(NFKD, compatTests, 1); 566 } 567 568 public void TestNFKC() throws Exception{ 569 staticTest(NFKC, compatTests, 2); 570 } 571 572 private void staticTest(java.text.Normalizer.Form form, 573 String[][] tests, 574 int outCol) throws Exception { 575 for (int i = 0; i < tests.length; i++) { 576 String input = tests[i][0]; 577 logln("Normalizing '" + input + "' (" + toHexString(input) + ")" ); 578 579 String expect =tests[i][outCol]; 580 String output = java.text.Normalizer.normalize(input, form); 581 582 if (!output.equals(expect)) { 583 errln("FAIL: case " + i 584 + " expected '" + expect + "' (" + toHexString(expect) + ")" 585 + " but got '" + output + "' (" + toHexString(output) + ")" 586 ); 587 } 588 } 589 } 590 591 // With Canonical decomposition, Hangul syllables should get decomposed 592 // into Jamo, but Jamo characters should not be decomposed into 593 // conjoining Jamo 594 private String[][] hangulCanon = { 595 // Input Decomposed Composed 596 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, 597 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, 598 }; 599 600 public void TestHangulCompose() throws Exception{ 601 logln("Canonical composition..."); 602 staticTest(NFC, hangulCanon, 2); 603 } 604 605 public void TestHangulDecomp() throws Exception{ 606 logln("Canonical decomposition..."); 607 staticTest(NFD, hangulCanon, 1); 608 } 609 610 }