1 /* 2 * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* @test 25 * @bug 8186801 8186751 26 * @summary Test the charset mappings 27 */ 28 29 import java.io.*; 30 import java.nio.*; 31 import java.nio.file.*; 32 import java.nio.charset.*; 33 import java.util.*; 34 import java.util.function.*; 35 import java.util.regex.*; 36 import java.util.stream.*; 37 38 public class TestCharsetMapping { 39 40 private static final int BUFSIZ = 8192; // Initial buffer size 41 private static final int MAXERRS = 10; // Errors reported per test 42 43 private static final PrintStream log = System.out; 44 45 // Set by -v on the command line 46 private static boolean verbose = false; 47 48 // Test modes 49 private static final int ENCODE = 1; 50 private static final int DECODE = 2; 51 52 // Utilities 53 private static ByteBuffer expand(ByteBuffer bb) { 54 ByteBuffer nbb = ByteBuffer.allocate(bb.capacity() * 2); 55 bb.flip(); 56 nbb.put(bb); 57 return nbb; 58 } 59 60 private static CharBuffer expand(CharBuffer cb) { 61 CharBuffer ncb = CharBuffer.allocate(cb.capacity() * 2); 62 cb.flip(); 63 ncb.put(cb); 64 return ncb; 65 } 66 67 private static byte[] parseBytes(String s) { 68 int nb = s.length() / 2; 69 byte[] bs = new byte[nb]; 70 for (int i = 0; i < nb; i++) { 71 int j = i * 2; 72 if (j + 2 > s.length()) 73 throw new RuntimeException("Malformed byte string: " + s); 74 bs[i] = (byte)Integer.parseInt(s.substring(j, j + 2), 16); 75 } 76 return bs; 77 } 78 79 private static String printBytes(byte[] bs) { 80 StringBuffer sb = new StringBuffer(); 81 for (int i = 0; i < bs.length; i++) { 82 sb.append(Integer.toHexString((bs[i] >> 4) & 0xf)); 83 sb.append(Integer.toHexString((bs[i] >> 0) & 0xf)); 84 } 85 return sb.toString(); 86 } 87 88 private static String printCodePoint(int cp) { 89 StringBuffer sb = new StringBuffer(); 90 sb.append("U+"); 91 if (cp > 0xffff) 92 sb.append(Integer.toHexString((cp >> 16) & 0xf)); 93 sb.append(Integer.toHexString((cp >> 12) & 0xf)); 94 sb.append(Integer.toHexString((cp >> 8) & 0xf)); 95 sb.append(Integer.toHexString((cp >> 4) & 0xf)); 96 sb.append(Integer.toHexString((cp >> 0) & 0xf)); 97 return sb.toString(); 98 } 99 100 private static int getCodePoint(CharBuffer cb) { 101 char c = cb.get(); 102 if (Character.isHighSurrogate(c)) 103 return Character.toCodePoint(c, cb.get()); 104 else 105 return c; 106 } 107 108 private static String plural(int n) { 109 return (n == 1 ? "" : "s"); 110 } 111 112 // TestCharsetMapping 113 private CharsetInfo csinfo; 114 private CharsetDecoder decoder = null; 115 private CharsetEncoder encoder = null; 116 117 // Stateful dbcs encoding has leading shift byte '0x0e' 118 // and trailing shift byte '0x0f'. 119 // The flag variable shiftHackDBCS is 'true' for stateful 120 // EBCDIC encodings, which indicates the need of adding/ 121 // removing the shift bytes. 122 private boolean shiftHackDBCS = false; 123 124 private TestCharsetMapping(CharsetInfo csinfo) throws Exception { 125 this.csinfo = csinfo; 126 this.encoder = csinfo.cs.newEncoder() 127 .onUnmappableCharacter(CodingErrorAction.REPLACE) 128 .onMalformedInput(CodingErrorAction.REPLACE); 129 this.decoder = csinfo.cs.newDecoder() 130 .onUnmappableCharacter(CodingErrorAction.REPLACE) 131 .onMalformedInput(CodingErrorAction.REPLACE); 132 } 133 134 private class Test { 135 // An instance of this class tests all mappings for 136 // a particular bytesPerChar value 137 private int bytesPerChar; 138 139 // Reference data from .map/nr/c2b files 140 private ByteBuffer refBytes = ByteBuffer.allocate(BUFSIZ); 141 private CharBuffer refChars = CharBuffer.allocate(BUFSIZ); 142 143 private ByteBuffer dRefBytes = ByteBuffer.allocateDirect(BUFSIZ); 144 private CharBuffer dRefChars = ByteBuffer.allocateDirect(BUFSIZ*2).asCharBuffer(); 145 146 private Test(int bpc) { 147 bytesPerChar = bpc; 148 } 149 150 // shiftHackDBCS can add the leading/trailing shift bytesa 151 private void put(byte[] bs) { 152 if (refBytes.remaining() < bytesPerChar) 153 refBytes = expand(refBytes); 154 refBytes.put(bs); 155 } 156 157 private void put(byte[] bs, char[] cc) { 158 if (bs.length != bytesPerChar) 159 throw new IllegalArgumentException(bs.length 160 + " != " 161 + bytesPerChar); 162 if (refBytes.remaining() < bytesPerChar) 163 refBytes = expand(refBytes); 164 refBytes.put(bs); 165 if (refChars.remaining() < cc.length) 166 refChars = expand(refChars); 167 refChars.put(cc); 168 } 169 170 private boolean decode(ByteBuffer refBytes, CharBuffer refChars) 171 throws Exception { 172 log.println(" decode" + (refBytes.isDirect()?" (direct)":"")); 173 CharBuffer out = decoder.decode(refBytes); 174 175 refBytes.rewind(); 176 byte[] bs = new byte[bytesPerChar]; 177 int e = 0; 178 179 if (shiftHackDBCS && bytesPerChar == 2 && refBytes.get() != (byte)0x0e) { 180 log.println("Missing leading byte"); 181 } 182 183 while (refChars.hasRemaining()) { 184 refBytes.get(bs); 185 int rcp = getCodePoint(refChars); 186 int ocp = getCodePoint(out); 187 if (rcp != ocp) { 188 log.println(" Error: " 189 + printBytes(bs) 190 + " --> " 191 + printCodePoint(ocp) 192 + ", expected " 193 + printCodePoint(rcp)); 194 if (++e >= MAXERRS) { 195 log.println(" Too many errors, giving up"); 196 break; 197 } 198 } 199 if (verbose) { 200 log.println(" " 201 + printBytes(bs) 202 + " --> " 203 + printCodePoint(rcp)); 204 } 205 } 206 207 if (shiftHackDBCS && bytesPerChar == 2 && refBytes.get() != (byte)0x0f) { 208 log.println("Missing trailing byte"); 209 } 210 211 if (e == 0 && (refChars.hasRemaining() || out.hasRemaining())) { 212 // Paranoia: Didn't consume everything 213 throw new IllegalStateException(); 214 } 215 refBytes.rewind(); 216 refChars.rewind(); 217 return (e == 0); 218 } 219 220 private boolean encode(ByteBuffer refBytes, CharBuffer refChars) 221 throws Exception { 222 log.println(" encode" + (refBytes.isDirect()?" (direct)":"")); 223 ByteBuffer out = encoder.encode(refChars); 224 refChars.rewind(); 225 226 if (shiftHackDBCS && bytesPerChar == 2 && out.get() != refBytes.get()) { 227 log.println("Missing leading byte"); 228 return false; 229 } 230 231 byte[] rbs = new byte[bytesPerChar]; 232 byte[] obs = new byte[bytesPerChar]; 233 int e = 0; 234 while (refChars.hasRemaining()) { 235 int cp = getCodePoint(refChars); 236 refBytes.get(rbs); 237 out.get(obs); 238 boolean eq = true; 239 for (int i = 0; i < bytesPerChar; i++) 240 eq &= rbs[i] == obs[i]; 241 if (!eq) { 242 log.println(" Error: " 243 + printCodePoint(cp) 244 + " --> " 245 + printBytes(obs) 246 + ", expected " 247 + printBytes(rbs)); 248 if (++e >= MAXERRS) { 249 log.println(" Too many errors, giving up"); 250 break; 251 } 252 } 253 if (verbose) { 254 log.println(" " 255 + printCodePoint(cp) 256 + " --> " 257 + printBytes(rbs)); 258 } 259 } 260 261 if (shiftHackDBCS && bytesPerChar == 2 && out.get() != refBytes.get()) { 262 log.println("Missing trailing byte"); 263 return false; 264 } 265 266 if (e == 0 && (refBytes.hasRemaining() || out.hasRemaining())) { 267 // Paranoia: Didn't consume everything 268 throw new IllegalStateException(); 269 } 270 271 refBytes.rewind(); 272 refChars.rewind(); 273 return (e == 0); 274 } 275 276 private boolean run(int mode) throws Exception { 277 log.println(" " + bytesPerChar 278 + " byte" + plural(bytesPerChar) + "/char"); 279 280 if (dRefBytes.capacity() < refBytes.capacity()) { 281 dRefBytes = ByteBuffer.allocateDirect(refBytes.capacity()); 282 } 283 if (dRefChars.capacity() < refChars.capacity()) { 284 dRefChars = ByteBuffer.allocateDirect(refChars.capacity()*2) 285 .asCharBuffer(); 286 } 287 refBytes.flip(); 288 refChars.flip(); 289 dRefBytes.clear(); 290 dRefChars.clear(); 291 292 dRefBytes.put(refBytes).flip(); 293 dRefChars.put(refChars).flip(); 294 refBytes.flip(); 295 refChars.flip(); 296 297 boolean rv = true; 298 if (mode != ENCODE) { 299 rv &= decode(refBytes, refChars); 300 rv &= decode(dRefBytes, dRefChars); 301 } 302 if (mode != DECODE) { 303 rv &= encode(refBytes, refChars); 304 rv &= encode(dRefBytes, dRefChars); 305 } 306 return rv; 307 } 308 } 309 310 // Maximum bytes/char being tested 311 private int maxBytesPerChar = 0; 312 313 // Tests, indexed by bytesPerChar - 1 314 private Test[] tests; 315 316 private void clearTests() { 317 maxBytesPerChar = 0; 318 tests = new Test[0]; 319 } 320 321 // Find the test for the given bytes/char value, 322 // expanding the test array if needed 323 // 324 private Test testFor(int bpc) { 325 if (bpc > maxBytesPerChar) { 326 Test[] ts = new Test[bpc]; 327 System.arraycopy(tests, 0, ts, 0, maxBytesPerChar); 328 for (int i = maxBytesPerChar; i < bpc; i++) 329 ts[i] = new Test(i + 1); 330 tests = ts; 331 maxBytesPerChar = bpc; 332 } 333 return tests[bpc - 1]; 334 } 335 336 private boolean testStringConv() throws Exception { 337 if (shiftHackDBCS) { 338 log.println(" string de/encoding skipped for ebcdic"); 339 return true; 340 } 341 boolean rv = true; 342 log.println(" string de/encoding"); 343 // for new String() 344 ByteArrayOutputStream baosDec = new ByteArrayOutputStream(); 345 StringBuilder sbDec = new StringBuilder(); 346 // for String.getBytes() 347 ByteArrayOutputStream baosEnc = new ByteArrayOutputStream(); 348 StringBuilder sbEnc = new StringBuilder(); 349 350 for (Entry e : csinfo.mappings) { 351 baosDec.write(e.bs); 352 sbDec.append(Character.toChars(e.cp)); 353 if (e.cp2 != 0) 354 sbDec.append(e.cp2); 355 356 // non-roundtrip b2c, and c2b 357 if (csinfo.nr != null && csinfo.nr.containsKey(e.bb) || 358 csinfo.c2b != null && !csinfo.c2b.containsKey(e.cp)) 359 continue; 360 baosEnc.write(e.bs); 361 sbEnc.append(Character.toChars(e.cp)); 362 if (e.cp2 != 0) 363 sbEnc.append(e.cp2); 364 } 365 log.println(" new String()"); 366 if (!new String(baosDec.toByteArray(), csinfo.csName).equals(sbDec.toString())) { 367 log.println(" Error: new String() failed"); 368 rv = false; 369 } 370 log.println(" String.getBytes()"); 371 if (!Arrays.equals(baosEnc.toByteArray(), sbEnc.toString().getBytes(csinfo.csName))) { 372 log.println(" Error: String().getBytes() failed"); 373 rv = false; 374 } 375 return rv; 376 } 377 378 private boolean run() throws Exception { 379 boolean rv = true; 380 shiftHackDBCS = csinfo.type.equals("ebcdic"); // isStateful; 381 382 // (1) new String()/String.getBytes() 383 rv &= testStringConv(); 384 385 // (2) DECODE: 386 clearTests(); 387 if (shiftHackDBCS) { 388 testFor(2).put(new byte[] { 0x0e }); 389 } 390 csinfo.mappings.forEach(e -> { 391 if (e.cp2 != 0) 392 return; // skip composite (base+cc) for now 393 byte[] bs = e.bs; 394 char[] cc = Character.toChars(e.cp); 395 testFor(bs.length).put(bs, cc); 396 }); 397 if (shiftHackDBCS) { 398 testFor(2).put(new byte[] { 0x0f }); 399 } 400 for (int i = 0; i < maxBytesPerChar; i++) { 401 rv &= tests[i].run(DECODE); 402 } 403 404 // (3) ENCODE: 405 clearTests(); 406 if (shiftHackDBCS) { 407 testFor(2).put(new byte[] { 0x0e }); 408 } 409 csinfo.mappings.forEach(e -> { 410 if (e.cp2 != 0) 411 return; // skip composite (base+cc) for now 412 if (csinfo.nr != null && csinfo.nr.containsKey(e.bb)) 413 return; // non-roundtrip b2c 414 if (csinfo.c2b != null && csinfo.c2b.containsKey(e.cp)) 415 return; // c2b only mapping 416 byte[] bs = e.bs; 417 char[] cc = Character.toChars(e.cp); 418 testFor(bs.length).put(bs, cc); 419 }); 420 if (csinfo.c2b != null) 421 csinfo.c2b.values().forEach(e -> { 422 byte[] bs = e.bs; 423 char[] cc = Character.toChars(e.cp); 424 testFor(bs.length).put(bs, cc); 425 }); 426 if (shiftHackDBCS) { 427 testFor(2).put(new byte[] { 0x0f }); 428 } 429 for (int i = 0; i < maxBytesPerChar; i++) { 430 rv &= tests[i].run(ENCODE); 431 } 432 return rv; 433 } 434 435 private static class Entry { 436 byte[] bs; // byte sequence reps 437 int cp; // Unicode codepoint 438 int cp2; // CC of composite 439 long bb; // bs in "long" form for nr lookup; 440 } 441 442 private final static int UNMAPPABLE = 0xFFFD; 443 private static final Pattern ptn = Pattern.compile("(?:0x)?(\\p{XDigit}++)\\s++(?:U\\+|0x)?(\\p{XDigit}++)(?:\\s++#.*)?"); 444 private static final int G_BS = 1; 445 private static final int G_CP = 2; 446 private static final int G_CP2 = 3; 447 448 private static class CharsetInfo { 449 Charset cs; 450 String pkgName; 451 String clzName; 452 String csName; 453 String hisName; 454 String type; 455 boolean isInternal; 456 Set<String> aliases = new HashSet<>(); 457 458 // mapping entries 459 List<Entry> mappings; 460 Map<Long, Entry> nr; // bytes -> entry 461 Map<Integer, Entry> c2b; // cp -> entry 462 463 CharsetInfo(String csName, String clzName) { 464 this.csName = csName; 465 this.clzName = clzName; 466 } 467 468 private Entry parse(Matcher m) { 469 Entry e = new Entry(); 470 e.bb = Long.parseLong(m.group(G_BS), 16); 471 if (e.bb < 0x100) 472 e.bs = new byte[] { (byte)e.bb }; 473 else 474 e.bs = parseBytes(m.group(G_BS)); 475 e.cp = Integer.parseInt(m.group(G_CP), 16); 476 if (G_CP2 <= m.groupCount() && m.group(G_CP2) != null) 477 e.cp2 = Integer.parseInt(m.group(G_CP2), 16); 478 else 479 e.cp2 = 0; 480 return e; 481 } 482 483 boolean loadMappings(Path dir) throws IOException { 484 // xxx.map 485 Path path = dir.resolve(clzName + ".map"); 486 if (!Files.exists(path)) { 487 return false; 488 } 489 Matcher m = ptn.matcher(""); 490 mappings = Files.lines(path) 491 .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt()) 492 .map(ln -> parse(m)) 493 .filter(e -> e.cp != UNMAPPABLE) // non-mapping 494 .collect(Collectors.toList()); 495 // xxx.nr 496 path = dir.resolve(clzName + ".nr"); 497 if (Files.exists(path)) { 498 nr = Files.lines(path) 499 .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt()) 500 .map(ln -> parse(m)) 501 .collect(Collectors.toMap(e -> e.bb, Function.identity())); 502 } 503 // xxx.c2b 504 path = dir.resolve(clzName + ".c2b"); 505 if (Files.exists(path)) { 506 c2b = Files.lines(path) 507 .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt()) 508 .map(ln -> parse(m)) 509 .collect(Collectors.toMap(e -> e.cp, Function.identity())); 510 } 511 return true; 512 } 513 } 514 515 private static Set<CharsetInfo> charsets(Path cslist) throws IOException { 516 Set<CharsetInfo> charsets = new LinkedHashSet<>(); 517 Iterator<String> itr = Files.readAllLines(cslist).iterator(); 518 CharsetInfo cs = null; 519 520 while (itr.hasNext()) { 521 String line = itr.next(); 522 if (line.startsWith("#") || line.length() == 0) { 523 continue; 524 } 525 String[] tokens = line.split("\\s+"); 526 if (tokens.length < 2) { 527 continue; 528 } 529 if ("charset".equals(tokens[0])) { 530 if (cs != null) { 531 charsets.add(cs); 532 cs = null; 533 } 534 if (tokens.length < 3) { 535 throw new RuntimeException("Error: incorrect charset line [" + line + "]"); 536 } 537 cs = new CharsetInfo(tokens[1], tokens[2]); 538 } else { 539 String key = tokens[1]; // leading empty str 540 switch (key) { 541 case "alias": 542 if (tokens.length < 3) { 543 throw new RuntimeException("Error: incorrect alias line [" + line + "]"); 544 } 545 cs.aliases.add(tokens[2]); // ALIAS_NAME 546 break; 547 case "package": 548 cs.pkgName = tokens[2]; 549 break; 550 case "type": 551 cs.type = tokens[2]; 552 break; 553 case "hisname": 554 cs.hisName = tokens[2]; 555 break; 556 case "internal": 557 cs.isInternal = Boolean.parseBoolean(tokens[2]); 558 break; 559 default: // ignore 560 } 561 } 562 } 563 if (cs != null) { 564 charsets.add(cs); 565 } 566 return charsets; 567 } 568 569 public static void main(String args[]) throws Exception { 570 Path dir = Paths.get(System.getProperty("test.src", ".") + 571 "/../../../../make/data/charsetmapping"); 572 if (!Files.exists(dir)) { 573 // not inside jdk repo, no mappings, exit silently 574 log.println("Nothing done, not in a jdk repo: "); 575 return; 576 } 577 if (args.length > 0 && "-v".equals(args[0])) { 578 // For debugging: java CoderTest [-v] 579 verbose = true; 580 } 581 582 int errors = 0; 583 int tested = 0; 584 int skipped = 0; 585 int known = 0; 586 587 for (CharsetInfo csinfo : charsets(dir.resolve("charsets"))) { 588 String csname = csinfo.csName; 589 590 if (csinfo.isInternal) { 591 continue; 592 } 593 594 log.printf("%ntesting: %-16s", csname); 595 596 if (!Charset.isSupported(csname)) { 597 errors++; 598 log.println(" [error: charset is not supported]"); 599 continue; 600 } 601 602 Charset cs = csinfo.cs = Charset.forName(csinfo.csName); 603 // test name() 604 if (!cs.name().equals(csinfo.csName)) { 605 errors++; 606 log.printf(" [error: wrong csname: " + csinfo.csName 607 + " vs " + cs.name() + "]"); 608 } 609 // test aliases() 610 if (!cs.aliases().equals(csinfo.aliases)) { 611 errors++; 612 log.printf(" [error wrong aliases]"); 613 if (verbose) { 614 log.println(); 615 log.println(" expected: " + csinfo.aliases); 616 log.println(" got: " + cs.aliases()); 617 } 618 } 619 620 if (csinfo.type.equals("source")) { 621 log.println(" [skipped: source based]"); 622 skipped++; 623 continue; 624 } 625 626 if (!csinfo.loadMappings(dir)) { 627 log.println(" [error loading mappings failed]"); 628 errors++; 629 continue; 630 } 631 632 tested++; 633 log.println(); 634 if (!new TestCharsetMapping(csinfo).run()) { 635 636 /////////////// known nr/c2b issues //////////////// 637 if (csinfo.csName.equals("x-IBM948") || 638 csinfo.csName.equals("x-IBM950") || 639 csinfo.csName.equals("x-IBM937") || 640 csinfo.csName.equals("x-IBM1383")) 641 { 642 log.println(" [**** skipped, KNOWN nr/c2b mapping issue]"); 643 known++; 644 continue; 645 } 646 647 errors++; 648 } 649 } 650 651 log.println(); 652 log.println(tested + " charset" + plural(tested) + " tested, " 653 + skipped + " skipped, " + known + " known issue(s)"); 654 log.println(); 655 if (errors > 0) 656 throw new Exception("Errors detected in " 657 + errors + " charset" + plural(errors)); 658 } 659 }