1 /* 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /** 27 * This is a tool to generate categoryNames and categoryMap which are used in 28 * CharSet.java. 29 */ 30 31 package org.openjdk.buildtools.generatebreakiteratordata; 32 33 import java.io.BufferedReader; 34 import java.io.BufferedWriter; 35 import java.io.FileReader; 36 import java.io.FileWriter; 37 import java.util.StringTokenizer; 38 39 class CharacterCategory { 40 41 /** 42 * A list of Unicode category names. 43 */ 44 static final String[] categoryNames = { 45 "Ll", /* Letter, Lowercase */ 46 "Lu", /* Letter, Uppercase */ 47 "Lt", /* Letter, Titlecase */ 48 "Lo", /* Letter, Other */ 49 "Lm", /* Letter, Modifier */ 50 "Nd", /* Number, Decimal Digit */ 51 "Nl", /* Number, Letter */ 52 "No", /* Number, Other */ 53 "Ps", /* Punctuation, Open */ 54 "Pe", /* Punctuation, Close */ 55 "Pi", /* Punctuation, Initial quote */ 56 "Pf", /* Punctuation, Final quote */ 57 "Pd", /* Punctuation, Dash */ 58 "Pc", /* Punctuation, Connector */ 59 "Po", /* Punctuation, Other */ 60 "Sc", /* Symbol, Currency */ 61 "Sm", /* Symbol, Math */ 62 "So", /* Symbol, Other */ 63 "Mn", /* Mark, Non-Spacing */ 64 "Mc", /* Mark, Spacing Combining */ 65 "Me", /* Mark, Enclosing */ 66 "Zl", /* Separator, Line */ 67 "Zp", /* Separator, Paragraph */ 68 "Zs", /* Separator, Space */ 69 "Cc", /* Other, Control */ 70 "Cf", /* Other, Format */ 71 "--", /* Dummy, ignored */ 72 // Don't add anything after the Dummy entry!! 73 }; 74 75 /** 76 * A array of Unicode code points for each category. 77 */ 78 private static int[][] categoryMap; 79 80 81 /** 82 * Generates CategoryMap for GenerateBreakIteratorData. 83 */ 84 static void makeCategoryMap(String filename) { 85 /* Overwrite specfile name */ 86 specfile = filename; 87 88 /* Generate data in current format (1.5.0) */ 89 generateNewData(); 90 91 /* Copy generated data to cateogyMap */ 92 categoryMap = new int[categoryNames.length-1][]; 93 for (int i = 0; i < categoryNames.length-1; i++) { 94 int len = newListCount[BMP][i] + newListCount[nonBMP][i]; 95 categoryMap[i] = new int[len]; 96 System.arraycopy(newList[i], 0, categoryMap[i], 0, len); 97 } 98 } 99 100 /** 101 * Returns categoryMap for the given category. 102 */ 103 static int[] getCategoryMap(int category) { 104 return categoryMap[category]; 105 } 106 107 108 /** 109 * Only used for debugging and generating a test program. 110 */ 111 public static void main(String[] args) { 112 /* Parses command-line options */ 113 processArgs(args); 114 115 /* Generates data in current format (1.5.0) */ 116 generateNewData(); 117 118 /* 119 * Generates data in older format (1.4.X and earlier) and creates 120 * the old CategoryMap if "oldFilename" is not null. 121 */ 122 if (!oldDatafile.equals("")) { 123 generateOldData(); 124 generateOldDatafile(); 125 } 126 127 /* Displays summary of generated data */ 128 showSummary(); 129 130 /* 131 * Generates a test program which compares the new data and the return 132 * values of Character.getType(). 133 * and the old data and the new data. 134 */ 135 generateTestProgram(); 136 } 137 138 139 /** 140 * Spec (Unicode data file) 141 */ 142 private static String specfile = "UnicodeData.txt"; 143 144 /** 145 * Output directory 146 */ 147 private static String outputDir = ""; 148 149 /** 150 * Old data filename 151 */ 152 private static String oldDatafile = ""; 153 154 /** 155 * Parses the specified arguments and sets up the variables. 156 */ 157 private static void processArgs(String[] args) { 158 for (int i = 0; i < args.length; i++) { 159 String arg =args[i]; 160 if (arg.equals("-spec")) { 161 specfile = args[++i]; 162 } else if (arg.equals("-old")) { 163 oldDatafile = args[++i]; 164 } else if (arg.equals("-o")) { 165 outputDir = args[++i]; 166 } else { 167 System.err.println("Usage: java CharacterCategory [-spec specfile]"); 168 System.exit(1); 169 } 170 } 171 } 172 173 174 /** 175 * Displays summary of generated data 176 */ 177 private static void showSummary() { 178 int oldSum = 0; 179 int newSum = 0; 180 int oldSuppSum = 0; 181 int newSuppSum = 0; 182 183 for (int i = 0; i < categoryNames.length-1; i++) { 184 int newNum = newListCount[BMP][i] + newListCount[nonBMP][i]; 185 186 if (oldTotalCount[i] != newNum) { 187 System.err.println("Error: The number of generated data is different between the new approach and the old approach."); 188 } 189 if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) { 190 System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach."); 191 } 192 193 System.out.println(" " + categoryNames[i] + ": " + 194 oldTotalCount[i] + 195 "(" + oldListCount[BEFORE][i] + 196 " + " + oldListCount[SURROGATE][i] + 197 " + " + oldListCount[AFTER][i] + ")" + 198 " --- " + newNum + 199 "(" + newListCount[BMP][i] + 200 " + " + newListCount[nonBMP][i] + ")"); 201 202 oldSum += oldListCount[BEFORE][i] * 2 + 203 oldListCount[SURROGATE][i] * 4 + 204 oldListCount[AFTER][i] * 2; 205 newSum += newNum * 4 ; 206 oldSuppSum += oldListCount[SURROGATE][i] * 4; 207 newSuppSum += newListCount[nonBMP][i] * 4; 208 } 209 210 System.out.println("\nTotal buffer sizes are:\n " + 211 oldSum + "bytes(Including " + oldSuppSum + 212 "bytes for supplementary characters)\n " + 213 newSum + "bytes(Including " + newSuppSum + 214 "bytes for supplementary characters)"); 215 216 if (!ignoredOld.toString().equals(ignoredNew.toString())) { 217 System.err.println("Ignored categories: Error: List mismatch: " + 218 ignoredOld + " vs. " + ignoredNew); 219 } else { 220 System.out.println("\nIgnored categories: " + ignoredOld); 221 System.out.println("Please confirm that they aren't used in BreakIteratorRules."); 222 } 223 } 224 225 226 private static final int HighSurrogate_CodeUnit_Start = 0xD800; 227 private static final int LowSurrogate_CodeUnit_Start = 0xDC00; 228 private static final int Supplementary_CodePoint_Start = 0x10000; 229 230 231 private static StringBuffer ignoredOld = new StringBuffer(); 232 private static int[] oldTotalCount = new int[categoryNames.length]; 233 private static int[][] oldListCount = new int[3][categoryNames.length]; 234 private static int[][] oldListLen = new int[3][categoryNames.length]; 235 private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length]; 236 237 private static final int BEFORE = 0; 238 private static final int SURROGATE = 1; 239 private static final int AFTER = 2; 240 241 /** 242 * Makes CategoryMap in ordler format which had been used by JDK 1.4.X and 243 * earlier versions. 244 */ 245 private static void generateOldData() { 246 /* Initialize arrays. */ 247 for (int i = 0; i<categoryNames.length; i++) { 248 for (int j = BEFORE; j <= AFTER; j++) { 249 oldListCount[j][i] = 0; 250 oldList[j][i] = new StringBuffer(); 251 oldListLen[j][i] = 17; 252 } 253 } 254 255 storeOldData(); 256 257 if (oldTotalCount[categoryNames.length-1] != 1) { 258 System.err.println("This should not happen. Unicode data which belongs to an undefined category exists"); 259 System.exit(1); 260 } 261 } 262 263 private static void storeOldData() { 264 try { 265 FileReader fin = new FileReader(specfile); 266 BufferedReader bin = new BufferedReader(fin); 267 268 String prevCode = "????"; 269 String line; 270 int prevIndex = categoryNames.length - 1; 271 int prevCodeValue = -1; 272 int curCodeValue = 0; 273 boolean setFirst = false; 274 275 while ((line = bin.readLine()) != null) { 276 if (line.length() == 0) { 277 continue; 278 } 279 280 StringTokenizer st = new StringTokenizer(line, ";"); 281 String code = st.nextToken(); 282 283 char c = code.charAt(0); 284 if (c == '#' || c == '/') { 285 continue; 286 } 287 288 int i = Integer.valueOf(code, 16).intValue(); 289 290 String characterName = st.nextToken(); 291 String category = st.nextToken(); 292 293 int index; 294 for (index = 0; index < categoryNames.length; index++) { 295 if (category.equals(categoryNames[index])) { 296 break; 297 } 298 } 299 300 if (index != categoryNames.length) { 301 curCodeValue = Integer.parseInt(code, 16); 302 if (prevIndex != index) { 303 appendOldChar(prevIndex, prevCodeValue, prevCode); 304 appendOldChar(index, curCodeValue, code); 305 prevIndex = index; 306 } else if (prevCodeValue != curCodeValue - 1) { 307 if (setFirst && characterName.endsWith(" Last>")) { 308 setFirst = false; 309 } else { 310 appendOldChar(prevIndex, prevCodeValue, prevCode); 311 appendOldChar(index, curCodeValue, code); 312 } 313 } 314 prevCodeValue = curCodeValue; 315 prevCode = code; 316 if (characterName.endsWith(" First>")) { 317 setFirst = true; 318 } 319 } else { 320 if (ignoredOld.indexOf(category) == -1) { 321 ignoredOld.append(category); 322 ignoredOld.append(' '); 323 } 324 } 325 } 326 appendOldChar(prevIndex, prevCodeValue, prevCode); 327 328 bin.close(); 329 fin.close(); 330 } 331 catch (Exception e) { 332 throw new InternalError(e.toString()); 333 } 334 } 335 336 private static void appendOldChar(int index, int code, String s) { 337 int range; 338 if (code < HighSurrogate_CodeUnit_Start) { 339 range = BEFORE; 340 } else if (code < Supplementary_CodePoint_Start) { 341 range = AFTER; 342 } else { 343 range = SURROGATE; 344 } 345 346 if (oldListLen[range][index] > 64) { 347 oldList[range][index].append("\"\n + \""); 348 oldListLen[range][index] = 19; 349 } 350 351 if (code == 0x22 || code == 0x5c) { 352 oldList[range][index].append('\\'); 353 oldList[range][index].append((char)code); 354 oldListLen[range][index] += 2; 355 } else if (code > 0x20 && code < 0x7F) { 356 oldList[range][index].append((char)code); 357 oldListLen[range][index] ++; 358 } else { 359 if (range == SURROGATE) {// Need to convert code point to code unit 360 oldList[range][index].append(toCodeUnit(code)); 361 oldListLen[range][index] += 12; 362 } else { 363 oldList[range][index].append("\\u"); 364 oldList[range][index].append(s); 365 oldListLen[range][index] += 6; 366 } 367 } 368 oldListCount[range][index] ++; 369 oldTotalCount[index]++; 370 } 371 372 private static String toCodeUnit(int i) { 373 StringBuffer sb = new StringBuffer(); 374 sb.append("\\u"); 375 sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase()); 376 sb.append("\\u"); 377 sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase()); 378 return sb.toString(); 379 } 380 381 private static int toCodePoint(String s) { 382 char c1 = s.charAt(0); 383 384 if (s.length() == 1 || !Character.isHighSurrogate(c1)) { 385 return (int)c1; 386 } else { 387 char c2 = s.charAt(1); 388 if (s.length() != 2 || !Character.isLowSurrogate(c2)) { 389 return -1; 390 } 391 return Character.toCodePoint(c1, c2); 392 } 393 } 394 395 396 private static StringBuffer ignoredNew = new StringBuffer(); 397 private static int[] newTotalCount = new int[categoryNames.length]; 398 private static int[][] newListCount = new int[2][categoryNames.length]; 399 private static int[][] newList = new int[categoryNames.length][]; 400 401 private static final int BMP = 0; 402 private static final int nonBMP = 1; 403 404 /** 405 * Makes CategoryMap in newer format which is used by JDK 1.5.0. 406 */ 407 private static void generateNewData() { 408 /* Initialize arrays. */ 409 for (int i = 0; i<categoryNames.length; i++) { 410 newList[i] = new int[10]; 411 } 412 413 storeNewData(); 414 415 if (newListCount[BMP][categoryNames.length-1] != 1) { 416 System.err.println("This should not happen. Unicode data which belongs to an undefined category exists"); 417 System.exit(1); 418 } 419 } 420 421 private static void storeNewData() { 422 try { 423 FileReader fin = new FileReader(specfile); 424 BufferedReader bin = new BufferedReader(fin); 425 426 String line; 427 int prevIndex = categoryNames.length - 1; 428 int prevCodeValue = -1; 429 int curCodeValue = 0; 430 boolean setFirst = false; 431 432 while ((line = bin.readLine()) != null) { 433 if (line.length() == 0) { 434 continue; 435 } 436 437 StringTokenizer st = new StringTokenizer(line, ";"); 438 String code = st.nextToken(); 439 440 char c = code.charAt(0); 441 if (c == '#' || c == '/') { 442 continue; 443 } 444 445 int i = Integer.valueOf(code, 16).intValue(); 446 447 String characterName = st.nextToken(); 448 String category = st.nextToken(); 449 450 int index; 451 for (index = 0; index < categoryNames.length; index++) { 452 if (category.equals(categoryNames[index])) { 453 break; 454 } 455 } 456 457 if (index != categoryNames.length) { 458 curCodeValue = Integer.parseInt(code, 16); 459 if (prevIndex == index) { 460 if (setFirst) { 461 if (characterName.endsWith(" Last>")) { 462 setFirst = false; 463 } else { 464 System.err.println("*** Error 1 at " + code); 465 } 466 } else { 467 if (characterName.endsWith(" First>")) { 468 setFirst = true; 469 } else if (characterName.endsWith(" Last>")) { 470 System.err.println("*** Error 2 at " + code); 471 } else { 472 if (prevCodeValue != curCodeValue - 1) { 473 appendNewChar(prevIndex, prevCodeValue); 474 appendNewChar(index, curCodeValue); 475 } 476 } 477 } 478 } else { 479 if (setFirst) { 480 System.err.println("*** Error 3 at " + code); 481 } else if (characterName.endsWith(" First>")) { 482 setFirst = true; 483 } else if (characterName.endsWith(" Last>")) { 484 System.err.println("*** Error 4 at " + code); 485 } 486 appendNewChar(prevIndex, prevCodeValue); 487 appendNewChar(index, curCodeValue); 488 prevIndex = index; 489 } 490 prevCodeValue = curCodeValue; 491 } else { 492 if (ignoredNew.indexOf(category) == -1) { 493 ignoredNew.append(category); 494 ignoredNew.append(' '); 495 } 496 } 497 } 498 appendNewChar(prevIndex, prevCodeValue); 499 500 bin.close(); 501 fin.close(); 502 } 503 catch (Exception e) { 504 System.err.println("Error occurred on accessing " + specfile); 505 e.printStackTrace(); 506 System.exit(1); 507 } 508 } 509 510 private static void appendNewChar(int index, int code) { 511 int bufLen = newList[index].length; 512 if (newTotalCount[index] == bufLen) { 513 int[] tmpBuf = new int[bufLen + 10]; 514 System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen); 515 newList[index] = tmpBuf; 516 } 517 518 newList[index][newTotalCount[index]++] = code; 519 if (code < 0x10000) { 520 newListCount[BMP][index]++; 521 } else { 522 newListCount[nonBMP][index]++; 523 } 524 } 525 526 527 /* Generates the old CategoryMap. */ 528 private static void generateOldDatafile() { 529 try { 530 FileWriter fout = new FileWriter(oldDatafile); 531 BufferedWriter bout = new BufferedWriter(fout); 532 533 bout.write("\n //\n // The following String[][] can be used in CharSet.java as is.\n //\n\n private static final String[][] categoryMap = {\n"); 534 for (int i = 0; i < categoryNames.length - 1; i++) { 535 if (oldTotalCount[i] != 0) { 536 bout.write(" { \"" + categoryNames[i] + "\","); 537 538 /* 0x0000-0xD7FF */ 539 if (oldListCount[BEFORE][i] != 0) { 540 bout.write(" \""); 541 542 bout.write(oldList[BEFORE][i].toString() + "\"\n"); 543 } 544 545 /* 0xD800-0xFFFF */ 546 if (oldListCount[AFTER][i] != 0) { 547 if (oldListCount[BEFORE][i] != 0) { 548 bout.write(" + \""); 549 } else { 550 bout.write(" \""); 551 } 552 bout.write(oldList[AFTER][i].toString() + "\"\n"); 553 } 554 555 /* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */ 556 if (oldListCount[SURROGATE][i] != 0) { 557 if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) { 558 bout.write(" + \""); 559 } else { 560 bout.write(" \""); 561 } 562 bout.write(oldList[SURROGATE][i].toString() + "\"\n"); 563 } 564 bout.write(" },\n"); 565 566 } 567 } 568 bout.write(" };\n\n"); 569 bout.close(); 570 fout.close(); 571 } 572 catch (Exception e) { 573 System.err.println("Error occurred on accessing " + oldDatafile); 574 e.printStackTrace(); 575 System.exit(1); 576 } 577 578 System.out.println("\n" + oldDatafile + " has been generated."); 579 } 580 581 582 /** 583 * Test program to be generated 584 */ 585 private static final String outfile = "CharacterCategoryTest.java"; 586 587 /* 588 * Generates a test program which compare the generated date (newer one) 589 * with the return values of Characger.getType(). 590 */ 591 private static void generateTestProgram() { 592 try { 593 FileWriter fout = new FileWriter(outfile); 594 BufferedWriter bout = new BufferedWriter(fout); 595 596 bout.write(collationMethod); 597 bout.write("\n //\n // The following arrays can be used in CharSet.java as is.\n //\n\n"); 598 599 bout.write(" private static final String[] categoryNames = {"); 600 for (int i = 0; i < categoryNames.length - 1; i++) { 601 if (i % 10 == 0) { 602 bout.write("\n "); 603 } 604 bout.write("\"" + categoryNames[i] + "\", "); 605 } 606 bout.write("\n };\n\n"); 607 608 bout.write(" private static final int[][] categoryMap = {\n"); 609 610 for (int i = 0; i < categoryNames.length - 1; i++) { 611 StringBuffer sb = new StringBuffer(" { /* Data for \"" + categoryNames[i] + "\" category */"); 612 613 for (int j = 0; j < newTotalCount[i]; j++) { 614 if (j % 8 == 0) { 615 sb.append("\n "); 616 } 617 sb.append(" 0x"); 618 sb.append(Integer.toString(newList[i][j], 16).toUpperCase()); 619 sb.append(','); 620 } 621 sb.append("\n },\n"); 622 bout.write(sb.toString()); 623 } 624 625 bout.write(" };\n"); 626 627 bout.write("\n}\n"); 628 629 bout.close(); 630 fout.close(); 631 } 632 catch (Exception e) { 633 System.err.println("Error occurred on accessing " + outfile); 634 e.printStackTrace(); 635 System.exit(1); 636 } 637 638 System.out.println("\n" + outfile + " has been generated."); 639 } 640 641 static String collationMethod = 642 "public class CharacterCategoryTest {\n\n" + 643 " static final int SIZE = 0x110000;\n" + 644 " static final String[] category = {\n" + 645 " \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" + 646 " \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" + 647 " \"Cf\", \"\", \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" + 648 " \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" + 649 " };\n\n" + 650 " public static void main(String[] args) {\n" + 651 " boolean err = false;\n" + 652 " byte[] b = new byte[SIZE];\n" + 653 " for (int i = 0; i < SIZE; i++) {\n" + 654 " b[i] = 0;\n" + 655 " }\n" + 656 " for (int i = 0; i < categoryMap.length; i++) {\n" + 657 " byte categoryNum = 0;\n" + 658 " String categoryName = categoryNames[i];\n" + 659 " for (int j = 0; j < category.length; j++) {\n" + 660 " if (categoryName.equals(category[j])) {\n" + 661 " categoryNum = (byte)j;\n" + 662 " break;\n" + 663 " }\n" + 664 " }\n" + 665 " int[] values = categoryMap[i];\n" + 666 " for (int j = 0; j < values.length;) {\n" + 667 " int firstChar = values[j++];\n" + 668 " int lastChar = values[j++];\n" + 669 " for (int k = firstChar; k <= lastChar; k++) {\n" + 670 " b[k] = categoryNum;\n" + 671 " }\n" + 672 " }\n" + 673 " }\n" + 674 " for (int i = 0; i < SIZE; i++) {\n" + 675 " int characterType = Character.getType(i);\n" + 676 " if (b[i] != characterType) {\n" + 677 " /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" + 678 " if (characterType == Character.PRIVATE_USE ||\n" + 679 " characterType == Character.SURROGATE ||\n" + 680 " characterType == Character.MODIFIER_SYMBOL) {\n" + 681 " continue;\n" + 682 " }\n" + 683 " err = true;\n" + 684 " System.err.println(\"Category conflict for a character(0x\" +\n" + 685 " Integer.toHexString(i) +\n" + 686 " \"). CharSet.categoryMap:\" +\n" + 687 " category[b[i]] +\n" + 688 " \" Character.getType():\" +\n" + 689 " category[characterType]);\n" + 690 " }\n" + 691 " }\n\n" + 692 " if (err) {\n" + 693 " throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" + 694 " }\n" + 695 " }\n"; 696 697 }