1 /* 2 * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package build.tools.generatecharacter; 27 28 import java.io.IOException; 29 import java.io.FileNotFoundException; 30 import java.io.BufferedReader; 31 import java.io.FileReader; 32 import java.io.PrintWriter; 33 import java.io.BufferedWriter; 34 import java.io.FileWriter; 35 import java.io.File; 36 import java.util.List; 37 38 import build.tools.generatecharacter.CharacterName; 39 40 /** 41 * This program generates the source code for the class java.lang.Character. 42 * It also generates native C code that can perform the same operations. 43 * It requires two external input data files: 44 * <ul> 45 * <li> Unicode specification file 46 * <li> Character class template file 47 * </ul> 48 * The Unicode specification file is available from the Unicode consortium. 49 * It has character specification lines that look like this: 50 * <listing> 51 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 52 * </listing> 53 * The Character class template file is filled in with additional 54 * information to produce the file Character.java, which can then be 55 * compiled by a Java compiler. The template file contains certain 56 * markers consisting of an alphabetic name string preceded by "$$". 57 * Such markers are replaced with generated program text. As a special 58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of 59 * alphabetic characters constituting a variable name. The character "_" 60 * is considered alphabetic for these purposes. 61 * 62 * @author Guy Steele 63 * @author Alan Liu 64 * @author John O'Conner 65 */ 66 67 public class GenerateCharacter { 68 69 final static boolean DEBUG = false; 70 71 final static String commandMarker = "$$"; 72 static String ROOT = ""; 73 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt"; 74 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt"; 75 static String DefaultPropListFileName = ROOT + "PropList.txt"; 76 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template"; 77 static String DefaultJavaOutputFileName = ROOT + "Character.java"; 78 static String DefaultCTemplateFileName = ROOT + "Character.c.template"; 79 static String DefaultCOutputFileName = ROOT + "Character.c"; 80 81 static int plane = 0; 82 83 /* The overall idea is that, in the generated Character class source code, 84 most character property data is stored in a special multi-level table whose 85 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn]. 86 The integers must sum to 16 (the number of bits in a character). 87 The first table is indexed by the k1 high-order bits of the character code. 88 The result is concatenated to the next k2 bits of the character code to index 89 the second table, and so on. Eventually the kn low-order bits of the character 90 code are concatenated and used to index one of two tables A and B; A contains 91 32-bit integer entries and B contains 16-bit short entries. The 48 bits that 92 can be thus obtained encode the properties for the character. 93 94 The default specification is [9, 4, 3, 0]. This particular table format was 95 designed by conducting an exhaustive search of table formats to minimize the 96 space consumed by the tables: the first and third tables need have only byte 97 values (the second table must have short values). Another good choice is 98 [10, 6, 0], which produces a larger table but allows particularly fast table 99 lookup code. 100 101 In each case, where the word "concatenated" is used, this may imply 102 first a << and then a | operation, or perhaps just a | operation if 103 the values in the table can be preshifted (generally possible if the table 104 entries are short rather than byte). 105 */ 106 107 /* The character properties are currently encoded into A (32 bits)and B (16 bits) 108 two parts. 109 110 A: the low 32 bits are defined in the following manner: 111 112 1 bit Mirrored property. 113 4 bits Bidirectional category (see below) (unused if -nobidi switch specified) 114 9 bits A signed offset used for converting case . 115 1 bit If 1, adding the signed offset converts the character to lowercase. 116 1 bit If 1, subtracting the signed offset converts the character to uppercase. 117 Note: for a titlecase character, both of the preceding bits will be 1 118 and the signed offset will be 1. 119 1 bit If 1, this character has a titlecase equivalent (possibly itself); 120 in this case, the two bits before this bit can be used to decide 121 whether this character is in fact uppercase, lowercase, or titlecase. 122 3 bits This field provides a quick way to lex identifiers. 123 The eight possible values for this field are as follows: 124 0 May not be part of an identifier 125 1 Ignorable control; may continue a Unicode identifier or Java identifier 126 2 May continue a Java identifier but not a Unicode identifier (unused) 127 3 May continue a Unicode identifier or Java identifier 128 4 Is a Java whitespace character 129 5 May start or continue a Java identifier; 130 may continue but not start a Unicode identifier 131 (this value is used for connector punctuation such as _) 132 6 May start or continue a Java identifier; 133 may not occur in a Unicode identifier 134 (this value is used for currency symbols such as $) 135 7 May start or continue a Unicode identifier or Java identifier 136 Thus: 137 5, 6, 7 may start a Java identifier 138 1, 2, 3, 5, 6, 7 may continue a Java identifier 139 7 may start a Unicode identifier 140 1, 3, 5, 7 may continue a Unicode identifier 141 1 is ignorable within an identifier 142 4 is Java whitespace 143 2 bits This field indicates whether the character has a numeric property. 144 The four possible values for this field are as follows: 145 0 This character has no numeric property. 146 1 Adding the digit offset to the character code and then 147 masking with 0x1F will produce the desired numeric value. 148 2 This character has a "strange" numeric value. 149 3 A Java supradecimal digit: adding the digit offset to the 150 character code, then masking with 0x1F, then adding 10 151 will produce the desired numeric value. 152 5 bits The digit offset (see description of previous field) 153 5 bits Character type (see below) 154 155 B: the high 16 bits are defined as: 156 1 bit Other_Lowercase property 157 1 bit Other_Uppercase property 158 1 bit Other_Alphabetic property 159 1 bit Other_Math property 160 1 bit Ideographic property 161 1 bit Noncharacter codepoint property 162 */ 163 164 165 // bit masks identify each component of a 32-bit property field described 166 // above. 167 // shift* indicates how many shifts right must happen to get the 168 // indicated property value in the lowest bits of the 32-bit space. 169 private static final int 170 shiftType = 0, maskType = 0x001F, 171 shiftDigitOffset = 5, maskDigitOffset = 0x03E0, 172 shiftNumericType = 10, maskNumericType = 0x0C00, 173 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000, 174 maskUnicodePart = 0x1000, 175 shiftCaseInfo = 15, maskCaseInfo = 0x38000, 176 maskLowerCase = 0x20000, 177 maskUpperCase = 0x10000, 178 maskTitleCase = 0x08000, 179 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000, 180 shiftCaseOffsetSign = 5, 181 // used only when calculating and 182 // storing digit offsets from char values 183 maskDigit = 0x001F, 184 // case offset are 9 bits 185 maskCase = 0x01FF, 186 shiftBidi = 27, maskBidi = 0x78000000, 187 shiftMirrored = 31, //maskMirrored = 0x80000000, 188 shiftPlane = 16, maskPlane = 0xFF0000; 189 190 // maskMirrored needs to be long, if up 16-bit 191 private static final long maskMirrored = 0x80000000L; 192 193 // bit masks identify the 16-bit priperty field described above, in B 194 // table 195 private static final long 196 maskOtherLowercase = 0x100000000L, 197 maskOtherUppercase = 0x200000000L, 198 maskOtherAlphabetic = 0x400000000L, 199 maskOtherMath = 0x800000000L, 200 maskIdeographic = 0x1000000000L, 201 maskNoncharacterCP = 0x2000000000L; 202 203 // Can compare masked values with these to determine 204 // numeric or lexical types. 205 public static int 206 valueNotNumeric = 0x0000, 207 valueDigit = 0x0400, 208 valueStrangeNumeric = 0x0800, 209 valueJavaSupradecimal = 0x0C00, 210 valueIgnorable = 0x1000, 211 valueJavaOnlyPart = 0x2000, 212 valueJavaUnicodePart = 0x3000, 213 valueJavaWhitespace = 0x4000, 214 valueJavaStartUnicodePart = 0x5000, 215 valueJavaOnlyStart = 0x6000, 216 valueJavaUnicodeStart = 0x7000, 217 lowJavaStart = 0x5000, 218 nonzeroJavaPart = 0x3000, 219 valueUnicodeStart = 0x7000; 220 221 // these values are used when only identifier properties are generated 222 // for use in verifier code. Shortens the property down to a single byte. 223 private static final int 224 bitJavaStart = 0x02, 225 bitJavaPart = 0x01, 226 maskIsJavaIdentifierPart = bitJavaPart, 227 maskIsJavaIdentifierStart = bitJavaStart; 228 229 static int maxOffset = maskCase/2 ; 230 static int minOffset = -maxOffset; 231 232 /* The following routines provide simple, concise formatting of long integer values. 233 The number in the name of the method indicates the desired number of characters 234 to be produced. If the number of digits required to represent the integer value 235 is less than that number, then the output is padded on the left with zeros 236 (for hex) or with spaces (for decimal). If the number of digits required to 237 represent the integer value is greater than the desired number, then all the digits 238 that are required are actually produced. 239 */ 240 241 static String hex(long n) { return Long.toHexString(n).toUpperCase(); } 242 243 static String hex2(long n) { 244 String q = Long.toHexString(n & 0xFF).toUpperCase(); 245 return "00".substring(Math.min(2, q.length())) + q; 246 } 247 248 static String hex4(long n) { 249 String q = Long.toHexString(n & 0xFFFF).toUpperCase(); 250 return "0000".substring(Math.min(4, q.length())) + q; 251 } 252 253 static String hex8(long n) { 254 String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase(); 255 return "00000000".substring(Math.min(8, q.length())) + q; 256 } 257 258 static String hex16(long n) { 259 String q = Long.toHexString(n).toUpperCase(); 260 return "0000000000000000".substring(Math.min(16, q.length())) + q; 261 } 262 263 static String dec3(long n) { 264 String q = Long.toString(n); 265 return " ".substring(Math.min(3, q.length())) + q; 266 } 267 268 static String dec5(long n) { 269 String q = Long.toString(n); 270 return " ".substring(Math.min(5, q.length())) + q; 271 } 272 273 /* This routine is called when some failure occurs. */ 274 275 static void FAIL(String s) { 276 System.out.println("** " + s); 277 } 278 279 /** 280 * Given the data from the Unicode specification file, this routine builds a map. 281 * 282 * The specification file is assumed to contain its data in sorted order by 283 * character code; as a result, the array passed as an argument to this method 284 * has its components in the same sorted order, with one entry for each defined 285 * Unicode character or character range. (A range is indicated by two consecutive 286 * entries, such that the name of the first entry begins with "<" and ends with 287 * "First>" and the second entry begins with "<" and ends with "Last>".) This is 288 * therefore a sparse representation of the character property data. 289 * 290 * The resulting map is dense representation of the character data. It contains 291 * 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits 292 * of this long value are used, but type long is used rather than int to facilitate 293 * future extensions of this source code generator that might require more than 294 * 32 bits to encode relevant character properties.) Entry k holds the encoded 295 * properties for character k. 296 * 297 * Method buildMap manages the transformation from the sparse representation to 298 * the dense representation. It calls method buildOne to handle the encoding 299 * of character property data from a single UnicodeSpec object into 32 bits. 300 * For undefined characters, method buildOne is not called and the map entry for 301 * that character is set to UnicodeSpec.UNASSIGNED. 302 * 303 * @param data character property data from the Unicode specification file 304 * @return an array of length 65536 with one entry for every possible char value 305 * 306 * @see GenerateCharacter#buildOne 307 */ 308 309 static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList) 310 { 311 long[] result; 312 if (bLatin1 == true) { 313 result = new long[256]; 314 } else { 315 result = new long[1<<16]; 316 } 317 int k=0; 318 int codePoint = plane<<16; 319 UnicodeSpec nonCharSpec = new UnicodeSpec(); 320 for (int j = 0; j < data.length && k < result.length; j++) { 321 if (data[j].codePoint == codePoint) { 322 result[k] = buildOne(codePoint, data[j], specialMaps); 323 ++k; 324 ++codePoint; 325 } 326 else if(data[j].codePoint > codePoint) { 327 if (data[j].name.endsWith("Last>")) { 328 // build map data for all chars except last in range 329 while (codePoint < data[j].codePoint && k < result.length) { 330 result[k] = buildOne(codePoint, data[j], specialMaps); 331 ++k; 332 ++codePoint; 333 } 334 } 335 else { 336 // we have a few unassigned chars before data[j].codePoint 337 while (codePoint < data[j].codePoint && k < result.length) { 338 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 339 ++k; 340 ++codePoint; 341 } 342 } 343 k = data[j].codePoint & 0xFFFF; 344 codePoint = data[j].codePoint; 345 result[k] = buildOne(codePoint, data[j], specialMaps); 346 ++k; 347 ++codePoint; 348 } 349 else { 350 System.out.println("An error has occured during spec mapping."); 351 System.exit(0); 352 } 353 } 354 // if there are still unprocessed chars, process them 355 // as unassigned/undefined. 356 codePoint = (plane<<16) | k; 357 while (k < result.length) { 358 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 359 ++k; 360 ++codePoint; 361 } 362 // now add all extra supported properties from PropList, to the 363 // upper 16-bit 364 addExProp(result, propList, "Other_Lowercase", maskOtherLowercase); 365 addExProp(result, propList, "Other_Uppercase", maskOtherUppercase); 366 addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic); 367 addExProp(result, propList, "Ideographic", maskIdeographic); 368 //addExProp(result, propList, "Other_Math", maskOtherMath); 369 //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP); 370 371 return result; 372 } 373 374 // The maximum and minimum offsets found while scanning the database 375 static int maxOffsetSeen = 0; 376 static int minOffsetSeen = 0; 377 378 /** 379 * Some Unicode separator characters are not considered Java whitespace. 380 * @param c character to test 381 * @return true if c in an invalid Java whitespace character, false otherwise. 382 */ 383 static boolean isInvalidJavaWhiteSpace(int c) { 384 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF}; 385 boolean retValue = false; 386 for(int x=0;x<exceptions.length;x++) { 387 if(c == exceptions[x]) { 388 retValue = true; 389 break; 390 } 391 } 392 return retValue; 393 394 } 395 396 /** 397 * Given the character property data for one Unicode character, encode the data 398 * of interest into a single long integer value. (Right now only 32 bits 399 * of this long value are used, but type long is used rather than int to facilitate 400 * future extensions of this source code generator that might require more than 401 * 32 bits to encode relevant character properties.) 402 * 403 * @param c the character code for which to encode property data 404 * @param us property data record from the Unicode specification file 405 * (its character code might not be equal to c if it specifies data 406 * for a range of characters) 407 * @return an encoded long value that contains the properties for a single char 408 * 409 * @see GenerateCharacter#buildMap 410 */ 411 412 static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) { 413 long resultA = 0; 414 // record the general category 415 resultA |= us.generalCategory; 416 417 // record the numeric properties 418 NUMERIC: { 419 STRANGE: { 420 int val = 0; 421 // c is A-Z 422 if ((c >= 0x0041) && (c <= 0x005A)) { 423 val = c - 0x0041; 424 resultA |= valueJavaSupradecimal; 425 // c is a-z 426 } else if ((c >= 0x0061) && (c <= 0x007A)) { 427 val = c - 0x0061; 428 resultA |= valueJavaSupradecimal; 429 // c is a full-width A-Z 430 } else if ((c >= 0xFF21) && (c <= 0xFF3A)) { 431 val = c - 0xFF21; 432 resultA |= valueJavaSupradecimal; 433 // c is a full-width a-z 434 } else if ((c >= 0xFF41) && (c <= 0xFF5A)) { 435 val = c - 0xFF41; 436 resultA |= valueJavaSupradecimal; 437 } else if (us.isDecimalValue()) { 438 val = us.decimalValue; 439 resultA |= valueDigit; 440 } else if (us.isDigitValue()) { 441 val = us.digitValue; 442 resultA |= valueDigit; 443 } else { 444 if (us.numericValue.length() == 0) { 445 break NUMERIC; // no numeric value at all 446 } else { 447 try { 448 val = Integer.parseInt(us.numericValue); 449 if (val >= 32 || val < 0) break STRANGE; 450 if (c == 0x215F) break STRANGE; 451 } catch(NumberFormatException e) { 452 break STRANGE; 453 } 454 resultA |= valueDigit; 455 } 456 } 457 if (val >= 32 || val < 0) break STRANGE; 458 resultA |= ((val - c & maskDigit) << shiftDigitOffset); 459 break NUMERIC; 460 } // end STRANGE 461 resultA |= valueStrangeNumeric; 462 } // end NUMERIC 463 464 // record case mapping 465 int offset = 0; 466 // might have a 1:M mapping 467 int specialMap = SpecialCaseMap.find(c, specialCaseMaps); 468 boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1); 469 if (bHasUpper) { 470 resultA |= maskUpperCase; 471 } 472 if (specialMap != -1) { 473 // has mapping, but cannot record the 474 // proper offset; can only flag it and provide special case 475 // code in Character.java 476 offset = -1; 477 } 478 else if (us.hasUpperMap()) { 479 offset = c - us.upperMap; 480 } 481 482 if (us.hasLowerMap()) { 483 resultA |= maskLowerCase; 484 if (offset == 0) 485 offset = us.lowerMap - c; 486 else if (offset != (us.lowerMap - c)) { 487 if (DEBUG) { 488 FAIL("Character " + hex(c) + 489 " has incompatible lowercase and uppercase mappings"); 490 } 491 } 492 } 493 if ((us.hasTitleMap() && us.titleMap != us.upperMap) || 494 (bHasUpper && us.hasLowerMap())) { 495 resultA |= maskTitleCase; 496 } 497 if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) { 498 System.out.println("Warning: Character " + hex4(c) + " has upper but " + 499 "no title case; Java won't know this"); 500 } 501 if (offset < minOffsetSeen) minOffsetSeen = offset; 502 if (offset > maxOffsetSeen) maxOffsetSeen = offset; 503 if (offset > maxOffset || offset < minOffset) { 504 if (DEBUG) { 505 FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case"); 506 } 507 offset = maskCase; 508 } 509 resultA |= ((offset & maskCase) << shiftCaseOffset); 510 511 // record lexical info about this character 512 if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER 513 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER 514 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER 515 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER 516 || us.generalCategory == UnicodeSpec.OTHER_LETTER 517 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) { 518 resultA |= valueJavaUnicodeStart; 519 } 520 else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK 521 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK 522 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) { 523 resultA |= valueJavaUnicodePart; 524 } 525 else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) { 526 resultA |= valueJavaStartUnicodePart; 527 } 528 else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) { 529 resultA |= valueJavaOnlyStart; 530 } 531 else if (((c >= 0x0000) && (c <= 0x0008)) 532 || ((c >= 0x000E) && (c <= 0x001B)) 533 || ((c >= 0x007F) && (c <= 0x009F)) 534 || us.generalCategory == UnicodeSpec.FORMAT) { 535 resultA |= valueIgnorable; 536 } 537 else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR 538 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR 539 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) { 540 if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace; 541 } 542 else if (((c >= 0x0009) && (c <= 0x000D)) 543 || ((c >= 0x001C) && (c <= 0x001F))) { 544 resultA |= valueJavaWhitespace; 545 } 546 547 // record bidi category 548 if (!nobidi) { 549 int tmpBidi = 550 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS || 551 us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi); 552 resultA |= tmpBidi; 553 } 554 555 // record mirrored property 556 if (!nomirror) { 557 resultA |= us.mirrored ? maskMirrored : 0; 558 } 559 560 if (identifiers) { 561 long replacement = 0; 562 if ((resultA & maskIdentifierInfo) >= lowJavaStart) { 563 replacement |= bitJavaStart; 564 } 565 if ( ((resultA & nonzeroJavaPart) != 0) 566 && ((resultA & maskIdentifierInfo) != valueIgnorable)) { 567 replacement |= bitJavaPart; 568 } 569 resultA = replacement; 570 } 571 return resultA; 572 } 573 574 static void addExProp(long[] map, PropList propList, String prop, long mask) { 575 List<Integer> cps = propList.codepoints(prop); 576 if (cps != null) { 577 for (Integer cp : cps) { 578 if (cp < map.length) 579 map[cp] |= mask; 580 } 581 } 582 } 583 584 /** 585 * This is the heart of the table compression strategy. The inputs are a map 586 * and a number of bits (size). The map is simply an array of long integer values; 587 * the number of bits indicates how index values for that map are to be split. 588 * The length of the given map must be a multiple of (1 << size). The result is 589 * a new map z and a compressed table t such that for every valid index value k 590 * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k]. 591 * 592 * In other words, the index k can be split into two parts, namely the "size" 593 * low-order bits and all the remaining high-order bits; the high-order bits are then 594 * remapped by map z to produce an index into table t. In effect, the data of the 595 * original map m is broken up into blocks of size (1<<size); the compression relies 596 * on the expectation that many of these blocks will be identical and therefore need 597 * be represented only once in the compressed table t. 598 * 599 * This method is intended to be used iteratively. The first map to be handed 600 * to it is the one constructed by method buildMap. After that, the first of the 601 * two arrays returned by this method is fed back into it for further compression. 602 * At the end of the iteration, one has a starter map and a sequence of tables. 603 * 604 * The algorithm used to implement this computation is straightforward and not 605 * especially clever. It uses brute-force linear search (the loop labeled MIDDLE) 606 * to locate identical blocks, so overall the time complexity of the algorithm 607 * is quadratic in the length of the input map. Fortunately, speed is not crucial 608 * to this application. 609 * 610 * @param map a map to be compressed 611 * @param size the number of index bits to be split off by the compression 612 * @return an array of length 2 containing two arrays; the first is a new map 613 * and the second is a compressed data table 614 * 615 * @see GenerateCharacter#buildMap 616 */ 617 618 static long[][] buildTable(long[] map, int size) { 619 int n = map.length; 620 if (((n >> size) << size) != n) { 621 FAIL("Length " + n + " is not a multiple of " + (1 << size)); 622 } 623 int m = 1 << size; 624 // We know the final length of the new map up front. 625 long[] newmap = new long[n >> size]; 626 // The buffer is used temporarily to hold data for the compressed table 627 // because we don't know its final length yet. 628 long[] buffer = new long[n]; 629 int ptr = 0; 630 OUTER: for (int i = 0; i < n; i += m) { 631 // For every block of size m in the original map... 632 MIDDLE: for (int j = 0; j < ptr; j += m) { 633 // Find out whether there is already a block just like it in the buffer. 634 for (int k = 0; k < m; k++) { 635 if (buffer[j+k] != map[i+k]) 636 continue MIDDLE; 637 } 638 // There is a block just like it at position j, so just 639 // put its index into the new map (thereby sharing it). 640 newmap[i >> size] = (j >> size); 641 continue OUTER; 642 } // end MIDDLE 643 // There is no block just like it already, so add it to 644 // the buffer and put its index into the new map. 645 for (int k = 0; k < m; k++) { 646 buffer[ptr+k] = map[i+k]; 647 } 648 newmap[i >> size] = (ptr >> size); 649 ptr += m; 650 } // end OUTER 651 // Now we know how long the compressed table should be, 652 // so create a new array and copy data from the temporary buffer. 653 long[] newdata = new long[ptr]; 654 for (int j = 0; j < ptr; j++) { 655 newdata[j] = buffer[j]; 656 } 657 // Return the new map and the new data table. 658 long[][] result = { newmap, newdata }; 659 return result; 660 } 661 662 /** 663 * Once the compressed tables have been computed, this method reads in a 664 * template file for the source code to be generated and writes out the final 665 * source code by acting as a sort of specialized macro processor. 666 * 667 * The first output line is a comment saying that the file was automatically 668 * generated; it includes a timestamp. All other output is generated by 669 * reading a line from the template file, performing macro replacements, 670 * and then writing the resulting line or lines of code to the output file. 671 * 672 * This method handles the I/O, the timestamp comment, and the locating of 673 * macro calls within each input line. The method replaceCommand is called 674 * to generate replacement text for each macro call. 675 * 676 * Macro calls to be replaced are indicated in the template file by 677 * occurrences of the commandMarker "$$". The rest of the call may consist 678 * of Java letters (including the underscore "_") and also of balanced 679 * parentheses. 680 * 681 * @param theTemplateFileName 682 * the file name for the template input file 683 * @param theOutputFileName 684 * the file name for the source code output file 685 * 686 * @see GenerateCharacter#replaceCommand 687 */ 688 689 static void generateCharacterClass(String theTemplateFileName, 690 String theOutputFileName) 691 throws FileNotFoundException, IOException { 692 BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName)); 693 PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName))); 694 out.println(commentStart + 695 " This file was generated AUTOMATICALLY from a template file " + 696 new java.util.Date() + commentEnd); 697 int marklen = commandMarker.length(); 698 LOOP: while(true) { 699 try { 700 String line = in.readLine(); 701 if (line == null) break LOOP; 702 int pos = 0; 703 int depth = 0; 704 while ((pos = line.indexOf(commandMarker, pos)) >= 0) { 705 int newpos = pos + marklen; 706 char ch = 'x'; 707 SCAN: while (newpos < line.length() && 708 (Character.isJavaIdentifierStart(ch = line.charAt(newpos)) 709 || ch == '(' || (ch == ')' && depth > 0))) { 710 ++newpos; 711 if (ch == '(') { 712 ++depth; 713 } 714 else if (ch == ')') { 715 --depth; 716 if (depth == 0) 717 break SCAN; 718 } 719 } 720 String replacement = replaceCommand(line.substring(pos + marklen, newpos)); 721 line = line.substring(0, pos) + replacement + line.substring(newpos); 722 pos += replacement.length(); 723 } 724 out.println(line); 725 } 726 catch (IOException e) { 727 break LOOP; 728 } 729 } 730 in.close(); 731 out.close(); 732 } 733 734 /** 735 * The replaceCommand method takes a command (a macro call without the 736 * leading marker "$$") and computes replacement text for it. 737 * 738 * Most of the commands are simply names of integer constants that are defined 739 * in the source code of this GenerateCharacter class. The replacement text is 740 * simply the value of the constant as an appropriately formatted integer literal. 741 * 742 * Two cases are more complicated, however. The command "Tables" causes the 743 * final map and compressed tables to be emitted, with elaborate comments 744 * describing their contents. (This is actually handled by method genTables.) 745 * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates 746 * an expression that will return the character property data for the character 747 * whose code is the value of the variable "xxx". (this is handled by method 748 * "genAccess".) 749 * 750 * @param x a command from the template file to be replaced 751 * @return the replacement text, as a String 752 * 753 * @see GenerateCharacter#genTables 754 * @see GenerateCharacter#genAccess 755 * @see GenerateCharacter#generateCharacterClass 756 */ 757 758 static String replaceCommand(String x) { 759 if (x.equals("Tables")) return genTables(); 760 if (x.equals("Initializers")) return genInitializers(); 761 if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") && 762 x.substring(x.length()-1).equals(")") ) 763 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32)); 764 if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") && 765 x.substring(x.length()-1).equals(")") ) 766 return genAccess("B", x.substring(9, x.length()-1), 16); 767 if (x.equals("shiftType")) return Long.toString(shiftType); 768 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo); 769 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo); 770 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart); 771 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset); 772 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo); 773 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign); 774 if (x.equals("maskCase")) return "0x" + hex8(maskCase); 775 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset); 776 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase); 777 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase); 778 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase); 779 if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32); 780 if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32); 781 if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32); 782 if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32); 783 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable); 784 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart); 785 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart); 786 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart); 787 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart); 788 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace); 789 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart); 790 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart); 791 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart); 792 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart); 793 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart); 794 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart); 795 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart); 796 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset); 797 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset); 798 if (x.equals("maskDigit")) return "0x" + hex(maskDigit); 799 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType); 800 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType); 801 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric); 802 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 803 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 804 if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal); 805 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 806 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 807 if (x.equals("maskType")) return "0x" + hex(maskType); 808 if (x.equals("shiftBidi")) return Long.toString(shiftBidi); 809 if (x.equals("maskBidi")) return "0x" + hex(maskBidi); 810 if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored); 811 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG])) 812 return Integer.toString(UnicodeSpec.UNASSIGNED); 813 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG])) 814 return Integer.toString(UnicodeSpec.UPPERCASE_LETTER); 815 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG])) 816 return Integer.toString(UnicodeSpec.LOWERCASE_LETTER); 817 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG])) 818 return Integer.toString(UnicodeSpec.TITLECASE_LETTER); 819 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG])) 820 return Integer.toString(UnicodeSpec.MODIFIER_LETTER); 821 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG])) 822 return Integer.toString(UnicodeSpec.OTHER_LETTER); 823 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG])) 824 return Integer.toString(UnicodeSpec.NON_SPACING_MARK); 825 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG])) 826 return Integer.toString(UnicodeSpec.ENCLOSING_MARK); 827 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG])) 828 return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK); 829 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG])) 830 return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER); 831 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG])) 832 return Integer.toString(UnicodeSpec.OTHER_NUMBER); 833 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG])) 834 return Integer.toString(UnicodeSpec.SPACE_SEPARATOR); 835 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG])) 836 return Integer.toString(UnicodeSpec.LINE_SEPARATOR); 837 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 838 return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR); 839 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG])) 840 return Integer.toString(UnicodeSpec.CONTROL); 841 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG])) 842 return Integer.toString(UnicodeSpec.FORMAT); 843 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG])) 844 return Integer.toString(UnicodeSpec.PRIVATE_USE); 845 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG])) 846 return Integer.toString(UnicodeSpec.SURROGATE); 847 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG])) 848 return Integer.toString(UnicodeSpec.DASH_PUNCTUATION); 849 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG])) 850 return Integer.toString(UnicodeSpec.START_PUNCTUATION); 851 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG])) 852 return Integer.toString(UnicodeSpec.END_PUNCTUATION); 853 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 854 return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION); 855 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 856 return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION); 857 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG])) 858 return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION); 859 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG])) 860 return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION); 861 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG])) 862 return Integer.toString(UnicodeSpec.LETTER_NUMBER); 863 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG])) 864 return Integer.toString(UnicodeSpec.MATH_SYMBOL); 865 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG])) 866 return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL); 867 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG])) 868 return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL); 869 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG])) 870 return Integer.toString(UnicodeSpec.OTHER_SYMBOL); 871 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG])) 872 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT); 873 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG])) 874 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING); 875 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG])) 876 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE); 877 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG])) 878 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT); 879 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG])) 880 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC); 881 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG])) 882 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING); 883 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG])) 884 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE); 885 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG])) 886 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT); 887 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG])) 888 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER); 889 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 890 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR); 891 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG])) 892 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR); 893 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG])) 894 return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER); 895 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 896 return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR); 897 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG])) 898 return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK); 899 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG])) 900 return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL); 901 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 902 return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR); 903 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG])) 904 return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR); 905 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG])) 906 return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE); 907 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG])) 908 return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS); 909 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG])) 910 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE); 911 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG])) 912 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE); 913 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG])) 914 return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE); 915 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG])) 916 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE); 917 FAIL("Unknown text substitution marker " + commandMarker + x); 918 return commandMarker + x; 919 } 920 921 /** 922 * The genTables method generates source code for all the lookup tables 923 * needed to represent the various Unicode character properties. 924 * It simply calls the method genTable once for each table to be generated 925 * and then generates a summary comment. 926 * 927 * @return the replacement text for the "Tables" command, as a String 928 * 929 * @see GenerateCharacter#genTable 930 * @see GenerateCharacter#replaceCommand 931 */ 932 static String genTables() { 933 int n = sizes.length; 934 StringBuffer result = new StringBuffer(); 935 // liu : Add a comment showing the source of this table 936 result.append(commentStart + " The following tables and code generated using:" + 937 commentEnd + "\n "); 938 result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n "); 939 940 if (plane == 0 && bLatin1 == false) { 941 genCaseMapTableDeclaration(result); 942 genCaseMapTable(initializers, specialCaseMaps); 943 } 944 int totalBytes = 0; 945 for (int k = 0; k < n - 1; k++) { 946 genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k], 947 sizes[k+1], false, false, k==0); 948 int s = bytes[k]; 949 if (s == 1 && useCharForByte) { 950 s = 2; 951 } 952 totalBytes += tables[k].length * s; 953 } 954 genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32), 955 sizes[n - 1], false, 0, true, !(identifiers), false); 956 957 // If we ever need more than 32 bits to represent the character properties, 958 // then a table "B" may be needed as well. 959 genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false); 960 961 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2); 962 result.append(commentStart); 963 result.append(" In all, the character property tables require "); 964 result.append(totalBytes).append(" bytes.").append(commentEnd); 965 if (verbose) { 966 System.out.println("The character property tables require " 967 + totalBytes + " bytes."); 968 } 969 return result.toString(); 970 } 971 972 /** 973 * The genInitializers method generates the body of the 974 * ensureInitted() method, which enables lazy initialization of 975 * the case map table and other tables. 976 */ 977 static String genInitializers() { 978 return initializers.toString(); 979 } 980 981 /** 982 * Return the total number of bytes needed by all tables. This is a stripped- 983 * down copy of genTables(). 984 */ 985 static int getTotalBytes() { 986 int n = sizes.length; 987 int totalBytes = 0; 988 for (int k = 0; k < n - 1; k++) { 989 totalBytes += tables[k].length * bytes[k]; 990 } 991 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) 992 + 31) >> 5) << 2); 993 return totalBytes; 994 } 995 996 static void appendEscapedStringFragment(StringBuffer result, 997 char[] line, 998 int length, 999 boolean lastFragment) { 1000 result.append(" \""); 1001 for (int k=0; k<length; ++k) { 1002 result.append("\\u"); 1003 result.append(hex4(line[k])); 1004 } 1005 result.append("\""); 1006 result.append(lastFragment ? ";" : "+"); 1007 result.append("\n"); 1008 } 1009 1010 static String SMALL_INITIALIZER = 1011 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1012 // " $$name = new $$type[$$size];\n"+ 1013 " int len = $$name_DATA.length();\n"+ 1014 " int j=0;\n"+ 1015 " for (int i=0; i<len; ++i) {\n"+ 1016 " int c = $$name_DATA.charAt(i);\n"+ 1017 " for (int k=0; k<$$entriesPerChar; ++k) {\n"+ 1018 " $$name[j++] = ($$type)c;\n"+ 1019 " c >>= $$bits;\n"+ 1020 " }\n"+ 1021 " }\n"+ 1022 " assert (j == $$size);\n"+ 1023 " }\n"; 1024 1025 static String SAME_SIZE_INITIALIZER = 1026 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1027 " assert ($$name_DATA.length() == $$size);\n"+ 1028 // " $$name = new $$type[$$size];\n"+ 1029 " for (int i=0; i<$$size; ++i)\n"+ 1030 " $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+ 1031 " }\n"; 1032 1033 static String BIG_INITIALIZER = 1034 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1035 // " $$name = new $$type[$$size];\n"+ 1036 " int len = $$name_DATA.length();\n"+ 1037 " int j=0;\n"+ 1038 " int charsInEntry=0;\n"+ 1039 " $$type entry=0;\n"+ 1040 " for (int i=0; i<len; ++i) {\n"+ 1041 " entry |= $$name_DATA.charAt(i);\n"+ 1042 " if (++charsInEntry == $$charsPerEntry) {\n"+ 1043 " $$name[j++] = entry;\n"+ 1044 " entry = 0;\n"+ 1045 " charsInEntry = 0;\n"+ 1046 " }\n"+ 1047 " else {\n"+ 1048 " entry <<= 16;\n"+ 1049 " }\n"+ 1050 " }\n"+ 1051 " assert (j == $$size);\n"+ 1052 " }\n"; 1053 1054 static String INT32_INITIALIZER = 1055 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1056 " char[] data = $$name_DATA.toCharArray();\n"+ 1057 " assert (data.length == ($$size * 2));\n"+ 1058 " int i = 0, j = 0;\n"+ 1059 " while (i < ($$size * 2)) {\n"+ 1060 " int entry = data[i++] << 16;\n"+ 1061 " $$name[j++] = entry | data[i++];\n"+ 1062 " }\n"+ 1063 " }\n"; 1064 1065 static void addInitializer(String name, String type, int entriesPerChar, 1066 int bits, int size) { 1067 1068 String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER : 1069 ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER); 1070 if (entriesPerChar == -2) { 1071 template = INT32_INITIALIZER; 1072 } 1073 int marklen = commandMarker.length(); 1074 int pos = 0; 1075 while ((pos = template.indexOf(commandMarker, pos)) >= 0) { 1076 int newpos = pos + marklen; 1077 char ch = 'x'; 1078 while (newpos < template.length() && 1079 Character.isJavaIdentifierStart(ch = template.charAt(newpos)) && 1080 ch != '_') // Don't allow this in token names 1081 ++newpos; 1082 String token = template.substring(pos+marklen, newpos); 1083 String replacement = "ERROR"; 1084 1085 if (token.equals("name")) replacement = name; 1086 else if (token.equals("type")) replacement = type; 1087 else if (token.equals("bits")) replacement = ""+bits; 1088 else if (token.equals("size")) replacement = ""+size; 1089 else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar; 1090 else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar); 1091 else FAIL("Unrecognized token: " + token); 1092 1093 template = template.substring(0, pos) + replacement + template.substring(newpos); 1094 pos += replacement.length(); 1095 } 1096 initializers.append(template); 1097 } 1098 1099 /** 1100 * The genTable method generates source code for one lookup table. 1101 * Most of the complexity stems from handling various options as to 1102 * the type of the array components, the precise representation of the 1103 * values, the format in which to render each value, the number of values 1104 * to emit on each line of source code, and the kinds of useful comments 1105 * to be generated. 1106 * 1107 * @param result a StringBuffer, to which the generated source code 1108 * text is to be appended 1109 * @param name the name of the table 1110 * @param table the table data (an array of long values) 1111 * @param extract a distance, in bits, by which each entry of the table 1112 * is to be right-shifted before it is processed 1113 * @param bits the number of bits (not bytes) to be used to represent 1114 * each table entry 1115 * @param size the table data is divided up into blocks of size (1<<size); 1116 * in this method, this information is used only to affect 1117 * how many table values are to be generated per line 1118 * @param preshifted if this flag is true, then the table entries are to be 1119 * emitted in a preshifted form; that is, each value should 1120 * be left-shifted by the amount "shift", so that this work 1121 * is built into the table and need not be performed by an 1122 * explicit shift operator at run time 1123 * @param shift this is the shift amount for preshifting of table entries 1124 * @param hexFormat if this flag is true, table entries should be emitted as 1125 * hexadecimal literals; otherwise decimal literals are used 1126 * @param properties if this flag is true, the table entries are encoded 1127 * character properties rather than indexes into yet other tables; 1128 * therefore comments describing the encoded properties should 1129 * be generated 1130 * @param hexComment if this flag is true, each line of output is labelled with 1131 * a hexadecimal comment indicating the character values to 1132 * which that line applies; otherwise, decimal values indicating 1133 * table indices are generated 1134 * 1135 * @see GenerateCharacter#genTables 1136 * @see GenerateCharacter#replaceCommand 1137 */ 1138 1139 static void genTable(StringBuffer result, String name, 1140 long[] table, int extract, int bits, int size, 1141 boolean preshifted, int shift, boolean hexFormat, 1142 boolean properties, boolean hexComment) { 1143 1144 String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") : 1145 bits == 2 ? (Csyntax ? "unsigned long" : "int") : 1146 bits == 4 ? (Csyntax ? "unsigned long" : "int") : 1147 bits == 8 ? (Csyntax ? "unsigned char" : "byte") : 1148 bits == 16 ? (Csyntax ? "unsigned short" : "char") : 1149 bits == 32 ? (Csyntax ? "unsigned long" : "int") : 1150 (Csyntax ? "int64" : "long"); 1151 long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu 1152 bits == 2 ? Integer.MAX_VALUE : 1153 bits == 4 ? Integer.MAX_VALUE : 1154 bits == 8 ? Byte.MAX_VALUE : 1155 bits == 16 ? Short.MAX_VALUE : 1156 bits == 32 ? Integer.MAX_VALUE : 1157 Long.MAX_VALUE; 1158 int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16); 1159 boolean shiftEntries = preshifted && shift != 0; 1160 if (bits == 8 && tableAsString && useCharForByte) { 1161 atype = "char"; 1162 maxPosEntry = Character.MAX_VALUE; 1163 entriesPerChar = 1; 1164 } 1165 boolean noConversion = atype.equals("char"); 1166 1167 result.append(commentStart); 1168 result.append(" The ").append(name).append(" table has ").append(table.length); 1169 result.append(" entries for a total of "); 1170 int sizeOfTable = ((table.length * bits + 31) >> 5) << 2; 1171 if (bits == 8 && useCharForByte) { 1172 sizeOfTable *= 2; 1173 } 1174 result.append(sizeOfTable); 1175 result.append(" bytes.").append(commentEnd).append("\n\n"); 1176 if (Csyntax) 1177 result.append(" static "); 1178 else 1179 result.append(" static final "); 1180 result.append(atype); 1181 result.append(" ").append(name).append("["); 1182 if (Csyntax) 1183 result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0)); 1184 if (tableAsString) { 1185 if (noConversion) { 1186 result.append("] = (\n"); 1187 } else { 1188 result.append("] = new ").append(atype).append("["+table.length+"];\n "); 1189 result.append("static final String ").append(name).append("_DATA =\n"); 1190 } 1191 int CHARS_PER_LINE = 8; 1192 StringBuffer theString = new StringBuffer(); 1193 int entriesInCharSoFar = 0; 1194 char ch = '\u0000'; 1195 int charsPerEntry = -entriesPerChar; 1196 for (int j=0; j<table.length; ++j) { 1197 //long entry = table[j] >> extract; 1198 long entry; 1199 if ("A".equals(name)) 1200 entry = (table[j] & 0xffffffffL) >> extract; 1201 else 1202 entry = (table[j] >> extract); 1203 if (shiftEntries) entry <<= shift; 1204 if (entry >= (1L << bits)) { 1205 FAIL("Entry too big"); 1206 } 1207 if (entriesPerChar > 0) { 1208 // Pack multiple entries into a character 1209 ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits)); 1210 ++entriesInCharSoFar; 1211 if (entriesInCharSoFar == entriesPerChar) { 1212 // Character is full 1213 theString.append(ch); 1214 entriesInCharSoFar = 0; 1215 ch = '\u0000'; 1216 } 1217 } 1218 else { 1219 // Use multiple characters per entry 1220 for (int k=0; k<charsPerEntry; ++k) { 1221 ch = (char)(entry >> ((charsPerEntry-1)*16)); 1222 entry <<= 16; 1223 theString.append(ch); 1224 } 1225 } 1226 } 1227 if (entriesInCharSoFar > 0) { 1228 while (entriesInCharSoFar < entriesPerChar) { 1229 ch = (char)((int)ch >> bits); 1230 ++entriesInCharSoFar; 1231 } 1232 theString.append(ch); 1233 entriesInCharSoFar = 0; 1234 } 1235 result.append(Utility.formatForSource(theString.toString(), " ")); 1236 if (noConversion) { 1237 result.append(").toCharArray()"); 1238 } 1239 result.append(";\n\n "); 1240 1241 if (!noConversion) { 1242 addInitializer(name, atype, entriesPerChar, bits, table.length); 1243 } 1244 } 1245 else { 1246 result.append("] = {"); 1247 boolean castEntries = shiftEntries && (bits < 32); 1248 int printPerLine = hexFormat ? (bits == 1 ? 32*4 : 1249 bits == 2 ? 16*4 : 1250 bits == 4 ? 8*4 : 1251 bits == 8 ? 8 : 1252 bits == 16 ? 8 : 1253 bits == 32 ? 4 : 2) : 1254 (bits == 8 ? 8 : 1255 bits == 16 ? 8 : 4); 1256 int printMask = properties ? 0 : 1257 Math.min(1 << size, 1258 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1; 1259 int commentShift = ((1 << size) == table.length) ? 0 : size; 1260 int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1; 1261 long val = 0; 1262 for (int j = 0; j < table.length; j++) { 1263 if ((j & printMask) == 0) { 1264 while (result.charAt(result.length() - 1) == ' ') 1265 result.setLength(result.length() - 1); 1266 result.append("\n "); 1267 } 1268 PRINT: { 1269 if (castEntries) 1270 result.append("(").append(atype).append(")("); 1271 long entry = table[j] >> extract; 1272 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1); 1273 int k = j & packMask; 1274 if (bits >= 8) 1275 val = entry; 1276 else if (k == 0) { 1277 val = entry; 1278 break PRINT; 1279 } 1280 else { 1281 val |= (entry << (k*bits)); 1282 if (k != packMask) 1283 break PRINT; 1284 } 1285 if (val > maxPosEntry && !Csyntax) { // liu 1286 // For values that are out of range, convert them to in-range negative values. 1287 // Actually, output the '-' and convert them to the negative of the corresponding 1288 // in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126. 1289 result.append('-'); 1290 val = maxPosEntry + maxPosEntry + 2 - val; 1291 } 1292 if (hexFormat) { 1293 result.append("0x"); 1294 if (bits == 8) 1295 result.append(hex2((byte)val)); 1296 else if (bits == 16) 1297 result.append(hex4((short)val)); 1298 else if (bits == 32 || bits < 8) 1299 result.append(hex8((int)val)); 1300 else { 1301 result.append(hex16(val)); 1302 if (!Csyntax) 1303 result.append("L"); 1304 } 1305 } 1306 else { 1307 if (bits == 8) 1308 result.append(dec3(val)); 1309 else if (bits == 64) { 1310 result.append(dec5(val)); 1311 if (!Csyntax) 1312 result.append("L"); 1313 } 1314 else 1315 result.append(dec5(val)); 1316 } 1317 if (shiftEntries) 1318 result.append("<<").append(shift); 1319 if (castEntries) result.append(")"); 1320 if (j < (table.length - 1)) 1321 result.append(", "); 1322 else 1323 result.append(" "); 1324 if ((j & printMask) == printMask) { 1325 result.append(" ").append(commentStart).append(" "); 1326 if (hexComment) 1327 result.append("0x").append(hex4((j & ~commentMask) << (16 - size))); 1328 else 1329 result.append(dec3((j & ~commentMask) >> commentShift)); 1330 if (properties) propertiesComments(result, val); 1331 result.append(commentEnd); 1332 } 1333 } // end PRINT 1334 } 1335 result.append("\n };\n\n "); 1336 } 1337 } 1338 1339 static void genCaseMapTableDeclaration(StringBuffer result) { 1340 String myTab = " "; 1341 result.append(myTab + "static final char[][][] charMap;\n"); 1342 } 1343 1344 static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){ 1345 String myTab = " "; 1346 int ch; 1347 char[] map; 1348 result.append(myTab + "charMap = new char[][][] {\n"); 1349 for (int x = 0; x < specialCaseMaps.length; x++) { 1350 ch = specialCaseMaps[x].getCharSource(); 1351 map = specialCaseMaps[x].getUpperCaseMap(); 1352 result.append(myTab + myTab); 1353 result.append("{ "); 1354 result.append("{\'\\u"+hex4(ch)+"\'}, {"); 1355 for (int y = 0; y < map.length; y++) { 1356 result.append("\'\\u"+hex4(map[y])+"\', "); 1357 } 1358 result.append("} },\n"); 1359 } 1360 result.append(myTab + "};\n"); 1361 1362 } 1363 1364 /** 1365 * The propertiesComments method generates comments describing encoded 1366 * character properties. 1367 * 1368 * @param result a StringBuffer, to which the generated source code 1369 * text is to be appended 1370 * @param val encoded character properties 1371 * 1372 * @see GenerateCharacter#genTable 1373 */ 1374 1375 static void propertiesComments(StringBuffer result, long val) { 1376 result.append(" "); 1377 switch ((int)(val & maskType)) { 1378 case UnicodeSpec.CONTROL: 1379 result.append("Cc"); 1380 break; 1381 case UnicodeSpec.FORMAT: 1382 result.append("Cf"); 1383 break; 1384 case UnicodeSpec.PRIVATE_USE: 1385 result.append("Co"); 1386 break; 1387 case UnicodeSpec.SURROGATE: 1388 result.append("Cs"); 1389 break; 1390 case UnicodeSpec.LOWERCASE_LETTER: 1391 result.append("Ll"); 1392 break; 1393 case UnicodeSpec.MODIFIER_LETTER: 1394 result.append("Lm"); 1395 break; 1396 case UnicodeSpec.OTHER_LETTER: 1397 result.append("Lo"); 1398 break; 1399 case UnicodeSpec.TITLECASE_LETTER: 1400 result.append("Lt"); 1401 break; 1402 case UnicodeSpec.UPPERCASE_LETTER: 1403 result.append("Lu"); 1404 break; 1405 case UnicodeSpec.COMBINING_SPACING_MARK: 1406 result.append("Mc"); 1407 break; 1408 case UnicodeSpec.ENCLOSING_MARK: 1409 result.append("Me"); 1410 break; 1411 case UnicodeSpec.NON_SPACING_MARK: 1412 result.append("Mn"); 1413 break; 1414 case UnicodeSpec.DECIMAL_DIGIT_NUMBER: 1415 result.append("Nd"); 1416 break; 1417 case UnicodeSpec.LETTER_NUMBER: 1418 result.append("Nl"); 1419 break; 1420 case UnicodeSpec.OTHER_NUMBER: 1421 result.append("No"); 1422 break; 1423 case UnicodeSpec.CONNECTOR_PUNCTUATION: 1424 result.append("Pc"); 1425 break; 1426 case UnicodeSpec.DASH_PUNCTUATION: 1427 result.append("Pd"); 1428 break; 1429 case UnicodeSpec.END_PUNCTUATION: 1430 result.append("Pe"); 1431 break; 1432 case UnicodeSpec.OTHER_PUNCTUATION: 1433 result.append("Po"); 1434 break; 1435 case UnicodeSpec.START_PUNCTUATION: 1436 result.append("Ps"); 1437 break; 1438 case UnicodeSpec.CURRENCY_SYMBOL: 1439 result.append("Sc"); 1440 break; 1441 case UnicodeSpec.MODIFIER_SYMBOL: 1442 result.append("Sk"); 1443 break; 1444 case UnicodeSpec.MATH_SYMBOL: 1445 result.append("Sm"); 1446 break; 1447 case UnicodeSpec.OTHER_SYMBOL: 1448 result.append("So"); 1449 break; 1450 case UnicodeSpec.LINE_SEPARATOR: 1451 result.append("Zl"); break; 1452 case UnicodeSpec.PARAGRAPH_SEPARATOR: 1453 result.append("Zp"); 1454 break; 1455 case UnicodeSpec.SPACE_SEPARATOR: 1456 result.append("Zs"); 1457 break; 1458 case UnicodeSpec.UNASSIGNED: 1459 result.append("unassigned"); 1460 break; 1461 } 1462 1463 switch ((int)((val & maskBidi) >> shiftBidi)) { 1464 case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT: 1465 result.append(", L"); 1466 break; 1467 case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT: 1468 result.append(", R"); 1469 break; 1470 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER: 1471 result.append(", EN"); 1472 break; 1473 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR: 1474 result.append(", ES"); 1475 break; 1476 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR: 1477 result.append(", ET"); 1478 break; 1479 case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER: 1480 result.append(", AN"); 1481 break; 1482 case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR: 1483 result.append(", CS"); 1484 break; 1485 case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR: 1486 result.append(", B"); 1487 break; 1488 case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR: 1489 result.append(", S"); 1490 break; 1491 case UnicodeSpec.DIRECTIONALITY_WHITESPACE: 1492 result.append(", WS"); 1493 break; 1494 case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS: 1495 result.append(", ON"); 1496 break; 1497 } 1498 if ((val & maskUpperCase) != 0) { 1499 result.append(", hasUpper (subtract "); 1500 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1501 } 1502 if ((val & maskLowerCase) != 0) { 1503 result.append(", hasLower (add "); 1504 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1505 } 1506 if ((val & maskTitleCase) != 0) { 1507 result.append(", hasTitle"); 1508 } 1509 if ((val & maskIdentifierInfo) == valueIgnorable) { 1510 result.append(", ignorable"); 1511 } 1512 if ((val & maskIdentifierInfo) == valueJavaUnicodePart) { 1513 result.append(", identifier part"); 1514 } 1515 if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) { 1516 result.append(", underscore"); 1517 } 1518 if ((val & maskIdentifierInfo) == valueJavaWhitespace) { 1519 result.append(", whitespace"); 1520 } 1521 if ((val & maskIdentifierInfo) == valueJavaOnlyStart) { 1522 result.append(", currency"); 1523 } 1524 if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) { 1525 result.append(", identifier start"); 1526 } 1527 if ((val & maskNumericType) == valueDigit) { 1528 result.append(", decimal "); 1529 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1530 } 1531 if ((val & maskNumericType) == valueStrangeNumeric) { 1532 result.append(", strange"); 1533 } 1534 if ((val & maskNumericType) == valueJavaSupradecimal) { 1535 result.append(", supradecimal "); 1536 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1537 } 1538 } 1539 1540 static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" }; 1541 1542 static String tableName(int j) { return tableNames[j]; } 1543 1544 /** 1545 * The genAccess method generates source code for one table access expression. 1546 * 1547 * Most of the complexity stems from handling various options as to 1548 * table representation, such as whether it contains values so large that 1549 * they are represented as negative values and whether the table values are 1550 * preshifted. This method also avoids such "ugly" expressions as shifting 1551 * by distance zero, masking when no masking is necessary, and so on. 1552 * For clarity, it generates expressions that do not rely on operator 1553 * precedence, but otherwise it avoids generating redundant parentheses. 1554 * 1555 * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]] 1556 * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example. 1557 * 1558 * @param tbl the name of the final table to be accessed 1559 * @param var the variable name that appeared in parentheses in the 1560 * "Lookup" command 1561 * @param bits the number of bits (not bytes) to be used to represent 1562 * the final table entry 1563 * @return the replacement text for the "Lookup(xxx)" command, as a String 1564 * 1565 * @see GenerateCharacter#replaceCommand 1566 */ 1567 1568 static String genAccess(String tbl, String var, int bits) { 1569 String access = null; 1570 int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0; 1571 for (int k = 0; k < sizes.length; k++) { 1572 int offset = ((k < sizes.length - 1) ? 0 : bitoffset); 1573 int shift = shifts[k] + offset; 1574 String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")"; 1575 int mask = (1 << (sizes[k] - offset)) - 1; 1576 String masked = (k == 0) ? shifted : 1577 "(" + shifted + "&0x" + hex(mask) + ")"; 1578 String index = (k == 0) ? masked : 1579 (mask == 0) ? access : "(" + access + "|" + masked + ")"; 1580 String indexNoParens = (index.charAt(0) != '(') ? index : 1581 index.substring(1, index.length() - 1); 1582 String tblname = (k == sizes.length - 1) ? tbl : tableName(k); 1583 String fetched = tblname + "[" + indexNoParens + "]"; 1584 String zeroextended = (zeroextend[k] == 0) ? fetched : 1585 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")"; 1586 int adjustment = preshifted[k] ? 0 : 1587 sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0); 1588 String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended : 1589 "(" + zeroextended + "<<" + adjustment + ")"; 1590 String bitshift = (bits == 1) ? "(" + var + "&0x1F)" : 1591 (bits == 2) ? "((" + var + "&0xF)<<1)" : 1592 (bits == 4) ? "((" + var + "&7)<<2)" : null; 1593 String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted : 1594 "((" + adjusted + ">>" + bitshift + ")&" + 1595 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")"; 1596 access = extracted; 1597 } 1598 return access; 1599 } 1600 1601 /* The command line arguments are decoded and used to set the following 1602 global variables. 1603 */ 1604 1605 static boolean verbose = false; 1606 static boolean nobidi = false; 1607 static boolean nomirror = false; 1608 static boolean identifiers = false; 1609 static boolean Csyntax = false; 1610 static String TemplateFileName = null; 1611 static String OutputFileName = null; 1612 static String UnicodeSpecFileName = null; // liu 1613 static String SpecialCasingFileName = null; 1614 static String PropListFileName = null; 1615 static boolean useCharForByte = false; 1616 static int[] sizes; 1617 static int bins = 0; // liu; if > 0, then perform search 1618 static boolean tableAsString = false; 1619 static boolean bLatin1 = false; 1620 1621 static String commandLineDescription; 1622 1623 /* Other global variables, equal in length to the "sizes" array. */ 1624 1625 static int[] shifts; 1626 static int[] zeroextend; 1627 static int[] bytes; 1628 static boolean[] preshifted; 1629 static long[][] tables; 1630 1631 1632 /* Other global variables */ 1633 static String commentStart; 1634 static String commentEnd; 1635 1636 static StringBuffer initializers = new StringBuffer(); 1637 1638 /* special casing rules for 1:M toUpperCase mappings */ 1639 static SpecialCaseMap[] specialCaseMaps; 1640 1641 /** 1642 * Process the command line arguments. 1643 * 1644 * The allowed flags in command line are: 1645 * <dl> 1646 * <dt> -verbose <dd> Emit comments to standard output describing 1647 * what's going on during the processing. 1648 * <dt> -nobidi <dd> Do not include bidi categories in the 1649 * encoded character properties. 1650 * <dt> -nomirror <dd> Do no include mirror property in the encoded 1651 * character properties. 1652 * <dt> -identifiers <dd> Generate tables for scanning identifiers only. 1653 * <dt> -c <dd> Output code in C syntax instead of Java syntax. 1654 * <dt> -o filename <dd> Specify output file name. 1655 * <dt> -template filename <dd> Specify template input file name. 1656 * <dt> -spec filename <dd> Specify Unicode spec file name. 1657 * <dt> -specialcasing filename <dd> Specify Unicode special casing file name. 1658 * <dt> -search bins <dd> Try different partitions into the specified 1659 * number of bins. E.g., for 2 bins, try 1660 * 16 0, 15 1,..., 0 16. 1661 * <dt> -string <dd> Create table as string. Only valid with Java 1662 * syntax. 1663 * <dt> -latin1 <dd> Create a latin 1 only property table. 1664 * </dl> 1665 * In addition, decimal literals may appear as command line arguments; 1666 * each one represents the number of bits of the character to be broken 1667 * off at each lookup step. If present, they must add up to 16 (the number 1668 * of bits in a char value). For smaller tables, the last value should 1669 * be 0; values other than the last one may not be zero. If no such 1670 * numeric values are provided, default values are used. 1671 * 1672 * @param args the command line arguments, as an array of String 1673 * 1674 * @see GenerateCharacter#main 1675 */ 1676 1677 static void processArgs(String[] args) { 1678 StringBuffer desc = new StringBuffer("java GenerateCharacter"); 1679 for (int j=0; j<args.length; ++j) { 1680 desc.append(" " + args[j]); 1681 } 1682 for (int j = 0; j < args.length; j++) { 1683 if (args[j].equals("-verbose") || args[j].equals("-v")) 1684 verbose = true; 1685 else if (args[j].equals("-nobidi")) 1686 nobidi = true; 1687 else if (args[j].equals("-nomirror")) 1688 nomirror = true; 1689 else if (args[j].equals("-identifiers")) 1690 identifiers = true; 1691 else if (args[j].equals("-c")) 1692 Csyntax = true; 1693 else if (args[j].equals("-string")) 1694 tableAsString = true; 1695 else if (args[j].equals("-o")) { 1696 if (j == args.length - 1) { 1697 FAIL("File name missing after -o"); 1698 } 1699 else { 1700 OutputFileName = args[++j]; 1701 } 1702 } 1703 else if (args[j].equals("-search")) { 1704 if (j == args.length - 1) 1705 FAIL("Bin count missing after -search"); 1706 else { 1707 bins = Integer.parseInt(args[++j]); 1708 if (bins < 1 || bins > 10) 1709 FAIL("Bin count must be >= 1 and <= 10"); 1710 } 1711 } 1712 else if (args[j].equals("-template")) { 1713 if (j == args.length - 1) 1714 FAIL("File name missing after -template"); 1715 else 1716 TemplateFileName = args[++j]; 1717 } 1718 else if (args[j].equals("-spec")) { // liu 1719 if (j == args.length - 1) { 1720 FAIL("File name missing after -spec"); 1721 } 1722 else { 1723 UnicodeSpecFileName = args[++j]; 1724 } 1725 } 1726 else if (args[j].equals("-specialcasing")) { 1727 if (j == args.length -1) { 1728 FAIL("File name missing after -specialcasing"); 1729 } 1730 else { 1731 SpecialCasingFileName = args[++j]; 1732 } 1733 } 1734 else if (args[j].equals("-proplist")) { 1735 if (j == args.length -1) { 1736 FAIL("File name missing after -proplist"); 1737 } 1738 else { 1739 PropListFileName = args[++j]; 1740 } 1741 } 1742 else if (args[j].equals("-plane")) { 1743 if (j == args.length -1) { 1744 FAIL("Plane number missing after -plane"); 1745 } 1746 else { 1747 plane = Integer.parseInt(args[++j]); 1748 } 1749 if (plane > 0) { 1750 bLatin1 = false; 1751 } 1752 } 1753 else if ("-usecharforbyte".equals(args[j])) { 1754 useCharForByte = true; 1755 } 1756 else if (args[j].equals("-latin1")) { 1757 bLatin1 = true; 1758 plane = 0; 1759 } 1760 else { 1761 try { 1762 int val = Integer.parseInt(args[j]); 1763 if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]); 1764 if (sizes == null) 1765 sizes = new int[1]; 1766 else { 1767 int[] newsizes = new int[sizes.length + 1]; 1768 System.arraycopy(sizes, 0, newsizes, 0, sizes.length); 1769 sizes = newsizes; 1770 } 1771 sizes[sizes.length - 1] = val; 1772 } 1773 catch(NumberFormatException e) { 1774 FAIL("Unknown switch: " + args[j]); 1775 } 1776 } 1777 } 1778 if (Csyntax && tableAsString) { 1779 FAIL("Can't specify table as string with C syntax"); 1780 } 1781 if (sizes == null) { 1782 desc.append(" ["); 1783 if (identifiers) { 1784 int[] newsizes = { 8, 4, 4 }; // Good default values 1785 desc.append("8 4 4]"); 1786 sizes = newsizes; 1787 } 1788 else { 1789 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 } 1790 desc.append("10 5 1]"); 1791 sizes = newsizes; 1792 } 1793 } 1794 if (UnicodeSpecFileName == null) { // liu 1795 UnicodeSpecFileName = DefaultUnicodeSpecFileName; 1796 desc.append(" [-spec " + UnicodeSpecFileName + ']'); 1797 } 1798 if (SpecialCasingFileName == null) { 1799 SpecialCasingFileName = DefaultSpecialCasingFileName; 1800 desc.append(" [-specialcasing " + SpecialCasingFileName + ']'); 1801 } 1802 if (PropListFileName == null) { 1803 PropListFileName = DefaultPropListFileName; 1804 desc.append(" [-proplist " + PropListFileName + ']'); 1805 } 1806 if (TemplateFileName == null) { 1807 TemplateFileName = (Csyntax ? DefaultCTemplateFileName 1808 : DefaultJavaTemplateFileName); 1809 desc.append(" [-template " + TemplateFileName + ']'); 1810 } 1811 if (OutputFileName == null) { 1812 OutputFileName = (Csyntax ? DefaultCOutputFileName 1813 : DefaultJavaOutputFileName); 1814 desc.append(" [-o " + OutputFileName + ']'); 1815 } 1816 commentStart = (Csyntax ? "/*" : "//"); 1817 commentEnd = (Csyntax ? " */" : ""); 1818 commandLineDescription = desc.toString(); 1819 } 1820 1821 private static void searchBins(long[] map, int binsOccupied) throws Exception { 1822 int bitsFree = 16; 1823 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i]; 1824 if (binsOccupied == (bins-1)) { 1825 sizes[binsOccupied] = bitsFree; 1826 generateForSizes(map); 1827 } 1828 else { 1829 for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one 1830 sizes[binsOccupied] = i; 1831 searchBins(map, binsOccupied+1); 1832 } 1833 } 1834 } 1835 1836 private static void generateForSizes(long[] map) throws Exception { 1837 int sum = 0; 1838 shifts = new int[sizes.length]; 1839 for (int k = sizes.length - 1; k >= 0; k--) { 1840 shifts[k] = sum; 1841 sum += sizes[k]; 1842 } 1843 if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) { 1844 FAIL("Bit field widths total to " + sum + 1845 ": wrong total for map of size " + map.length); 1846 } 1847 // need a table for each set of lookup bits in char 1848 tables = new long[sizes.length][]; 1849 // the last table is the map 1850 tables[sizes.length - 1] = map; 1851 for (int j = sizes.length - 1; j > 0; j--) { 1852 if (verbose && bins==0) 1853 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]); 1854 long[][] temp = buildTable(tables[j], sizes[j]); 1855 tables[j-1] = temp[0]; 1856 tables[j] = temp[1]; 1857 } 1858 preshifted = new boolean[sizes.length]; 1859 zeroextend = new int[sizes.length]; 1860 bytes = new int[sizes.length]; 1861 for (int j = 0; j < sizes.length - 1; j++) { 1862 int len = tables[j+1].length; 1863 int size = sizes[j+1]; 1864 if (len > 0x100 && (len >> size) <= 0x100) { 1865 len >>= size; 1866 preshifted[j] = false; 1867 } 1868 else if (len > 0x10000 && (len >> size) <= 0x10000) { 1869 len >>= size; 1870 preshifted[j] = false; 1871 } 1872 else preshifted[j] = true; 1873 if (Csyntax) 1874 zeroextend[j] = 0; 1875 else if (len > 0x7F && len <= 0xFF) { 1876 if (!useCharForByte) { 1877 zeroextend[j] = 0xFF; 1878 } 1879 } else if (len > 0x7FFF && len <= 0xFFFF) 1880 zeroextend[j] = 0xFFFF; 1881 else zeroextend[j] = 0; 1882 if (len <= 0x100) bytes[j] = 1; 1883 else if (len <= 0x10000) bytes[j] = 2; 1884 else bytes[j] = 4; 1885 } 1886 preshifted[sizes.length - 1] = true; 1887 zeroextend[sizes.length - 1] = 0; 1888 bytes[sizes.length - 1] = 0; 1889 if (bins > 0) { 1890 int totalBytes = getTotalBytes(); 1891 String access = genAccess("A", "ch", (identifiers ? 2 : 32)); 1892 int accessComplexity = 0; 1893 for (int j=0; j<access.length(); ++j) { 1894 char ch = access.charAt(j); 1895 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity; 1896 if (ch == '<' || ch == '>') ++j; 1897 } 1898 System.out.print("("); 1899 for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]); 1900 System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access); 1901 return; 1902 } 1903 if (verbose) { 1904 System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted"); 1905 for (int j = 0; j < sizes.length; j++) { 1906 System.out.println(dec5(j) + "\t" + 1907 dec5(sizes[j]) + "\t" + 1908 dec5(tables[j].length) + "\t" + 1909 dec5(shifts[j]) + "\t" + 1910 dec5(zeroextend[j]) + "\t" + 1911 dec5(bytes[j]) + "\t " + 1912 preshifted[j]); 1913 } 1914 } 1915 if (verbose) { 1916 System.out.println("Generating source code for class Character"); 1917 System.out.println("A table access looks like " + 1918 genAccess("A", "ch", (identifiers ? 2 : 32))); 1919 } 1920 generateCharacterClass(TemplateFileName, OutputFileName); 1921 } 1922 1923 /** 1924 * The main program for generating source code for the Character class. 1925 * The basic outline of its operation is: 1926 * <ol> 1927 * <li> Process the command line arguments. One result of this process 1928 * is a list of sizes (measured in bits and summing to 16). 1929 * <li> Get the Unicode character property data from the specification file. 1930 * <li> From that, build a map that has, for each character code, its 1931 * relevant properties encoded as a long integer value. 1932 * <li> Repeatedly compress the map, producing a compressed table and a 1933 * new map. This is done once for each size value in the list. 1934 * When this is done, we have a set of tables. 1935 * <li> Make some decisions about table representation; record these 1936 * decisions in arrays named preshifted, zeroextend, and bytes. 1937 * <li> Generate the source code for the class Character by performing 1938 * macro processing on a template file. 1939 * </ol> 1940 * 1941 * @param args the command line arguments, as an array of String 1942 * 1943 * @see GenerateCharacter#processArgs 1944 * @see UnicodeSpec@readSpecFile 1945 * @see GenerateCharacter#buildMap 1946 * @see GenerateCharacter#buildTable 1947 * @see GenerateCharacter#generateCharacterClass 1948 */ 1949 1950 public static void main(String[] args) { 1951 processArgs(args); 1952 try { 1953 1954 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane); 1955 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane); 1956 PropList propList = PropList.readSpecFile(new File(PropListFileName), plane); 1957 1958 if (verbose) { 1959 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu 1960 } 1961 long[] map = buildMap(data, specialCaseMaps, propList); 1962 if (verbose) { 1963 System.err.println("Completed building of initial map"); 1964 } 1965 1966 if (bins == 0) { 1967 generateForSizes(map); 1968 } 1969 else { 1970 while (bins > 0) { 1971 sizes = new int[bins]; 1972 searchBins(map, 0); 1973 --bins; 1974 } 1975 } 1976 if (verbose && false) { 1977 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" + 1978 hex8(maxOffsetSeen)); 1979 System.out.println(" allowed: -" + hex8(-minOffset) + "..+" + 1980 hex8(maxOffset)); 1981 } 1982 } 1983 catch (FileNotFoundException e) { FAIL(e.toString()); } 1984 catch (IOException e) { FAIL(e.toString()); } 1985 catch (Throwable e) { 1986 System.out.println("Unexpected exception:"); 1987 e.printStackTrace(); 1988 FAIL("Unexpected exception!"); 1989 } 1990 if (verbose) { System.out.println("Done!");} 1991 } 1992 1993 } // end class