1 /* 2 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package build.tools.generatecharacter; 27 28 import java.io.IOException; 29 import java.io.FileNotFoundException; 30 import java.io.BufferedReader; 31 import java.io.FileReader; 32 import java.io.PrintWriter; 33 import java.io.BufferedWriter; 34 import java.io.FileWriter; 35 import java.io.File; 36 import java.util.List; 37 38 import build.tools.generatecharacter.CharacterName; 39 40 /** 41 * This program generates the source code for the class java.lang.Character. 42 * It also generates native C code that can perform the same operations. 43 * It requires two external input data files: 44 * <ul> 45 * <li> Unicode specification file 46 * <li> Character class template file 47 * </ul> 48 * The Unicode specification file is available from the Unicode consortium. 49 * It has character specification lines that look like this: 50 * <listing> 51 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 52 * </listing> 53 * The Character class template file is filled in with additional 54 * information to produce the file Character.java, which can then be 55 * compiled by a Java compiler. The template file contains certain 56 * markers consisting of an alphabetic name string preceded by "$$". 57 * Such markers are replaced with generated program text. As a special 58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of 59 * alphabetic characters constituting a variable name. The character "_" 60 * is considered alphabetic for these purposes. 61 * 62 * @author Guy Steele 63 * @author Alan Liu 64 * @author John O'Conner 65 */ 66 67 public class GenerateCharacter { 68 69 final static boolean DEBUG = false; 70 71 final static String commandMarker = "$$"; 72 static String ROOT = ""; 73 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt"; 74 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt"; 75 static String DefaultPropListFileName = ROOT + "PropList.txt"; 76 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template"; 77 static String DefaultJavaOutputFileName = ROOT + "Character.java"; 78 static String DefaultCTemplateFileName = ROOT + "Character.c.template"; 79 static String DefaultCOutputFileName = ROOT + "Character.c"; 80 81 static int plane = 0; 82 83 /* The overall idea is that, in the generated Character class source code, 84 most character property data is stored in a special multi-level table whose 85 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn]. 86 The integers must sum to 16 (the number of bits in a character). 87 The first table is indexed by the k1 high-order bits of the character code. 88 The result is concatenated to the next k2 bits of the character code to index 89 the second table, and so on. Eventually the kn low-order bits of the character 90 code are concatenated and used to index one of two tables A and B; A contains 91 32-bit integer entries and B contains 16-bit short entries. The 48 bits that 92 can be thus obtained encode the properties for the character. 93 94 The default specification is [9, 4, 3, 0]. This particular table format was 95 designed by conducting an exhaustive search of table formats to minimize the 96 space consumed by the tables: the first and third tables need have only byte 97 values (the second table must have short values). Another good choice is 98 [10, 6, 0], which produces a larger table but allows particularly fast table 99 lookup code. 100 101 In each case, where the word "concatenated" is used, this may imply 102 first a << and then a | operation, or perhaps just a | operation if 103 the values in the table can be preshifted (generally possible if the table 104 entries are short rather than byte). 105 */ 106 107 /* The character properties are currently encoded into A (32 bits)and B (16 bits) 108 two parts. 109 110 A: the low 32 bits are defined in the following manner: 111 112 1 bit Mirrored property. 113 4 bits Bidirectional category (see below) (unused if -nobidi switch specified) 114 9 bits A signed offset used for converting case . 115 1 bit If 1, adding the signed offset converts the character to lowercase. 116 1 bit If 1, subtracting the signed offset converts the character to uppercase. 117 Note: for a titlecase character, both of the preceding bits will be 1 118 and the signed offset will be 1. 119 1 bit If 1, this character has a titlecase equivalent (possibly itself); 120 in this case, the two bits before this bit can be used to decide 121 whether this character is in fact uppercase, lowercase, or titlecase. 122 3 bits This field provides a quick way to lex identifiers. 123 The eight possible values for this field are as follows: 124 0 May not be part of an identifier 125 1 Ignorable control; may continue a Unicode identifier or Java identifier 126 2 May continue a Java identifier but not a Unicode identifier (unused) 127 3 May continue a Unicode identifier or Java identifier 128 4 Is a Java whitespace character 129 5 May start or continue a Java identifier; 130 may continue but not start a Unicode identifier 131 (this value is used for connector punctuation such as _) 132 6 May start or continue a Java identifier; 133 may not occur in a Unicode identifier 134 (this value is used for currency symbols such as $) 135 7 May start or continue a Unicode identifier or Java identifier 136 Thus: 137 5, 6, 7 may start a Java identifier 138 1, 2, 3, 5, 6, 7 may continue a Java identifier 139 7 may start a Unicode identifier 140 1, 3, 5, 7 may continue a Unicode identifier 141 1 is ignorable within an identifier 142 4 is Java whitespace 143 2 bits This field indicates whether the character has a numeric property. 144 The four possible values for this field are as follows: 145 0 This character has no numeric property. 146 1 Adding the digit offset to the character code and then 147 masking with 0x1F will produce the desired numeric value. 148 2 This character has a "strange" numeric value. 149 3 A Java supradecimal digit: adding the digit offset to the 150 character code, then masking with 0x1F, then adding 10 151 will produce the desired numeric value. 152 5 bits The digit offset (see description of previous field) 153 5 bits Character type (see below) 154 155 B: the high 16 bits are defined as: 156 1 bit Other_Lowercase property 157 1 bit Other_Uppercase property 158 1 bit Other_Alphabetic property 159 1 bit Other_Math property 160 1 bit Ideographic property 161 1 bit Noncharacter codepoint property 162 */ 163 164 165 // bit masks identify each component of a 32-bit property field described 166 // above. 167 // shift* indicates how many shifts right must happen to get the 168 // indicated property value in the lowest bits of the 32-bit space. 169 private static final int 170 shiftType = 0, maskType = 0x001F, 171 shiftDigitOffset = 5, maskDigitOffset = 0x03E0, 172 shiftNumericType = 10, maskNumericType = 0x0C00, 173 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000, 174 maskUnicodePart = 0x1000, 175 shiftCaseInfo = 15, maskCaseInfo = 0x38000, 176 maskLowerCase = 0x20000, 177 maskUpperCase = 0x10000, 178 maskTitleCase = 0x08000, 179 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000, 180 shiftCaseOffsetSign = 5, 181 // used only when calculating and 182 // storing digit offsets from char values 183 maskDigit = 0x001F, 184 // case offset are 9 bits 185 maskCase = 0x01FF, 186 shiftBidi = 27, maskBidi = 0x78000000, 187 shiftMirrored = 31, //maskMirrored = 0x80000000, 188 shiftPlane = 16, maskPlane = 0xFF0000; 189 190 // maskMirrored needs to be long, if up 16-bit 191 private static final long maskMirrored = 0x80000000L; 192 193 // bit masks identify the 16-bit priperty field described above, in B 194 // table 195 private static final long 196 maskOtherLowercase = 0x100000000L, 197 maskOtherUppercase = 0x200000000L, 198 maskOtherAlphabetic = 0x400000000L, 199 maskOtherMath = 0x800000000L, 200 maskIdeographic = 0x1000000000L, 201 maskNoncharacterCP = 0x2000000000L; 202 203 // Can compare masked values with these to determine 204 // numeric or lexical types. 205 public static int 206 valueNotNumeric = 0x0000, 207 valueDigit = 0x0400, 208 valueStrangeNumeric = 0x0800, 209 valueJavaSupradecimal = 0x0C00, 210 valueIgnorable = 0x1000, 211 valueJavaOnlyPart = 0x2000, 212 valueJavaUnicodePart = 0x3000, 213 valueJavaWhitespace = 0x4000, 214 valueJavaStartUnicodePart = 0x5000, 215 valueJavaOnlyStart = 0x6000, 216 valueJavaUnicodeStart = 0x7000, 217 lowJavaStart = 0x5000, 218 nonzeroJavaPart = 0x3000, 219 valueUnicodeStart = 0x7000; 220 221 // these values are used when only identifier properties are generated 222 // for use in verifier code. Shortens the property down to a single byte. 223 private static final int 224 bitJavaStart = 0x02, 225 bitJavaPart = 0x01, 226 maskIsJavaIdentifierPart = bitJavaPart, 227 maskIsJavaIdentifierStart = bitJavaStart; 228 229 static int maxOffset = maskCase/2 ; 230 static int minOffset = -maxOffset; 231 232 /* The following routines provide simple, concise formatting of long integer values. 233 The number in the name of the method indicates the desired number of characters 234 to be produced. If the number of digits required to represent the integer value 235 is less than that number, then the output is padded on the left with zeros 236 (for hex) or with spaces (for decimal). If the number of digits required to 237 represent the integer value is greater than the desired number, then all the digits 238 that are required are actually produced. 239 */ 240 241 static String hex(long n) { return Long.toHexString(n).toUpperCase(); } 242 243 static String hex2(long n) { 244 String q = Long.toHexString(n & 0xFF).toUpperCase(); 245 return "00".substring(Math.min(2, q.length())) + q; 246 } 247 248 static String hex4(long n) { 249 String q = Long.toHexString(n & 0xFFFF).toUpperCase(); 250 return "0000".substring(Math.min(4, q.length())) + q; 251 } 252 253 static String hex8(long n) { 254 String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase(); 255 return "00000000".substring(Math.min(8, q.length())) + q; 256 } 257 258 static String hex16(long n) { 259 String q = Long.toHexString(n).toUpperCase(); 260 return "0000000000000000".substring(Math.min(16, q.length())) + q; 261 } 262 263 static String dec3(long n) { 264 String q = Long.toString(n); 265 return " ".substring(Math.min(3, q.length())) + q; 266 } 267 268 static String dec5(long n) { 269 String q = Long.toString(n); 270 return " ".substring(Math.min(5, q.length())) + q; 271 } 272 273 /* This routine is called when some failure occurs. */ 274 275 static void FAIL(String s) { 276 System.out.println("** " + s); 277 } 278 279 /** 280 * Given the data from the Unicode specification file, this routine builds a map. 281 * 282 * The specification file is assumed to contain its data in sorted order by 283 * character code; as a result, the array passed as an argument to this method 284 * has its components in the same sorted order, with one entry for each defined 285 * Unicode character or character range. (A range is indicated by two consecutive 286 * entries, such that the name of the first entry begins with "<" and ends with 287 * "First>" and the second entry begins with "<" and ends with "Last>".) This is 288 * therefore a sparse representation of the character property data. 289 * 290 * The resulting map is dense representation of the character data. It contains 291 * 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits 292 * of this long value are used, but type long is used rather than int to facilitate 293 * future extensions of this source code generator that might require more than 294 * 32 bits to encode relevant character properties.) Entry k holds the encoded 295 * properties for character k. 296 * 297 * Method buildMap manages the transformation from the sparse representation to 298 * the dense representation. It calls method buildOne to handle the encoding 299 * of character property data from a single UnicodeSpec object into 32 bits. 300 * For undefined characters, method buildOne is not called and the map entry for 301 * that character is set to UnicodeSpec.UNASSIGNED. 302 * 303 * @param data character property data from the Unicode specification file 304 * @return an array of length 65536 with one entry for every possible char value 305 * 306 * @see GenerateCharacter#buildOne 307 */ 308 309 static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList) 310 { 311 long[] result; 312 if (bLatin1 == true) { 313 result = new long[256]; 314 } else { 315 result = new long[1<<16]; 316 } 317 int k=0; 318 int codePoint = plane<<16; 319 UnicodeSpec nonCharSpec = new UnicodeSpec(); 320 for (int j = 0; j < data.length && k < result.length; j++) { 321 if (data[j].codePoint == codePoint) { 322 result[k] = buildOne(codePoint, data[j], specialMaps); 323 ++k; 324 ++codePoint; 325 } 326 else if(data[j].codePoint > codePoint) { 327 if (data[j].name.endsWith("Last>")) { 328 // build map data for all chars except last in range 329 while (codePoint < data[j].codePoint && k < result.length) { 330 result[k] = buildOne(codePoint, data[j], specialMaps); 331 ++k; 332 ++codePoint; 333 } 334 } 335 else { 336 // we have a few unassigned chars before data[j].codePoint 337 while (codePoint < data[j].codePoint && k < result.length) { 338 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 339 ++k; 340 ++codePoint; 341 } 342 } 343 k = data[j].codePoint & 0xFFFF; 344 codePoint = data[j].codePoint; 345 result[k] = buildOne(codePoint, data[j], specialMaps); 346 ++k; 347 ++codePoint; 348 } 349 else { 350 System.out.println("An error has occured during spec mapping."); 351 System.exit(0); 352 } 353 } 354 // if there are still unprocessed chars, process them 355 // as unassigned/undefined. 356 codePoint = (plane<<16) | k; 357 while (k < result.length) { 358 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 359 ++k; 360 ++codePoint; 361 } 362 // now add all extra supported properties from PropList, to the 363 // upper 16-bit 364 addExProp(result, propList, "Other_Lowercase", maskOtherLowercase); 365 addExProp(result, propList, "Other_Uppercase", maskOtherUppercase); 366 addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic); 367 addExProp(result, propList, "Ideographic", maskIdeographic); 368 //addExProp(result, propList, "Other_Math", maskOtherMath); 369 //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP); 370 371 return result; 372 } 373 374 // The maximum and minimum offsets found while scanning the database 375 static int maxOffsetSeen = 0; 376 static int minOffsetSeen = 0; 377 378 /** 379 * Some Unicode separator characters are not considered Java whitespace. 380 * @param c character to test 381 * @return true if c in an invalid Java whitespace character, false otherwise. 382 */ 383 static boolean isInvalidJavaWhiteSpace(int c) { 384 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF}; 385 boolean retValue = false; 386 for(int x=0;x<exceptions.length;x++) { 387 if(c == exceptions[x]) { 388 retValue = true; 389 break; 390 } 391 } 392 return retValue; 393 394 } 395 396 /** 397 * Given the character property data for one Unicode character, encode the data 398 * of interest into a single long integer value. (Right now only 32 bits 399 * of this long value are used, but type long is used rather than int to facilitate 400 * future extensions of this source code generator that might require more than 401 * 32 bits to encode relevant character properties.) 402 * 403 * @param c the character code for which to encode property data 404 * @param us property data record from the Unicode specification file 405 * (its character code might not be equal to c if it specifies data 406 * for a range of characters) 407 * @return an encoded long value that contains the properties for a single char 408 * 409 * @see GenerateCharacter#buildMap 410 */ 411 412 static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) { 413 long resultA = 0; 414 // record the general category 415 resultA |= us.generalCategory; 416 417 // record the numeric properties 418 NUMERIC: { 419 STRANGE: { 420 int val = 0; 421 // c is A-Z 422 if ((c >= 0x0041) && (c <= 0x005A)) { 423 val = c - 0x0041; 424 resultA |= valueJavaSupradecimal; 425 // c is a-z 426 } else if ((c >= 0x0061) && (c <= 0x007A)) { 427 val = c - 0x0061; 428 resultA |= valueJavaSupradecimal; 429 // c is a full-width A-Z 430 } else if ((c >= 0xFF21) && (c <= 0xFF3A)) { 431 val = c - 0xFF21; 432 resultA |= valueJavaSupradecimal; 433 // c is a full-width a-z 434 } else if ((c >= 0xFF41) && (c <= 0xFF5A)) { 435 val = c - 0xFF41; 436 resultA |= valueJavaSupradecimal; 437 } else if (us.isDecimalValue()) { 438 val = us.decimalValue; 439 resultA |= valueDigit; 440 } else if (us.isDigitValue()) { 441 val = us.digitValue; 442 resultA |= valueDigit; 443 } else { 444 if (us.numericValue.length() == 0) { 445 break NUMERIC; // no numeric value at all 446 } else { 447 try { 448 val = Integer.parseInt(us.numericValue); 449 if (val >= 32 || val < 0) break STRANGE; 450 if (c == 0x215F) break STRANGE; 451 } catch(NumberFormatException e) { 452 break STRANGE; 453 } 454 resultA |= valueDigit; 455 } 456 } 457 if (val >= 32 || val < 0) break STRANGE; 458 resultA |= ((val - c & maskDigit) << shiftDigitOffset); 459 break NUMERIC; 460 } // end STRANGE 461 resultA |= valueStrangeNumeric; 462 } // end NUMERIC 463 464 // record case mapping 465 int offset = 0; 466 // might have a 1:M mapping 467 int specialMap = SpecialCaseMap.find(c, specialCaseMaps); 468 boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1); 469 if (bHasUpper) { 470 resultA |= maskUpperCase; 471 } 472 if (specialMap != -1) { 473 // has mapping, but cannot record the 474 // proper offset; can only flag it and provide special case 475 // code in Character.java 476 offset = -1; 477 } 478 else if (us.hasUpperMap()) { 479 offset = c - us.upperMap; 480 } 481 482 if (us.hasLowerMap()) { 483 resultA |= maskLowerCase; 484 if (offset == 0) 485 offset = us.lowerMap - c; 486 else if (offset != (us.lowerMap - c)) { 487 if (DEBUG) { 488 FAIL("Character " + hex(c) + 489 " has incompatible lowercase and uppercase mappings"); 490 } 491 } 492 } 493 if ((us.hasTitleMap() && us.titleMap != us.upperMap) || 494 (bHasUpper && us.hasLowerMap())) { 495 resultA |= maskTitleCase; 496 } 497 if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) { 498 System.out.println("Warning: Character " + hex4(c) + " has upper but " + 499 "no title case; Java won't know this"); 500 } 501 if (offset < minOffsetSeen) minOffsetSeen = offset; 502 if (offset > maxOffsetSeen) maxOffsetSeen = offset; 503 if (offset > maxOffset || offset < minOffset) { 504 if (DEBUG) { 505 FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case"); 506 } 507 offset = maskCase; 508 } 509 resultA |= ((offset & maskCase) << shiftCaseOffset); 510 511 // record lexical info about this character 512 if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER 513 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER 514 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER 515 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER 516 || us.generalCategory == UnicodeSpec.OTHER_LETTER 517 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) { 518 resultA |= valueJavaUnicodeStart; 519 } 520 else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK 521 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK 522 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) { 523 resultA |= valueJavaUnicodePart; 524 } 525 else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) { 526 resultA |= valueJavaStartUnicodePart; 527 } 528 else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) { 529 resultA |= valueJavaOnlyStart; 530 } 531 else if (((c >= 0x0000) && (c <= 0x0008)) 532 || ((c >= 0x000E) && (c <= 0x001B)) 533 || ((c >= 0x007F) && (c <= 0x009F)) 534 || us.generalCategory == UnicodeSpec.FORMAT) { 535 resultA |= valueIgnorable; 536 } 537 else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR 538 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR 539 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) { 540 if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace; 541 } 542 else if (((c >= 0x0009) && (c <= 0x000D)) 543 || ((c >= 0x001C) && (c <= 0x001F))) { 544 resultA |= valueJavaWhitespace; 545 } 546 547 // record bidi category 548 if (!nobidi) { 549 int tmpBidi = 550 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS || 551 us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi); 552 resultA |= tmpBidi; 553 } 554 555 // record mirrored property 556 if (!nomirror) { 557 resultA |= us.mirrored ? maskMirrored : 0; 558 } 559 560 if (identifiers) { 561 long replacement = 0; 562 if ((resultA & maskIdentifierInfo) >= lowJavaStart) { 563 replacement |= bitJavaStart; 564 } 565 if ( ((resultA & nonzeroJavaPart) != 0) 566 && ((resultA & maskIdentifierInfo) != valueIgnorable)) { 567 replacement |= bitJavaPart; 568 } 569 resultA = replacement; 570 } 571 return resultA; 572 } 573 574 static void addExProp(long[] map, PropList propList, String prop, long mask) { 575 List<Integer> cps = propList.codepoints(prop); 576 if (cps != null) { 577 for (Integer cp : cps) { 578 if (cp < map.length) 579 map[cp] |= mask; 580 } 581 } 582 } 583 584 /** 585 * This is the heart of the table compression strategy. The inputs are a map 586 * and a number of bits (size). The map is simply an array of long integer values; 587 * the number of bits indicates how index values for that map are to be split. 588 * The length of the given map must be a multiple of (1 << size). The result is 589 * a new map z and a compressed table t such that for every valid index value k 590 * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k]. 591 * 592 * In other words, the index k can be split into two parts, namely the "size" 593 * low-order bits and all the remaining high-order bits; the high-order bits are then 594 * remapped by map z to produce an index into table t. In effect, the data of the 595 * original map m is broken up into blocks of size (1<<size); the compression relies 596 * on the expectation that many of these blocks will be identical and therefore need 597 * be represented only once in the compressed table t. 598 * 599 * This method is intended to be used iteratively. The first map to be handed 600 * to it is the one constructed by method buildMap. After that, the first of the 601 * two arrays returned by this method is fed back into it for further compression. 602 * At the end of the iteration, one has a starter map and a sequence of tables. 603 * 604 * The algorithm used to implement this computation is straightforward and not 605 * especially clever. It uses brute-force linear search (the loop labeled MIDDLE) 606 * to locate identical blocks, so overall the time complexity of the algorithm 607 * is quadratic in the length of the input map. Fortunately, speed is not crucial 608 * to this application. 609 * 610 * @param map a map to be compressed 611 * @param size the number of index bits to be split off by the compression 612 * @return an array of length 2 containing two arrays; the first is a new map 613 * and the second is a compressed data table 614 * 615 * @see GenerateCharacter#buildMap 616 */ 617 618 static long[][] buildTable(long[] map, int size) { 619 int n = map.length; 620 if (((n >> size) << size) != n) { 621 FAIL("Length " + n + " is not a multiple of " + (1 << size)); 622 } 623 int m = 1 << size; 624 // We know the final length of the new map up front. 625 long[] newmap = new long[n >> size]; 626 // The buffer is used temporarily to hold data for the compressed table 627 // because we don't know its final length yet. 628 long[] buffer = new long[n]; 629 int ptr = 0; 630 OUTER: for (int i = 0; i < n; i += m) { 631 // For every block of size m in the original map... 632 MIDDLE: for (int j = 0; j < ptr; j += m) { 633 // Find out whether there is already a block just like it in the buffer. 634 for (int k = 0; k < m; k++) { 635 if (buffer[j+k] != map[i+k]) 636 continue MIDDLE; 637 } 638 // There is a block just like it at position j, so just 639 // put its index into the new map (thereby sharing it). 640 newmap[i >> size] = (j >> size); 641 continue OUTER; 642 } // end MIDDLE 643 // There is no block just like it already, so add it to 644 // the buffer and put its index into the new map. 645 for (int k = 0; k < m; k++) { 646 buffer[ptr+k] = map[i+k]; 647 } 648 newmap[i >> size] = (ptr >> size); 649 ptr += m; 650 } // end OUTER 651 // Now we know how long the compressed table should be, 652 // so create a new array and copy data from the temporary buffer. 653 long[] newdata = new long[ptr]; 654 for (int j = 0; j < ptr; j++) { 655 newdata[j] = buffer[j]; 656 } 657 // Return the new map and the new data table. 658 long[][] result = { newmap, newdata }; 659 return result; 660 } 661 662 /** 663 * Once the compressed tables have been computed, this method reads in a 664 * template file for the source code to be generated and writes out the final 665 * source code by acting as a sort of specialized macro processor. 666 * 667 * The first output line is a comment saying that the file was automatically 668 * generated; it includes a timestamp. All other output is generated by 669 * reading a line from the template file, performing macro replacements, 670 * and then writing the resulting line or lines of code to the output file. 671 * 672 * This method handles the I/O, the timestamp comment, and the locating of 673 * macro calls within each input line. The method replaceCommand is called 674 * to generate replacement text for each macro call. 675 * 676 * Macro calls to be replaced are indicated in the template file by 677 * occurrences of the commandMarker "$$". The rest of the call may consist 678 * of Java letters (including the underscore "_") and also of balanced 679 * parentheses. 680 * 681 * @param theTemplateFileName 682 * the file name for the template input file 683 * @param theOutputFileName 684 * the file name for the source code output file 685 * 686 * @see GenerateCharacter#replaceCommand 687 */ 688 689 static void generateCharacterClass(String theTemplateFileName, 690 String theOutputFileName) 691 throws FileNotFoundException, IOException { 692 BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName)); 693 PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName))); 694 out.println(commentStart + 695 " This file was generated AUTOMATICALLY from a template file " + 696 new java.util.Date() + commentEnd); 697 int marklen = commandMarker.length(); 698 LOOP: while(true) { 699 try { 700 String line = in.readLine(); 701 if (line == null) break LOOP; 702 int pos = 0; 703 int depth = 0; 704 while ((pos = line.indexOf(commandMarker, pos)) >= 0) { 705 int newpos = pos + marklen; 706 char ch = 'x'; 707 SCAN: while (newpos < line.length() && 708 (Character.isJavaIdentifierStart(ch = line.charAt(newpos)) 709 || ch == '(' || (ch == ')' && depth > 0))) { 710 ++newpos; 711 if (ch == '(') { 712 ++depth; 713 } 714 else if (ch == ')') { 715 --depth; 716 if (depth == 0) 717 break SCAN; 718 } 719 } 720 String replacement = replaceCommand(line.substring(pos + marklen, newpos)); 721 line = line.substring(0, pos) + replacement + line.substring(newpos); 722 pos += replacement.length(); 723 } 724 out.println(line); 725 } 726 catch (IOException e) { 727 break LOOP; 728 } 729 } 730 in.close(); 731 out.close(); 732 } 733 734 /** 735 * The replaceCommand method takes a command (a macro call without the 736 * leading marker "$$") and computes replacement text for it. 737 * 738 * Most of the commands are simply names of integer constants that are defined 739 * in the source code of this GenerateCharacter class. The replacement text is 740 * simply the value of the constant as an appropriately formatted integer literal. 741 * 742 * Two cases are more complicated, however. The command "Tables" causes the 743 * final map and compressed tables to be emitted, with elaborate comments 744 * describing their contents. (This is actually handled by method genTables.) 745 * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates 746 * an expression that will return the character property data for the character 747 * whose code is the value of the variable "xxx". (this is handled by method 748 * "genAccess".) 749 * 750 * @param x a command from the template file to be replaced 751 * @return the replacement text, as a String 752 * 753 * @see GenerateCharacter#genTables 754 * @see GenerateCharacter#genAccess 755 * @see GenerateCharacter#generateCharacterClass 756 */ 757 758 static String replaceCommand(String x) { 759 if (x.equals("Tables")) return genTables(); 760 if (x.equals("Initializers")) return genInitializers(); 761 if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") && 762 x.substring(x.length()-1).equals(")") ) 763 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32)); 764 if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") && 765 x.substring(x.length()-1).equals(")") ) 766 return genAccess("B", x.substring(9, x.length()-1), 16); 767 if (x.equals("shiftType")) return Long.toString(shiftType); 768 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo); 769 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo); 770 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart); 771 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset); 772 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo); 773 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign); 774 if (x.equals("maskCase")) return "0x" + hex8(maskCase); 775 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset); 776 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase); 777 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase); 778 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase); 779 if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32); 780 if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32); 781 if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32); 782 if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32); 783 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable); 784 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart); 785 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart); 786 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart); 787 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart); 788 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace); 789 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart); 790 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart); 791 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart); 792 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart); 793 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart); 794 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart); 795 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart); 796 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset); 797 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset); 798 if (x.equals("maskDigit")) return "0x" + hex(maskDigit); 799 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType); 800 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType); 801 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric); 802 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 803 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 804 if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal); 805 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 806 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 807 if (x.equals("maskType")) return "0x" + hex(maskType); 808 if (x.equals("shiftBidi")) return Long.toString(shiftBidi); 809 if (x.equals("maskBidi")) return "0x" + hex(maskBidi); 810 if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored); 811 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG])) 812 return Integer.toString(UnicodeSpec.UNASSIGNED); 813 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG])) 814 return Integer.toString(UnicodeSpec.UPPERCASE_LETTER); 815 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG])) 816 return Integer.toString(UnicodeSpec.LOWERCASE_LETTER); 817 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG])) 818 return Integer.toString(UnicodeSpec.TITLECASE_LETTER); 819 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG])) 820 return Integer.toString(UnicodeSpec.MODIFIER_LETTER); 821 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG])) 822 return Integer.toString(UnicodeSpec.OTHER_LETTER); 823 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG])) 824 return Integer.toString(UnicodeSpec.NON_SPACING_MARK); 825 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG])) 826 return Integer.toString(UnicodeSpec.ENCLOSING_MARK); 827 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG])) 828 return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK); 829 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG])) 830 return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER); 831 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG])) 832 return Integer.toString(UnicodeSpec.OTHER_NUMBER); 833 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG])) 834 return Integer.toString(UnicodeSpec.SPACE_SEPARATOR); 835 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG])) 836 return Integer.toString(UnicodeSpec.LINE_SEPARATOR); 837 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 838 return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR); 839 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG])) 840 return Integer.toString(UnicodeSpec.CONTROL); 841 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG])) 842 return Integer.toString(UnicodeSpec.FORMAT); 843 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG])) 844 return Integer.toString(UnicodeSpec.PRIVATE_USE); 845 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG])) 846 return Integer.toString(UnicodeSpec.SURROGATE); 847 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG])) 848 return Integer.toString(UnicodeSpec.DASH_PUNCTUATION); 849 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG])) 850 return Integer.toString(UnicodeSpec.START_PUNCTUATION); 851 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG])) 852 return Integer.toString(UnicodeSpec.END_PUNCTUATION); 853 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 854 return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION); 855 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 856 return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION); 857 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG])) 858 return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION); 859 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG])) 860 return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION); 861 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG])) 862 return Integer.toString(UnicodeSpec.LETTER_NUMBER); 863 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG])) 864 return Integer.toString(UnicodeSpec.MATH_SYMBOL); 865 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG])) 866 return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL); 867 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG])) 868 return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL); 869 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG])) 870 return Integer.toString(UnicodeSpec.OTHER_SYMBOL); 871 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG])) 872 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT); 873 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG])) 874 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING); 875 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG])) 876 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE); 877 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG])) 878 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT); 879 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG])) 880 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC); 881 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG])) 882 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING); 883 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG])) 884 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE); 885 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG])) 886 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT); 887 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG])) 888 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER); 889 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 890 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR); 891 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG])) 892 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR); 893 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG])) 894 return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER); 895 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 896 return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR); 897 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG])) 898 return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK); 899 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG])) 900 return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL); 901 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 902 return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR); 903 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG])) 904 return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR); 905 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG])) 906 return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE); 907 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG])) 908 return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS); 909 FAIL("Unknown text substitution marker " + commandMarker + x); 910 return commandMarker + x; 911 } 912 913 /** 914 * The genTables method generates source code for all the lookup tables 915 * needed to represent the various Unicode character properties. 916 * It simply calls the method genTable once for each table to be generated 917 * and then generates a summary comment. 918 * 919 * @return the replacement text for the "Tables" command, as a String 920 * 921 * @see GenerateCharacter#genTable 922 * @see GenerateCharacter#replaceCommand 923 */ 924 static String genTables() { 925 int n = sizes.length; 926 StringBuffer result = new StringBuffer(); 927 // liu : Add a comment showing the source of this table 928 result.append(commentStart + " The following tables and code generated using:" + 929 commentEnd + "\n "); 930 result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n "); 931 932 if (plane == 0 && bLatin1 == false) { 933 genCaseMapTableDeclaration(result); 934 genCaseMapTable(initializers, specialCaseMaps); 935 } 936 int totalBytes = 0; 937 for (int k = 0; k < n - 1; k++) { 938 genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k], 939 sizes[k+1], false, false, k==0); 940 int s = bytes[k]; 941 if (s == 1 && useCharForByte) { 942 s = 2; 943 } 944 totalBytes += tables[k].length * s; 945 } 946 genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32), 947 sizes[n - 1], false, 0, true, !(identifiers), false); 948 949 // If we ever need more than 32 bits to represent the character properties, 950 // then a table "B" may be needed as well. 951 genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false); 952 953 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2); 954 result.append(commentStart); 955 result.append(" In all, the character property tables require "); 956 result.append(totalBytes).append(" bytes.").append(commentEnd); 957 if (verbose) { 958 System.out.println("The character property tables require " 959 + totalBytes + " bytes."); 960 } 961 return result.toString(); 962 } 963 964 /** 965 * The genInitializers method generates the body of the 966 * ensureInitted() method, which enables lazy initialization of 967 * the case map table and other tables. 968 */ 969 static String genInitializers() { 970 return initializers.toString(); 971 } 972 973 /** 974 * Return the total number of bytes needed by all tables. This is a stripped- 975 * down copy of genTables(). 976 */ 977 static int getTotalBytes() { 978 int n = sizes.length; 979 int totalBytes = 0; 980 for (int k = 0; k < n - 1; k++) { 981 totalBytes += tables[k].length * bytes[k]; 982 } 983 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) 984 + 31) >> 5) << 2); 985 return totalBytes; 986 } 987 988 static void appendEscapedStringFragment(StringBuffer result, 989 char[] line, 990 int length, 991 boolean lastFragment) { 992 result.append(" \""); 993 for (int k=0; k<length; ++k) { 994 result.append("\\u"); 995 result.append(hex4(line[k])); 996 } 997 result.append("\""); 998 result.append(lastFragment ? ";" : "+"); 999 result.append("\n"); 1000 } 1001 1002 static String SMALL_INITIALIZER = 1003 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1004 // " $$name = new $$type[$$size];\n"+ 1005 " int len = $$name_DATA.length();\n"+ 1006 " int j=0;\n"+ 1007 " for (int i=0; i<len; ++i) {\n"+ 1008 " int c = $$name_DATA.charAt(i);\n"+ 1009 " for (int k=0; k<$$entriesPerChar; ++k) {\n"+ 1010 " $$name[j++] = ($$type)c;\n"+ 1011 " c >>= $$bits;\n"+ 1012 " }\n"+ 1013 " }\n"+ 1014 " assert (j == $$size);\n"+ 1015 " }\n"; 1016 1017 static String SAME_SIZE_INITIALIZER = 1018 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1019 " assert ($$name_DATA.length() == $$size);\n"+ 1020 // " $$name = new $$type[$$size];\n"+ 1021 " for (int i=0; i<$$size; ++i)\n"+ 1022 " $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+ 1023 " }\n"; 1024 1025 static String BIG_INITIALIZER = 1026 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1027 // " $$name = new $$type[$$size];\n"+ 1028 " int len = $$name_DATA.length();\n"+ 1029 " int j=0;\n"+ 1030 " int charsInEntry=0;\n"+ 1031 " $$type entry=0;\n"+ 1032 " for (int i=0; i<len; ++i) {\n"+ 1033 " entry |= $$name_DATA.charAt(i);\n"+ 1034 " if (++charsInEntry == $$charsPerEntry) {\n"+ 1035 " $$name[j++] = entry;\n"+ 1036 " entry = 0;\n"+ 1037 " charsInEntry = 0;\n"+ 1038 " }\n"+ 1039 " else {\n"+ 1040 " entry <<= 16;\n"+ 1041 " }\n"+ 1042 " }\n"+ 1043 " assert (j == $$size);\n"+ 1044 " }\n"; 1045 1046 static String INT32_INITIALIZER = 1047 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1048 " char[] data = $$name_DATA.toCharArray();\n"+ 1049 " assert (data.length == ($$size * 2));\n"+ 1050 " int i = 0, j = 0;\n"+ 1051 " while (i < ($$size * 2)) {\n"+ 1052 " int entry = data[i++] << 16;\n"+ 1053 " $$name[j++] = entry | data[i++];\n"+ 1054 " }\n"+ 1055 " }\n"; 1056 1057 static void addInitializer(String name, String type, int entriesPerChar, 1058 int bits, int size) { 1059 1060 String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER : 1061 ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER); 1062 if (entriesPerChar == -2) { 1063 template = INT32_INITIALIZER; 1064 } 1065 int marklen = commandMarker.length(); 1066 int pos = 0; 1067 while ((pos = template.indexOf(commandMarker, pos)) >= 0) { 1068 int newpos = pos + marklen; 1069 char ch = 'x'; 1070 while (newpos < template.length() && 1071 Character.isJavaIdentifierStart(ch = template.charAt(newpos)) && 1072 ch != '_') // Don't allow this in token names 1073 ++newpos; 1074 String token = template.substring(pos+marklen, newpos); 1075 String replacement = "ERROR"; 1076 1077 if (token.equals("name")) replacement = name; 1078 else if (token.equals("type")) replacement = type; 1079 else if (token.equals("bits")) replacement = ""+bits; 1080 else if (token.equals("size")) replacement = ""+size; 1081 else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar; 1082 else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar); 1083 else FAIL("Unrecognized token: " + token); 1084 1085 template = template.substring(0, pos) + replacement + template.substring(newpos); 1086 pos += replacement.length(); 1087 } 1088 initializers.append(template); 1089 } 1090 1091 /** 1092 * The genTable method generates source code for one lookup table. 1093 * Most of the complexity stems from handling various options as to 1094 * the type of the array components, the precise representation of the 1095 * values, the format in which to render each value, the number of values 1096 * to emit on each line of source code, and the kinds of useful comments 1097 * to be generated. 1098 * 1099 * @param result a StringBuffer, to which the generated source code 1100 * text is to be appended 1101 * @param name the name of the table 1102 * @param table the table data (an array of long values) 1103 * @param extract a distance, in bits, by which each entry of the table 1104 * is to be right-shifted before it is processed 1105 * @param bits the number of bits (not bytes) to be used to represent 1106 * each table entry 1107 * @param size the table data is divided up into blocks of size (1<<size); 1108 * in this method, this information is used only to affect 1109 * how many table values are to be generated per line 1110 * @param preshifted if this flag is true, then the table entries are to be 1111 * emitted in a preshifted form; that is, each value should 1112 * be left-shifted by the amount "shift", so that this work 1113 * is built into the table and need not be performed by an 1114 * explicit shift operator at run time 1115 * @param shift this is the shift amount for preshifting of table entries 1116 * @param hexFormat if this flag is true, table entries should be emitted as 1117 * hexadecimal literals; otherwise decimal literals are used 1118 * @param properties if this flag is true, the table entries are encoded 1119 * character properties rather than indexes into yet other tables; 1120 * therefore comments describing the encoded properties should 1121 * be generated 1122 * @param hexComment if this flag is true, each line of output is labelled with 1123 * a hexadecimal comment indicating the character values to 1124 * which that line applies; otherwise, decimal values indicating 1125 * table indices are generated 1126 * 1127 * @see GenerateCharacter#genTables 1128 * @see GenerateCharacter#replaceCommand 1129 */ 1130 1131 static void genTable(StringBuffer result, String name, 1132 long[] table, int extract, int bits, int size, 1133 boolean preshifted, int shift, boolean hexFormat, 1134 boolean properties, boolean hexComment) { 1135 1136 String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") : 1137 bits == 2 ? (Csyntax ? "unsigned long" : "int") : 1138 bits == 4 ? (Csyntax ? "unsigned long" : "int") : 1139 bits == 8 ? (Csyntax ? "unsigned char" : "byte") : 1140 bits == 16 ? (Csyntax ? "unsigned short" : "char") : 1141 bits == 32 ? (Csyntax ? "unsigned long" : "int") : 1142 (Csyntax ? "int64" : "long"); 1143 long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu 1144 bits == 2 ? Integer.MAX_VALUE : 1145 bits == 4 ? Integer.MAX_VALUE : 1146 bits == 8 ? Byte.MAX_VALUE : 1147 bits == 16 ? Short.MAX_VALUE : 1148 bits == 32 ? Integer.MAX_VALUE : 1149 Long.MAX_VALUE; 1150 int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16); 1151 boolean shiftEntries = preshifted && shift != 0; 1152 if (bits == 8 && tableAsString && useCharForByte) { 1153 atype = "char"; 1154 maxPosEntry = Character.MAX_VALUE; 1155 entriesPerChar = 1; 1156 } 1157 boolean noConversion = atype.equals("char"); 1158 1159 result.append(commentStart); 1160 result.append(" The ").append(name).append(" table has ").append(table.length); 1161 result.append(" entries for a total of "); 1162 int sizeOfTable = ((table.length * bits + 31) >> 5) << 2; 1163 if (bits == 8 && useCharForByte) { 1164 sizeOfTable *= 2; 1165 } 1166 result.append(sizeOfTable); 1167 result.append(" bytes.").append(commentEnd).append("\n\n"); 1168 if (Csyntax) 1169 result.append(" static "); 1170 else 1171 result.append(" static final "); 1172 result.append(atype); 1173 result.append(" ").append(name).append("["); 1174 if (Csyntax) 1175 result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0)); 1176 if (tableAsString) { 1177 if (noConversion) { 1178 result.append("] = (\n"); 1179 } else { 1180 result.append("] = new ").append(atype).append("["+table.length+"];\n "); 1181 result.append("static final String ").append(name).append("_DATA =\n"); 1182 } 1183 int CHARS_PER_LINE = 8; 1184 StringBuffer theString = new StringBuffer(); 1185 int entriesInCharSoFar = 0; 1186 char ch = '\u0000'; 1187 int charsPerEntry = -entriesPerChar; 1188 for (int j=0; j<table.length; ++j) { 1189 //long entry = table[j] >> extract; 1190 long entry; 1191 if ("A".equals(name)) 1192 entry = (table[j] & 0xffffffffL) >> extract; 1193 else 1194 entry = (table[j] >> extract); 1195 if (shiftEntries) entry <<= shift; 1196 if (entry >= (1L << bits)) { 1197 FAIL("Entry too big"); 1198 } 1199 if (entriesPerChar > 0) { 1200 // Pack multiple entries into a character 1201 ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits)); 1202 ++entriesInCharSoFar; 1203 if (entriesInCharSoFar == entriesPerChar) { 1204 // Character is full 1205 theString.append(ch); 1206 entriesInCharSoFar = 0; 1207 ch = '\u0000'; 1208 } 1209 } 1210 else { 1211 // Use multiple characters per entry 1212 for (int k=0; k<charsPerEntry; ++k) { 1213 ch = (char)(entry >> ((charsPerEntry-1)*16)); 1214 entry <<= 16; 1215 theString.append(ch); 1216 } 1217 } 1218 } 1219 if (entriesInCharSoFar > 0) { 1220 while (entriesInCharSoFar < entriesPerChar) { 1221 ch = (char)((int)ch >> bits); 1222 ++entriesInCharSoFar; 1223 } 1224 theString.append(ch); 1225 entriesInCharSoFar = 0; 1226 } 1227 result.append(Utility.formatForSource(theString.toString(), " ")); 1228 if (noConversion) { 1229 result.append(").toCharArray()"); 1230 } 1231 result.append(";\n\n "); 1232 1233 if (!noConversion) { 1234 addInitializer(name, atype, entriesPerChar, bits, table.length); 1235 } 1236 } 1237 else { 1238 result.append("] = {"); 1239 boolean castEntries = shiftEntries && (bits < 32); 1240 int printPerLine = hexFormat ? (bits == 1 ? 32*4 : 1241 bits == 2 ? 16*4 : 1242 bits == 4 ? 8*4 : 1243 bits == 8 ? 8 : 1244 bits == 16 ? 8 : 1245 bits == 32 ? 4 : 2) : 1246 (bits == 8 ? 8 : 1247 bits == 16 ? 8 : 4); 1248 int printMask = properties ? 0 : 1249 Math.min(1 << size, 1250 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1; 1251 int commentShift = ((1 << size) == table.length) ? 0 : size; 1252 int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1; 1253 long val = 0; 1254 for (int j = 0; j < table.length; j++) { 1255 if ((j & printMask) == 0) { 1256 while (result.charAt(result.length() - 1) == ' ') 1257 result.setLength(result.length() - 1); 1258 result.append("\n "); 1259 } 1260 PRINT: { 1261 if (castEntries) 1262 result.append("(").append(atype).append(")("); 1263 long entry = table[j] >> extract; 1264 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1); 1265 int k = j & packMask; 1266 if (bits >= 8) 1267 val = entry; 1268 else if (k == 0) { 1269 val = entry; 1270 break PRINT; 1271 } 1272 else { 1273 val |= (entry << (k*bits)); 1274 if (k != packMask) 1275 break PRINT; 1276 } 1277 if (val > maxPosEntry && !Csyntax) { // liu 1278 // For values that are out of range, convert them to in-range negative values. 1279 // Actually, output the '-' and convert them to the negative of the corresponding 1280 // in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126. 1281 result.append('-'); 1282 val = maxPosEntry + maxPosEntry + 2 - val; 1283 } 1284 if (hexFormat) { 1285 result.append("0x"); 1286 if (bits == 8) 1287 result.append(hex2((byte)val)); 1288 else if (bits == 16) 1289 result.append(hex4((short)val)); 1290 else if (bits == 32 || bits < 8) 1291 result.append(hex8((int)val)); 1292 else { 1293 result.append(hex16((long)val)); 1294 if (!Csyntax) 1295 result.append("L"); 1296 } 1297 } 1298 else { 1299 if (bits == 8) 1300 result.append(dec3(val)); 1301 else if (bits == 64) { 1302 result.append(dec5(val)); 1303 if (!Csyntax) 1304 result.append("L"); 1305 } 1306 else 1307 result.append(dec5(val)); 1308 } 1309 if (shiftEntries) 1310 result.append("<<").append(shift); 1311 if (castEntries) result.append(")"); 1312 if (j < (table.length - 1)) 1313 result.append(", "); 1314 else 1315 result.append(" "); 1316 if ((j & printMask) == printMask) { 1317 result.append(" ").append(commentStart).append(" "); 1318 if (hexComment) 1319 result.append("0x").append(hex4((j & ~commentMask) << (16 - size))); 1320 else 1321 result.append(dec3((j & ~commentMask) >> commentShift)); 1322 if (properties) propertiesComments(result, val); 1323 result.append(commentEnd); 1324 } 1325 } // end PRINT 1326 } 1327 result.append("\n };\n\n "); 1328 } 1329 } 1330 1331 static void genCaseMapTableDeclaration(StringBuffer result) { 1332 String myTab = " "; 1333 result.append(myTab + "static final char[][][] charMap;\n"); 1334 } 1335 1336 static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){ 1337 String myTab = " "; 1338 int ch; 1339 char[] map; 1340 result.append(myTab + "charMap = new char[][][] {\n"); 1341 for (int x = 0; x < specialCaseMaps.length; x++) { 1342 ch = specialCaseMaps[x].getCharSource(); 1343 map = specialCaseMaps[x].getUpperCaseMap(); 1344 result.append(myTab + myTab); 1345 result.append("{ "); 1346 result.append("{\'\\u"+hex4(ch)+"\'}, {"); 1347 for (int y = 0; y < map.length; y++) { 1348 result.append("\'\\u"+hex4(map[y])+"\', "); 1349 } 1350 result.append("} },\n"); 1351 } 1352 result.append(myTab + "};\n"); 1353 1354 } 1355 1356 /** 1357 * The propertiesComments method generates comments describing encoded 1358 * character properties. 1359 * 1360 * @param result a StringBuffer, to which the generated source code 1361 * text is to be appended 1362 * @param val encoded character properties 1363 * 1364 * @see GenerateCharacter#genTable 1365 */ 1366 1367 static void propertiesComments(StringBuffer result, long val) { 1368 result.append(" "); 1369 switch ((int)(val & maskType)) { 1370 case UnicodeSpec.CONTROL: 1371 result.append("Cc"); 1372 break; 1373 case UnicodeSpec.FORMAT: 1374 result.append("Cf"); 1375 break; 1376 case UnicodeSpec.PRIVATE_USE: 1377 result.append("Co"); 1378 break; 1379 case UnicodeSpec.SURROGATE: 1380 result.append("Cs"); 1381 break; 1382 case UnicodeSpec.LOWERCASE_LETTER: 1383 result.append("Ll"); 1384 break; 1385 case UnicodeSpec.MODIFIER_LETTER: 1386 result.append("Lm"); 1387 break; 1388 case UnicodeSpec.OTHER_LETTER: 1389 result.append("Lo"); 1390 break; 1391 case UnicodeSpec.TITLECASE_LETTER: 1392 result.append("Lt"); 1393 break; 1394 case UnicodeSpec.UPPERCASE_LETTER: 1395 result.append("Lu"); 1396 break; 1397 case UnicodeSpec.COMBINING_SPACING_MARK: 1398 result.append("Mc"); 1399 break; 1400 case UnicodeSpec.ENCLOSING_MARK: 1401 result.append("Me"); 1402 break; 1403 case UnicodeSpec.NON_SPACING_MARK: 1404 result.append("Mn"); 1405 break; 1406 case UnicodeSpec.DECIMAL_DIGIT_NUMBER: 1407 result.append("Nd"); 1408 break; 1409 case UnicodeSpec.LETTER_NUMBER: 1410 result.append("Nl"); 1411 break; 1412 case UnicodeSpec.OTHER_NUMBER: 1413 result.append("No"); 1414 break; 1415 case UnicodeSpec.CONNECTOR_PUNCTUATION: 1416 result.append("Pc"); 1417 break; 1418 case UnicodeSpec.DASH_PUNCTUATION: 1419 result.append("Pd"); 1420 break; 1421 case UnicodeSpec.END_PUNCTUATION: 1422 result.append("Pe"); 1423 break; 1424 case UnicodeSpec.OTHER_PUNCTUATION: 1425 result.append("Po"); 1426 break; 1427 case UnicodeSpec.START_PUNCTUATION: 1428 result.append("Ps"); 1429 break; 1430 case UnicodeSpec.CURRENCY_SYMBOL: 1431 result.append("Sc"); 1432 break; 1433 case UnicodeSpec.MODIFIER_SYMBOL: 1434 result.append("Sk"); 1435 break; 1436 case UnicodeSpec.MATH_SYMBOL: 1437 result.append("Sm"); 1438 break; 1439 case UnicodeSpec.OTHER_SYMBOL: 1440 result.append("So"); 1441 break; 1442 case UnicodeSpec.LINE_SEPARATOR: 1443 result.append("Zl"); break; 1444 case UnicodeSpec.PARAGRAPH_SEPARATOR: 1445 result.append("Zp"); 1446 break; 1447 case UnicodeSpec.SPACE_SEPARATOR: 1448 result.append("Zs"); 1449 break; 1450 case UnicodeSpec.UNASSIGNED: 1451 result.append("unassigned"); 1452 break; 1453 } 1454 1455 switch ((int)((val & maskBidi) >> shiftBidi)) { 1456 case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT: 1457 result.append(", L"); 1458 break; 1459 case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT: 1460 result.append(", R"); 1461 break; 1462 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER: 1463 result.append(", EN"); 1464 break; 1465 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR: 1466 result.append(", ES"); 1467 break; 1468 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR: 1469 result.append(", ET"); 1470 break; 1471 case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER: 1472 result.append(", AN"); 1473 break; 1474 case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR: 1475 result.append(", CS"); 1476 break; 1477 case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR: 1478 result.append(", B"); 1479 break; 1480 case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR: 1481 result.append(", S"); 1482 break; 1483 case UnicodeSpec.DIRECTIONALITY_WHITESPACE: 1484 result.append(", WS"); 1485 break; 1486 case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS: 1487 result.append(", ON"); 1488 break; 1489 } 1490 if ((val & maskUpperCase) != 0) { 1491 result.append(", hasUpper (subtract "); 1492 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1493 } 1494 if ((val & maskLowerCase) != 0) { 1495 result.append(", hasLower (add "); 1496 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1497 } 1498 if ((val & maskTitleCase) != 0) { 1499 result.append(", hasTitle"); 1500 } 1501 if ((val & maskIdentifierInfo) == valueIgnorable) { 1502 result.append(", ignorable"); 1503 } 1504 if ((val & maskIdentifierInfo) == valueJavaUnicodePart) { 1505 result.append(", identifier part"); 1506 } 1507 if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) { 1508 result.append(", underscore"); 1509 } 1510 if ((val & maskIdentifierInfo) == valueJavaWhitespace) { 1511 result.append(", whitespace"); 1512 } 1513 if ((val & maskIdentifierInfo) == valueJavaOnlyStart) { 1514 result.append(", currency"); 1515 } 1516 if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) { 1517 result.append(", identifier start"); 1518 } 1519 if ((val & maskNumericType) == valueDigit) { 1520 result.append(", decimal "); 1521 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1522 } 1523 if ((val & maskNumericType) == valueStrangeNumeric) { 1524 result.append(", strange"); 1525 } 1526 if ((val & maskNumericType) == valueJavaSupradecimal) { 1527 result.append(", supradecimal "); 1528 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1529 } 1530 } 1531 1532 static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" }; 1533 1534 static String tableName(int j) { return tableNames[j]; } 1535 1536 /** 1537 * The genAccess method generates source code for one table access expression. 1538 * 1539 * Most of the complexity stems from handling various options as to 1540 * table representation, such as whether it contains values so large that 1541 * they are represented as negative values and whether the table values are 1542 * preshifted. This method also avoids such "ugly" expressions as shifting 1543 * by distance zero, masking when no masking is necessary, and so on. 1544 * For clarity, it generates expressions that do not rely on operator 1545 * precedence, but otherwise it avoids generating redundant parentheses. 1546 * 1547 * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]] 1548 * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example. 1549 * 1550 * @param tbl the name of the final table to be accessed 1551 * @param var the variable name that appeared in parentheses in the 1552 * "Lookup" command 1553 * @param bits the number of bits (not bytes) to be used to represent 1554 * the final table entry 1555 * @return the replacement text for the "Lookup(xxx)" command, as a String 1556 * 1557 * @see GenerateCharacter#replaceCommand 1558 */ 1559 1560 static String genAccess(String tbl, String var, int bits) { 1561 String access = null; 1562 int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0; 1563 for (int k = 0; k < sizes.length; k++) { 1564 int offset = ((k < sizes.length - 1) ? 0 : bitoffset); 1565 int shift = shifts[k] + offset; 1566 String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")"; 1567 int mask = (1 << (sizes[k] - offset)) - 1; 1568 String masked = (k == 0) ? shifted : 1569 "(" + shifted + "&0x" + hex(mask) + ")"; 1570 String index = (k == 0) ? masked : 1571 (mask == 0) ? access : "(" + access + "|" + masked + ")"; 1572 String indexNoParens = (index.charAt(0) != '(') ? index : 1573 index.substring(1, index.length() - 1); 1574 String tblname = (k == sizes.length - 1) ? tbl : tableName(k); 1575 String fetched = tblname + "[" + indexNoParens + "]"; 1576 String zeroextended = (zeroextend[k] == 0) ? fetched : 1577 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")"; 1578 int adjustment = preshifted[k] ? 0 : 1579 sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0); 1580 String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended : 1581 "(" + zeroextended + "<<" + adjustment + ")"; 1582 String bitshift = (bits == 1) ? "(" + var + "&0x1F)" : 1583 (bits == 2) ? "((" + var + "&0xF)<<1)" : 1584 (bits == 4) ? "((" + var + "&7)<<2)" : null; 1585 String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted : 1586 "((" + adjusted + ">>" + bitshift + ")&" + 1587 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")"; 1588 access = extracted; 1589 } 1590 return access; 1591 } 1592 1593 /* The command line arguments are decoded and used to set the following 1594 global variables. 1595 */ 1596 1597 static boolean verbose = false; 1598 static boolean nobidi = false; 1599 static boolean nomirror = false; 1600 static boolean identifiers = false; 1601 static boolean Csyntax = false; 1602 static String TemplateFileName = null; 1603 static String OutputFileName = null; 1604 static String UnicodeSpecFileName = null; // liu 1605 static String SpecialCasingFileName = null; 1606 static String PropListFileName = null; 1607 static boolean useCharForByte = false; 1608 static int[] sizes; 1609 static int bins = 0; // liu; if > 0, then perform search 1610 static boolean tableAsString = false; 1611 static boolean bLatin1 = false; 1612 1613 static String commandLineDescription; 1614 1615 /* Other global variables, equal in length to the "sizes" array. */ 1616 1617 static int[] shifts; 1618 static int[] zeroextend; 1619 static int[] bytes; 1620 static boolean[] preshifted; 1621 static long[][] tables; 1622 1623 1624 /* Other global variables */ 1625 static String commentStart; 1626 static String commentEnd; 1627 1628 static StringBuffer initializers = new StringBuffer(); 1629 1630 /* special casing rules for 1:M toUpperCase mappings */ 1631 static SpecialCaseMap[] specialCaseMaps; 1632 1633 /** 1634 * Process the command line arguments. 1635 * 1636 * The allowed flags in command line are: 1637 * <dl> 1638 * <dt> -verbose <dd> Emit comments to standard output describing 1639 * what's going on during the processing. 1640 * <dt> -nobidi <dd> Do not include bidi categories in the 1641 * encoded character properties. 1642 * <dt> -nomirror <dd> Do no include mirror property in the encoded 1643 * character properties. 1644 * <dt> -identifiers <dd> Generate tables for scanning identifiers only. 1645 * <dt> -c <dd> Output code in C syntax instead of Java syntax. 1646 * <dt> -o filename <dd> Specify output file name. 1647 * <dt> -template filename <dd> Specify template input file name. 1648 * <dt> -spec filename <dd> Specify Unicode spec file name. 1649 * <dt> -specialcasing filename <dd> Specify Unicode special casing file name. 1650 * <dt> -search bins <dd> Try different partitions into the specified 1651 * number of bins. E.g., for 2 bins, try 1652 * 16 0, 15 1,..., 0 16. 1653 * <dt> -string <dd> Create table as string. Only valid with Java 1654 * syntax. 1655 * <dt> -latin1 <dd> Create a latin 1 only property table. 1656 * </dl> 1657 * In addition, decimal literals may appear as command line arguments; 1658 * each one represents the number of bits of the character to be broken 1659 * off at each lookup step. If present, they must add up to 16 (the number 1660 * of bits in a char value). For smaller tables, the last value should 1661 * be 0; values other than the last one may not be zero. If no such 1662 * numeric values are provided, default values are used. 1663 * 1664 * @param args the command line arguments, as an array of String 1665 * 1666 * @see GenerateCharacter#main 1667 */ 1668 1669 static void processArgs(String[] args) { 1670 StringBuffer desc = new StringBuffer("java GenerateCharacter"); 1671 for (int j=0; j<args.length; ++j) { 1672 desc.append(" " + args[j]); 1673 } 1674 for (int j = 0; j < args.length; j++) { 1675 if (args[j].equals("-verbose") || args[j].equals("-v")) 1676 verbose = true; 1677 else if (args[j].equals("-nobidi")) 1678 nobidi = true; 1679 else if (args[j].equals("-nomirror")) 1680 nomirror = true; 1681 else if (args[j].equals("-identifiers")) 1682 identifiers = true; 1683 else if (args[j].equals("-c")) 1684 Csyntax = true; 1685 else if (args[j].equals("-string")) 1686 tableAsString = true; 1687 else if (args[j].equals("-o")) { 1688 if (j == args.length - 1) { 1689 FAIL("File name missing after -o"); 1690 } 1691 else { 1692 OutputFileName = args[++j]; 1693 } 1694 } 1695 else if (args[j].equals("-search")) { 1696 if (j == args.length - 1) 1697 FAIL("Bin count missing after -search"); 1698 else { 1699 bins = Integer.parseInt(args[++j]); 1700 if (bins < 1 || bins > 10) 1701 FAIL("Bin count must be >= 1 and <= 10"); 1702 } 1703 } 1704 else if (args[j].equals("-template")) { 1705 if (j == args.length - 1) 1706 FAIL("File name missing after -template"); 1707 else 1708 TemplateFileName = args[++j]; 1709 } 1710 else if (args[j].equals("-spec")) { // liu 1711 if (j == args.length - 1) { 1712 FAIL("File name missing after -spec"); 1713 } 1714 else { 1715 UnicodeSpecFileName = args[++j]; 1716 } 1717 } 1718 else if (args[j].equals("-specialcasing")) { 1719 if (j == args.length -1) { 1720 FAIL("File name missing after -specialcasing"); 1721 } 1722 else { 1723 SpecialCasingFileName = args[++j]; 1724 } 1725 } 1726 else if (args[j].equals("-proplist")) { 1727 if (j == args.length -1) { 1728 FAIL("File name missing after -proplist"); 1729 } 1730 else { 1731 PropListFileName = args[++j]; 1732 } 1733 } 1734 else if (args[j].equals("-plane")) { 1735 if (j == args.length -1) { 1736 FAIL("Plane number missing after -plane"); 1737 } 1738 else { 1739 plane = Integer.parseInt(args[++j]); 1740 } 1741 if (plane > 0) { 1742 bLatin1 = false; 1743 } 1744 } 1745 else if ("-usecharforbyte".equals(args[j])) { 1746 useCharForByte = true; 1747 } 1748 else if (args[j].equals("-latin1")) { 1749 bLatin1 = true; 1750 plane = 0; 1751 } 1752 else { 1753 try { 1754 int val = Integer.parseInt(args[j]); 1755 if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]); 1756 if (sizes == null) 1757 sizes = new int[1]; 1758 else { 1759 int[] newsizes = new int[sizes.length + 1]; 1760 System.arraycopy(sizes, 0, newsizes, 0, sizes.length); 1761 sizes = newsizes; 1762 } 1763 sizes[sizes.length - 1] = val; 1764 } 1765 catch(NumberFormatException e) { 1766 FAIL("Unknown switch: " + args[j]); 1767 } 1768 } 1769 } 1770 if (Csyntax && tableAsString) { 1771 FAIL("Can't specify table as string with C syntax"); 1772 } 1773 if (sizes == null) { 1774 desc.append(" ["); 1775 if (identifiers) { 1776 int[] newsizes = { 8, 4, 4 }; // Good default values 1777 desc.append("8 4 4]"); 1778 sizes = newsizes; 1779 } 1780 else { 1781 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 } 1782 desc.append("10 5 1]"); 1783 sizes = newsizes; 1784 } 1785 } 1786 if (UnicodeSpecFileName == null) { // liu 1787 UnicodeSpecFileName = DefaultUnicodeSpecFileName; 1788 desc.append(" [-spec " + UnicodeSpecFileName + ']'); 1789 } 1790 if (SpecialCasingFileName == null) { 1791 SpecialCasingFileName = DefaultSpecialCasingFileName; 1792 desc.append(" [-specialcasing " + SpecialCasingFileName + ']'); 1793 } 1794 if (PropListFileName == null) { 1795 PropListFileName = DefaultPropListFileName; 1796 desc.append(" [-proplist " + PropListFileName + ']'); 1797 } 1798 if (TemplateFileName == null) { 1799 TemplateFileName = (Csyntax ? DefaultCTemplateFileName 1800 : DefaultJavaTemplateFileName); 1801 desc.append(" [-template " + TemplateFileName + ']'); 1802 } 1803 if (OutputFileName == null) { 1804 OutputFileName = (Csyntax ? DefaultCOutputFileName 1805 : DefaultJavaOutputFileName); 1806 desc.append(" [-o " + OutputFileName + ']'); 1807 } 1808 commentStart = (Csyntax ? "/*" : "//"); 1809 commentEnd = (Csyntax ? " */" : ""); 1810 commandLineDescription = desc.toString(); 1811 } 1812 1813 private static void searchBins(long[] map, int binsOccupied) throws Exception { 1814 int bitsFree = 16; 1815 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i]; 1816 if (binsOccupied == (bins-1)) { 1817 sizes[binsOccupied] = bitsFree; 1818 generateForSizes(map); 1819 } 1820 else { 1821 for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one 1822 sizes[binsOccupied] = i; 1823 searchBins(map, binsOccupied+1); 1824 } 1825 } 1826 } 1827 1828 private static void generateForSizes(long[] map) throws Exception { 1829 int sum = 0; 1830 shifts = new int[sizes.length]; 1831 for (int k = sizes.length - 1; k >= 0; k--) { 1832 shifts[k] = sum; 1833 sum += sizes[k]; 1834 } 1835 if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) { 1836 FAIL("Bit field widths total to " + sum + 1837 ": wrong total for map of size " + map.length); 1838 } 1839 // need a table for each set of lookup bits in char 1840 tables = new long[sizes.length][]; 1841 // the last table is the map 1842 tables[sizes.length - 1] = map; 1843 for (int j = sizes.length - 1; j > 0; j--) { 1844 if (verbose && bins==0) 1845 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]); 1846 long[][] temp = buildTable(tables[j], sizes[j]); 1847 tables[j-1] = temp[0]; 1848 tables[j] = temp[1]; 1849 } 1850 preshifted = new boolean[sizes.length]; 1851 zeroextend = new int[sizes.length]; 1852 bytes = new int[sizes.length]; 1853 for (int j = 0; j < sizes.length - 1; j++) { 1854 int len = tables[j+1].length; 1855 int size = sizes[j+1]; 1856 if (len > 0x100 && (len >> size) <= 0x100) { 1857 len >>= size; 1858 preshifted[j] = false; 1859 } 1860 else if (len > 0x10000 && (len >> size) <= 0x10000) { 1861 len >>= size; 1862 preshifted[j] = false; 1863 } 1864 else preshifted[j] = true; 1865 if (Csyntax) 1866 zeroextend[j] = 0; 1867 else if (len > 0x7F && len <= 0xFF) { 1868 if (!useCharForByte) { 1869 zeroextend[j] = 0xFF; 1870 } 1871 } else if (len > 0x7FFF && len <= 0xFFFF) 1872 zeroextend[j] = 0xFFFF; 1873 else zeroextend[j] = 0; 1874 if (len <= 0x100) bytes[j] = 1; 1875 else if (len <= 0x10000) bytes[j] = 2; 1876 else bytes[j] = 4; 1877 } 1878 preshifted[sizes.length - 1] = true; 1879 zeroextend[sizes.length - 1] = 0; 1880 bytes[sizes.length - 1] = 0; 1881 if (bins > 0) { 1882 int totalBytes = getTotalBytes(); 1883 String access = genAccess("A", "ch", (identifiers ? 2 : 32)); 1884 int accessComplexity = 0; 1885 for (int j=0; j<access.length(); ++j) { 1886 char ch = access.charAt(j); 1887 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity; 1888 if (ch == '<' || ch == '>') ++j; 1889 } 1890 System.out.print("("); 1891 for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]); 1892 System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access); 1893 return; 1894 } 1895 if (verbose) { 1896 System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted"); 1897 for (int j = 0; j < sizes.length; j++) { 1898 System.out.println(dec5(j) + "\t" + 1899 dec5(sizes[j]) + "\t" + 1900 dec5(tables[j].length) + "\t" + 1901 dec5(shifts[j]) + "\t" + 1902 dec5(zeroextend[j]) + "\t" + 1903 dec5(bytes[j]) + "\t " + 1904 preshifted[j]); 1905 } 1906 } 1907 if (verbose) { 1908 System.out.println("Generating source code for class Character"); 1909 System.out.println("A table access looks like " + 1910 genAccess("A", "ch", (identifiers ? 2 : 32))); 1911 } 1912 generateCharacterClass(TemplateFileName, OutputFileName); 1913 } 1914 1915 /** 1916 * The main program for generating source code for the Character class. 1917 * The basic outline of its operation is: 1918 * <ol> 1919 * <li> Process the command line arguments. One result of this process 1920 * is a list of sizes (measured in bits and summing to 16). 1921 * <li> Get the Unicode character property data from the specification file. 1922 * <li> From that, build a map that has, for each character code, its 1923 * relevant properties encoded as a long integer value. 1924 * <li> Repeatedly compress the map, producing a compressed table and a 1925 * new map. This is done once for each size value in the list. 1926 * When this is done, we have a set of tables. 1927 * <li> Make some decisions about table representation; record these 1928 * decisions in arrays named preshifted, zeroextend, and bytes. 1929 * <li> Generate the source code for the class Character by performing 1930 * macro processing on a template file. 1931 * </ol> 1932 * 1933 * @param args the command line arguments, as an array of String 1934 * 1935 * @see GenerateCharacter#processArgs 1936 * @see UnicodeSpec@readSpecFile 1937 * @see GenerateCharacter#buildMap 1938 * @see GenerateCharacter#buildTable 1939 * @see GenerateCharacter#generateCharacterClass 1940 */ 1941 1942 public static void main(String[] args) { 1943 processArgs(args); 1944 try { 1945 1946 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane); 1947 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane); 1948 PropList propList = PropList.readSpecFile(new File(PropListFileName), plane); 1949 1950 if (verbose) { 1951 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu 1952 } 1953 long[] map = buildMap(data, specialCaseMaps, propList); 1954 if (verbose) { 1955 System.err.println("Completed building of initial map"); 1956 } 1957 1958 if (bins == 0) { 1959 generateForSizes(map); 1960 } 1961 else { 1962 while (bins > 0) { 1963 sizes = new int[bins]; 1964 searchBins(map, 0); 1965 --bins; 1966 } 1967 } 1968 if (verbose && false) { 1969 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" + 1970 hex8(maxOffsetSeen)); 1971 System.out.println(" allowed: -" + hex8(-minOffset) + "..+" + 1972 hex8(maxOffset)); 1973 } 1974 } 1975 catch (FileNotFoundException e) { FAIL(e.toString()); } 1976 catch (IOException e) { FAIL(e.toString()); } 1977 catch (Throwable e) { 1978 System.out.println("Unexpected exception:"); 1979 e.printStackTrace(); 1980 FAIL("Unexpected exception!"); 1981 } 1982 if (verbose) { System.out.println("Done!");} 1983 } 1984 1985 } // end class