1 2 /* 3 * Copyright 2002-2003 Sun Microsystems, Inc. All Rights Reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Sun designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Sun in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, 23 * CA 95054 USA or visit www.sun.com if you need additional information or 24 * have any questions. 25 */ 26 27 package build.tools.generatecharacter; 28 29 import java.io.IOException; 30 import java.io.FileNotFoundException; 31 import java.io.BufferedReader; 32 import java.io.FileReader; 33 import java.io.PrintWriter; 34 import java.io.BufferedWriter; 35 import java.io.FileWriter; 36 import java.io.File; 37 38 /** 39 * This program generates the source code for the class java.lang.Character. 40 * It also generates native C code that can perform the same operations. 41 * It requires two external input data files: 42 * <ul> 43 * <li> Unicode specification file 44 * <li> Character class template file 45 * </ul> 46 * The Unicode specification file is available from the Unicode consortium. 47 * It has character specification lines that look like this: 48 * <listing> 49 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 50 * </listing> 51 * The Character class template file is filled in with additional 52 * information to produce the file Character.java, which can then be 53 * compiled by a Java compiler. The template file contains certain 54 * markers consisting of an alphabetic name string preceded by "$$". 55 * Such markers are replaced with generated program text. As a special 56 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of 57 * alphabetic characters constituting a variable name. The character "_" 58 * is considered alphabetic for these purposes. 59 * 60 * @author Guy Steele 61 * @author Alan Liu 62 * @author John O'Conner 63 */ 64 65 public class GenerateCharacter { 66 67 final static boolean DEBUG = false; 68 69 final static int MAX_UNICODE_VALUE = 0xFFFF; 70 final static String commandMarker = "$$"; 71 static String ROOT = ""; 72 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt"; 73 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt"; 74 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template"; 75 static String DefaultJavaOutputFileName = ROOT + "Character.java"; 76 static String DefaultCTemplateFileName = ROOT + "Character.c.template"; 77 static String DefaultCOutputFileName = ROOT + "Character.c"; 78 79 static String CharacterDataClassName = "CharacterData"; 80 static int plane = 0; 81 82 /* The overall idea is that, in the generated Character class source code, 83 most character property data is stored in a special multi-level table whose 84 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn]. 85 The integers must sum to 16 (the number of bits in a character). 86 The first table is indexed by the k1 high-order bits of the character code. 87 The result is concatenated to the next k2 bits of the character code to index 88 the second table, and so on. Eventually the kn low-order bits of the character 89 code are concatenated and used to index one of two tables A and B; A contains 90 32-bit integer entries and B contains 16-bit short entries. The 48 bits that 91 can be thus obtained encode the properties for the character. 92 93 The default specification is [9, 4, 3, 0]. This particular table format was 94 designed by conducting an exhaustive search of table formats to minimize the 95 space consumed by the tables: the first and third tables need have only byte 96 values (the second table must have short values). Another good choice is 97 [10, 6, 0], which produces a larger table but allows particularly fast table 98 lookup code. 99 100 In each case, where the word "concatenated" is used, this may imply 101 first a << and then a | operation, or perhaps just a | operation if 102 the values in the table can be preshifted (generally possible if the table 103 entries are short rather than byte). 104 */ 105 106 /* The character properties are currently encoded into 32 bits in the following manner: 107 1 bit Mirrored property. 108 4 bits Bidirectional category (see below) (unused if -nobidi switch specified) 109 9 bits A signed offset used for converting case . 110 1 bit If 1, adding the signed offset converts the character to lowercase. 111 1 bit If 1, subtracting the signed offset converts the character to uppercase. 112 Note: for a titlecase character, both of the preceding bits will be 1 113 and the signed offset will be 1. 114 1 bit If 1, this character has a titlecase equivalent (possibly itself); 115 in this case, the two bits before this bit can be used to decide 116 whether this character is in fact uppercase, lowercase, or titlecase. 117 3 bits This field provides a quick way to lex identifiers. 118 The eight possible values for this field are as follows: 119 0 May not be part of an identifier 120 1 Ignorable control; may continue a Unicode identifier or Java identifier 121 2 May continue a Java identifier but not a Unicode identifier (unused) 122 3 May continue a Unicode identifier or Java identifier 123 4 Is a Java whitespace character 124 5 May start or continue a Java identifier; 125 may continue but not start a Unicode identifier 126 (this value is used for connector punctuation such as _) 127 6 May start or continue a Java identifier; 128 may not occur in a Unicode identifier 129 (this value is used for currency symbols such as $) 130 7 May start or continue a Unicode identifier or Java identifier 131 Thus: 132 5, 6, 7 may start a Java identifier 133 1, 2, 3, 5, 6, 7 may continue a Java identifier 134 7 may start a Unicode identifier 135 1, 3, 5, 7 may continue a Unicode identifier 136 1 is ignorable within an identifier 137 4 is Java whitespace 138 2 bits This field indicates whether the character has a numeric property. 139 The four possible values for this field are as follows: 140 0 This character has no numeric property. 141 1 Adding the digit offset to the character code and then 142 masking with 0x1F will produce the desired numeric value. 143 2 This character has a "strange" numeric value. 144 3 A Java supradecimal digit: adding the digit offset to the 145 character code, then masking with 0x1F, then adding 10 146 will produce the desired numeric value. 147 5 bits The digit offset (see description of previous field) 148 5 bits Character type (see below) 149 */ 150 151 152 // bit masks identify each component of a 32-bit property field described 153 // above. 154 // shift* indicates how many shifts right must happen to get the 155 // indicated property value in the lowest bits of the 32-bit space. 156 private static final int 157 shiftType = 0, maskType = 0x001F, 158 shiftDigitOffset = 5, maskDigitOffset = 0x03E0, 159 shiftNumericType = 10, maskNumericType = 0x0C00, 160 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000, 161 maskUnicodePart = 0x1000, 162 shiftCaseInfo = 15, maskCaseInfo = 0x38000, 163 maskLowerCase = 0x20000, 164 maskUpperCase = 0x10000, 165 maskTitleCase = 0x08000, 166 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000, 167 shiftCaseOffsetSign = 5, 168 // used only when calculating and 169 // storing digit offsets from char values 170 maskDigit = 0x001F, 171 // case offset are 9 bits 172 maskCase = 0x01FF, 173 shiftBidi = 27, maskBidi = 0x78000000, 174 shiftMirrored = 31, maskMirrored = 0x80000000, 175 shiftPlane = 16, maskPlane = 0xFF0000; 176 177 // Can compare masked values with these to determine 178 // numeric or lexical types. 179 public static int 180 valueNotNumeric = 0x0000, 181 valueDigit = 0x0400, 182 valueStrangeNumeric = 0x0800, 183 valueJavaSupradecimal = 0x0C00, 184 valueIgnorable = 0x1000, 185 valueJavaOnlyPart = 0x2000, 186 valueJavaUnicodePart = 0x3000, 187 valueJavaWhitespace = 0x4000, 188 valueJavaStartUnicodePart = 0x5000, 189 valueJavaOnlyStart = 0x6000, 190 valueJavaUnicodeStart = 0x7000, 191 lowJavaStart = 0x5000, 192 nonzeroJavaPart = 0x3000, 193 valueUnicodeStart = 0x7000; 194 195 // these values are used when only identifier properties are generated 196 // for use in verifier code. Shortens the property down to a single byte. 197 private static final int 198 bitJavaStart = 0x02, 199 bitJavaPart = 0x01, 200 maskIsJavaIdentifierPart = bitJavaPart, 201 maskIsJavaIdentifierStart = bitJavaStart; 202 203 static int maxOffset = maskCase/2 ; 204 static int minOffset = -maxOffset; 205 206 /* The following routines provide simple, concise formatting of long integer values. 207 The number in the name of the method indicates the desired number of characters 208 to be produced. If the number of digits required to represent the integer value 209 is less than that number, then the output is padded on the left with zeros 210 (for hex) or with spaces (for decimal). If the number of digits required to 211 represent the integer value is greater than the desired number, then all the digits 212 that are required are actually produced. 213 */ 214 215 static String hex(long n) { return Long.toHexString(n).toUpperCase(); } 216 217 static String hex2(long n) { 218 String q = Long.toHexString(n & 0xFF).toUpperCase(); 219 return "00".substring(Math.min(2, q.length())) + q; 220 } 221 222 static String hex4(long n) { 223 String q = Long.toHexString(n & 0xFFFF).toUpperCase(); 224 return "0000".substring(Math.min(4, q.length())) + q; 225 } 226 227 static String hex8(long n) { 228 String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase(); 229 return "00000000".substring(Math.min(8, q.length())) + q; 230 } 231 232 static String hex16(long n) { 233 String q = Long.toHexString(n).toUpperCase(); 234 return "0000000000000000".substring(Math.min(16, q.length())) + q; 235 } 236 237 static String dec3(long n) { 238 String q = Long.toString(n); 239 return " ".substring(Math.min(3, q.length())) + q; 240 } 241 242 static String dec5(long n) { 243 String q = Long.toString(n); 244 return " ".substring(Math.min(5, q.length())) + q; 245 } 246 247 /* This routine is called when some failure occurs. */ 248 249 static void FAIL(String s) { 250 System.out.println("** " + s); 251 } 252 253 /** 254 * Given the data from the Unicode specification file, this routine builds a map. 255 * 256 * The specification file is assumed to contain its data in sorted order by 257 * character code; as a result, the array passed as an argument to this method 258 * has its components in the same sorted order, with one entry for each defined 259 * Unicode character or character range. (A range is indicated by two consecutive 260 * entries, such that the name of the first entry begins with "<" and ends with 261 * "First>" and the second entry begins with "<" and ends with "Last>".) This is 262 * therefore a sparse representation of the character property data. 263 * 264 * The resulting map is dense representation of the character data. It contains 265 * 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits 266 * of this long value are used, but type long is used rather than int to facilitate 267 * future extensions of this source code generator that might require more than 268 * 32 bits to encode relevant character properties.) Entry k holds the encoded 269 * properties for character k. 270 * 271 * Method buildMap manages the transformation from the sparse representation to 272 * the dense representation. It calls method buildOne to handle the encoding 273 * of character property data from a single UnicodeSpec object into 32 bits. 274 * For undefined characters, method buildOne is not called and the map entry for 275 * that character is set to UnicodeSpec.UNASSIGNED. 276 * 277 * @param data character property data from the Unicode specification file 278 * @return an array of length 65536 with one entry for every possible char value 279 * 280 * @see GenerateCharacter#buildOne 281 */ 282 283 static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps) { 284 long[] result; 285 if (bLatin1 == true) { 286 result = new long[256]; 287 } else { 288 result = new long[1<<16]; 289 } 290 int k=0; 291 int codePoint = plane<<16; 292 UnicodeSpec nonCharSpec = new UnicodeSpec(); 293 for (int j = 0; j < data.length && k < result.length; j++) { 294 if (data[j].codePoint == codePoint) { 295 result[k] = buildOne(codePoint, data[j], specialMaps); 296 ++k; 297 ++codePoint; 298 } 299 else if(data[j].codePoint > codePoint) { 300 if (data[j].name.endsWith("Last>")) { 301 // build map data for all chars except last in range 302 while (codePoint < data[j].codePoint && k < result.length) { 303 result[k] = buildOne(codePoint, data[j], specialMaps); 304 ++k; 305 ++codePoint; 306 } 307 } 308 else { 309 // we have a few unassigned chars before data[j].codePoint 310 while (codePoint < data[j].codePoint && k < result.length) { 311 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 312 ++k; 313 ++codePoint; 314 } 315 } 316 k = data[j].codePoint & 0xFFFF; 317 codePoint = data[j].codePoint; 318 result[k] = buildOne(codePoint, data[j], specialMaps); 319 ++k; 320 ++codePoint; 321 322 } 323 else { 324 System.out.println("An error has occured during spec mapping."); 325 System.exit(0); 326 } 327 } 328 // if there are still unprocessed chars, process them 329 // as unassigned/undefined. 330 codePoint = (plane<<16) | k; 331 while (k < result.length) { 332 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 333 ++k; 334 ++codePoint; 335 } 336 return result; 337 } 338 339 // The maximum and minimum offsets found while scanning the database 340 static int maxOffsetSeen = 0; 341 static int minOffsetSeen = 0; 342 343 /** 344 * Some Unicode separator characters are not considered Java whitespace. 345 * @param c character to test 346 * @return true if c in an invalid Java whitespace character, false otherwise. 347 */ 348 static boolean isInvalidJavaWhiteSpace(int c) { 349 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF}; 350 boolean retValue = false; 351 for(int x=0;x<exceptions.length;x++) { 352 if(c == exceptions[x]) { 353 retValue = true; 354 break; 355 } 356 } 357 return retValue; 358 359 } 360 361 /** 362 * Given the character property data for one Unicode character, encode the data 363 * of interest into a single long integer value. (Right now only 32 bits 364 * of this long value are used, but type long is used rather than int to facilitate 365 * future extensions of this source code generator that might require more than 366 * 32 bits to encode relevant character properties.) 367 * 368 * @param c the character code for which to encode property data 369 * @param us property data record from the Unicode specification file 370 * (its character code might not be equal to c if it specifies data 371 * for a range of characters) 372 * @return an encoded long value that contains the properties for a single char 373 * 374 * @see GenerateCharacter#buildMap 375 */ 376 377 static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) { 378 long resultA = 0; 379 // record the general category 380 resultA |= us.generalCategory; 381 382 // record the numeric properties 383 NUMERIC: { 384 STRANGE: { 385 int val = 0; 386 // c is A-Z 387 if ((c >= 0x0041) && (c <= 0x005A)) { 388 val = c - 0x0041; 389 resultA |= valueJavaSupradecimal; 390 // c is a-z 391 } else if ((c >= 0x0061) && (c <= 0x007A)) { 392 val = c - 0x0061; 393 resultA |= valueJavaSupradecimal; 394 // c is a full-width A-Z 395 } else if ((c >= 0xFF21) && (c <= 0xFF3A)) { 396 val = c - 0xFF21; 397 resultA |= valueJavaSupradecimal; 398 // c is a full-width a-z 399 } else if ((c >= 0xFF41) && (c <= 0xFF5A)) { 400 val = c - 0xFF41; 401 resultA |= valueJavaSupradecimal; 402 } else if (us.isDecimalValue()) { 403 val = us.decimalValue; 404 resultA |= valueDigit; 405 } else if (us.isDigitValue()) { 406 val = us.digitValue; 407 resultA |= valueDigit; 408 } else { 409 if (us.numericValue.length() == 0) { 410 break NUMERIC; // no numeric value at all 411 } else { 412 try { 413 val = Integer.parseInt(us.numericValue); 414 if (val >= 32 || val < 0) break STRANGE; 415 if (c == 0x215F) break STRANGE; 416 } catch(NumberFormatException e) { 417 break STRANGE; 418 } 419 resultA |= valueDigit; 420 } 421 } 422 if (val >= 32 || val < 0) break STRANGE; 423 resultA |= ((val - c & maskDigit) << shiftDigitOffset); 424 break NUMERIC; 425 } // end STRANGE 426 resultA |= valueStrangeNumeric; 427 } // end NUMERIC 428 429 // record case mapping 430 int offset = 0; 431 // might have a 1:M mapping 432 int specialMap = SpecialCaseMap.find(c, specialCaseMaps); 433 boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1); 434 if (bHasUpper) { 435 resultA |= maskUpperCase; 436 } 437 if (specialMap != -1) { 438 // has mapping, but cannot record the 439 // proper offset; can only flag it and provide special case 440 // code in Character.java 441 offset = -1; 442 } 443 else if (us.hasUpperMap()) { 444 offset = c - us.upperMap; 445 } 446 447 if (us.hasLowerMap()) { 448 resultA |= maskLowerCase; 449 if (offset == 0) 450 offset = us.lowerMap - c; 451 else if (offset != (us.lowerMap - c)) { 452 if (DEBUG) { 453 FAIL("Character " + hex(c) + 454 " has incompatible lowercase and uppercase mappings"); 455 } 456 } 457 } 458 if ((us.hasTitleMap() && us.titleMap != us.upperMap) || 459 (bHasUpper && us.hasLowerMap())) { 460 resultA |= maskTitleCase; 461 } 462 if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) { 463 System.out.println("Warning: Character " + hex4(c) + " has upper but " + 464 "no title case; Java won't know this"); 465 } 466 if (offset < minOffsetSeen) minOffsetSeen = offset; 467 if (offset > maxOffsetSeen) maxOffsetSeen = offset; 468 if (offset > maxOffset || offset < minOffset) { 469 if (DEBUG) { 470 FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case"); 471 } 472 offset = maskCase; 473 } 474 resultA |= ((offset & maskCase) << shiftCaseOffset); 475 476 477 // record lexical info about this character 478 if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER 479 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER 480 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER 481 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER 482 || us.generalCategory == UnicodeSpec.OTHER_LETTER 483 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) { 484 resultA |= valueJavaUnicodeStart; 485 } 486 else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK 487 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK 488 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) { 489 resultA |= valueJavaUnicodePart; 490 } 491 else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) { 492 resultA |= valueJavaStartUnicodePart; 493 } 494 else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) { 495 resultA |= valueJavaOnlyStart; 496 } 497 else if (((c >= 0x0000) && (c <= 0x0008)) 498 || ((c >= 0x000E) && (c <= 0x001B)) 499 || ((c >= 0x007F) && (c <= 0x009F)) 500 || us.generalCategory == UnicodeSpec.FORMAT) { 501 resultA |= valueIgnorable; 502 } 503 else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR 504 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR 505 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) { 506 if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace; 507 } 508 else if (((c >= 0x0009) && (c <= 0x000D)) 509 || ((c >= 0x001C) && (c <= 0x001F))) { 510 resultA |= valueJavaWhitespace; 511 } 512 513 // record bidi category 514 if (!nobidi) { 515 int tmpBidi = 516 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS || 517 us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi); 518 resultA |= tmpBidi; 519 } 520 521 // record mirrored property 522 if (!nomirror) { 523 resultA |= us.mirrored ? maskMirrored : 0; 524 } 525 526 if (identifiers) { 527 long replacement = 0; 528 if ((resultA & maskIdentifierInfo) >= lowJavaStart) { 529 replacement |= bitJavaStart; 530 } 531 if ( ((resultA & nonzeroJavaPart) != 0) 532 && ((resultA & maskIdentifierInfo) != valueIgnorable)) { 533 replacement |= bitJavaPart; 534 } 535 resultA = replacement; 536 } 537 return resultA; 538 } 539 540 /** 541 * This is the heart of the table compression strategy. The inputs are a map 542 * and a number of bits (size). The map is simply an array of long integer values; 543 * the number of bits indicates how index values for that map are to be split. 544 * The length of the given map must be a multiple of (1 << size). The result is 545 * a new map z and a compressed table t such that for every valid index value k 546 * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k]. 547 * 548 * In other words, the index k can be split into two parts, namely the "size" 549 * low-order bits and all the remaining high-order bits; the high-order bits are then 550 * remapped by map z to produce an index into table t. In effect, the data of the 551 * original map m is broken up into blocks of size (1<<size); the compression relies 552 * on the expectation that many of these blocks will be identical and therefore need 553 * be represented only once in the compressed table t. 554 * 555 * This method is intended to be used iteratively. The first map to be handed 556 * to it is the one constructed by method buildMap. After that, the first of the 557 * two arrays returned by this method is fed back into it for further compression. 558 * At the end of the iteration, one has a starter map and a sequence of tables. 559 * 560 * The algorithm used to implement this computation is straightforward and not 561 * especially clever. It uses brute-force linear search (the loop labeled MIDDLE) 562 * to locate identical blocks, so overall the time complexity of the algorithm 563 * is quadratic in the length of the input map. Fortunately, speed is not crucial 564 * to this application. 565 * 566 * @param map a map to be compressed 567 * @param size the number of index bits to be split off by the compression 568 * @return an array of length 2 containing two arrays; the first is a new map 569 * and the second is a compressed data table 570 * 571 * @see GenerateCharacter#buildMap 572 */ 573 574 static long[][] buildTable(long[] map, int size) { 575 int n = map.length; 576 if (((n >> size) << size) != n) { 577 FAIL("Length " + n + " is not a multiple of " + (1 << size)); 578 } 579 int m = 1 << size; 580 // We know the final length of the new map up front. 581 long[] newmap = new long[n >> size]; 582 // The buffer is used temporarily to hold data for the compressed table 583 // because we don't know its final length yet. 584 long[] buffer = new long[n]; 585 int ptr = 0; 586 OUTER: for (int i = 0; i < n; i += m) { 587 // For every block of size m in the original map... 588 MIDDLE: for (int j = 0; j < ptr; j += m) { 589 // Find out whether there is already a block just like it in the buffer. 590 for (int k = 0; k < m; k++) { 591 if (buffer[j+k] != map[i+k]) 592 continue MIDDLE; 593 } 594 // There is a block just like it at position j, so just 595 // put its index into the new map (thereby sharing it). 596 newmap[i >> size] = (j >> size); 597 continue OUTER; 598 } // end MIDDLE 599 // There is no block just like it already, so add it to 600 // the buffer and put its index into the new map. 601 for (int k = 0; k < m; k++) { 602 buffer[ptr+k] = map[i+k]; 603 } 604 newmap[i >> size] = (ptr >> size); 605 ptr += m; 606 } // end OUTER 607 // Now we know how long the compressed table should be, 608 // so create a new array and copy data from the temporary buffer. 609 long[] newdata = new long[ptr]; 610 for (int j = 0; j < ptr; j++) { 611 newdata[j] = buffer[j]; 612 } 613 // Return the new map and the new data table. 614 long[][] result = { newmap, newdata }; 615 return result; 616 } 617 618 /** 619 * Once the compressed tables have been computed, this method reads in a 620 * template file for the source code to be generated and writes out the final 621 * source code by acting as a sort of specialized macro processor. 622 * 623 * The first output line is a comment saying that the file was automatically 624 * generated; it includes a timestamp. All other output is generated by 625 * reading a line from the template file, performing macro replacements, 626 * and then writing the resulting line or lines of code to the output file. 627 * 628 * This method handles the I/O, the timestamp comment, and the locating of 629 * macro calls within each input line. The method replaceCommand is called 630 * to generate replacement text for each macro call. 631 * 632 * Macro calls to be replaced are indicated in the template file by 633 * occurrences of the commandMarker "$$". The rest of the call may consist 634 * of Java letters (including the underscore "_") and also of balanced 635 * parentheses. 636 * 637 * @param theTemplateFileName 638 * the file name for the template input file 639 * @param theOutputFileName 640 * the file name for the source code output file 641 * 642 * @see GenerateCharacter#replaceCommand 643 */ 644 645 static void generateCharacterClass(String theTemplateFileName, 646 String theOutputFileName) 647 throws FileNotFoundException, IOException { 648 BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName)); 649 PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName))); 650 out.println(commentStart + 651 " This file was generated AUTOMATICALLY from a template file " + 652 new java.util.Date() + commentEnd); 653 int marklen = commandMarker.length(); 654 LOOP: while(true) { 655 try { 656 String line = in.readLine(); 657 if (line == null) break LOOP; 658 int pos = 0; 659 int depth = 0; 660 while ((pos = line.indexOf(commandMarker, pos)) >= 0) { 661 int newpos = pos + marklen; 662 char ch = 'x'; 663 SCAN: while (newpos < line.length() && 664 (Character.isJavaIdentifierStart(ch = line.charAt(newpos)) 665 || ch == '(' || (ch == ')' && depth > 0))) { 666 ++newpos; 667 if (ch == '(') { 668 ++depth; 669 } 670 else if (ch == ')') { 671 --depth; 672 if (depth == 0) 673 break SCAN; 674 } 675 } 676 String replacement = replaceCommand(line.substring(pos + marklen, newpos)); 677 line = line.substring(0, pos) + replacement + line.substring(newpos); 678 pos += replacement.length(); 679 } 680 out.println(line); 681 } 682 catch (IOException e) { 683 break LOOP; 684 } 685 } 686 in.close(); 687 out.close(); 688 } 689 690 /** 691 * The replaceCommand method takes a command (a macro call without the 692 * leading marker "$$") and computes replacement text for it. 693 * 694 * Most of the commands are simply names of integer constants that are defined 695 * in the source code of this GenerateCharacter class. The replacement text is 696 * simply the value of the constant as an appropriately formatted integer literal. 697 * 698 * Two cases are more complicated, however. The command "Tables" causes the 699 * final map and compressed tables to be emitted, with elaborate comments 700 * describing their contents. (This is actually handled by method genTables.) 701 * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates 702 * an expression that will return the character property data for the character 703 * whose code is the value of the variable "xxx". (this is handled by method 704 * "genAccess".) 705 * 706 * @param x a command from the template file to be replaced 707 * @return the replacement text, as a String 708 * 709 * @see GenerateCharacter#genTables 710 * @see GenerateCharacter#genAccess 711 * @see GenerateCharacter#generateCharacterClass 712 */ 713 714 static String replaceCommand(String x) { 715 if (x.equals("Tables")) return genTables(); 716 if (x.equals("Initializers")) return genInitializers(); 717 if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") && 718 x.substring(x.length()-1).equals(")") ) 719 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32)); 720 if (x.equals("shiftType")) return Long.toString(shiftType); 721 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo); 722 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo); 723 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart); 724 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset); 725 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo); 726 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign); 727 if (x.equals("maskCase")) return "0x" + hex8(maskCase); 728 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset); 729 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase); 730 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase); 731 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase); 732 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable); 733 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart); 734 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart); 735 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart); 736 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart); 737 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace); 738 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart); 739 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart); 740 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart); 741 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart); 742 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart); 743 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart); 744 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart); 745 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset); 746 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset); 747 if (x.equals("maskDigit")) return "0x" + hex(maskDigit); 748 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType); 749 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType); 750 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric); 751 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 752 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 753 if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal); 754 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 755 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 756 if (x.equals("maskType")) return "0x" + hex(maskType); 757 if (x.equals("shiftBidi")) return Long.toString(shiftBidi); 758 if (x.equals("maskBidi")) return "0x" + hex(maskBidi); 759 if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored); 760 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG])) 761 return Integer.toString(UnicodeSpec.UNASSIGNED); 762 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG])) 763 return Integer.toString(UnicodeSpec.UPPERCASE_LETTER); 764 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG])) 765 return Integer.toString(UnicodeSpec.LOWERCASE_LETTER); 766 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG])) 767 return Integer.toString(UnicodeSpec.TITLECASE_LETTER); 768 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG])) 769 return Integer.toString(UnicodeSpec.MODIFIER_LETTER); 770 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG])) 771 return Integer.toString(UnicodeSpec.OTHER_LETTER); 772 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG])) 773 return Integer.toString(UnicodeSpec.NON_SPACING_MARK); 774 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG])) 775 return Integer.toString(UnicodeSpec.ENCLOSING_MARK); 776 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG])) 777 return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK); 778 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG])) 779 return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER); 780 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG])) 781 return Integer.toString(UnicodeSpec.OTHER_NUMBER); 782 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG])) 783 return Integer.toString(UnicodeSpec.SPACE_SEPARATOR); 784 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG])) 785 return Integer.toString(UnicodeSpec.LINE_SEPARATOR); 786 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 787 return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR); 788 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG])) 789 return Integer.toString(UnicodeSpec.CONTROL); 790 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG])) 791 return Integer.toString(UnicodeSpec.FORMAT); 792 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG])) 793 return Integer.toString(UnicodeSpec.PRIVATE_USE); 794 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG])) 795 return Integer.toString(UnicodeSpec.SURROGATE); 796 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG])) 797 return Integer.toString(UnicodeSpec.DASH_PUNCTUATION); 798 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG])) 799 return Integer.toString(UnicodeSpec.START_PUNCTUATION); 800 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG])) 801 return Integer.toString(UnicodeSpec.END_PUNCTUATION); 802 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 803 return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION); 804 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 805 return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION); 806 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG])) 807 return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION); 808 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG])) 809 return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION); 810 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG])) 811 return Integer.toString(UnicodeSpec.LETTER_NUMBER); 812 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG])) 813 return Integer.toString(UnicodeSpec.MATH_SYMBOL); 814 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG])) 815 return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL); 816 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG])) 817 return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL); 818 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG])) 819 return Integer.toString(UnicodeSpec.OTHER_SYMBOL); 820 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG])) 821 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT); 822 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG])) 823 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING); 824 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG])) 825 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE); 826 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG])) 827 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT); 828 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG])) 829 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC); 830 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG])) 831 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING); 832 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG])) 833 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE); 834 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG])) 835 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT); 836 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG])) 837 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER); 838 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 839 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR); 840 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG])) 841 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR); 842 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG])) 843 return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER); 844 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 845 return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR); 846 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG])) 847 return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK); 848 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG])) 849 return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL); 850 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 851 return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR); 852 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG])) 853 return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR); 854 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG])) 855 return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE); 856 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG])) 857 return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS); 858 FAIL("Unknown text substitution marker " + commandMarker + x); 859 return commandMarker + x; 860 } 861 862 /** 863 * The genTables method generates source code for all the lookup tables 864 * needed to represent the various Unicode character properties. 865 * It simply calls the method genTable once for each table to be generated 866 * and then generates a summary comment. 867 * 868 * @return the replacement text for the "Tables" command, as a String 869 * 870 * @see GenerateCharacter#genTable 871 * @see GenerateCharacter#replaceCommand 872 */ 873 static String genTables() { 874 int n = sizes.length; 875 StringBuffer result = new StringBuffer(); 876 // liu : Add a comment showing the source of this table 877 result.append(commentStart + " The following tables and code generated using:" + 878 commentEnd + "\n "); 879 result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n "); 880 881 if (plane == 0 && bLatin1 == false) { 882 genCaseMapTableDeclaration(result); 883 genCaseMapTable(initializers, specialCaseMaps); 884 } 885 int totalBytes = 0; 886 for (int k = 0; k < n - 1; k++) { 887 genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k], 888 sizes[k+1], false, false, k==0); 889 int s = bytes[k]; 890 if (s == 1 && useCharForByte) { 891 s = 2; 892 } 893 totalBytes += tables[k].length * s; 894 } 895 genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32), 896 sizes[n - 1], false, 0, true, !(identifiers), false); 897 898 // If we ever need more than 32 bits to represent the character properties, 899 // then a table "B" may be needed as well. 900 // genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false); 901 902 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2); 903 result.append(commentStart); 904 result.append(" In all, the character property tables require "); 905 result.append(totalBytes).append(" bytes.").append(commentEnd); 906 if (verbose) { 907 System.out.println("The character property tables require " 908 + totalBytes + " bytes."); 909 } 910 return result.toString(); 911 } 912 913 /** 914 * The genInitializers method generates the body of the 915 * ensureInitted() method, which enables lazy initialization of 916 * the case map table and other tables. 917 */ 918 static String genInitializers() { 919 return initializers.toString(); 920 } 921 922 /** 923 * Return the total number of bytes needed by all tables. This is a stripped- 924 * down copy of genTables(). 925 */ 926 static int getTotalBytes() { 927 int n = sizes.length; 928 int totalBytes = 0; 929 for (int k = 0; k < n - 1; k++) { 930 totalBytes += tables[k].length * bytes[k]; 931 } 932 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) 933 + 31) >> 5) << 2); 934 return totalBytes; 935 } 936 937 static void appendEscapedStringFragment(StringBuffer result, 938 char[] line, 939 int length, 940 boolean lastFragment) { 941 result.append(" \""); 942 for (int k=0; k<length; ++k) { 943 result.append("\\u"); 944 result.append(hex4(line[k])); 945 } 946 result.append("\""); 947 result.append(lastFragment ? ";" : "+"); 948 result.append("\n"); 949 } 950 951 static String SMALL_INITIALIZER = 952 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 953 // " $$name = new $$type[$$size];\n"+ 954 " int len = $$name_DATA.length();\n"+ 955 " int j=0;\n"+ 956 " for (int i=0; i<len; ++i) {\n"+ 957 " int c = $$name_DATA.charAt(i);\n"+ 958 " for (int k=0; k<$$entriesPerChar; ++k) {\n"+ 959 " $$name[j++] = ($$type)c;\n"+ 960 " c >>= $$bits;\n"+ 961 " }\n"+ 962 " }\n"+ 963 " assert (j == $$size);\n"+ 964 " }\n"; 965 966 static String SAME_SIZE_INITIALIZER = 967 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 968 " assert ($$name_DATA.length() == $$size);\n"+ 969 // " $$name = new $$type[$$size];\n"+ 970 " for (int i=0; i<$$size; ++i)\n"+ 971 " $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+ 972 " }\n"; 973 974 static String BIG_INITIALIZER = 975 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 976 // " $$name = new $$type[$$size];\n"+ 977 " int len = $$name_DATA.length();\n"+ 978 " int j=0;\n"+ 979 " int charsInEntry=0;\n"+ 980 " $$type entry=0;\n"+ 981 " for (int i=0; i<len; ++i) {\n"+ 982 " entry |= $$name_DATA.charAt(i);\n"+ 983 " if (++charsInEntry == $$charsPerEntry) {\n"+ 984 " $$name[j++] = entry;\n"+ 985 " entry = 0;\n"+ 986 " charsInEntry = 0;\n"+ 987 " }\n"+ 988 " else {\n"+ 989 " entry <<= 16;\n"+ 990 " }\n"+ 991 " }\n"+ 992 " assert (j == $$size);\n"+ 993 " }\n"; 994 995 static String INT32_INITIALIZER = 996 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 997 " char[] data = $$name_DATA.toCharArray();\n"+ 998 " assert (data.length == ($$size * 2));\n"+ 999 " int i = 0, j = 0;\n"+ 1000 " while (i < ($$size * 2)) {\n"+ 1001 " int entry = data[i++] << 16;\n"+ 1002 " $$name[j++] = entry | data[i++];\n"+ 1003 " }\n"+ 1004 " }\n"; 1005 1006 static void addInitializer(String name, String type, int entriesPerChar, 1007 int bits, int size) { 1008 1009 String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER : 1010 ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER); 1011 if (entriesPerChar == -2) { 1012 template = INT32_INITIALIZER; 1013 } 1014 int marklen = commandMarker.length(); 1015 int pos = 0; 1016 while ((pos = template.indexOf(commandMarker, pos)) >= 0) { 1017 int newpos = pos + marklen; 1018 char ch = 'x'; 1019 while (newpos < template.length() && 1020 Character.isJavaIdentifierStart(ch = template.charAt(newpos)) && 1021 ch != '_') // Don't allow this in token names 1022 ++newpos; 1023 String token = template.substring(pos+marklen, newpos); 1024 String replacement = "ERROR"; 1025 1026 if (token.equals("name")) replacement = name; 1027 else if (token.equals("type")) replacement = type; 1028 else if (token.equals("bits")) replacement = ""+bits; 1029 else if (token.equals("size")) replacement = ""+size; 1030 else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar; 1031 else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar); 1032 else FAIL("Unrecognized token: " + token); 1033 1034 template = template.substring(0, pos) + replacement + template.substring(newpos); 1035 pos += replacement.length(); 1036 } 1037 initializers.append(template); 1038 } 1039 1040 /** 1041 * The genTable method generates source code for one lookup table. 1042 * Most of the complexity stems from handling various options as to 1043 * the type of the array components, the precise representation of the 1044 * values, the format in which to render each value, the number of values 1045 * to emit on each line of source code, and the kinds of useful comments 1046 * to be generated. 1047 * 1048 * @param result a StringBuffer, to which the generated source code 1049 * text is to be appended 1050 * @param name the name of the table 1051 * @param table the table data (an array of long values) 1052 * @param extract a distance, in bits, by which each entry of the table 1053 * is to be right-shifted before it is processed 1054 * @param bits the number of bits (not bytes) to be used to represent 1055 * each table entry 1056 * @param size the table data is divided up into blocks of size (1<<size); 1057 * in this method, this information is used only to affect 1058 * how many table values are to be generated per line 1059 * @param preshifted if this flag is true, then the table entries are to be 1060 * emitted in a preshifted form; that is, each value should 1061 * be left-shifted by the amount "shift", so that this work 1062 * is built into the table and need not be performed by an 1063 * explicit shift operator at run time 1064 * @param shift this is the shift amount for preshifting of table entries 1065 * @param hexFormat if this flag is true, table entries should be emitted as 1066 * hexadecimal literals; otherwise decimal literals are used 1067 * @param properties if this flag is true, the table entries are encoded 1068 * character properties rather than indexes into yet other tables; 1069 * therefore comments describing the encoded properties should 1070 * be generated 1071 * @param hexComment if this flag is true, each line of output is labelled with 1072 * a hexadecimal comment indicating the character values to 1073 * which that line applies; otherwise, decimal values indicating 1074 * table indices are generated 1075 * 1076 * @see GenerateCharacter#genTables 1077 * @see GenerateCharacter#replaceCommand 1078 */ 1079 1080 static void genTable(StringBuffer result, String name, 1081 long[] table, int extract, int bits, int size, 1082 boolean preshifted, int shift, boolean hexFormat, 1083 boolean properties, boolean hexComment) { 1084 1085 String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") : 1086 bits == 2 ? (Csyntax ? "unsigned long" : "int") : 1087 bits == 4 ? (Csyntax ? "unsigned long" : "int") : 1088 bits == 8 ? (Csyntax ? "unsigned char" : "byte") : 1089 bits == 16 ? (Csyntax ? "unsigned short" : "char") : 1090 bits == 32 ? (Csyntax ? "unsigned long" : "int") : 1091 (Csyntax ? "int64" : "long"); 1092 long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu 1093 bits == 2 ? Integer.MAX_VALUE : 1094 bits == 4 ? Integer.MAX_VALUE : 1095 bits == 8 ? Byte.MAX_VALUE : 1096 bits == 16 ? Short.MAX_VALUE : 1097 bits == 32 ? Integer.MAX_VALUE : 1098 Long.MAX_VALUE; 1099 int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16); 1100 boolean shiftEntries = preshifted && shift != 0; 1101 if (bits == 8 && tableAsString && useCharForByte) { 1102 atype = "char"; 1103 maxPosEntry = Character.MAX_VALUE; 1104 entriesPerChar = 1; 1105 } 1106 boolean noConversion = atype.equals("char"); 1107 1108 result.append(commentStart); 1109 result.append(" The ").append(name).append(" table has ").append(table.length); 1110 result.append(" entries for a total of "); 1111 int sizeOfTable = ((table.length * bits + 31) >> 5) << 2; 1112 if (bits == 8 && useCharForByte) { 1113 sizeOfTable *= 2; 1114 } 1115 result.append(sizeOfTable); 1116 result.append(" bytes.").append(commentEnd).append("\n\n"); 1117 if (Csyntax) 1118 result.append(" static "); 1119 else 1120 result.append(" static final "); 1121 result.append(atype); 1122 result.append(" ").append(name).append("["); 1123 if (Csyntax) 1124 result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0)); 1125 if (tableAsString) { 1126 if (noConversion) { 1127 result.append("] = (\n"); 1128 } else { 1129 result.append("] = new ").append(atype).append("["+table.length+"];\n "); 1130 result.append("static final String ").append(name).append("_DATA =\n"); 1131 } 1132 int CHARS_PER_LINE = 8; 1133 StringBuffer theString = new StringBuffer(); 1134 int entriesInCharSoFar = 0; 1135 char ch = '\u0000'; 1136 int charsPerEntry = -entriesPerChar; 1137 for (int j=0; j<table.length; ++j) { 1138 long entry = table[j] >> extract; 1139 if (shiftEntries) entry <<= shift; 1140 if (entry >= (1L << bits)) { 1141 FAIL("Entry too big"); 1142 } 1143 if (entriesPerChar > 0) { 1144 // Pack multiple entries into a character 1145 ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits)); 1146 ++entriesInCharSoFar; 1147 if (entriesInCharSoFar == entriesPerChar) { 1148 // Character is full 1149 theString.append(ch); 1150 entriesInCharSoFar = 0; 1151 ch = '\u0000'; 1152 } 1153 } 1154 else { 1155 // Use multiple characters per entry 1156 for (int k=0; k<charsPerEntry; ++k) { 1157 ch = (char)(entry >> ((charsPerEntry-1)*16)); 1158 entry <<= 16; 1159 theString.append(ch); 1160 } 1161 } 1162 } 1163 if (entriesInCharSoFar > 0) { 1164 while (entriesInCharSoFar < entriesPerChar) { 1165 ch = (char)((int)ch >> bits); 1166 ++entriesInCharSoFar; 1167 } 1168 theString.append(ch); 1169 entriesInCharSoFar = 0; 1170 } 1171 result.append(Utility.formatForSource(theString.toString(), " ")); 1172 if (noConversion) { 1173 result.append(").toCharArray()"); 1174 } 1175 result.append(";\n\n "); 1176 1177 if (!noConversion) { 1178 addInitializer(name, atype, entriesPerChar, bits, table.length); 1179 } 1180 } 1181 else { 1182 result.append("] = {"); 1183 boolean castEntries = shiftEntries && (bits < 32); 1184 int printPerLine = hexFormat ? (bits == 1 ? 32*4 : 1185 bits == 2 ? 16*4 : 1186 bits == 4 ? 8*4 : 1187 bits == 8 ? 8 : 1188 bits == 16 ? 8 : 1189 bits == 32 ? 4 : 2) : 1190 (bits == 8 ? 8 : 1191 bits == 16 ? 8 : 4); 1192 int printMask = properties ? 0 : 1193 Math.min(1 << size, 1194 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1; 1195 int commentShift = ((1 << size) == table.length) ? 0 : size; 1196 int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1; 1197 long val = 0; 1198 for (int j = 0; j < table.length; j++) { 1199 if ((j & printMask) == 0) { 1200 while (result.charAt(result.length() - 1) == ' ') 1201 result.setLength(result.length() - 1); 1202 result.append("\n "); 1203 } 1204 PRINT: { 1205 if (castEntries) 1206 result.append("(").append(atype).append(")("); 1207 long entry = table[j] >> extract; 1208 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1); 1209 int k = j & packMask; 1210 if (bits >= 8) 1211 val = entry; 1212 else if (k == 0) { 1213 val = entry; 1214 break PRINT; 1215 } 1216 else { 1217 val |= (entry << (k*bits)); 1218 if (k != packMask) 1219 break PRINT; 1220 } 1221 if (val > maxPosEntry && !Csyntax) { // liu 1222 // For values that are out of range, convert them to in-range negative values. 1223 // Actually, output the '-' and convert them to the negative of the corresponding 1224 // in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126. 1225 result.append('-'); 1226 val = maxPosEntry + maxPosEntry + 2 - val; 1227 } 1228 if (hexFormat) { 1229 result.append("0x"); 1230 if (bits == 8) 1231 result.append(hex2((byte)val)); 1232 else if (bits == 16) 1233 result.append(hex4((short)val)); 1234 else if (bits == 32 || bits < 8) 1235 result.append(hex8((int)val)); 1236 else { 1237 result.append(hex16((long)val)); 1238 if (!Csyntax) 1239 result.append("L"); 1240 } 1241 } 1242 else { 1243 if (bits == 8) 1244 result.append(dec3(val)); 1245 else if (bits == 64) { 1246 result.append(dec5(val)); 1247 if (!Csyntax) 1248 result.append("L"); 1249 } 1250 else 1251 result.append(dec5(val)); 1252 } 1253 if (shiftEntries) 1254 result.append("<<").append(shift); 1255 if (castEntries) result.append(")"); 1256 if (j < (table.length - 1)) 1257 result.append(", "); 1258 else 1259 result.append(" "); 1260 if ((j & printMask) == printMask) { 1261 result.append(" ").append(commentStart).append(" "); 1262 if (hexComment) 1263 result.append("0x").append(hex4((j & ~commentMask) << (16 - size))); 1264 else 1265 result.append(dec3((j & ~commentMask) >> commentShift)); 1266 if (properties) propertiesComments(result, val); 1267 result.append(commentEnd); 1268 } 1269 } // end PRINT 1270 } 1271 result.append("\n };\n\n "); 1272 } 1273 } 1274 1275 static void genCaseMapTableDeclaration(StringBuffer result) { 1276 String myTab = " "; 1277 result.append(myTab + "static final char[][][] charMap;\n"); 1278 } 1279 1280 static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){ 1281 String myTab = " "; 1282 int ch; 1283 char[] map; 1284 result.append(myTab + "charMap = new char[][][] {\n"); 1285 for (int x = 0; x < specialCaseMaps.length; x++) { 1286 ch = specialCaseMaps[x].getCharSource(); 1287 map = specialCaseMaps[x].getUpperCaseMap(); 1288 result.append(myTab + myTab); 1289 result.append("{ "); 1290 result.append("{\'\\u"+hex4(ch)+"\'}, {"); 1291 for (int y = 0; y < map.length; y++) { 1292 result.append("\'\\u"+hex4(map[y])+"\', "); 1293 } 1294 result.append("} },\n"); 1295 } 1296 result.append(myTab + "};\n"); 1297 1298 } 1299 1300 /** 1301 * The propertiesComments method generates comments describing encoded 1302 * character properties. 1303 * 1304 * @param result a StringBuffer, to which the generated source code 1305 * text is to be appended 1306 * @param val encoded character properties 1307 * 1308 * @see GenerateCharacter#genTable 1309 */ 1310 1311 static void propertiesComments(StringBuffer result, long val) { 1312 result.append(" "); 1313 switch ((int)(val & maskType)) { 1314 case UnicodeSpec.CONTROL: 1315 result.append("Cc"); 1316 break; 1317 case UnicodeSpec.FORMAT: 1318 result.append("Cf"); 1319 break; 1320 case UnicodeSpec.PRIVATE_USE: 1321 result.append("Co"); 1322 break; 1323 case UnicodeSpec.SURROGATE: 1324 result.append("Cs"); 1325 break; 1326 case UnicodeSpec.LOWERCASE_LETTER: 1327 result.append("Ll"); 1328 break; 1329 case UnicodeSpec.MODIFIER_LETTER: 1330 result.append("Lm"); 1331 break; 1332 case UnicodeSpec.OTHER_LETTER: 1333 result.append("Lo"); 1334 break; 1335 case UnicodeSpec.TITLECASE_LETTER: 1336 result.append("Lt"); 1337 break; 1338 case UnicodeSpec.UPPERCASE_LETTER: 1339 result.append("Lu"); 1340 break; 1341 case UnicodeSpec.COMBINING_SPACING_MARK: 1342 result.append("Mc"); 1343 break; 1344 case UnicodeSpec.ENCLOSING_MARK: 1345 result.append("Me"); 1346 break; 1347 case UnicodeSpec.NON_SPACING_MARK: 1348 result.append("Mn"); 1349 break; 1350 case UnicodeSpec.DECIMAL_DIGIT_NUMBER: 1351 result.append("Nd"); 1352 break; 1353 case UnicodeSpec.LETTER_NUMBER: 1354 result.append("Nl"); 1355 break; 1356 case UnicodeSpec.OTHER_NUMBER: 1357 result.append("No"); 1358 break; 1359 case UnicodeSpec.CONNECTOR_PUNCTUATION: 1360 result.append("Pc"); 1361 break; 1362 case UnicodeSpec.DASH_PUNCTUATION: 1363 result.append("Pd"); 1364 break; 1365 case UnicodeSpec.END_PUNCTUATION: 1366 result.append("Pe"); 1367 break; 1368 case UnicodeSpec.OTHER_PUNCTUATION: 1369 result.append("Po"); 1370 break; 1371 case UnicodeSpec.START_PUNCTUATION: 1372 result.append("Ps"); 1373 break; 1374 case UnicodeSpec.CURRENCY_SYMBOL: 1375 result.append("Sc"); 1376 break; 1377 case UnicodeSpec.MODIFIER_SYMBOL: 1378 result.append("Sk"); 1379 break; 1380 case UnicodeSpec.MATH_SYMBOL: 1381 result.append("Sm"); 1382 break; 1383 case UnicodeSpec.OTHER_SYMBOL: 1384 result.append("So"); 1385 break; 1386 case UnicodeSpec.LINE_SEPARATOR: 1387 result.append("Zl"); break; 1388 case UnicodeSpec.PARAGRAPH_SEPARATOR: 1389 result.append("Zp"); 1390 break; 1391 case UnicodeSpec.SPACE_SEPARATOR: 1392 result.append("Zs"); 1393 break; 1394 case UnicodeSpec.UNASSIGNED: 1395 result.append("unassigned"); 1396 break; 1397 } 1398 1399 switch ((int)((val & maskBidi) >> shiftBidi)) { 1400 case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT: 1401 result.append(", L"); 1402 break; 1403 case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT: 1404 result.append(", R"); 1405 break; 1406 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER: 1407 result.append(", EN"); 1408 break; 1409 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR: 1410 result.append(", ES"); 1411 break; 1412 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR: 1413 result.append(", ET"); 1414 break; 1415 case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER: 1416 result.append(", AN"); 1417 break; 1418 case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR: 1419 result.append(", CS"); 1420 break; 1421 case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR: 1422 result.append(", B"); 1423 break; 1424 case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR: 1425 result.append(", S"); 1426 break; 1427 case UnicodeSpec.DIRECTIONALITY_WHITESPACE: 1428 result.append(", WS"); 1429 break; 1430 case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS: 1431 result.append(", ON"); 1432 break; 1433 } 1434 if ((val & maskUpperCase) != 0) { 1435 result.append(", hasUpper (subtract "); 1436 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1437 } 1438 if ((val & maskLowerCase) != 0) { 1439 result.append(", hasLower (add "); 1440 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1441 } 1442 if ((val & maskTitleCase) != 0) { 1443 result.append(", hasTitle"); 1444 } 1445 if ((val & maskIdentifierInfo) == valueIgnorable) { 1446 result.append(", ignorable"); 1447 } 1448 if ((val & maskIdentifierInfo) == valueJavaUnicodePart) { 1449 result.append(", identifier part"); 1450 } 1451 if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) { 1452 result.append(", underscore"); 1453 } 1454 if ((val & maskIdentifierInfo) == valueJavaWhitespace) { 1455 result.append(", whitespace"); 1456 } 1457 if ((val & maskIdentifierInfo) == valueJavaOnlyStart) { 1458 result.append(", currency"); 1459 } 1460 if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) { 1461 result.append(", identifier start"); 1462 } 1463 if ((val & maskNumericType) == valueDigit) { 1464 result.append(", decimal "); 1465 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1466 } 1467 if ((val & maskNumericType) == valueStrangeNumeric) { 1468 result.append(", strange"); 1469 } 1470 if ((val & maskNumericType) == valueJavaSupradecimal) { 1471 result.append(", supradecimal "); 1472 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1473 } 1474 } 1475 1476 static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" }; 1477 1478 static String tableName(int j) { return tableNames[j]; } 1479 1480 /** 1481 * The genAccess method generates source code for one table access expression. 1482 * 1483 * Most of the complexity stems from handling various options as to 1484 * table representation, such as whether it contains values so large that 1485 * they are represented as negative values and whether the table values are 1486 * preshifted. This method also avoids such "ugly" expressions as shifting 1487 * by distance zero, masking when no masking is necessary, and so on. 1488 * For clarity, it generates expressions that do not rely on operator 1489 * precedence, but otherwise it avoids generating redundant parentheses. 1490 * 1491 * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]] 1492 * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example. 1493 * 1494 * @param tbl the name of the final table to be accessed 1495 * @param var the variable name that appeared in parentheses in the 1496 * "Lookup" command 1497 * @param bits the number of bits (not bytes) to be used to represent 1498 * the final table entry 1499 * @return the replacement text for the "Lookup(xxx)" command, as a String 1500 * 1501 * @see GenerateCharacter#replaceCommand 1502 */ 1503 1504 static String genAccess(String tbl, String var, int bits) { 1505 String access = null; 1506 int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0; 1507 for (int k = 0; k < sizes.length; k++) { 1508 int offset = ((k < sizes.length - 1) ? 0 : bitoffset); 1509 int shift = shifts[k] + offset; 1510 String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")"; 1511 int mask = (1 << (sizes[k] - offset)) - 1; 1512 String masked = (k == 0) ? shifted : 1513 "(" + shifted + "&0x" + hex(mask) + ")"; 1514 String index = (k == 0) ? masked : 1515 (mask == 0) ? access : "(" + access + "|" + masked + ")"; 1516 String indexNoParens = (index.charAt(0) != '(') ? index : 1517 index.substring(1, index.length() - 1); 1518 String tblname = (k == sizes.length - 1) ? tbl : tableName(k); 1519 String fetched = tblname + "[" + indexNoParens + "]"; 1520 String zeroextended = (zeroextend[k] == 0) ? fetched : 1521 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")"; 1522 int adjustment = preshifted[k] ? 0 : 1523 sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0); 1524 String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended : 1525 "(" + zeroextended + "<<" + adjustment + ")"; 1526 String bitshift = (bits == 1) ? "(" + var + "&0x1F)" : 1527 (bits == 2) ? "((" + var + "&0xF)<<1)" : 1528 (bits == 4) ? "((" + var + "&7)<<2)" : null; 1529 String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted : 1530 "((" + adjusted + ">>" + bitshift + ")&" + 1531 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")"; 1532 access = extracted; 1533 } 1534 return access; 1535 } 1536 1537 /* The command line arguments are decoded and used to set the following 1538 global variables. 1539 */ 1540 1541 static boolean verbose = false; 1542 static boolean nobidi = false; 1543 static boolean nomirror = false; 1544 static boolean identifiers = false; 1545 static boolean Csyntax = false; 1546 static String TemplateFileName = null; 1547 static String OutputFileName = null; 1548 static String UnicodeSpecFileName = null; // liu 1549 static String SpecialCasingFileName = null; 1550 static boolean useCharForByte = false; 1551 static int[] sizes; 1552 static int bins = 0; // liu; if > 0, then perform search 1553 static boolean tableAsString = false; 1554 static boolean bLatin1 = false; 1555 1556 static String commandLineDescription; 1557 1558 /* Other global variables, equal in length to the "sizes" array. */ 1559 1560 static int[] shifts; 1561 static int[] zeroextend; 1562 static int[] bytes; 1563 static boolean[] preshifted; 1564 static long[][] tables; 1565 1566 1567 /* Other global variables */ 1568 static String commentStart; 1569 static String commentEnd; 1570 1571 static StringBuffer initializers = new StringBuffer(); 1572 1573 /* special casing rules for 1:M toUpperCase mappings */ 1574 static SpecialCaseMap[] specialCaseMaps; 1575 1576 /** 1577 * Process the command line arguments. 1578 * 1579 * The allowed flags in command line are: 1580 * <dl> 1581 * <dt> -verbose <dd> Emit comments to standard output describing 1582 * what's going on during the processing. 1583 * <dt> -nobidi <dd> Do not include bidi categories in the 1584 * encoded character properties. 1585 * <dt> -nomirror <dd> Do no include mirror property in the encoded 1586 * character properties. 1587 * <dt> -identifiers <dd> Generate tables for scanning identifiers only. 1588 * <dt> -c <dd> Output code in C syntax instead of Java syntax. 1589 * <dt> -o filename <dd> Specify output file name. 1590 * <dt> -template filename <dd> Specify template input file name. 1591 * <dt> -spec filename <dd> Specify Unicode spec file name. 1592 * <dt> -specialcasing filename <dd> Specify Unicode special casing file name. 1593 * <dt> -search bins <dd> Try different partitions into the specified 1594 * number of bins. E.g., for 2 bins, try 1595 * 16 0, 15 1,..., 0 16. 1596 * <dt> -string <dd> Create table as string. Only valid with Java 1597 * syntax. 1598 * <dt> -latin1 <dd> Create a latin 1 only property table. 1599 * </dl> 1600 * In addition, decimal literals may appear as command line arguments; 1601 * each one represents the number of bits of the character to be broken 1602 * off at each lookup step. If present, they must add up to 16 (the number 1603 * of bits in a char value). For smaller tables, the last value should 1604 * be 0; values other than the last one may not be zero. If no such 1605 * numeric values are provided, default values are used. 1606 * 1607 * @param args the command line arguments, as an array of String 1608 * 1609 * @see GenerateCharacter#main 1610 */ 1611 1612 static void processArgs(String[] args) { 1613 StringBuffer desc = new StringBuffer("java GenerateCharacter"); 1614 for (int j=0; j<args.length; ++j) { 1615 desc.append(" " + args[j]); 1616 } 1617 for (int j = 0; j < args.length; j++) { 1618 if (args[j].equals("-verbose") || args[j].equals("-v")) 1619 verbose = true; 1620 else if (args[j].equals("-nobidi")) 1621 nobidi = true; 1622 else if (args[j].equals("-nomirror")) 1623 nomirror = true; 1624 else if (args[j].equals("-identifiers")) 1625 identifiers = true; 1626 else if (args[j].equals("-c")) 1627 Csyntax = true; 1628 else if (args[j].equals("-string")) 1629 tableAsString = true; 1630 else if (args[j].equals("-o")) { 1631 if (j == args.length - 1) { 1632 FAIL("File name missing after -o"); 1633 } 1634 else { 1635 OutputFileName = args[++j]; 1636 } 1637 } 1638 else if (args[j].equals("-search")) { 1639 if (j == args.length - 1) 1640 FAIL("Bin count missing after -search"); 1641 else { 1642 bins = Integer.parseInt(args[++j]); 1643 if (bins < 1 || bins > 10) 1644 FAIL("Bin count must be >= 1 and <= 10"); 1645 } 1646 } 1647 else if (args[j].equals("-template")) { 1648 if (j == args.length - 1) 1649 FAIL("File name missing after -template"); 1650 else 1651 TemplateFileName = args[++j]; 1652 } 1653 else if (args[j].equals("-spec")) { // liu 1654 if (j == args.length - 1) { 1655 FAIL("File name missing after -spec"); 1656 } 1657 else { 1658 UnicodeSpecFileName = args[++j]; 1659 } 1660 } 1661 else if (args[j].equals("-specialcasing")) { 1662 if (j == args.length -1) { 1663 FAIL("File name missing after -specialcasing"); 1664 } 1665 else { 1666 SpecialCasingFileName = args[++j]; 1667 } 1668 } 1669 else if (args[j].equals("-plane")) { 1670 if (j == args.length -1) { 1671 FAIL("Plane number missing after -plane"); 1672 } 1673 else { 1674 plane = Integer.parseInt(args[++j]); 1675 } 1676 if (plane > 0) { 1677 bLatin1 = false; 1678 } 1679 } 1680 else if ("-usecharforbyte".equals(args[j])) { 1681 useCharForByte = true; 1682 } 1683 else if (args[j].equals("-latin1")) { 1684 bLatin1 = true; 1685 plane = 0; 1686 } 1687 else { 1688 try { 1689 int val = Integer.parseInt(args[j]); 1690 if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]); 1691 if (sizes == null) 1692 sizes = new int[1]; 1693 else { 1694 int[] newsizes = new int[sizes.length + 1]; 1695 System.arraycopy(sizes, 0, newsizes, 0, sizes.length); 1696 sizes = newsizes; 1697 } 1698 sizes[sizes.length - 1] = val; 1699 } 1700 catch(NumberFormatException e) { 1701 FAIL("Unknown switch: " + args[j]); 1702 } 1703 } 1704 } 1705 if (Csyntax && tableAsString) { 1706 FAIL("Can't specify table as string with C syntax"); 1707 } 1708 if (sizes == null) { 1709 desc.append(" ["); 1710 if (identifiers) { 1711 int[] newsizes = { 8, 4, 4 }; // Good default values 1712 desc.append("8 4 4]"); 1713 sizes = newsizes; 1714 } 1715 else { 1716 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 } 1717 desc.append("10 5 1]"); 1718 sizes = newsizes; 1719 } 1720 } 1721 if (UnicodeSpecFileName == null) { // liu 1722 UnicodeSpecFileName = DefaultUnicodeSpecFileName; 1723 desc.append(" [-spec " + UnicodeSpecFileName + ']'); 1724 } 1725 if (SpecialCasingFileName == null) { 1726 SpecialCasingFileName = DefaultSpecialCasingFileName; 1727 desc.append(" [-specialcasing " + SpecialCasingFileName + ']'); 1728 } 1729 if (TemplateFileName == null) { 1730 TemplateFileName = (Csyntax ? DefaultCTemplateFileName 1731 : DefaultJavaTemplateFileName); 1732 desc.append(" [-template " + TemplateFileName + ']'); 1733 } 1734 if (OutputFileName == null) { 1735 OutputFileName = (Csyntax ? DefaultCOutputFileName 1736 : DefaultJavaOutputFileName); 1737 desc.append(" [-o " + OutputFileName + ']'); 1738 } 1739 commentStart = (Csyntax ? "/*" : "//"); 1740 commentEnd = (Csyntax ? " */" : ""); 1741 commandLineDescription = desc.toString(); 1742 } 1743 1744 private static void searchBins(long[] map, int binsOccupied) throws Exception { 1745 int bitsFree = 16; 1746 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i]; 1747 if (binsOccupied == (bins-1)) { 1748 sizes[binsOccupied] = bitsFree; 1749 generateForSizes(map); 1750 } 1751 else { 1752 for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one 1753 sizes[binsOccupied] = i; 1754 searchBins(map, binsOccupied+1); 1755 } 1756 } 1757 } 1758 1759 private static void generateForSizes(long[] map) throws Exception { 1760 int sum = 0; 1761 shifts = new int[sizes.length]; 1762 for (int k = sizes.length - 1; k >= 0; k--) { 1763 shifts[k] = sum; 1764 sum += sizes[k]; 1765 } 1766 if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) { 1767 FAIL("Bit field widths total to " + sum + 1768 ": wrong total for map of size " + map.length); 1769 } 1770 // need a table for each set of lookup bits in char 1771 tables = new long[sizes.length][]; 1772 // the last table is the map 1773 tables[sizes.length - 1] = map; 1774 for (int j = sizes.length - 1; j > 0; j--) { 1775 if (verbose && bins==0) 1776 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]); 1777 long[][] temp = buildTable(tables[j], sizes[j]); 1778 tables[j-1] = temp[0]; 1779 tables[j] = temp[1]; 1780 } 1781 preshifted = new boolean[sizes.length]; 1782 zeroextend = new int[sizes.length]; 1783 bytes = new int[sizes.length]; 1784 for (int j = 0; j < sizes.length - 1; j++) { 1785 int len = tables[j+1].length; 1786 int size = sizes[j+1]; 1787 if (len > 0x100 && (len >> size) <= 0x100) { 1788 len >>= size; 1789 preshifted[j] = false; 1790 } 1791 else if (len > 0x10000 && (len >> size) <= 0x10000) { 1792 len >>= size; 1793 preshifted[j] = false; 1794 } 1795 else preshifted[j] = true; 1796 if (Csyntax) 1797 zeroextend[j] = 0; 1798 else if (len > 0x7F && len <= 0xFF) { 1799 if (!useCharForByte) { 1800 zeroextend[j] = 0xFF; 1801 } 1802 } else if (len > 0x7FFF && len <= 0xFFFF) 1803 zeroextend[j] = 0xFFFF; 1804 else zeroextend[j] = 0; 1805 if (len <= 0x100) bytes[j] = 1; 1806 else if (len <= 0x10000) bytes[j] = 2; 1807 else bytes[j] = 4; 1808 } 1809 preshifted[sizes.length - 1] = true; 1810 zeroextend[sizes.length - 1] = 0; 1811 bytes[sizes.length - 1] = 0; 1812 if (bins > 0) { 1813 int totalBytes = getTotalBytes(); 1814 String access = genAccess("A", "ch", (identifiers ? 2 : 32)); 1815 int accessComplexity = 0; 1816 for (int j=0; j<access.length(); ++j) { 1817 char ch = access.charAt(j); 1818 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity; 1819 if (ch == '<' || ch == '>') ++j; 1820 } 1821 System.out.print("("); 1822 for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]); 1823 System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access); 1824 return; 1825 } 1826 if (verbose) { 1827 System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted"); 1828 for (int j = 0; j < sizes.length; j++) { 1829 System.out.println(dec5(j) + "\t" + 1830 dec5(sizes[j]) + "\t" + 1831 dec5(tables[j].length) + "\t" + 1832 dec5(shifts[j]) + "\t" + 1833 dec5(zeroextend[j]) + "\t" + 1834 dec5(bytes[j]) + "\t " + 1835 preshifted[j]); 1836 } 1837 } 1838 if (verbose) { 1839 System.out.println("Generating source code for class Character"); 1840 System.out.println("A table access looks like " + 1841 genAccess("A", "ch", (identifiers ? 2 : 32))); 1842 } 1843 generateCharacterClass(TemplateFileName, OutputFileName); 1844 } 1845 1846 /** 1847 * The main program for generating source code for the Character class. 1848 * The basic outline of its operation is: 1849 * <ol> 1850 * <li> Process the command line arguments. One result of this process 1851 * is a list of sizes (measured in bits and summing to 16). 1852 * <li> Get the Unicode character property data from the specification file. 1853 * <li> From that, build a map that has, for each character code, its 1854 * relevant properties encoded as a long integer value. 1855 * <li> Repeatedly compress the map, producing a compressed table and a 1856 * new map. This is done once for each size value in the list. 1857 * When this is done, we have a set of tables. 1858 * <li> Make some decisions about table representation; record these 1859 * decisions in arrays named preshifted, zeroextend, and bytes. 1860 * <li> Generate the source code for the class Character by performing 1861 * macro processing on a template file. 1862 * </ol> 1863 * 1864 * @param args the command line arguments, as an array of String 1865 * 1866 * @see GenerateCharacter#processArgs 1867 * @see UnicodeSpec@readSpecFile 1868 * @see GenerateCharacter#buildMap 1869 * @see GenerateCharacter#buildTable 1870 * @see GenerateCharacter#generateCharacterClass 1871 */ 1872 1873 public static void main(String[] args) { 1874 processArgs(args); 1875 try { 1876 1877 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane); 1878 1879 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane); 1880 if (verbose) { 1881 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu 1882 } 1883 long[] map = buildMap(data, specialCaseMaps); 1884 if (verbose) { 1885 System.err.println("Completed building of initial map"); 1886 } 1887 1888 if (bins == 0) { 1889 generateForSizes(map); 1890 } 1891 else { 1892 while (bins > 0) { 1893 sizes = new int[bins]; 1894 searchBins(map, 0); 1895 --bins; 1896 } 1897 } 1898 if (verbose && false) { 1899 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" + 1900 hex8(maxOffsetSeen)); 1901 System.out.println(" allowed: -" + hex8(-minOffset) + "..+" + 1902 hex8(maxOffset)); 1903 } 1904 } 1905 catch (FileNotFoundException e) { FAIL(e.toString()); } 1906 catch (IOException e) { FAIL(e.toString()); } 1907 catch (Throwable e) { 1908 System.out.println("Unexpected exception:"); 1909 e.printStackTrace(); 1910 FAIL("Unexpected exception!"); 1911 } 1912 if (verbose) { System.out.println("Done!");} 1913 } 1914 1915 } // end class