1 2 /* 3 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 package build.tools.generatecharacter; 28 29 import java.io.IOException; 30 import java.io.FileNotFoundException; 31 import java.io.BufferedReader; 32 import java.io.FileReader; 33 import java.io.PrintWriter; 34 import java.io.BufferedWriter; 35 import java.io.FileWriter; 36 import java.io.File; 37 38 import build.tools.generatecharacter.CharacterName; 39 40 /** 41 * This program generates the source code for the class java.lang.Character. 42 * It also generates native C code that can perform the same operations. 43 * It requires two external input data files: 44 * <ul> 45 * <li> Unicode specification file 46 * <li> Character class template file 47 * </ul> 48 * The Unicode specification file is available from the Unicode consortium. 49 * It has character specification lines that look like this: 50 * <listing> 51 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 52 * </listing> 53 * The Character class template file is filled in with additional 54 * information to produce the file Character.java, which can then be 55 * compiled by a Java compiler. The template file contains certain 56 * markers consisting of an alphabetic name string preceded by "$$". 57 * Such markers are replaced with generated program text. As a special 58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of 59 * alphabetic characters constituting a variable name. The character "_" 60 * is considered alphabetic for these purposes. 61 * 62 * @author Guy Steele 63 * @author Alan Liu 64 * @author John O'Conner 65 */ 66 67 public class GenerateCharacter { 68 69 final static boolean DEBUG = false; 70 71 final static int MAX_UNICODE_VALUE = 0xFFFF; 72 final static String commandMarker = "$$"; 73 static String ROOT = ""; 74 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt"; 75 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt"; 76 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template"; 77 static String DefaultJavaOutputFileName = ROOT + "Character.java"; 78 static String DefaultCTemplateFileName = ROOT + "Character.c.template"; 79 static String DefaultCOutputFileName = ROOT + "Character.c"; 80 81 static String CharacterDataClassName = "CharacterData"; 82 static int plane = 0; 83 84 /* The overall idea is that, in the generated Character class source code, 85 most character property data is stored in a special multi-level table whose 86 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn]. 87 The integers must sum to 16 (the number of bits in a character). 88 The first table is indexed by the k1 high-order bits of the character code. 89 The result is concatenated to the next k2 bits of the character code to index 90 the second table, and so on. Eventually the kn low-order bits of the character 91 code are concatenated and used to index one of two tables A and B; A contains 92 32-bit integer entries and B contains 16-bit short entries. The 48 bits that 93 can be thus obtained encode the properties for the character. 94 95 The default specification is [9, 4, 3, 0]. This particular table format was 96 designed by conducting an exhaustive search of table formats to minimize the 97 space consumed by the tables: the first and third tables need have only byte 98 values (the second table must have short values). Another good choice is 99 [10, 6, 0], which produces a larger table but allows particularly fast table 100 lookup code. 101 102 In each case, where the word "concatenated" is used, this may imply 103 first a << and then a | operation, or perhaps just a | operation if 104 the values in the table can be preshifted (generally possible if the table 105 entries are short rather than byte). 106 */ 107 108 /* The character properties are currently encoded into 32 bits in the following manner: 109 1 bit Mirrored property. 110 4 bits Bidirectional category (see below) (unused if -nobidi switch specified) 111 9 bits A signed offset used for converting case . 112 1 bit If 1, adding the signed offset converts the character to lowercase. 113 1 bit If 1, subtracting the signed offset converts the character to uppercase. 114 Note: for a titlecase character, both of the preceding bits will be 1 115 and the signed offset will be 1. 116 1 bit If 1, this character has a titlecase equivalent (possibly itself); 117 in this case, the two bits before this bit can be used to decide 118 whether this character is in fact uppercase, lowercase, or titlecase. 119 3 bits This field provides a quick way to lex identifiers. 120 The eight possible values for this field are as follows: 121 0 May not be part of an identifier 122 1 Ignorable control; may continue a Unicode identifier or Java identifier 123 2 May continue a Java identifier but not a Unicode identifier (unused) 124 3 May continue a Unicode identifier or Java identifier 125 4 Is a Java whitespace character 126 5 May start or continue a Java identifier; 127 may continue but not start a Unicode identifier 128 (this value is used for connector punctuation such as _) 129 6 May start or continue a Java identifier; 130 may not occur in a Unicode identifier 131 (this value is used for currency symbols such as $) 132 7 May start or continue a Unicode identifier or Java identifier 133 Thus: 134 5, 6, 7 may start a Java identifier 135 1, 2, 3, 5, 6, 7 may continue a Java identifier 136 7 may start a Unicode identifier 137 1, 3, 5, 7 may continue a Unicode identifier 138 1 is ignorable within an identifier 139 4 is Java whitespace 140 2 bits This field indicates whether the character has a numeric property. 141 The four possible values for this field are as follows: 142 0 This character has no numeric property. 143 1 Adding the digit offset to the character code and then 144 masking with 0x1F will produce the desired numeric value. 145 2 This character has a "strange" numeric value. 146 3 A Java supradecimal digit: adding the digit offset to the 147 character code, then masking with 0x1F, then adding 10 148 will produce the desired numeric value. 149 5 bits The digit offset (see description of previous field) 150 5 bits Character type (see below) 151 */ 152 153 154 // bit masks identify each component of a 32-bit property field described 155 // above. 156 // shift* indicates how many shifts right must happen to get the 157 // indicated property value in the lowest bits of the 32-bit space. 158 private static final int 159 shiftType = 0, maskType = 0x001F, 160 shiftDigitOffset = 5, maskDigitOffset = 0x03E0, 161 shiftNumericType = 10, maskNumericType = 0x0C00, 162 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000, 163 maskUnicodePart = 0x1000, 164 shiftCaseInfo = 15, maskCaseInfo = 0x38000, 165 maskLowerCase = 0x20000, 166 maskUpperCase = 0x10000, 167 maskTitleCase = 0x08000, 168 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000, 169 shiftCaseOffsetSign = 5, 170 // used only when calculating and 171 // storing digit offsets from char values 172 maskDigit = 0x001F, 173 // case offset are 9 bits 174 maskCase = 0x01FF, 175 shiftBidi = 27, maskBidi = 0x78000000, 176 shiftMirrored = 31, maskMirrored = 0x80000000, 177 shiftPlane = 16, maskPlane = 0xFF0000; 178 179 // Can compare masked values with these to determine 180 // numeric or lexical types. 181 public static int 182 valueNotNumeric = 0x0000, 183 valueDigit = 0x0400, 184 valueStrangeNumeric = 0x0800, 185 valueJavaSupradecimal = 0x0C00, 186 valueIgnorable = 0x1000, 187 valueJavaOnlyPart = 0x2000, 188 valueJavaUnicodePart = 0x3000, 189 valueJavaWhitespace = 0x4000, 190 valueJavaStartUnicodePart = 0x5000, 191 valueJavaOnlyStart = 0x6000, 192 valueJavaUnicodeStart = 0x7000, 193 lowJavaStart = 0x5000, 194 nonzeroJavaPart = 0x3000, 195 valueUnicodeStart = 0x7000; 196 197 // these values are used when only identifier properties are generated 198 // for use in verifier code. Shortens the property down to a single byte. 199 private static final int 200 bitJavaStart = 0x02, 201 bitJavaPart = 0x01, 202 maskIsJavaIdentifierPart = bitJavaPart, 203 maskIsJavaIdentifierStart = bitJavaStart; 204 205 static int maxOffset = maskCase/2 ; 206 static int minOffset = -maxOffset; 207 208 /* The following routines provide simple, concise formatting of long integer values. 209 The number in the name of the method indicates the desired number of characters 210 to be produced. If the number of digits required to represent the integer value 211 is less than that number, then the output is padded on the left with zeros 212 (for hex) or with spaces (for decimal). If the number of digits required to 213 represent the integer value is greater than the desired number, then all the digits 214 that are required are actually produced. 215 */ 216 217 static String hex(long n) { return Long.toHexString(n).toUpperCase(); } 218 219 static String hex2(long n) { 220 String q = Long.toHexString(n & 0xFF).toUpperCase(); 221 return "00".substring(Math.min(2, q.length())) + q; 222 } 223 224 static String hex4(long n) { 225 String q = Long.toHexString(n & 0xFFFF).toUpperCase(); 226 return "0000".substring(Math.min(4, q.length())) + q; 227 } 228 229 static String hex8(long n) { 230 String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase(); 231 return "00000000".substring(Math.min(8, q.length())) + q; 232 } 233 234 static String hex16(long n) { 235 String q = Long.toHexString(n).toUpperCase(); 236 return "0000000000000000".substring(Math.min(16, q.length())) + q; 237 } 238 239 static String dec3(long n) { 240 String q = Long.toString(n); 241 return " ".substring(Math.min(3, q.length())) + q; 242 } 243 244 static String dec5(long n) { 245 String q = Long.toString(n); 246 return " ".substring(Math.min(5, q.length())) + q; 247 } 248 249 /* This routine is called when some failure occurs. */ 250 251 static void FAIL(String s) { 252 System.out.println("** " + s); 253 } 254 255 /** 256 * Given the data from the Unicode specification file, this routine builds a map. 257 * 258 * The specification file is assumed to contain its data in sorted order by 259 * character code; as a result, the array passed as an argument to this method 260 * has its components in the same sorted order, with one entry for each defined 261 * Unicode character or character range. (A range is indicated by two consecutive 262 * entries, such that the name of the first entry begins with "<" and ends with 263 * "First>" and the second entry begins with "<" and ends with "Last>".) This is 264 * therefore a sparse representation of the character property data. 265 * 266 * The resulting map is dense representation of the character data. It contains 267 * 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits 268 * of this long value are used, but type long is used rather than int to facilitate 269 * future extensions of this source code generator that might require more than 270 * 32 bits to encode relevant character properties.) Entry k holds the encoded 271 * properties for character k. 272 * 273 * Method buildMap manages the transformation from the sparse representation to 274 * the dense representation. It calls method buildOne to handle the encoding 275 * of character property data from a single UnicodeSpec object into 32 bits. 276 * For undefined characters, method buildOne is not called and the map entry for 277 * that character is set to UnicodeSpec.UNASSIGNED. 278 * 279 * @param data character property data from the Unicode specification file 280 * @return an array of length 65536 with one entry for every possible char value 281 * 282 * @see GenerateCharacter#buildOne 283 */ 284 285 static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps) { 286 long[] result; 287 if (bLatin1 == true) { 288 result = new long[256]; 289 } else { 290 result = new long[1<<16]; 291 } 292 int k=0; 293 int codePoint = plane<<16; 294 UnicodeSpec nonCharSpec = new UnicodeSpec(); 295 for (int j = 0; j < data.length && k < result.length; j++) { 296 if (data[j].codePoint == codePoint) { 297 result[k] = buildOne(codePoint, data[j], specialMaps); 298 ++k; 299 ++codePoint; 300 } 301 else if(data[j].codePoint > codePoint) { 302 if (data[j].name.endsWith("Last>")) { 303 // build map data for all chars except last in range 304 while (codePoint < data[j].codePoint && k < result.length) { 305 result[k] = buildOne(codePoint, data[j], specialMaps); 306 ++k; 307 ++codePoint; 308 } 309 } 310 else { 311 // we have a few unassigned chars before data[j].codePoint 312 while (codePoint < data[j].codePoint && k < result.length) { 313 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 314 ++k; 315 ++codePoint; 316 } 317 } 318 k = data[j].codePoint & 0xFFFF; 319 codePoint = data[j].codePoint; 320 result[k] = buildOne(codePoint, data[j], specialMaps); 321 ++k; 322 ++codePoint; 323 324 } 325 else { 326 System.out.println("An error has occured during spec mapping."); 327 System.exit(0); 328 } 329 } 330 // if there are still unprocessed chars, process them 331 // as unassigned/undefined. 332 codePoint = (plane<<16) | k; 333 while (k < result.length) { 334 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 335 ++k; 336 ++codePoint; 337 } 338 return result; 339 } 340 341 // The maximum and minimum offsets found while scanning the database 342 static int maxOffsetSeen = 0; 343 static int minOffsetSeen = 0; 344 345 /** 346 * Some Unicode separator characters are not considered Java whitespace. 347 * @param c character to test 348 * @return true if c in an invalid Java whitespace character, false otherwise. 349 */ 350 static boolean isInvalidJavaWhiteSpace(int c) { 351 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF}; 352 boolean retValue = false; 353 for(int x=0;x<exceptions.length;x++) { 354 if(c == exceptions[x]) { 355 retValue = true; 356 break; 357 } 358 } 359 return retValue; 360 361 } 362 363 /** 364 * Given the character property data for one Unicode character, encode the data 365 * of interest into a single long integer value. (Right now only 32 bits 366 * of this long value are used, but type long is used rather than int to facilitate 367 * future extensions of this source code generator that might require more than 368 * 32 bits to encode relevant character properties.) 369 * 370 * @param c the character code for which to encode property data 371 * @param us property data record from the Unicode specification file 372 * (its character code might not be equal to c if it specifies data 373 * for a range of characters) 374 * @return an encoded long value that contains the properties for a single char 375 * 376 * @see GenerateCharacter#buildMap 377 */ 378 379 static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) { 380 long resultA = 0; 381 // record the general category 382 resultA |= us.generalCategory; 383 384 // record the numeric properties 385 NUMERIC: { 386 STRANGE: { 387 int val = 0; 388 // c is A-Z 389 if ((c >= 0x0041) && (c <= 0x005A)) { 390 val = c - 0x0041; 391 resultA |= valueJavaSupradecimal; 392 // c is a-z 393 } else if ((c >= 0x0061) && (c <= 0x007A)) { 394 val = c - 0x0061; 395 resultA |= valueJavaSupradecimal; 396 // c is a full-width A-Z 397 } else if ((c >= 0xFF21) && (c <= 0xFF3A)) { 398 val = c - 0xFF21; 399 resultA |= valueJavaSupradecimal; 400 // c is a full-width a-z 401 } else if ((c >= 0xFF41) && (c <= 0xFF5A)) { 402 val = c - 0xFF41; 403 resultA |= valueJavaSupradecimal; 404 } else if (us.isDecimalValue()) { 405 val = us.decimalValue; 406 resultA |= valueDigit; 407 } else if (us.isDigitValue()) { 408 val = us.digitValue; 409 resultA |= valueDigit; 410 } else { 411 if (us.numericValue.length() == 0) { 412 break NUMERIC; // no numeric value at all 413 } else { 414 try { 415 val = Integer.parseInt(us.numericValue); 416 if (val >= 32 || val < 0) break STRANGE; 417 if (c == 0x215F) break STRANGE; 418 } catch(NumberFormatException e) { 419 break STRANGE; 420 } 421 resultA |= valueDigit; 422 } 423 } 424 if (val >= 32 || val < 0) break STRANGE; 425 resultA |= ((val - c & maskDigit) << shiftDigitOffset); 426 break NUMERIC; 427 } // end STRANGE 428 resultA |= valueStrangeNumeric; 429 } // end NUMERIC 430 431 // record case mapping 432 int offset = 0; 433 // might have a 1:M mapping 434 int specialMap = SpecialCaseMap.find(c, specialCaseMaps); 435 boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1); 436 if (bHasUpper) { 437 resultA |= maskUpperCase; 438 } 439 if (specialMap != -1) { 440 // has mapping, but cannot record the 441 // proper offset; can only flag it and provide special case 442 // code in Character.java 443 offset = -1; 444 } 445 else if (us.hasUpperMap()) { 446 offset = c - us.upperMap; 447 } 448 449 if (us.hasLowerMap()) { 450 resultA |= maskLowerCase; 451 if (offset == 0) 452 offset = us.lowerMap - c; 453 else if (offset != (us.lowerMap - c)) { 454 if (DEBUG) { 455 FAIL("Character " + hex(c) + 456 " has incompatible lowercase and uppercase mappings"); 457 } 458 } 459 } 460 if ((us.hasTitleMap() && us.titleMap != us.upperMap) || 461 (bHasUpper && us.hasLowerMap())) { 462 resultA |= maskTitleCase; 463 } 464 if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) { 465 System.out.println("Warning: Character " + hex4(c) + " has upper but " + 466 "no title case; Java won't know this"); 467 } 468 if (offset < minOffsetSeen) minOffsetSeen = offset; 469 if (offset > maxOffsetSeen) maxOffsetSeen = offset; 470 if (offset > maxOffset || offset < minOffset) { 471 if (DEBUG) { 472 FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case"); 473 } 474 offset = maskCase; 475 } 476 resultA |= ((offset & maskCase) << shiftCaseOffset); 477 478 479 // record lexical info about this character 480 if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER 481 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER 482 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER 483 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER 484 || us.generalCategory == UnicodeSpec.OTHER_LETTER 485 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) { 486 resultA |= valueJavaUnicodeStart; 487 } 488 else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK 489 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK 490 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) { 491 resultA |= valueJavaUnicodePart; 492 } 493 else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) { 494 resultA |= valueJavaStartUnicodePart; 495 } 496 else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) { 497 resultA |= valueJavaOnlyStart; 498 } 499 else if (((c >= 0x0000) && (c <= 0x0008)) 500 || ((c >= 0x000E) && (c <= 0x001B)) 501 || ((c >= 0x007F) && (c <= 0x009F)) 502 || us.generalCategory == UnicodeSpec.FORMAT) { 503 resultA |= valueIgnorable; 504 } 505 else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR 506 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR 507 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) { 508 if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace; 509 } 510 else if (((c >= 0x0009) && (c <= 0x000D)) 511 || ((c >= 0x001C) && (c <= 0x001F))) { 512 resultA |= valueJavaWhitespace; 513 } 514 515 // record bidi category 516 if (!nobidi) { 517 int tmpBidi = 518 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS || 519 us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi); 520 resultA |= tmpBidi; 521 } 522 523 // record mirrored property 524 if (!nomirror) { 525 resultA |= us.mirrored ? maskMirrored : 0; 526 } 527 528 if (identifiers) { 529 long replacement = 0; 530 if ((resultA & maskIdentifierInfo) >= lowJavaStart) { 531 replacement |= bitJavaStart; 532 } 533 if ( ((resultA & nonzeroJavaPart) != 0) 534 && ((resultA & maskIdentifierInfo) != valueIgnorable)) { 535 replacement |= bitJavaPart; 536 } 537 resultA = replacement; 538 } 539 return resultA; 540 } 541 542 /** 543 * This is the heart of the table compression strategy. The inputs are a map 544 * and a number of bits (size). The map is simply an array of long integer values; 545 * the number of bits indicates how index values for that map are to be split. 546 * The length of the given map must be a multiple of (1 << size). The result is 547 * a new map z and a compressed table t such that for every valid index value k 548 * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k]. 549 * 550 * In other words, the index k can be split into two parts, namely the "size" 551 * low-order bits and all the remaining high-order bits; the high-order bits are then 552 * remapped by map z to produce an index into table t. In effect, the data of the 553 * original map m is broken up into blocks of size (1<<size); the compression relies 554 * on the expectation that many of these blocks will be identical and therefore need 555 * be represented only once in the compressed table t. 556 * 557 * This method is intended to be used iteratively. The first map to be handed 558 * to it is the one constructed by method buildMap. After that, the first of the 559 * two arrays returned by this method is fed back into it for further compression. 560 * At the end of the iteration, one has a starter map and a sequence of tables. 561 * 562 * The algorithm used to implement this computation is straightforward and not 563 * especially clever. It uses brute-force linear search (the loop labeled MIDDLE) 564 * to locate identical blocks, so overall the time complexity of the algorithm 565 * is quadratic in the length of the input map. Fortunately, speed is not crucial 566 * to this application. 567 * 568 * @param map a map to be compressed 569 * @param size the number of index bits to be split off by the compression 570 * @return an array of length 2 containing two arrays; the first is a new map 571 * and the second is a compressed data table 572 * 573 * @see GenerateCharacter#buildMap 574 */ 575 576 static long[][] buildTable(long[] map, int size) { 577 int n = map.length; 578 if (((n >> size) << size) != n) { 579 FAIL("Length " + n + " is not a multiple of " + (1 << size)); 580 } 581 int m = 1 << size; 582 // We know the final length of the new map up front. 583 long[] newmap = new long[n >> size]; 584 // The buffer is used temporarily to hold data for the compressed table 585 // because we don't know its final length yet. 586 long[] buffer = new long[n]; 587 int ptr = 0; 588 OUTER: for (int i = 0; i < n; i += m) { 589 // For every block of size m in the original map... 590 MIDDLE: for (int j = 0; j < ptr; j += m) { 591 // Find out whether there is already a block just like it in the buffer. 592 for (int k = 0; k < m; k++) { 593 if (buffer[j+k] != map[i+k]) 594 continue MIDDLE; 595 } 596 // There is a block just like it at position j, so just 597 // put its index into the new map (thereby sharing it). 598 newmap[i >> size] = (j >> size); 599 continue OUTER; 600 } // end MIDDLE 601 // There is no block just like it already, so add it to 602 // the buffer and put its index into the new map. 603 for (int k = 0; k < m; k++) { 604 buffer[ptr+k] = map[i+k]; 605 } 606 newmap[i >> size] = (ptr >> size); 607 ptr += m; 608 } // end OUTER 609 // Now we know how long the compressed table should be, 610 // so create a new array and copy data from the temporary buffer. 611 long[] newdata = new long[ptr]; 612 for (int j = 0; j < ptr; j++) { 613 newdata[j] = buffer[j]; 614 } 615 // Return the new map and the new data table. 616 long[][] result = { newmap, newdata }; 617 return result; 618 } 619 620 /** 621 * Once the compressed tables have been computed, this method reads in a 622 * template file for the source code to be generated and writes out the final 623 * source code by acting as a sort of specialized macro processor. 624 * 625 * The first output line is a comment saying that the file was automatically 626 * generated; it includes a timestamp. All other output is generated by 627 * reading a line from the template file, performing macro replacements, 628 * and then writing the resulting line or lines of code to the output file. 629 * 630 * This method handles the I/O, the timestamp comment, and the locating of 631 * macro calls within each input line. The method replaceCommand is called 632 * to generate replacement text for each macro call. 633 * 634 * Macro calls to be replaced are indicated in the template file by 635 * occurrences of the commandMarker "$$". The rest of the call may consist 636 * of Java letters (including the underscore "_") and also of balanced 637 * parentheses. 638 * 639 * @param theTemplateFileName 640 * the file name for the template input file 641 * @param theOutputFileName 642 * the file name for the source code output file 643 * 644 * @see GenerateCharacter#replaceCommand 645 */ 646 647 static void generateCharacterClass(String theTemplateFileName, 648 String theOutputFileName) 649 throws FileNotFoundException, IOException { 650 BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName)); 651 PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName))); 652 out.println(commentStart + 653 " This file was generated AUTOMATICALLY from a template file " + 654 new java.util.Date() + commentEnd); 655 int marklen = commandMarker.length(); 656 LOOP: while(true) { 657 try { 658 String line = in.readLine(); 659 if (line == null) break LOOP; 660 int pos = 0; 661 int depth = 0; 662 while ((pos = line.indexOf(commandMarker, pos)) >= 0) { 663 int newpos = pos + marklen; 664 char ch = 'x'; 665 SCAN: while (newpos < line.length() && 666 (Character.isJavaIdentifierStart(ch = line.charAt(newpos)) 667 || ch == '(' || (ch == ')' && depth > 0))) { 668 ++newpos; 669 if (ch == '(') { 670 ++depth; 671 } 672 else if (ch == ')') { 673 --depth; 674 if (depth == 0) 675 break SCAN; 676 } 677 } 678 String replacement = replaceCommand(line.substring(pos + marklen, newpos)); 679 line = line.substring(0, pos) + replacement + line.substring(newpos); 680 pos += replacement.length(); 681 } 682 out.println(line); 683 } 684 catch (IOException e) { 685 break LOOP; 686 } 687 } 688 in.close(); 689 out.close(); 690 } 691 692 /** 693 * The replaceCommand method takes a command (a macro call without the 694 * leading marker "$$") and computes replacement text for it. 695 * 696 * Most of the commands are simply names of integer constants that are defined 697 * in the source code of this GenerateCharacter class. The replacement text is 698 * simply the value of the constant as an appropriately formatted integer literal. 699 * 700 * Two cases are more complicated, however. The command "Tables" causes the 701 * final map and compressed tables to be emitted, with elaborate comments 702 * describing their contents. (This is actually handled by method genTables.) 703 * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates 704 * an expression that will return the character property data for the character 705 * whose code is the value of the variable "xxx". (this is handled by method 706 * "genAccess".) 707 * 708 * @param x a command from the template file to be replaced 709 * @return the replacement text, as a String 710 * 711 * @see GenerateCharacter#genTables 712 * @see GenerateCharacter#genAccess 713 * @see GenerateCharacter#generateCharacterClass 714 */ 715 716 static String replaceCommand(String x) { 717 if (x.equals("Tables")) return genTables(); 718 if (x.equals("Initializers")) return genInitializers(); 719 if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") && 720 x.substring(x.length()-1).equals(")") ) 721 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32)); 722 if (x.equals("shiftType")) return Long.toString(shiftType); 723 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo); 724 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo); 725 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart); 726 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset); 727 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo); 728 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign); 729 if (x.equals("maskCase")) return "0x" + hex8(maskCase); 730 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset); 731 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase); 732 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase); 733 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase); 734 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable); 735 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart); 736 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart); 737 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart); 738 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart); 739 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace); 740 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart); 741 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart); 742 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart); 743 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart); 744 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart); 745 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart); 746 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart); 747 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset); 748 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset); 749 if (x.equals("maskDigit")) return "0x" + hex(maskDigit); 750 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType); 751 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType); 752 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric); 753 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 754 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 755 if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal); 756 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 757 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 758 if (x.equals("maskType")) return "0x" + hex(maskType); 759 if (x.equals("shiftBidi")) return Long.toString(shiftBidi); 760 if (x.equals("maskBidi")) return "0x" + hex(maskBidi); 761 if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored); 762 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG])) 763 return Integer.toString(UnicodeSpec.UNASSIGNED); 764 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG])) 765 return Integer.toString(UnicodeSpec.UPPERCASE_LETTER); 766 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG])) 767 return Integer.toString(UnicodeSpec.LOWERCASE_LETTER); 768 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG])) 769 return Integer.toString(UnicodeSpec.TITLECASE_LETTER); 770 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG])) 771 return Integer.toString(UnicodeSpec.MODIFIER_LETTER); 772 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG])) 773 return Integer.toString(UnicodeSpec.OTHER_LETTER); 774 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG])) 775 return Integer.toString(UnicodeSpec.NON_SPACING_MARK); 776 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG])) 777 return Integer.toString(UnicodeSpec.ENCLOSING_MARK); 778 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG])) 779 return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK); 780 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG])) 781 return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER); 782 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG])) 783 return Integer.toString(UnicodeSpec.OTHER_NUMBER); 784 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG])) 785 return Integer.toString(UnicodeSpec.SPACE_SEPARATOR); 786 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG])) 787 return Integer.toString(UnicodeSpec.LINE_SEPARATOR); 788 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 789 return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR); 790 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG])) 791 return Integer.toString(UnicodeSpec.CONTROL); 792 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG])) 793 return Integer.toString(UnicodeSpec.FORMAT); 794 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG])) 795 return Integer.toString(UnicodeSpec.PRIVATE_USE); 796 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG])) 797 return Integer.toString(UnicodeSpec.SURROGATE); 798 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG])) 799 return Integer.toString(UnicodeSpec.DASH_PUNCTUATION); 800 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG])) 801 return Integer.toString(UnicodeSpec.START_PUNCTUATION); 802 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG])) 803 return Integer.toString(UnicodeSpec.END_PUNCTUATION); 804 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 805 return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION); 806 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 807 return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION); 808 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG])) 809 return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION); 810 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG])) 811 return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION); 812 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG])) 813 return Integer.toString(UnicodeSpec.LETTER_NUMBER); 814 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG])) 815 return Integer.toString(UnicodeSpec.MATH_SYMBOL); 816 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG])) 817 return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL); 818 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG])) 819 return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL); 820 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG])) 821 return Integer.toString(UnicodeSpec.OTHER_SYMBOL); 822 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG])) 823 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT); 824 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG])) 825 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING); 826 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG])) 827 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE); 828 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG])) 829 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT); 830 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG])) 831 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC); 832 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG])) 833 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING); 834 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG])) 835 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE); 836 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG])) 837 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT); 838 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG])) 839 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER); 840 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 841 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR); 842 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG])) 843 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR); 844 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG])) 845 return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER); 846 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 847 return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR); 848 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG])) 849 return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK); 850 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG])) 851 return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL); 852 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 853 return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR); 854 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG])) 855 return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR); 856 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG])) 857 return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE); 858 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG])) 859 return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS); 860 FAIL("Unknown text substitution marker " + commandMarker + x); 861 return commandMarker + x; 862 } 863 864 /** 865 * The genTables method generates source code for all the lookup tables 866 * needed to represent the various Unicode character properties. 867 * It simply calls the method genTable once for each table to be generated 868 * and then generates a summary comment. 869 * 870 * @return the replacement text for the "Tables" command, as a String 871 * 872 * @see GenerateCharacter#genTable 873 * @see GenerateCharacter#replaceCommand 874 */ 875 static String genTables() { 876 int n = sizes.length; 877 StringBuffer result = new StringBuffer(); 878 // liu : Add a comment showing the source of this table 879 result.append(commentStart + " The following tables and code generated using:" + 880 commentEnd + "\n "); 881 result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n "); 882 883 if (plane == 0 && bLatin1 == false) { 884 genCaseMapTableDeclaration(result); 885 genCaseMapTable(initializers, specialCaseMaps); 886 } 887 int totalBytes = 0; 888 for (int k = 0; k < n - 1; k++) { 889 genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k], 890 sizes[k+1], false, false, k==0); 891 int s = bytes[k]; 892 if (s == 1 && useCharForByte) { 893 s = 2; 894 } 895 totalBytes += tables[k].length * s; 896 } 897 genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32), 898 sizes[n - 1], false, 0, true, !(identifiers), false); 899 900 // If we ever need more than 32 bits to represent the character properties, 901 // then a table "B" may be needed as well. 902 // genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false); 903 904 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2); 905 result.append(commentStart); 906 result.append(" In all, the character property tables require "); 907 result.append(totalBytes).append(" bytes.").append(commentEnd); 908 if (verbose) { 909 System.out.println("The character property tables require " 910 + totalBytes + " bytes."); 911 } 912 return result.toString(); 913 } 914 915 /** 916 * The genInitializers method generates the body of the 917 * ensureInitted() method, which enables lazy initialization of 918 * the case map table and other tables. 919 */ 920 static String genInitializers() { 921 return initializers.toString(); 922 } 923 924 /** 925 * Return the total number of bytes needed by all tables. This is a stripped- 926 * down copy of genTables(). 927 */ 928 static int getTotalBytes() { 929 int n = sizes.length; 930 int totalBytes = 0; 931 for (int k = 0; k < n - 1; k++) { 932 totalBytes += tables[k].length * bytes[k]; 933 } 934 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) 935 + 31) >> 5) << 2); 936 return totalBytes; 937 } 938 939 static void appendEscapedStringFragment(StringBuffer result, 940 char[] line, 941 int length, 942 boolean lastFragment) { 943 result.append(" \""); 944 for (int k=0; k<length; ++k) { 945 result.append("\\u"); 946 result.append(hex4(line[k])); 947 } 948 result.append("\""); 949 result.append(lastFragment ? ";" : "+"); 950 result.append("\n"); 951 } 952 953 static String SMALL_INITIALIZER = 954 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 955 // " $$name = new $$type[$$size];\n"+ 956 " int len = $$name_DATA.length();\n"+ 957 " int j=0;\n"+ 958 " for (int i=0; i<len; ++i) {\n"+ 959 " int c = $$name_DATA.charAt(i);\n"+ 960 " for (int k=0; k<$$entriesPerChar; ++k) {\n"+ 961 " $$name[j++] = ($$type)c;\n"+ 962 " c >>= $$bits;\n"+ 963 " }\n"+ 964 " }\n"+ 965 " assert (j == $$size);\n"+ 966 " }\n"; 967 968 static String SAME_SIZE_INITIALIZER = 969 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 970 " assert ($$name_DATA.length() == $$size);\n"+ 971 // " $$name = new $$type[$$size];\n"+ 972 " for (int i=0; i<$$size; ++i)\n"+ 973 " $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+ 974 " }\n"; 975 976 static String BIG_INITIALIZER = 977 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 978 // " $$name = new $$type[$$size];\n"+ 979 " int len = $$name_DATA.length();\n"+ 980 " int j=0;\n"+ 981 " int charsInEntry=0;\n"+ 982 " $$type entry=0;\n"+ 983 " for (int i=0; i<len; ++i) {\n"+ 984 " entry |= $$name_DATA.charAt(i);\n"+ 985 " if (++charsInEntry == $$charsPerEntry) {\n"+ 986 " $$name[j++] = entry;\n"+ 987 " entry = 0;\n"+ 988 " charsInEntry = 0;\n"+ 989 " }\n"+ 990 " else {\n"+ 991 " entry <<= 16;\n"+ 992 " }\n"+ 993 " }\n"+ 994 " assert (j == $$size);\n"+ 995 " }\n"; 996 997 static String INT32_INITIALIZER = 998 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 999 " char[] data = $$name_DATA.toCharArray();\n"+ 1000 " assert (data.length == ($$size * 2));\n"+ 1001 " int i = 0, j = 0;\n"+ 1002 " while (i < ($$size * 2)) {\n"+ 1003 " int entry = data[i++] << 16;\n"+ 1004 " $$name[j++] = entry | data[i++];\n"+ 1005 " }\n"+ 1006 " }\n"; 1007 1008 static void addInitializer(String name, String type, int entriesPerChar, 1009 int bits, int size) { 1010 1011 String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER : 1012 ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER); 1013 if (entriesPerChar == -2) { 1014 template = INT32_INITIALIZER; 1015 } 1016 int marklen = commandMarker.length(); 1017 int pos = 0; 1018 while ((pos = template.indexOf(commandMarker, pos)) >= 0) { 1019 int newpos = pos + marklen; 1020 char ch = 'x'; 1021 while (newpos < template.length() && 1022 Character.isJavaIdentifierStart(ch = template.charAt(newpos)) && 1023 ch != '_') // Don't allow this in token names 1024 ++newpos; 1025 String token = template.substring(pos+marklen, newpos); 1026 String replacement = "ERROR"; 1027 1028 if (token.equals("name")) replacement = name; 1029 else if (token.equals("type")) replacement = type; 1030 else if (token.equals("bits")) replacement = ""+bits; 1031 else if (token.equals("size")) replacement = ""+size; 1032 else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar; 1033 else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar); 1034 else FAIL("Unrecognized token: " + token); 1035 1036 template = template.substring(0, pos) + replacement + template.substring(newpos); 1037 pos += replacement.length(); 1038 } 1039 initializers.append(template); 1040 } 1041 1042 /** 1043 * The genTable method generates source code for one lookup table. 1044 * Most of the complexity stems from handling various options as to 1045 * the type of the array components, the precise representation of the 1046 * values, the format in which to render each value, the number of values 1047 * to emit on each line of source code, and the kinds of useful comments 1048 * to be generated. 1049 * 1050 * @param result a StringBuffer, to which the generated source code 1051 * text is to be appended 1052 * @param name the name of the table 1053 * @param table the table data (an array of long values) 1054 * @param extract a distance, in bits, by which each entry of the table 1055 * is to be right-shifted before it is processed 1056 * @param bits the number of bits (not bytes) to be used to represent 1057 * each table entry 1058 * @param size the table data is divided up into blocks of size (1<<size); 1059 * in this method, this information is used only to affect 1060 * how many table values are to be generated per line 1061 * @param preshifted if this flag is true, then the table entries are to be 1062 * emitted in a preshifted form; that is, each value should 1063 * be left-shifted by the amount "shift", so that this work 1064 * is built into the table and need not be performed by an 1065 * explicit shift operator at run time 1066 * @param shift this is the shift amount for preshifting of table entries 1067 * @param hexFormat if this flag is true, table entries should be emitted as 1068 * hexadecimal literals; otherwise decimal literals are used 1069 * @param properties if this flag is true, the table entries are encoded 1070 * character properties rather than indexes into yet other tables; 1071 * therefore comments describing the encoded properties should 1072 * be generated 1073 * @param hexComment if this flag is true, each line of output is labelled with 1074 * a hexadecimal comment indicating the character values to 1075 * which that line applies; otherwise, decimal values indicating 1076 * table indices are generated 1077 * 1078 * @see GenerateCharacter#genTables 1079 * @see GenerateCharacter#replaceCommand 1080 */ 1081 1082 static void genTable(StringBuffer result, String name, 1083 long[] table, int extract, int bits, int size, 1084 boolean preshifted, int shift, boolean hexFormat, 1085 boolean properties, boolean hexComment) { 1086 1087 String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") : 1088 bits == 2 ? (Csyntax ? "unsigned long" : "int") : 1089 bits == 4 ? (Csyntax ? "unsigned long" : "int") : 1090 bits == 8 ? (Csyntax ? "unsigned char" : "byte") : 1091 bits == 16 ? (Csyntax ? "unsigned short" : "char") : 1092 bits == 32 ? (Csyntax ? "unsigned long" : "int") : 1093 (Csyntax ? "int64" : "long"); 1094 long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu 1095 bits == 2 ? Integer.MAX_VALUE : 1096 bits == 4 ? Integer.MAX_VALUE : 1097 bits == 8 ? Byte.MAX_VALUE : 1098 bits == 16 ? Short.MAX_VALUE : 1099 bits == 32 ? Integer.MAX_VALUE : 1100 Long.MAX_VALUE; 1101 int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16); 1102 boolean shiftEntries = preshifted && shift != 0; 1103 if (bits == 8 && tableAsString && useCharForByte) { 1104 atype = "char"; 1105 maxPosEntry = Character.MAX_VALUE; 1106 entriesPerChar = 1; 1107 } 1108 boolean noConversion = atype.equals("char"); 1109 1110 result.append(commentStart); 1111 result.append(" The ").append(name).append(" table has ").append(table.length); 1112 result.append(" entries for a total of "); 1113 int sizeOfTable = ((table.length * bits + 31) >> 5) << 2; 1114 if (bits == 8 && useCharForByte) { 1115 sizeOfTable *= 2; 1116 } 1117 result.append(sizeOfTable); 1118 result.append(" bytes.").append(commentEnd).append("\n\n"); 1119 if (Csyntax) 1120 result.append(" static "); 1121 else 1122 result.append(" static final "); 1123 result.append(atype); 1124 result.append(" ").append(name).append("["); 1125 if (Csyntax) 1126 result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0)); 1127 if (tableAsString) { 1128 if (noConversion) { 1129 result.append("] = (\n"); 1130 } else { 1131 result.append("] = new ").append(atype).append("["+table.length+"];\n "); 1132 result.append("static final String ").append(name).append("_DATA =\n"); 1133 } 1134 int CHARS_PER_LINE = 8; 1135 StringBuffer theString = new StringBuffer(); 1136 int entriesInCharSoFar = 0; 1137 char ch = '\u0000'; 1138 int charsPerEntry = -entriesPerChar; 1139 for (int j=0; j<table.length; ++j) { 1140 long entry = table[j] >> extract; 1141 if (shiftEntries) entry <<= shift; 1142 if (entry >= (1L << bits)) { 1143 FAIL("Entry too big"); 1144 } 1145 if (entriesPerChar > 0) { 1146 // Pack multiple entries into a character 1147 ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits)); 1148 ++entriesInCharSoFar; 1149 if (entriesInCharSoFar == entriesPerChar) { 1150 // Character is full 1151 theString.append(ch); 1152 entriesInCharSoFar = 0; 1153 ch = '\u0000'; 1154 } 1155 } 1156 else { 1157 // Use multiple characters per entry 1158 for (int k=0; k<charsPerEntry; ++k) { 1159 ch = (char)(entry >> ((charsPerEntry-1)*16)); 1160 entry <<= 16; 1161 theString.append(ch); 1162 } 1163 } 1164 } 1165 if (entriesInCharSoFar > 0) { 1166 while (entriesInCharSoFar < entriesPerChar) { 1167 ch = (char)((int)ch >> bits); 1168 ++entriesInCharSoFar; 1169 } 1170 theString.append(ch); 1171 entriesInCharSoFar = 0; 1172 } 1173 result.append(Utility.formatForSource(theString.toString(), " ")); 1174 if (noConversion) { 1175 result.append(").toCharArray()"); 1176 } 1177 result.append(";\n\n "); 1178 1179 if (!noConversion) { 1180 addInitializer(name, atype, entriesPerChar, bits, table.length); 1181 } 1182 } 1183 else { 1184 result.append("] = {"); 1185 boolean castEntries = shiftEntries && (bits < 32); 1186 int printPerLine = hexFormat ? (bits == 1 ? 32*4 : 1187 bits == 2 ? 16*4 : 1188 bits == 4 ? 8*4 : 1189 bits == 8 ? 8 : 1190 bits == 16 ? 8 : 1191 bits == 32 ? 4 : 2) : 1192 (bits == 8 ? 8 : 1193 bits == 16 ? 8 : 4); 1194 int printMask = properties ? 0 : 1195 Math.min(1 << size, 1196 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1; 1197 int commentShift = ((1 << size) == table.length) ? 0 : size; 1198 int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1; 1199 long val = 0; 1200 for (int j = 0; j < table.length; j++) { 1201 if ((j & printMask) == 0) { 1202 while (result.charAt(result.length() - 1) == ' ') 1203 result.setLength(result.length() - 1); 1204 result.append("\n "); 1205 } 1206 PRINT: { 1207 if (castEntries) 1208 result.append("(").append(atype).append(")("); 1209 long entry = table[j] >> extract; 1210 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1); 1211 int k = j & packMask; 1212 if (bits >= 8) 1213 val = entry; 1214 else if (k == 0) { 1215 val = entry; 1216 break PRINT; 1217 } 1218 else { 1219 val |= (entry << (k*bits)); 1220 if (k != packMask) 1221 break PRINT; 1222 } 1223 if (val > maxPosEntry && !Csyntax) { // liu 1224 // For values that are out of range, convert them to in-range negative values. 1225 // Actually, output the '-' and convert them to the negative of the corresponding 1226 // in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126. 1227 result.append('-'); 1228 val = maxPosEntry + maxPosEntry + 2 - val; 1229 } 1230 if (hexFormat) { 1231 result.append("0x"); 1232 if (bits == 8) 1233 result.append(hex2((byte)val)); 1234 else if (bits == 16) 1235 result.append(hex4((short)val)); 1236 else if (bits == 32 || bits < 8) 1237 result.append(hex8((int)val)); 1238 else { 1239 result.append(hex16((long)val)); 1240 if (!Csyntax) 1241 result.append("L"); 1242 } 1243 } 1244 else { 1245 if (bits == 8) 1246 result.append(dec3(val)); 1247 else if (bits == 64) { 1248 result.append(dec5(val)); 1249 if (!Csyntax) 1250 result.append("L"); 1251 } 1252 else 1253 result.append(dec5(val)); 1254 } 1255 if (shiftEntries) 1256 result.append("<<").append(shift); 1257 if (castEntries) result.append(")"); 1258 if (j < (table.length - 1)) 1259 result.append(", "); 1260 else 1261 result.append(" "); 1262 if ((j & printMask) == printMask) { 1263 result.append(" ").append(commentStart).append(" "); 1264 if (hexComment) 1265 result.append("0x").append(hex4((j & ~commentMask) << (16 - size))); 1266 else 1267 result.append(dec3((j & ~commentMask) >> commentShift)); 1268 if (properties) propertiesComments(result, val); 1269 result.append(commentEnd); 1270 } 1271 } // end PRINT 1272 } 1273 result.append("\n };\n\n "); 1274 } 1275 } 1276 1277 static void genCaseMapTableDeclaration(StringBuffer result) { 1278 String myTab = " "; 1279 result.append(myTab + "static final char[][][] charMap;\n"); 1280 } 1281 1282 static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){ 1283 String myTab = " "; 1284 int ch; 1285 char[] map; 1286 result.append(myTab + "charMap = new char[][][] {\n"); 1287 for (int x = 0; x < specialCaseMaps.length; x++) { 1288 ch = specialCaseMaps[x].getCharSource(); 1289 map = specialCaseMaps[x].getUpperCaseMap(); 1290 result.append(myTab + myTab); 1291 result.append("{ "); 1292 result.append("{\'\\u"+hex4(ch)+"\'}, {"); 1293 for (int y = 0; y < map.length; y++) { 1294 result.append("\'\\u"+hex4(map[y])+"\', "); 1295 } 1296 result.append("} },\n"); 1297 } 1298 result.append(myTab + "};\n"); 1299 1300 } 1301 1302 /** 1303 * The propertiesComments method generates comments describing encoded 1304 * character properties. 1305 * 1306 * @param result a StringBuffer, to which the generated source code 1307 * text is to be appended 1308 * @param val encoded character properties 1309 * 1310 * @see GenerateCharacter#genTable 1311 */ 1312 1313 static void propertiesComments(StringBuffer result, long val) { 1314 result.append(" "); 1315 switch ((int)(val & maskType)) { 1316 case UnicodeSpec.CONTROL: 1317 result.append("Cc"); 1318 break; 1319 case UnicodeSpec.FORMAT: 1320 result.append("Cf"); 1321 break; 1322 case UnicodeSpec.PRIVATE_USE: 1323 result.append("Co"); 1324 break; 1325 case UnicodeSpec.SURROGATE: 1326 result.append("Cs"); 1327 break; 1328 case UnicodeSpec.LOWERCASE_LETTER: 1329 result.append("Ll"); 1330 break; 1331 case UnicodeSpec.MODIFIER_LETTER: 1332 result.append("Lm"); 1333 break; 1334 case UnicodeSpec.OTHER_LETTER: 1335 result.append("Lo"); 1336 break; 1337 case UnicodeSpec.TITLECASE_LETTER: 1338 result.append("Lt"); 1339 break; 1340 case UnicodeSpec.UPPERCASE_LETTER: 1341 result.append("Lu"); 1342 break; 1343 case UnicodeSpec.COMBINING_SPACING_MARK: 1344 result.append("Mc"); 1345 break; 1346 case UnicodeSpec.ENCLOSING_MARK: 1347 result.append("Me"); 1348 break; 1349 case UnicodeSpec.NON_SPACING_MARK: 1350 result.append("Mn"); 1351 break; 1352 case UnicodeSpec.DECIMAL_DIGIT_NUMBER: 1353 result.append("Nd"); 1354 break; 1355 case UnicodeSpec.LETTER_NUMBER: 1356 result.append("Nl"); 1357 break; 1358 case UnicodeSpec.OTHER_NUMBER: 1359 result.append("No"); 1360 break; 1361 case UnicodeSpec.CONNECTOR_PUNCTUATION: 1362 result.append("Pc"); 1363 break; 1364 case UnicodeSpec.DASH_PUNCTUATION: 1365 result.append("Pd"); 1366 break; 1367 case UnicodeSpec.END_PUNCTUATION: 1368 result.append("Pe"); 1369 break; 1370 case UnicodeSpec.OTHER_PUNCTUATION: 1371 result.append("Po"); 1372 break; 1373 case UnicodeSpec.START_PUNCTUATION: 1374 result.append("Ps"); 1375 break; 1376 case UnicodeSpec.CURRENCY_SYMBOL: 1377 result.append("Sc"); 1378 break; 1379 case UnicodeSpec.MODIFIER_SYMBOL: 1380 result.append("Sk"); 1381 break; 1382 case UnicodeSpec.MATH_SYMBOL: 1383 result.append("Sm"); 1384 break; 1385 case UnicodeSpec.OTHER_SYMBOL: 1386 result.append("So"); 1387 break; 1388 case UnicodeSpec.LINE_SEPARATOR: 1389 result.append("Zl"); break; 1390 case UnicodeSpec.PARAGRAPH_SEPARATOR: 1391 result.append("Zp"); 1392 break; 1393 case UnicodeSpec.SPACE_SEPARATOR: 1394 result.append("Zs"); 1395 break; 1396 case UnicodeSpec.UNASSIGNED: 1397 result.append("unassigned"); 1398 break; 1399 } 1400 1401 switch ((int)((val & maskBidi) >> shiftBidi)) { 1402 case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT: 1403 result.append(", L"); 1404 break; 1405 case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT: 1406 result.append(", R"); 1407 break; 1408 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER: 1409 result.append(", EN"); 1410 break; 1411 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR: 1412 result.append(", ES"); 1413 break; 1414 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR: 1415 result.append(", ET"); 1416 break; 1417 case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER: 1418 result.append(", AN"); 1419 break; 1420 case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR: 1421 result.append(", CS"); 1422 break; 1423 case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR: 1424 result.append(", B"); 1425 break; 1426 case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR: 1427 result.append(", S"); 1428 break; 1429 case UnicodeSpec.DIRECTIONALITY_WHITESPACE: 1430 result.append(", WS"); 1431 break; 1432 case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS: 1433 result.append(", ON"); 1434 break; 1435 } 1436 if ((val & maskUpperCase) != 0) { 1437 result.append(", hasUpper (subtract "); 1438 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1439 } 1440 if ((val & maskLowerCase) != 0) { 1441 result.append(", hasLower (add "); 1442 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1443 } 1444 if ((val & maskTitleCase) != 0) { 1445 result.append(", hasTitle"); 1446 } 1447 if ((val & maskIdentifierInfo) == valueIgnorable) { 1448 result.append(", ignorable"); 1449 } 1450 if ((val & maskIdentifierInfo) == valueJavaUnicodePart) { 1451 result.append(", identifier part"); 1452 } 1453 if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) { 1454 result.append(", underscore"); 1455 } 1456 if ((val & maskIdentifierInfo) == valueJavaWhitespace) { 1457 result.append(", whitespace"); 1458 } 1459 if ((val & maskIdentifierInfo) == valueJavaOnlyStart) { 1460 result.append(", currency"); 1461 } 1462 if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) { 1463 result.append(", identifier start"); 1464 } 1465 if ((val & maskNumericType) == valueDigit) { 1466 result.append(", decimal "); 1467 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1468 } 1469 if ((val & maskNumericType) == valueStrangeNumeric) { 1470 result.append(", strange"); 1471 } 1472 if ((val & maskNumericType) == valueJavaSupradecimal) { 1473 result.append(", supradecimal "); 1474 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1475 } 1476 } 1477 1478 static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" }; 1479 1480 static String tableName(int j) { return tableNames[j]; } 1481 1482 /** 1483 * The genAccess method generates source code for one table access expression. 1484 * 1485 * Most of the complexity stems from handling various options as to 1486 * table representation, such as whether it contains values so large that 1487 * they are represented as negative values and whether the table values are 1488 * preshifted. This method also avoids such "ugly" expressions as shifting 1489 * by distance zero, masking when no masking is necessary, and so on. 1490 * For clarity, it generates expressions that do not rely on operator 1491 * precedence, but otherwise it avoids generating redundant parentheses. 1492 * 1493 * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]] 1494 * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example. 1495 * 1496 * @param tbl the name of the final table to be accessed 1497 * @param var the variable name that appeared in parentheses in the 1498 * "Lookup" command 1499 * @param bits the number of bits (not bytes) to be used to represent 1500 * the final table entry 1501 * @return the replacement text for the "Lookup(xxx)" command, as a String 1502 * 1503 * @see GenerateCharacter#replaceCommand 1504 */ 1505 1506 static String genAccess(String tbl, String var, int bits) { 1507 String access = null; 1508 int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0; 1509 for (int k = 0; k < sizes.length; k++) { 1510 int offset = ((k < sizes.length - 1) ? 0 : bitoffset); 1511 int shift = shifts[k] + offset; 1512 String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")"; 1513 int mask = (1 << (sizes[k] - offset)) - 1; 1514 String masked = (k == 0) ? shifted : 1515 "(" + shifted + "&0x" + hex(mask) + ")"; 1516 String index = (k == 0) ? masked : 1517 (mask == 0) ? access : "(" + access + "|" + masked + ")"; 1518 String indexNoParens = (index.charAt(0) != '(') ? index : 1519 index.substring(1, index.length() - 1); 1520 String tblname = (k == sizes.length - 1) ? tbl : tableName(k); 1521 String fetched = tblname + "[" + indexNoParens + "]"; 1522 String zeroextended = (zeroextend[k] == 0) ? fetched : 1523 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")"; 1524 int adjustment = preshifted[k] ? 0 : 1525 sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0); 1526 String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended : 1527 "(" + zeroextended + "<<" + adjustment + ")"; 1528 String bitshift = (bits == 1) ? "(" + var + "&0x1F)" : 1529 (bits == 2) ? "((" + var + "&0xF)<<1)" : 1530 (bits == 4) ? "((" + var + "&7)<<2)" : null; 1531 String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted : 1532 "((" + adjusted + ">>" + bitshift + ")&" + 1533 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")"; 1534 access = extracted; 1535 } 1536 return access; 1537 } 1538 1539 /* The command line arguments are decoded and used to set the following 1540 global variables. 1541 */ 1542 1543 static boolean verbose = false; 1544 static boolean nobidi = false; 1545 static boolean nomirror = false; 1546 static boolean identifiers = false; 1547 static boolean Csyntax = false; 1548 static String TemplateFileName = null; 1549 static String OutputFileName = null; 1550 static String UnicodeSpecFileName = null; // liu 1551 static String SpecialCasingFileName = null; 1552 static boolean useCharForByte = false; 1553 static int[] sizes; 1554 static int bins = 0; // liu; if > 0, then perform search 1555 static boolean tableAsString = false; 1556 static boolean bLatin1 = false; 1557 1558 static String commandLineDescription; 1559 1560 /* Other global variables, equal in length to the "sizes" array. */ 1561 1562 static int[] shifts; 1563 static int[] zeroextend; 1564 static int[] bytes; 1565 static boolean[] preshifted; 1566 static long[][] tables; 1567 1568 1569 /* Other global variables */ 1570 static String commentStart; 1571 static String commentEnd; 1572 1573 static StringBuffer initializers = new StringBuffer(); 1574 1575 /* special casing rules for 1:M toUpperCase mappings */ 1576 static SpecialCaseMap[] specialCaseMaps; 1577 1578 /** 1579 * Process the command line arguments. 1580 * 1581 * The allowed flags in command line are: 1582 * <dl> 1583 * <dt> -verbose <dd> Emit comments to standard output describing 1584 * what's going on during the processing. 1585 * <dt> -nobidi <dd> Do not include bidi categories in the 1586 * encoded character properties. 1587 * <dt> -nomirror <dd> Do no include mirror property in the encoded 1588 * character properties. 1589 * <dt> -identifiers <dd> Generate tables for scanning identifiers only. 1590 * <dt> -c <dd> Output code in C syntax instead of Java syntax. 1591 * <dt> -o filename <dd> Specify output file name. 1592 * <dt> -template filename <dd> Specify template input file name. 1593 * <dt> -spec filename <dd> Specify Unicode spec file name. 1594 * <dt> -specialcasing filename <dd> Specify Unicode special casing file name. 1595 * <dt> -search bins <dd> Try different partitions into the specified 1596 * number of bins. E.g., for 2 bins, try 1597 * 16 0, 15 1,..., 0 16. 1598 * <dt> -string <dd> Create table as string. Only valid with Java 1599 * syntax. 1600 * <dt> -latin1 <dd> Create a latin 1 only property table. 1601 * </dl> 1602 * In addition, decimal literals may appear as command line arguments; 1603 * each one represents the number of bits of the character to be broken 1604 * off at each lookup step. If present, they must add up to 16 (the number 1605 * of bits in a char value). For smaller tables, the last value should 1606 * be 0; values other than the last one may not be zero. If no such 1607 * numeric values are provided, default values are used. 1608 * 1609 * @param args the command line arguments, as an array of String 1610 * 1611 * @see GenerateCharacter#main 1612 */ 1613 1614 static void processArgs(String[] args) { 1615 StringBuffer desc = new StringBuffer("java GenerateCharacter"); 1616 for (int j=0; j<args.length; ++j) { 1617 desc.append(" " + args[j]); 1618 } 1619 for (int j = 0; j < args.length; j++) { 1620 if (args[j].equals("-verbose") || args[j].equals("-v")) 1621 verbose = true; 1622 else if (args[j].equals("-nobidi")) 1623 nobidi = true; 1624 else if (args[j].equals("-nomirror")) 1625 nomirror = true; 1626 else if (args[j].equals("-identifiers")) 1627 identifiers = true; 1628 else if (args[j].equals("-c")) 1629 Csyntax = true; 1630 else if (args[j].equals("-string")) 1631 tableAsString = true; 1632 else if (args[j].equals("-o")) { 1633 if (j == args.length - 1) { 1634 FAIL("File name missing after -o"); 1635 } 1636 else { 1637 OutputFileName = args[++j]; 1638 } 1639 } 1640 else if (args[j].equals("-search")) { 1641 if (j == args.length - 1) 1642 FAIL("Bin count missing after -search"); 1643 else { 1644 bins = Integer.parseInt(args[++j]); 1645 if (bins < 1 || bins > 10) 1646 FAIL("Bin count must be >= 1 and <= 10"); 1647 } 1648 } 1649 else if (args[j].equals("-template")) { 1650 if (j == args.length - 1) 1651 FAIL("File name missing after -template"); 1652 else 1653 TemplateFileName = args[++j]; 1654 } 1655 else if (args[j].equals("-spec")) { // liu 1656 if (j == args.length - 1) { 1657 FAIL("File name missing after -spec"); 1658 } 1659 else { 1660 UnicodeSpecFileName = args[++j]; 1661 } 1662 } 1663 else if (args[j].equals("-specialcasing")) { 1664 if (j == args.length -1) { 1665 FAIL("File name missing after -specialcasing"); 1666 } 1667 else { 1668 SpecialCasingFileName = args[++j]; 1669 } 1670 } 1671 else if (args[j].equals("-plane")) { 1672 if (j == args.length -1) { 1673 FAIL("Plane number missing after -plane"); 1674 } 1675 else { 1676 plane = Integer.parseInt(args[++j]); 1677 } 1678 if (plane > 0) { 1679 bLatin1 = false; 1680 } 1681 } 1682 else if ("-usecharforbyte".equals(args[j])) { 1683 useCharForByte = true; 1684 } 1685 else if (args[j].equals("-latin1")) { 1686 bLatin1 = true; 1687 plane = 0; 1688 } 1689 else { 1690 try { 1691 int val = Integer.parseInt(args[j]); 1692 if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]); 1693 if (sizes == null) 1694 sizes = new int[1]; 1695 else { 1696 int[] newsizes = new int[sizes.length + 1]; 1697 System.arraycopy(sizes, 0, newsizes, 0, sizes.length); 1698 sizes = newsizes; 1699 } 1700 sizes[sizes.length - 1] = val; 1701 } 1702 catch(NumberFormatException e) { 1703 FAIL("Unknown switch: " + args[j]); 1704 } 1705 } 1706 } 1707 if (Csyntax && tableAsString) { 1708 FAIL("Can't specify table as string with C syntax"); 1709 } 1710 if (sizes == null) { 1711 desc.append(" ["); 1712 if (identifiers) { 1713 int[] newsizes = { 8, 4, 4 }; // Good default values 1714 desc.append("8 4 4]"); 1715 sizes = newsizes; 1716 } 1717 else { 1718 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 } 1719 desc.append("10 5 1]"); 1720 sizes = newsizes; 1721 } 1722 } 1723 if (UnicodeSpecFileName == null) { // liu 1724 UnicodeSpecFileName = DefaultUnicodeSpecFileName; 1725 desc.append(" [-spec " + UnicodeSpecFileName + ']'); 1726 } 1727 if (SpecialCasingFileName == null) { 1728 SpecialCasingFileName = DefaultSpecialCasingFileName; 1729 desc.append(" [-specialcasing " + SpecialCasingFileName + ']'); 1730 } 1731 if (TemplateFileName == null) { 1732 TemplateFileName = (Csyntax ? DefaultCTemplateFileName 1733 : DefaultJavaTemplateFileName); 1734 desc.append(" [-template " + TemplateFileName + ']'); 1735 } 1736 if (OutputFileName == null) { 1737 OutputFileName = (Csyntax ? DefaultCOutputFileName 1738 : DefaultJavaOutputFileName); 1739 desc.append(" [-o " + OutputFileName + ']'); 1740 } 1741 commentStart = (Csyntax ? "/*" : "//"); 1742 commentEnd = (Csyntax ? " */" : ""); 1743 commandLineDescription = desc.toString(); 1744 } 1745 1746 private static void searchBins(long[] map, int binsOccupied) throws Exception { 1747 int bitsFree = 16; 1748 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i]; 1749 if (binsOccupied == (bins-1)) { 1750 sizes[binsOccupied] = bitsFree; 1751 generateForSizes(map); 1752 } 1753 else { 1754 for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one 1755 sizes[binsOccupied] = i; 1756 searchBins(map, binsOccupied+1); 1757 } 1758 } 1759 } 1760 1761 private static void generateForSizes(long[] map) throws Exception { 1762 int sum = 0; 1763 shifts = new int[sizes.length]; 1764 for (int k = sizes.length - 1; k >= 0; k--) { 1765 shifts[k] = sum; 1766 sum += sizes[k]; 1767 } 1768 if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) { 1769 FAIL("Bit field widths total to " + sum + 1770 ": wrong total for map of size " + map.length); 1771 } 1772 // need a table for each set of lookup bits in char 1773 tables = new long[sizes.length][]; 1774 // the last table is the map 1775 tables[sizes.length - 1] = map; 1776 for (int j = sizes.length - 1; j > 0; j--) { 1777 if (verbose && bins==0) 1778 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]); 1779 long[][] temp = buildTable(tables[j], sizes[j]); 1780 tables[j-1] = temp[0]; 1781 tables[j] = temp[1]; 1782 } 1783 preshifted = new boolean[sizes.length]; 1784 zeroextend = new int[sizes.length]; 1785 bytes = new int[sizes.length]; 1786 for (int j = 0; j < sizes.length - 1; j++) { 1787 int len = tables[j+1].length; 1788 int size = sizes[j+1]; 1789 if (len > 0x100 && (len >> size) <= 0x100) { 1790 len >>= size; 1791 preshifted[j] = false; 1792 } 1793 else if (len > 0x10000 && (len >> size) <= 0x10000) { 1794 len >>= size; 1795 preshifted[j] = false; 1796 } 1797 else preshifted[j] = true; 1798 if (Csyntax) 1799 zeroextend[j] = 0; 1800 else if (len > 0x7F && len <= 0xFF) { 1801 if (!useCharForByte) { 1802 zeroextend[j] = 0xFF; 1803 } 1804 } else if (len > 0x7FFF && len <= 0xFFFF) 1805 zeroextend[j] = 0xFFFF; 1806 else zeroextend[j] = 0; 1807 if (len <= 0x100) bytes[j] = 1; 1808 else if (len <= 0x10000) bytes[j] = 2; 1809 else bytes[j] = 4; 1810 } 1811 preshifted[sizes.length - 1] = true; 1812 zeroextend[sizes.length - 1] = 0; 1813 bytes[sizes.length - 1] = 0; 1814 if (bins > 0) { 1815 int totalBytes = getTotalBytes(); 1816 String access = genAccess("A", "ch", (identifiers ? 2 : 32)); 1817 int accessComplexity = 0; 1818 for (int j=0; j<access.length(); ++j) { 1819 char ch = access.charAt(j); 1820 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity; 1821 if (ch == '<' || ch == '>') ++j; 1822 } 1823 System.out.print("("); 1824 for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]); 1825 System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access); 1826 return; 1827 } 1828 if (verbose) { 1829 System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted"); 1830 for (int j = 0; j < sizes.length; j++) { 1831 System.out.println(dec5(j) + "\t" + 1832 dec5(sizes[j]) + "\t" + 1833 dec5(tables[j].length) + "\t" + 1834 dec5(shifts[j]) + "\t" + 1835 dec5(zeroextend[j]) + "\t" + 1836 dec5(bytes[j]) + "\t " + 1837 preshifted[j]); 1838 } 1839 } 1840 if (verbose) { 1841 System.out.println("Generating source code for class Character"); 1842 System.out.println("A table access looks like " + 1843 genAccess("A", "ch", (identifiers ? 2 : 32))); 1844 } 1845 generateCharacterClass(TemplateFileName, OutputFileName); 1846 } 1847 1848 /** 1849 * The main program for generating source code for the Character class. 1850 * The basic outline of its operation is: 1851 * <ol> 1852 * <li> Process the command line arguments. One result of this process 1853 * is a list of sizes (measured in bits and summing to 16). 1854 * <li> Get the Unicode character property data from the specification file. 1855 * <li> From that, build a map that has, for each character code, its 1856 * relevant properties encoded as a long integer value. 1857 * <li> Repeatedly compress the map, producing a compressed table and a 1858 * new map. This is done once for each size value in the list. 1859 * When this is done, we have a set of tables. 1860 * <li> Make some decisions about table representation; record these 1861 * decisions in arrays named preshifted, zeroextend, and bytes. 1862 * <li> Generate the source code for the class Character by performing 1863 * macro processing on a template file. 1864 * </ol> 1865 * 1866 * @param args the command line arguments, as an array of String 1867 * 1868 * @see GenerateCharacter#processArgs 1869 * @see UnicodeSpec@readSpecFile 1870 * @see GenerateCharacter#buildMap 1871 * @see GenerateCharacter#buildTable 1872 * @see GenerateCharacter#generateCharacterClass 1873 */ 1874 1875 public static void main(String[] args) { 1876 processArgs(args); 1877 try { 1878 1879 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane); 1880 1881 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane); 1882 if (verbose) { 1883 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu 1884 } 1885 long[] map = buildMap(data, specialCaseMaps); 1886 if (verbose) { 1887 System.err.println("Completed building of initial map"); 1888 } 1889 1890 if (bins == 0) { 1891 generateForSizes(map); 1892 } 1893 else { 1894 while (bins > 0) { 1895 sizes = new int[bins]; 1896 searchBins(map, 0); 1897 --bins; 1898 } 1899 } 1900 if (verbose && false) { 1901 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" + 1902 hex8(maxOffsetSeen)); 1903 System.out.println(" allowed: -" + hex8(-minOffset) + "..+" + 1904 hex8(maxOffset)); 1905 } 1906 } 1907 catch (FileNotFoundException e) { FAIL(e.toString()); } 1908 catch (IOException e) { FAIL(e.toString()); } 1909 catch (Throwable e) { 1910 System.out.println("Unexpected exception:"); 1911 e.printStackTrace(); 1912 FAIL("Unexpected exception!"); 1913 } 1914 if (verbose) { System.out.println("Done!");} 1915 } 1916 1917 } // end class