1 /* 2 * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package build.tools.generatecharacter; 27 28 import java.io.IOException; 29 import java.io.FileNotFoundException; 30 import java.io.BufferedReader; 31 import java.io.FileReader; 32 import java.io.PrintWriter; 33 import java.io.BufferedWriter; 34 import java.io.FileWriter; 35 import java.io.File; 36 import java.util.List; 37 38 import build.tools.generatecharacter.CharacterName; 39 40 /** 41 * This program generates the source code for the class java.lang.Character. 42 * It also generates native C code that can perform the same operations. 43 * It requires two external input data files: 44 * <ul> 45 * <li> Unicode specification file 46 * <li> Character class template file 47 * </ul> 48 * The Unicode specification file is available from the Unicode consortium. 49 * It has character specification lines that look like this: 50 * <listing> 51 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 52 * </listing> 53 * The Character class template file is filled in with additional 54 * information to produce the file Character.java, which can then be 55 * compiled by a Java compiler. The template file contains certain 56 * markers consisting of an alphabetic name string preceded by "$$". 57 * Such markers are replaced with generated program text. As a special 58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of 59 * alphabetic characters constituting a variable name. The character "_" 60 * is considered alphabetic for these purposes. 61 * 62 * @author Guy Steele 63 * @author Alan Liu 64 * @author John O'Conner 65 */ 66 67 public class GenerateCharacter { 68 69 final static boolean DEBUG = false; 70 71 final static String commandMarker = "$$"; 72 static String ROOT = ""; 73 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt"; 74 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt"; 75 static String DefaultPropListFileName = ROOT + "PropList.txt"; 76 static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt"; 77 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template"; 78 static String DefaultJavaOutputFileName = ROOT + "Character.java"; 79 static String DefaultCTemplateFileName = ROOT + "Character.c.template"; 80 static String DefaultCOutputFileName = ROOT + "Character.c"; 81 82 static int plane = 0; 83 84 /* The overall idea is that, in the generated Character class source code, 85 most character property data is stored in a special multi-level table whose 86 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn]. 87 The integers must sum to 16 (the number of bits in a character). 88 The first table is indexed by the k1 high-order bits of the character code. 89 The result is concatenated to the next k2 bits of the character code to index 90 the second table, and so on. Eventually the kn low-order bits of the character 91 code are concatenated and used to index one of two tables A and B; A contains 92 32-bit integer entries and B contains 16-bit short entries. The 48 bits that 93 can be thus obtained encode the properties for the character. 94 95 The default specification is [9, 4, 3, 0]. This particular table format was 96 designed by conducting an exhaustive search of table formats to minimize the 97 space consumed by the tables: the first and third tables need have only byte 98 values (the second table must have short values). Another good choice is 99 [10, 6, 0], which produces a larger table but allows particularly fast table 100 lookup code. 101 102 In each case, where the word "concatenated" is used, this may imply 103 first a << and then a | operation, or perhaps just a | operation if 104 the values in the table can be preshifted (generally possible if the table 105 entries are short rather than byte). 106 */ 107 108 /* The character properties are currently encoded into A (32 bits)and B (16 bits) 109 two parts. 110 111 A: the low 32 bits are defined in the following manner: 112 113 1 bit Mirrored property. 114 4 bits Bidirectional category (see below) (unused if -nobidi switch specified) 115 9 bits A signed offset used for converting case . 116 1 bit If 1, adding the signed offset converts the character to lowercase. 117 1 bit If 1, subtracting the signed offset converts the character to uppercase. 118 Note: for a titlecase character, both of the preceding bits will be 1 119 and the signed offset will be 1. 120 1 bit If 1, this character has a titlecase equivalent (possibly itself); 121 in this case, the two bits before this bit can be used to decide 122 whether this character is in fact uppercase, lowercase, or titlecase. 123 3 bits This field provides a quick way to lex identifiers. 124 The eight possible values for this field are as follows: 125 0 May not be part of an identifier 126 1 Ignorable control; may continue a Unicode identifier or Java identifier 127 2 May continue a Java identifier but not a Unicode identifier (unused) 128 3 May continue a Unicode identifier or Java identifier 129 4 Is a Java whitespace character 130 5 May start or continue a Java identifier; 131 may continue but not start a Unicode identifier 132 (this value is used for connector punctuation such as _) 133 6 May start or continue a Java identifier; 134 may not occur in a Unicode identifier 135 (this value is used for currency symbols such as $) 136 7 May start or continue a Unicode identifier or Java identifier 137 Thus: 138 5, 6, 7 may start a Java identifier 139 1, 2, 3, 5, 6, 7 may continue a Java identifier 140 7 may start a Unicode identifier 141 1, 3, 5, 7 may continue a Unicode identifier 142 1 is ignorable within an identifier 143 4 is Java whitespace 144 2 bits This field indicates whether the character has a numeric property. 145 The four possible values for this field are as follows: 146 0 This character has no numeric property. 147 1 Adding the digit offset to the character code and then 148 masking with 0x1F will produce the desired numeric value. 149 2 This character has a "strange" numeric value. 150 3 A Java supradecimal digit: adding the digit offset to the 151 character code, then masking with 0x1F, then adding 10 152 will produce the desired numeric value. 153 5 bits The digit offset (see description of previous field) 154 5 bits Character type (see below) 155 156 B: the high 16 bits are defined as: 157 1 bit Other_Lowercase property 158 1 bit Other_Uppercase property 159 1 bit Other_Alphabetic property 160 1 bit Other_Math property 161 1 bit Ideographic property 162 1 bit Noncharacter codepoint property 163 1 bit ID_Start property 164 1 bit ID_Continue property 165 */ 166 167 168 // bit masks identify each component of a 32-bit property field described 169 // above. 170 // shift* indicates how many shifts right must happen to get the 171 // indicated property value in the lowest bits of the 32-bit space. 172 private static final int 173 shiftType = 0, maskType = 0x001F, 174 shiftDigitOffset = 5, maskDigitOffset = 0x03E0, 175 shiftNumericType = 10, maskNumericType = 0x0C00, 176 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000, 177 maskUnicodePart = 0x1000, 178 shiftCaseInfo = 15, maskCaseInfo = 0x38000, 179 maskLowerCase = 0x20000, 180 maskUpperCase = 0x10000, 181 maskTitleCase = 0x08000, 182 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000, 183 shiftCaseOffsetSign = 5, 184 // used only when calculating and 185 // storing digit offsets from char values 186 maskDigit = 0x001F, 187 // case offset are 9 bits 188 maskCase = 0x01FF, 189 shiftBidi = 27, maskBidi = 0x78000000, 190 shiftMirrored = 31, //maskMirrored = 0x80000000, 191 shiftPlane = 16, maskPlane = 0xFF0000; 192 193 // maskMirrored needs to be long, if up 16-bit 194 private static final long maskMirrored = 0x80000000L; 195 196 // bit masks identify the 16-bit property field described above, in B 197 // table 198 private static final long 199 maskOtherLowercase = 0x100000000L, 200 maskOtherUppercase = 0x200000000L, 201 maskOtherAlphabetic = 0x400000000L, 202 maskOtherMath = 0x800000000L, 203 maskIdeographic = 0x1000000000L, 204 maskNoncharacterCP = 0x2000000000L, 205 maskIDStart = 0x4000000000L, 206 maskIDContinue = 0x8000000000L; 207 208 // Can compare masked values with these to determine 209 // numeric or lexical types. 210 public static int 211 valueNotNumeric = 0x0000, 212 valueDigit = 0x0400, 213 valueStrangeNumeric = 0x0800, 214 valueJavaSupradecimal = 0x0C00, 215 valueIgnorable = 0x1000, 216 valueJavaOnlyPart = 0x2000, 217 valueJavaUnicodePart = 0x3000, 218 valueJavaWhitespace = 0x4000, 219 valueJavaStartUnicodePart = 0x5000, 220 valueJavaOnlyStart = 0x6000, 221 valueJavaUnicodeStart = 0x7000, 222 lowJavaStart = 0x5000, 223 nonzeroJavaPart = 0x3000, 224 valueUnicodeStart = 0x7000; 225 226 // these values are used when only identifier properties are generated 227 // for use in verifier code. Shortens the property down to a single byte. 228 private static final int 229 bitJavaStart = 0x02, 230 bitJavaPart = 0x01, 231 maskIsJavaIdentifierPart = bitJavaPart, 232 maskIsJavaIdentifierStart = bitJavaStart; 233 234 static int maxOffset = maskCase/2 ; 235 static int minOffset = -maxOffset; 236 237 /* The following routines provide simple, concise formatting of long integer values. 238 The number in the name of the method indicates the desired number of characters 239 to be produced. If the number of digits required to represent the integer value 240 is less than that number, then the output is padded on the left with zeros 241 (for hex) or with spaces (for decimal). If the number of digits required to 242 represent the integer value is greater than the desired number, then all the digits 243 that are required are actually produced. 244 */ 245 246 static String hex(long n) { return Long.toHexString(n).toUpperCase(); } 247 248 static String hex2(long n) { 249 String q = Long.toHexString(n & 0xFF).toUpperCase(); 250 return "00".substring(Math.min(2, q.length())) + q; 251 } 252 253 static String hex4(long n) { 254 String q = Long.toHexString(n & 0xFFFF).toUpperCase(); 255 return "0000".substring(Math.min(4, q.length())) + q; 256 } 257 258 static String hex8(long n) { 259 String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase(); 260 return "00000000".substring(Math.min(8, q.length())) + q; 261 } 262 263 static String hex16(long n) { 264 String q = Long.toHexString(n).toUpperCase(); 265 return "0000000000000000".substring(Math.min(16, q.length())) + q; 266 } 267 268 static String dec3(long n) { 269 String q = Long.toString(n); 270 return " ".substring(Math.min(3, q.length())) + q; 271 } 272 273 static String dec5(long n) { 274 String q = Long.toString(n); 275 return " ".substring(Math.min(5, q.length())) + q; 276 } 277 278 /* This routine is called when some failure occurs. */ 279 280 static void FAIL(String s) { 281 System.out.println("** " + s); 282 } 283 284 /** 285 * Given the data from the Unicode specification file, this routine builds a map. 286 * 287 * The specification file is assumed to contain its data in sorted order by 288 * character code; as a result, the array passed as an argument to this method 289 * has its components in the same sorted order, with one entry for each defined 290 * Unicode character or character range. (A range is indicated by two consecutive 291 * entries, such that the name of the first entry begins with "<" and ends with 292 * "First>" and the second entry begins with "<" and ends with "Last>".) This is 293 * therefore a sparse representation of the character property data. 294 * 295 * The resulting map is dense representation of the character data. It contains 296 * 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits 297 * of this long value are used, but type long is used rather than int to facilitate 298 * future extensions of this source code generator that might require more than 299 * 32 bits to encode relevant character properties.) Entry k holds the encoded 300 * properties for character k. 301 * 302 * Method buildMap manages the transformation from the sparse representation to 303 * the dense representation. It calls method buildOne to handle the encoding 304 * of character property data from a single UnicodeSpec object into 32 bits. 305 * For undefined characters, method buildOne is not called and the map entry for 306 * that character is set to UnicodeSpec.UNASSIGNED. 307 * 308 * @param data character property data from the Unicode specification file 309 * @return an array of length 65536 with one entry for every possible char value 310 * 311 * @see GenerateCharacter#buildOne 312 */ 313 314 static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList) 315 { 316 long[] result; 317 if (bLatin1 == true) { 318 result = new long[256]; 319 } else { 320 result = new long[1<<16]; 321 } 322 int k=0; 323 int codePoint = plane<<16; 324 UnicodeSpec nonCharSpec = new UnicodeSpec(); 325 for (int j = 0; j < data.length && k < result.length; j++) { 326 if (data[j].codePoint == codePoint) { 327 result[k] = buildOne(codePoint, data[j], specialMaps); 328 ++k; 329 ++codePoint; 330 } 331 else if(data[j].codePoint > codePoint) { 332 if (data[j].name.endsWith("Last>")) { 333 // build map data for all chars except last in range 334 while (codePoint < data[j].codePoint && k < result.length) { 335 result[k] = buildOne(codePoint, data[j], specialMaps); 336 ++k; 337 ++codePoint; 338 } 339 } 340 else { 341 // we have a few unassigned chars before data[j].codePoint 342 while (codePoint < data[j].codePoint && k < result.length) { 343 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 344 ++k; 345 ++codePoint; 346 } 347 } 348 k = data[j].codePoint & 0xFFFF; 349 codePoint = data[j].codePoint; 350 result[k] = buildOne(codePoint, data[j], specialMaps); 351 ++k; 352 ++codePoint; 353 } 354 else { 355 System.out.println("An error has occured during spec mapping."); 356 System.exit(0); 357 } 358 } 359 // if there are still unprocessed chars, process them 360 // as unassigned/undefined. 361 codePoint = (plane<<16) | k; 362 while (k < result.length) { 363 result[k] = buildOne(codePoint, nonCharSpec, specialMaps); 364 ++k; 365 ++codePoint; 366 } 367 // now add all extra supported properties from PropList, to the 368 // upper 16-bit 369 addExProp(result, propList, "Other_Lowercase", maskOtherLowercase); 370 addExProp(result, propList, "Other_Uppercase", maskOtherUppercase); 371 addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic); 372 addExProp(result, propList, "Ideographic", maskIdeographic); 373 //addExProp(result, propList, "Other_Math", maskOtherMath); 374 //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP); 375 addExProp(result, propList, "ID_Start", maskIDStart); 376 addExProp(result, propList, "ID_Continue", maskIDContinue); 377 378 return result; 379 } 380 381 // The maximum and minimum offsets found while scanning the database 382 static int maxOffsetSeen = 0; 383 static int minOffsetSeen = 0; 384 385 /** 386 * Some Unicode separator characters are not considered Java whitespace. 387 * @param c character to test 388 * @return true if c in an invalid Java whitespace character, false otherwise. 389 */ 390 static boolean isInvalidJavaWhiteSpace(int c) { 391 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF}; 392 boolean retValue = false; 393 for(int x=0;x<exceptions.length;x++) { 394 if(c == exceptions[x]) { 395 retValue = true; 396 break; 397 } 398 } 399 return retValue; 400 401 } 402 403 /** 404 * Given the character property data for one Unicode character, encode the data 405 * of interest into a single long integer value. (Right now only 32 bits 406 * of this long value are used, but type long is used rather than int to facilitate 407 * future extensions of this source code generator that might require more than 408 * 32 bits to encode relevant character properties.) 409 * 410 * @param c the character code for which to encode property data 411 * @param us property data record from the Unicode specification file 412 * (its character code might not be equal to c if it specifies data 413 * for a range of characters) 414 * @return an encoded long value that contains the properties for a single char 415 * 416 * @see GenerateCharacter#buildMap 417 */ 418 419 static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) { 420 long resultA = 0; 421 // record the general category 422 resultA |= us.generalCategory; 423 424 // record the numeric properties 425 NUMERIC: { 426 STRANGE: { 427 int val = 0; 428 // c is A-Z 429 if ((c >= 0x0041) && (c <= 0x005A)) { 430 val = c - 0x0041; 431 resultA |= valueJavaSupradecimal; 432 // c is a-z 433 } else if ((c >= 0x0061) && (c <= 0x007A)) { 434 val = c - 0x0061; 435 resultA |= valueJavaSupradecimal; 436 // c is a full-width A-Z 437 } else if ((c >= 0xFF21) && (c <= 0xFF3A)) { 438 val = c - 0xFF21; 439 resultA |= valueJavaSupradecimal; 440 // c is a full-width a-z 441 } else if ((c >= 0xFF41) && (c <= 0xFF5A)) { 442 val = c - 0xFF41; 443 resultA |= valueJavaSupradecimal; 444 } else if (us.isDecimalValue()) { 445 val = us.decimalValue; 446 resultA |= valueDigit; 447 } else if (us.isDigitValue()) { 448 val = us.digitValue; 449 resultA |= valueDigit; 450 } else { 451 if (us.numericValue.length() == 0) { 452 break NUMERIC; // no numeric value at all 453 } else { 454 try { 455 val = Integer.parseInt(us.numericValue); 456 if (val >= 32 || val < 0) break STRANGE; 457 if (c == 0x215F) break STRANGE; 458 } catch(NumberFormatException e) { 459 break STRANGE; 460 } 461 resultA |= valueDigit; 462 } 463 } 464 if (val >= 32 || val < 0) break STRANGE; 465 resultA |= ((val - c & maskDigit) << shiftDigitOffset); 466 break NUMERIC; 467 } // end STRANGE 468 resultA |= valueStrangeNumeric; 469 } // end NUMERIC 470 471 // record case mapping 472 int offset = 0; 473 // might have a 1:M mapping 474 int specialMap = SpecialCaseMap.find(c, specialCaseMaps); 475 boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1); 476 if (bHasUpper) { 477 resultA |= maskUpperCase; 478 } 479 if (specialMap != -1) { 480 // has mapping, but cannot record the 481 // proper offset; can only flag it and provide special case 482 // code in Character.java 483 offset = -1; 484 } 485 else if (us.hasUpperMap()) { 486 offset = c - us.upperMap; 487 } 488 489 if (us.hasLowerMap()) { 490 resultA |= maskLowerCase; 491 if (offset == 0) 492 offset = us.lowerMap - c; 493 else if (offset != (us.lowerMap - c)) { 494 if (DEBUG) { 495 FAIL("Character " + hex(c) + 496 " has incompatible lowercase and uppercase mappings"); 497 } 498 } 499 } 500 if ((us.hasTitleMap() && us.titleMap != us.upperMap) || 501 (bHasUpper && us.hasLowerMap())) { 502 resultA |= maskTitleCase; 503 } 504 if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) { 505 System.out.println("Warning: Character " + hex4(c) + " has upper but " + 506 "no title case; Java won't know this"); 507 } 508 if (offset < minOffsetSeen) minOffsetSeen = offset; 509 if (offset > maxOffsetSeen) maxOffsetSeen = offset; 510 if (offset > maxOffset || offset < minOffset) { 511 if (DEBUG) { 512 FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case"); 513 } 514 offset = maskCase; 515 } 516 resultA |= ((offset & maskCase) << shiftCaseOffset); 517 518 // record lexical info about this character 519 if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER 520 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER 521 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER 522 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER 523 || us.generalCategory == UnicodeSpec.OTHER_LETTER 524 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) { 525 resultA |= valueJavaUnicodeStart; 526 } 527 else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK 528 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK 529 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) { 530 resultA |= valueJavaUnicodePart; 531 } 532 else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) { 533 resultA |= valueJavaStartUnicodePart; 534 } 535 else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) { 536 resultA |= valueJavaOnlyStart; 537 } 538 else if (((c >= 0x0000) && (c <= 0x0008)) 539 || ((c >= 0x000E) && (c <= 0x001B)) 540 || ((c >= 0x007F) && (c <= 0x009F)) 541 || us.generalCategory == UnicodeSpec.FORMAT) { 542 resultA |= valueIgnorable; 543 } 544 else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR 545 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR 546 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) { 547 if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace; 548 } 549 else if (((c >= 0x0009) && (c <= 0x000D)) 550 || ((c >= 0x001C) && (c <= 0x001F))) { 551 resultA |= valueJavaWhitespace; 552 } 553 554 // record bidi category 555 if (!nobidi) { 556 int tmpBidi = 557 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS || 558 us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi); 559 resultA |= tmpBidi; 560 } 561 562 // record mirrored property 563 if (!nomirror) { 564 resultA |= us.mirrored ? maskMirrored : 0; 565 } 566 567 if (identifiers) { 568 long replacement = 0; 569 if ((resultA & maskIdentifierInfo) >= lowJavaStart) { 570 replacement |= bitJavaStart; 571 } 572 if ( ((resultA & nonzeroJavaPart) != 0) 573 && ((resultA & maskIdentifierInfo) != valueIgnorable)) { 574 replacement |= bitJavaPart; 575 } 576 resultA = replacement; 577 } 578 return resultA; 579 } 580 581 static void addExProp(long[] map, PropList propList, String prop, long mask) { 582 List<Integer> cps = propList.codepoints(prop); 583 if (cps != null) { 584 for (Integer cp : cps) { 585 if (cp < map.length) 586 map[cp] |= mask; 587 } 588 } 589 } 590 591 /** 592 * This is the heart of the table compression strategy. The inputs are a map 593 * and a number of bits (size). The map is simply an array of long integer values; 594 * the number of bits indicates how index values for that map are to be split. 595 * The length of the given map must be a multiple of (1 << size). The result is 596 * a new map z and a compressed table t such that for every valid index value k 597 * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k]. 598 * 599 * In other words, the index k can be split into two parts, namely the "size" 600 * low-order bits and all the remaining high-order bits; the high-order bits are then 601 * remapped by map z to produce an index into table t. In effect, the data of the 602 * original map m is broken up into blocks of size (1<<size); the compression relies 603 * on the expectation that many of these blocks will be identical and therefore need 604 * be represented only once in the compressed table t. 605 * 606 * This method is intended to be used iteratively. The first map to be handed 607 * to it is the one constructed by method buildMap. After that, the first of the 608 * two arrays returned by this method is fed back into it for further compression. 609 * At the end of the iteration, one has a starter map and a sequence of tables. 610 * 611 * The algorithm used to implement this computation is straightforward and not 612 * especially clever. It uses brute-force linear search (the loop labeled MIDDLE) 613 * to locate identical blocks, so overall the time complexity of the algorithm 614 * is quadratic in the length of the input map. Fortunately, speed is not crucial 615 * to this application. 616 * 617 * @param map a map to be compressed 618 * @param size the number of index bits to be split off by the compression 619 * @return an array of length 2 containing two arrays; the first is a new map 620 * and the second is a compressed data table 621 * 622 * @see GenerateCharacter#buildMap 623 */ 624 625 static long[][] buildTable(long[] map, int size) { 626 int n = map.length; 627 if (((n >> size) << size) != n) { 628 FAIL("Length " + n + " is not a multiple of " + (1 << size)); 629 } 630 int m = 1 << size; 631 // We know the final length of the new map up front. 632 long[] newmap = new long[n >> size]; 633 // The buffer is used temporarily to hold data for the compressed table 634 // because we don't know its final length yet. 635 long[] buffer = new long[n]; 636 int ptr = 0; 637 OUTER: for (int i = 0; i < n; i += m) { 638 // For every block of size m in the original map... 639 MIDDLE: for (int j = 0; j < ptr; j += m) { 640 // Find out whether there is already a block just like it in the buffer. 641 for (int k = 0; k < m; k++) { 642 if (buffer[j+k] != map[i+k]) 643 continue MIDDLE; 644 } 645 // There is a block just like it at position j, so just 646 // put its index into the new map (thereby sharing it). 647 newmap[i >> size] = (j >> size); 648 continue OUTER; 649 } // end MIDDLE 650 // There is no block just like it already, so add it to 651 // the buffer and put its index into the new map. 652 for (int k = 0; k < m; k++) { 653 buffer[ptr+k] = map[i+k]; 654 } 655 newmap[i >> size] = (ptr >> size); 656 ptr += m; 657 } // end OUTER 658 // Now we know how long the compressed table should be, 659 // so create a new array and copy data from the temporary buffer. 660 long[] newdata = new long[ptr]; 661 for (int j = 0; j < ptr; j++) { 662 newdata[j] = buffer[j]; 663 } 664 // Return the new map and the new data table. 665 long[][] result = { newmap, newdata }; 666 return result; 667 } 668 669 /** 670 * Once the compressed tables have been computed, this method reads in a 671 * template file for the source code to be generated and writes out the final 672 * source code by acting as a sort of specialized macro processor. 673 * 674 * The first output line is a comment saying that the file was automatically 675 * generated; it includes a timestamp. All other output is generated by 676 * reading a line from the template file, performing macro replacements, 677 * and then writing the resulting line or lines of code to the output file. 678 * 679 * This method handles the I/O, the timestamp comment, and the locating of 680 * macro calls within each input line. The method replaceCommand is called 681 * to generate replacement text for each macro call. 682 * 683 * Macro calls to be replaced are indicated in the template file by 684 * occurrences of the commandMarker "$$". The rest of the call may consist 685 * of Java letters (including the underscore "_") and also of balanced 686 * parentheses. 687 * 688 * @param theTemplateFileName 689 * the file name for the template input file 690 * @param theOutputFileName 691 * the file name for the source code output file 692 * 693 * @see GenerateCharacter#replaceCommand 694 */ 695 696 static void generateCharacterClass(String theTemplateFileName, 697 String theOutputFileName) 698 throws FileNotFoundException, IOException { 699 BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName)); 700 PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName))); 701 out.println(commentStart + 702 " This file was generated AUTOMATICALLY from a template file " + 703 new java.util.Date() + commentEnd); 704 int marklen = commandMarker.length(); 705 LOOP: while(true) { 706 try { 707 String line = in.readLine(); 708 if (line == null) break LOOP; 709 int pos = 0; 710 int depth = 0; 711 while ((pos = line.indexOf(commandMarker, pos)) >= 0) { 712 int newpos = pos + marklen; 713 char ch = 'x'; 714 SCAN: while (newpos < line.length() && 715 (Character.isJavaIdentifierStart(ch = line.charAt(newpos)) 716 || ch == '(' || (ch == ')' && depth > 0))) { 717 ++newpos; 718 if (ch == '(') { 719 ++depth; 720 } 721 else if (ch == ')') { 722 --depth; 723 if (depth == 0) 724 break SCAN; 725 } 726 } 727 String replacement = replaceCommand(line.substring(pos + marklen, newpos)); 728 line = line.substring(0, pos) + replacement + line.substring(newpos); 729 pos += replacement.length(); 730 } 731 out.println(line); 732 } 733 catch (IOException e) { 734 break LOOP; 735 } 736 } 737 in.close(); 738 out.close(); 739 } 740 741 /** 742 * The replaceCommand method takes a command (a macro call without the 743 * leading marker "$$") and computes replacement text for it. 744 * 745 * Most of the commands are simply names of integer constants that are defined 746 * in the source code of this GenerateCharacter class. The replacement text is 747 * simply the value of the constant as an appropriately formatted integer literal. 748 * 749 * Two cases are more complicated, however. The command "Tables" causes the 750 * final map and compressed tables to be emitted, with elaborate comments 751 * describing their contents. (This is actually handled by method genTables.) 752 * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates 753 * an expression that will return the character property data for the character 754 * whose code is the value of the variable "xxx". (this is handled by method 755 * "genAccess".) 756 * 757 * @param x a command from the template file to be replaced 758 * @return the replacement text, as a String 759 * 760 * @see GenerateCharacter#genTables 761 * @see GenerateCharacter#genAccess 762 * @see GenerateCharacter#generateCharacterClass 763 */ 764 765 static String replaceCommand(String x) { 766 if (x.equals("Tables")) return genTables(); 767 if (x.equals("Initializers")) return genInitializers(); 768 if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") && 769 x.substring(x.length()-1).equals(")") ) 770 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32)); 771 if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") && 772 x.substring(x.length()-1).equals(")") ) 773 return genAccess("B", x.substring(9, x.length()-1), 16); 774 if (x.equals("shiftType")) return Long.toString(shiftType); 775 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo); 776 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo); 777 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart); 778 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset); 779 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo); 780 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign); 781 if (x.equals("maskCase")) return "0x" + hex8(maskCase); 782 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset); 783 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase); 784 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase); 785 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase); 786 if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32); 787 if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32); 788 if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32); 789 if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32); 790 if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32); 791 if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32); 792 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable); 793 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart); 794 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart); 795 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart); 796 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart); 797 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace); 798 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart); 799 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart); 800 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart); 801 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart); 802 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart); 803 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart); 804 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart); 805 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset); 806 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset); 807 if (x.equals("maskDigit")) return "0x" + hex(maskDigit); 808 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType); 809 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType); 810 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric); 811 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 812 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 813 if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal); 814 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit); 815 if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric); 816 if (x.equals("maskType")) return "0x" + hex(maskType); 817 if (x.equals("shiftBidi")) return Long.toString(shiftBidi); 818 if (x.equals("maskBidi")) return "0x" + hex(maskBidi); 819 if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored); 820 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG])) 821 return Integer.toString(UnicodeSpec.UNASSIGNED); 822 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG])) 823 return Integer.toString(UnicodeSpec.UPPERCASE_LETTER); 824 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG])) 825 return Integer.toString(UnicodeSpec.LOWERCASE_LETTER); 826 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG])) 827 return Integer.toString(UnicodeSpec.TITLECASE_LETTER); 828 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG])) 829 return Integer.toString(UnicodeSpec.MODIFIER_LETTER); 830 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG])) 831 return Integer.toString(UnicodeSpec.OTHER_LETTER); 832 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG])) 833 return Integer.toString(UnicodeSpec.NON_SPACING_MARK); 834 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG])) 835 return Integer.toString(UnicodeSpec.ENCLOSING_MARK); 836 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG])) 837 return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK); 838 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG])) 839 return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER); 840 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG])) 841 return Integer.toString(UnicodeSpec.OTHER_NUMBER); 842 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG])) 843 return Integer.toString(UnicodeSpec.SPACE_SEPARATOR); 844 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG])) 845 return Integer.toString(UnicodeSpec.LINE_SEPARATOR); 846 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 847 return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR); 848 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG])) 849 return Integer.toString(UnicodeSpec.CONTROL); 850 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG])) 851 return Integer.toString(UnicodeSpec.FORMAT); 852 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG])) 853 return Integer.toString(UnicodeSpec.PRIVATE_USE); 854 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG])) 855 return Integer.toString(UnicodeSpec.SURROGATE); 856 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG])) 857 return Integer.toString(UnicodeSpec.DASH_PUNCTUATION); 858 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG])) 859 return Integer.toString(UnicodeSpec.START_PUNCTUATION); 860 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG])) 861 return Integer.toString(UnicodeSpec.END_PUNCTUATION); 862 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 863 return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION); 864 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG])) 865 return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION); 866 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG])) 867 return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION); 868 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG])) 869 return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION); 870 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG])) 871 return Integer.toString(UnicodeSpec.LETTER_NUMBER); 872 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG])) 873 return Integer.toString(UnicodeSpec.MATH_SYMBOL); 874 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG])) 875 return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL); 876 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG])) 877 return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL); 878 if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG])) 879 return Integer.toString(UnicodeSpec.OTHER_SYMBOL); 880 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG])) 881 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT); 882 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG])) 883 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING); 884 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG])) 885 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE); 886 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG])) 887 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT); 888 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG])) 889 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC); 890 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG])) 891 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING); 892 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG])) 893 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE); 894 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG])) 895 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT); 896 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG])) 897 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER); 898 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 899 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR); 900 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG])) 901 return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR); 902 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG])) 903 return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER); 904 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG])) 905 return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR); 906 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG])) 907 return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK); 908 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG])) 909 return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL); 910 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG])) 911 return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR); 912 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG])) 913 return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR); 914 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG])) 915 return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE); 916 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG])) 917 return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS); 918 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG])) 919 return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE); 920 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG])) 921 return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE); 922 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG])) 923 return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE); 924 if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG])) 925 return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE); 926 FAIL("Unknown text substitution marker " + commandMarker + x); 927 return commandMarker + x; 928 } 929 930 /** 931 * The genTables method generates source code for all the lookup tables 932 * needed to represent the various Unicode character properties. 933 * It simply calls the method genTable once for each table to be generated 934 * and then generates a summary comment. 935 * 936 * @return the replacement text for the "Tables" command, as a String 937 * 938 * @see GenerateCharacter#genTable 939 * @see GenerateCharacter#replaceCommand 940 */ 941 static String genTables() { 942 int n = sizes.length; 943 StringBuffer result = new StringBuffer(); 944 // liu : Add a comment showing the source of this table 945 result.append(commentStart + " The following tables and code generated using:" + 946 commentEnd + "\n "); 947 result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n "); 948 949 if (plane == 0 && bLatin1 == false) { 950 genCaseMapTableDeclaration(result); 951 genCaseMapTable(initializers, specialCaseMaps); 952 } 953 int totalBytes = 0; 954 for (int k = 0; k < n - 1; k++) { 955 genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k], 956 sizes[k+1], false, false, k==0); 957 int s = bytes[k]; 958 if (s == 1 && useCharForByte) { 959 s = 2; 960 } 961 totalBytes += tables[k].length * s; 962 } 963 genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32), 964 sizes[n - 1], false, 0, true, !(identifiers), false); 965 966 // If we ever need more than 32 bits to represent the character properties, 967 // then a table "B" may be needed as well. 968 genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false); 969 970 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2); 971 result.append(commentStart); 972 result.append(" In all, the character property tables require "); 973 result.append(totalBytes).append(" bytes.").append(commentEnd); 974 if (verbose) { 975 System.out.println("The character property tables require " 976 + totalBytes + " bytes."); 977 } 978 return result.toString(); 979 } 980 981 /** 982 * The genInitializers method generates the body of the 983 * ensureInitted() method, which enables lazy initialization of 984 * the case map table and other tables. 985 */ 986 static String genInitializers() { 987 return initializers.toString(); 988 } 989 990 /** 991 * Return the total number of bytes needed by all tables. This is a stripped- 992 * down copy of genTables(). 993 */ 994 static int getTotalBytes() { 995 int n = sizes.length; 996 int totalBytes = 0; 997 for (int k = 0; k < n - 1; k++) { 998 totalBytes += tables[k].length * bytes[k]; 999 } 1000 totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) 1001 + 31) >> 5) << 2); 1002 return totalBytes; 1003 } 1004 1005 static void appendEscapedStringFragment(StringBuffer result, 1006 char[] line, 1007 int length, 1008 boolean lastFragment) { 1009 result.append(" \""); 1010 for (int k=0; k<length; ++k) { 1011 result.append("\\u"); 1012 result.append(hex4(line[k])); 1013 } 1014 result.append("\""); 1015 result.append(lastFragment ? ";" : "+"); 1016 result.append("\n"); 1017 } 1018 1019 static String SMALL_INITIALIZER = 1020 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1021 // " $$name = new $$type[$$size];\n"+ 1022 " int len = $$name_DATA.length();\n"+ 1023 " int j=0;\n"+ 1024 " for (int i=0; i<len; ++i) {\n"+ 1025 " int c = $$name_DATA.charAt(i);\n"+ 1026 " for (int k=0; k<$$entriesPerChar; ++k) {\n"+ 1027 " $$name[j++] = ($$type)c;\n"+ 1028 " c >>= $$bits;\n"+ 1029 " }\n"+ 1030 " }\n"+ 1031 " assert (j == $$size);\n"+ 1032 " }\n"; 1033 1034 static String SAME_SIZE_INITIALIZER = 1035 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1036 " assert ($$name_DATA.length() == $$size);\n"+ 1037 // " $$name = new $$type[$$size];\n"+ 1038 " for (int i=0; i<$$size; ++i)\n"+ 1039 " $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+ 1040 " }\n"; 1041 1042 static String BIG_INITIALIZER = 1043 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1044 // " $$name = new $$type[$$size];\n"+ 1045 " int len = $$name_DATA.length();\n"+ 1046 " int j=0;\n"+ 1047 " int charsInEntry=0;\n"+ 1048 " $$type entry=0;\n"+ 1049 " for (int i=0; i<len; ++i) {\n"+ 1050 " entry |= $$name_DATA.charAt(i);\n"+ 1051 " if (++charsInEntry == $$charsPerEntry) {\n"+ 1052 " $$name[j++] = entry;\n"+ 1053 " entry = 0;\n"+ 1054 " charsInEntry = 0;\n"+ 1055 " }\n"+ 1056 " else {\n"+ 1057 " entry <<= 16;\n"+ 1058 " }\n"+ 1059 " }\n"+ 1060 " assert (j == $$size);\n"+ 1061 " }\n"; 1062 1063 static String INT32_INITIALIZER = 1064 " { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+ 1065 " char[] data = $$name_DATA.toCharArray();\n"+ 1066 " assert (data.length == ($$size * 2));\n"+ 1067 " int i = 0, j = 0;\n"+ 1068 " while (i < ($$size * 2)) {\n"+ 1069 " int entry = data[i++] << 16;\n"+ 1070 " $$name[j++] = entry | data[i++];\n"+ 1071 " }\n"+ 1072 " }\n"; 1073 1074 static void addInitializer(String name, String type, int entriesPerChar, 1075 int bits, int size) { 1076 1077 String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER : 1078 ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER); 1079 if (entriesPerChar == -2) { 1080 template = INT32_INITIALIZER; 1081 } 1082 int marklen = commandMarker.length(); 1083 int pos = 0; 1084 while ((pos = template.indexOf(commandMarker, pos)) >= 0) { 1085 int newpos = pos + marklen; 1086 char ch = 'x'; 1087 while (newpos < template.length() && 1088 Character.isJavaIdentifierStart(ch = template.charAt(newpos)) && 1089 ch != '_') // Don't allow this in token names 1090 ++newpos; 1091 String token = template.substring(pos+marklen, newpos); 1092 String replacement = "ERROR"; 1093 1094 if (token.equals("name")) replacement = name; 1095 else if (token.equals("type")) replacement = type; 1096 else if (token.equals("bits")) replacement = ""+bits; 1097 else if (token.equals("size")) replacement = ""+size; 1098 else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar; 1099 else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar); 1100 else FAIL("Unrecognized token: " + token); 1101 1102 template = template.substring(0, pos) + replacement + template.substring(newpos); 1103 pos += replacement.length(); 1104 } 1105 initializers.append(template); 1106 } 1107 1108 /** 1109 * The genTable method generates source code for one lookup table. 1110 * Most of the complexity stems from handling various options as to 1111 * the type of the array components, the precise representation of the 1112 * values, the format in which to render each value, the number of values 1113 * to emit on each line of source code, and the kinds of useful comments 1114 * to be generated. 1115 * 1116 * @param result a StringBuffer, to which the generated source code 1117 * text is to be appended 1118 * @param name the name of the table 1119 * @param table the table data (an array of long values) 1120 * @param extract a distance, in bits, by which each entry of the table 1121 * is to be right-shifted before it is processed 1122 * @param bits the number of bits (not bytes) to be used to represent 1123 * each table entry 1124 * @param size the table data is divided up into blocks of size (1<<size); 1125 * in this method, this information is used only to affect 1126 * how many table values are to be generated per line 1127 * @param preshifted if this flag is true, then the table entries are to be 1128 * emitted in a preshifted form; that is, each value should 1129 * be left-shifted by the amount "shift", so that this work 1130 * is built into the table and need not be performed by an 1131 * explicit shift operator at run time 1132 * @param shift this is the shift amount for preshifting of table entries 1133 * @param hexFormat if this flag is true, table entries should be emitted as 1134 * hexadecimal literals; otherwise decimal literals are used 1135 * @param properties if this flag is true, the table entries are encoded 1136 * character properties rather than indexes into yet other tables; 1137 * therefore comments describing the encoded properties should 1138 * be generated 1139 * @param hexComment if this flag is true, each line of output is labelled with 1140 * a hexadecimal comment indicating the character values to 1141 * which that line applies; otherwise, decimal values indicating 1142 * table indices are generated 1143 * 1144 * @see GenerateCharacter#genTables 1145 * @see GenerateCharacter#replaceCommand 1146 */ 1147 1148 static void genTable(StringBuffer result, String name, 1149 long[] table, int extract, int bits, int size, 1150 boolean preshifted, int shift, boolean hexFormat, 1151 boolean properties, boolean hexComment) { 1152 1153 String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") : 1154 bits == 2 ? (Csyntax ? "unsigned long" : "int") : 1155 bits == 4 ? (Csyntax ? "unsigned long" : "int") : 1156 bits == 8 ? (Csyntax ? "unsigned char" : "byte") : 1157 bits == 16 ? (Csyntax ? "unsigned short" : "char") : 1158 bits == 32 ? (Csyntax ? "unsigned long" : "int") : 1159 (Csyntax ? "int64" : "long"); 1160 long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu 1161 bits == 2 ? Integer.MAX_VALUE : 1162 bits == 4 ? Integer.MAX_VALUE : 1163 bits == 8 ? Byte.MAX_VALUE : 1164 bits == 16 ? Short.MAX_VALUE : 1165 bits == 32 ? Integer.MAX_VALUE : 1166 Long.MAX_VALUE; 1167 int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16); 1168 boolean shiftEntries = preshifted && shift != 0; 1169 if (bits == 8 && tableAsString && useCharForByte) { 1170 atype = "char"; 1171 maxPosEntry = Character.MAX_VALUE; 1172 entriesPerChar = 1; 1173 } 1174 boolean noConversion = atype.equals("char"); 1175 1176 result.append(commentStart); 1177 result.append(" The ").append(name).append(" table has ").append(table.length); 1178 result.append(" entries for a total of "); 1179 int sizeOfTable = ((table.length * bits + 31) >> 5) << 2; 1180 if (bits == 8 && useCharForByte) { 1181 sizeOfTable *= 2; 1182 } 1183 result.append(sizeOfTable); 1184 result.append(" bytes.").append(commentEnd).append("\n\n"); 1185 if (Csyntax) 1186 result.append(" static "); 1187 else 1188 result.append(" static final "); 1189 result.append(atype); 1190 result.append(" ").append(name).append("["); 1191 if (Csyntax) 1192 result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0)); 1193 if (tableAsString) { 1194 if (noConversion) { 1195 result.append("] = (\n"); 1196 } else { 1197 result.append("] = new ").append(atype).append("["+table.length+"];\n "); 1198 result.append("static final String ").append(name).append("_DATA =\n"); 1199 } 1200 int CHARS_PER_LINE = 8; 1201 StringBuffer theString = new StringBuffer(); 1202 int entriesInCharSoFar = 0; 1203 char ch = '\u0000'; 1204 int charsPerEntry = -entriesPerChar; 1205 for (int j=0; j<table.length; ++j) { 1206 //long entry = table[j] >> extract; 1207 long entry; 1208 if ("A".equals(name)) 1209 entry = (table[j] & 0xffffffffL) >> extract; 1210 else 1211 entry = (table[j] >> extract); 1212 if (shiftEntries) entry <<= shift; 1213 if (entry >= (1L << bits)) { 1214 FAIL("Entry too big"); 1215 } 1216 if (entriesPerChar > 0) { 1217 // Pack multiple entries into a character 1218 ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits)); 1219 ++entriesInCharSoFar; 1220 if (entriesInCharSoFar == entriesPerChar) { 1221 // Character is full 1222 theString.append(ch); 1223 entriesInCharSoFar = 0; 1224 ch = '\u0000'; 1225 } 1226 } 1227 else { 1228 // Use multiple characters per entry 1229 for (int k=0; k<charsPerEntry; ++k) { 1230 ch = (char)(entry >> ((charsPerEntry-1)*16)); 1231 entry <<= 16; 1232 theString.append(ch); 1233 } 1234 } 1235 } 1236 if (entriesInCharSoFar > 0) { 1237 while (entriesInCharSoFar < entriesPerChar) { 1238 ch = (char)((int)ch >> bits); 1239 ++entriesInCharSoFar; 1240 } 1241 theString.append(ch); 1242 entriesInCharSoFar = 0; 1243 } 1244 result.append(Utility.formatForSource(theString.toString(), " ")); 1245 if (noConversion) { 1246 result.append(").toCharArray()"); 1247 } 1248 result.append(";\n\n "); 1249 1250 if (!noConversion) { 1251 addInitializer(name, atype, entriesPerChar, bits, table.length); 1252 } 1253 } 1254 else { 1255 result.append("] = {"); 1256 boolean castEntries = shiftEntries && (bits < 32); 1257 int printPerLine = hexFormat ? (bits == 1 ? 32*4 : 1258 bits == 2 ? 16*4 : 1259 bits == 4 ? 8*4 : 1260 bits == 8 ? 8 : 1261 bits == 16 ? 8 : 1262 bits == 32 ? 4 : 2) : 1263 (bits == 8 ? 8 : 1264 bits == 16 ? 8 : 4); 1265 int printMask = properties ? 0 : 1266 Math.min(1 << size, 1267 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1; 1268 int commentShift = ((1 << size) == table.length) ? 0 : size; 1269 int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1; 1270 long val = 0; 1271 for (int j = 0; j < table.length; j++) { 1272 if ((j & printMask) == 0) { 1273 while (result.charAt(result.length() - 1) == ' ') 1274 result.setLength(result.length() - 1); 1275 result.append("\n "); 1276 } 1277 PRINT: { 1278 if (castEntries) 1279 result.append("(").append(atype).append(")("); 1280 long entry = table[j] >> extract; 1281 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1); 1282 int k = j & packMask; 1283 if (bits >= 8) 1284 val = entry; 1285 else if (k == 0) { 1286 val = entry; 1287 break PRINT; 1288 } 1289 else { 1290 val |= (entry << (k*bits)); 1291 if (k != packMask) 1292 break PRINT; 1293 } 1294 if (val > maxPosEntry && !Csyntax) { // liu 1295 // For values that are out of range, convert them to in-range negative values. 1296 // Actually, output the '-' and convert them to the negative of the corresponding 1297 // in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126. 1298 result.append('-'); 1299 val = maxPosEntry + maxPosEntry + 2 - val; 1300 } 1301 if (hexFormat) { 1302 result.append("0x"); 1303 if (bits == 8) 1304 result.append(hex2((byte)val)); 1305 else if (bits == 16) 1306 result.append(hex4((short)val)); 1307 else if (bits == 32 || bits < 8) 1308 result.append(hex8((int)val)); 1309 else { 1310 result.append(hex16(val)); 1311 if (!Csyntax) 1312 result.append("L"); 1313 } 1314 } 1315 else { 1316 if (bits == 8) 1317 result.append(dec3(val)); 1318 else if (bits == 64) { 1319 result.append(dec5(val)); 1320 if (!Csyntax) 1321 result.append("L"); 1322 } 1323 else 1324 result.append(dec5(val)); 1325 } 1326 if (shiftEntries) 1327 result.append("<<").append(shift); 1328 if (castEntries) result.append(")"); 1329 if (j < (table.length - 1)) 1330 result.append(", "); 1331 else 1332 result.append(" "); 1333 if ((j & printMask) == printMask) { 1334 result.append(" ").append(commentStart).append(" "); 1335 if (hexComment) 1336 result.append("0x").append(hex4((j & ~commentMask) << (16 - size))); 1337 else 1338 result.append(dec3((j & ~commentMask) >> commentShift)); 1339 if (properties) propertiesComments(result, val); 1340 result.append(commentEnd); 1341 } 1342 } // end PRINT 1343 } 1344 result.append("\n };\n\n "); 1345 } 1346 } 1347 1348 static void genCaseMapTableDeclaration(StringBuffer result) { 1349 String myTab = " "; 1350 result.append(myTab + "static final char[][][] charMap;\n"); 1351 } 1352 1353 static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){ 1354 String myTab = " "; 1355 int ch; 1356 char[] map; 1357 result.append(myTab + "charMap = new char[][][] {\n"); 1358 for (int x = 0; x < specialCaseMaps.length; x++) { 1359 ch = specialCaseMaps[x].getCharSource(); 1360 map = specialCaseMaps[x].getUpperCaseMap(); 1361 result.append(myTab + myTab); 1362 result.append("{ "); 1363 result.append("{\'\\u"+hex4(ch)+"\'}, {"); 1364 for (int y = 0; y < map.length; y++) { 1365 result.append("\'\\u"+hex4(map[y])+"\', "); 1366 } 1367 result.append("} },\n"); 1368 } 1369 result.append(myTab + "};\n"); 1370 1371 } 1372 1373 /** 1374 * The propertiesComments method generates comments describing encoded 1375 * character properties. 1376 * 1377 * @param result a StringBuffer, to which the generated source code 1378 * text is to be appended 1379 * @param val encoded character properties 1380 * 1381 * @see GenerateCharacter#genTable 1382 */ 1383 1384 static void propertiesComments(StringBuffer result, long val) { 1385 result.append(" "); 1386 switch ((int)(val & maskType)) { 1387 case UnicodeSpec.CONTROL: 1388 result.append("Cc"); 1389 break; 1390 case UnicodeSpec.FORMAT: 1391 result.append("Cf"); 1392 break; 1393 case UnicodeSpec.PRIVATE_USE: 1394 result.append("Co"); 1395 break; 1396 case UnicodeSpec.SURROGATE: 1397 result.append("Cs"); 1398 break; 1399 case UnicodeSpec.LOWERCASE_LETTER: 1400 result.append("Ll"); 1401 break; 1402 case UnicodeSpec.MODIFIER_LETTER: 1403 result.append("Lm"); 1404 break; 1405 case UnicodeSpec.OTHER_LETTER: 1406 result.append("Lo"); 1407 break; 1408 case UnicodeSpec.TITLECASE_LETTER: 1409 result.append("Lt"); 1410 break; 1411 case UnicodeSpec.UPPERCASE_LETTER: 1412 result.append("Lu"); 1413 break; 1414 case UnicodeSpec.COMBINING_SPACING_MARK: 1415 result.append("Mc"); 1416 break; 1417 case UnicodeSpec.ENCLOSING_MARK: 1418 result.append("Me"); 1419 break; 1420 case UnicodeSpec.NON_SPACING_MARK: 1421 result.append("Mn"); 1422 break; 1423 case UnicodeSpec.DECIMAL_DIGIT_NUMBER: 1424 result.append("Nd"); 1425 break; 1426 case UnicodeSpec.LETTER_NUMBER: 1427 result.append("Nl"); 1428 break; 1429 case UnicodeSpec.OTHER_NUMBER: 1430 result.append("No"); 1431 break; 1432 case UnicodeSpec.CONNECTOR_PUNCTUATION: 1433 result.append("Pc"); 1434 break; 1435 case UnicodeSpec.DASH_PUNCTUATION: 1436 result.append("Pd"); 1437 break; 1438 case UnicodeSpec.END_PUNCTUATION: 1439 result.append("Pe"); 1440 break; 1441 case UnicodeSpec.OTHER_PUNCTUATION: 1442 result.append("Po"); 1443 break; 1444 case UnicodeSpec.START_PUNCTUATION: 1445 result.append("Ps"); 1446 break; 1447 case UnicodeSpec.CURRENCY_SYMBOL: 1448 result.append("Sc"); 1449 break; 1450 case UnicodeSpec.MODIFIER_SYMBOL: 1451 result.append("Sk"); 1452 break; 1453 case UnicodeSpec.MATH_SYMBOL: 1454 result.append("Sm"); 1455 break; 1456 case UnicodeSpec.OTHER_SYMBOL: 1457 result.append("So"); 1458 break; 1459 case UnicodeSpec.LINE_SEPARATOR: 1460 result.append("Zl"); break; 1461 case UnicodeSpec.PARAGRAPH_SEPARATOR: 1462 result.append("Zp"); 1463 break; 1464 case UnicodeSpec.SPACE_SEPARATOR: 1465 result.append("Zs"); 1466 break; 1467 case UnicodeSpec.UNASSIGNED: 1468 result.append("unassigned"); 1469 break; 1470 } 1471 1472 switch ((int)((val & maskBidi) >> shiftBidi)) { 1473 case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT: 1474 result.append(", L"); 1475 break; 1476 case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT: 1477 result.append(", R"); 1478 break; 1479 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER: 1480 result.append(", EN"); 1481 break; 1482 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR: 1483 result.append(", ES"); 1484 break; 1485 case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR: 1486 result.append(", ET"); 1487 break; 1488 case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER: 1489 result.append(", AN"); 1490 break; 1491 case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR: 1492 result.append(", CS"); 1493 break; 1494 case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR: 1495 result.append(", B"); 1496 break; 1497 case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR: 1498 result.append(", S"); 1499 break; 1500 case UnicodeSpec.DIRECTIONALITY_WHITESPACE: 1501 result.append(", WS"); 1502 break; 1503 case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS: 1504 result.append(", ON"); 1505 break; 1506 } 1507 if ((val & maskUpperCase) != 0) { 1508 result.append(", hasUpper (subtract "); 1509 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1510 } 1511 if ((val & maskLowerCase) != 0) { 1512 result.append(", hasLower (add "); 1513 result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")"); 1514 } 1515 if ((val & maskTitleCase) != 0) { 1516 result.append(", hasTitle"); 1517 } 1518 if ((val & maskIdentifierInfo) == valueIgnorable) { 1519 result.append(", ignorable"); 1520 } 1521 if ((val & maskIdentifierInfo) == valueJavaUnicodePart) { 1522 result.append(", identifier part"); 1523 } 1524 if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) { 1525 result.append(", underscore"); 1526 } 1527 if ((val & maskIdentifierInfo) == valueJavaWhitespace) { 1528 result.append(", whitespace"); 1529 } 1530 if ((val & maskIdentifierInfo) == valueJavaOnlyStart) { 1531 result.append(", currency"); 1532 } 1533 if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) { 1534 result.append(", identifier start"); 1535 } 1536 if ((val & maskNumericType) == valueDigit) { 1537 result.append(", decimal "); 1538 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1539 } 1540 if ((val & maskNumericType) == valueStrangeNumeric) { 1541 result.append(", strange"); 1542 } 1543 if ((val & maskNumericType) == valueJavaSupradecimal) { 1544 result.append(", supradecimal "); 1545 result.append((val & maskDigitOffset) >> shiftDigitOffset); 1546 } 1547 } 1548 1549 static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" }; 1550 1551 static String tableName(int j) { return tableNames[j]; } 1552 1553 /** 1554 * The genAccess method generates source code for one table access expression. 1555 * 1556 * Most of the complexity stems from handling various options as to 1557 * table representation, such as whether it contains values so large that 1558 * they are represented as negative values and whether the table values are 1559 * preshifted. This method also avoids such "ugly" expressions as shifting 1560 * by distance zero, masking when no masking is necessary, and so on. 1561 * For clarity, it generates expressions that do not rely on operator 1562 * precedence, but otherwise it avoids generating redundant parentheses. 1563 * 1564 * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]] 1565 * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example. 1566 * 1567 * @param tbl the name of the final table to be accessed 1568 * @param var the variable name that appeared in parentheses in the 1569 * "Lookup" command 1570 * @param bits the number of bits (not bytes) to be used to represent 1571 * the final table entry 1572 * @return the replacement text for the "Lookup(xxx)" command, as a String 1573 * 1574 * @see GenerateCharacter#replaceCommand 1575 */ 1576 1577 static String genAccess(String tbl, String var, int bits) { 1578 String access = null; 1579 int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0; 1580 for (int k = 0; k < sizes.length; k++) { 1581 int offset = ((k < sizes.length - 1) ? 0 : bitoffset); 1582 int shift = shifts[k] + offset; 1583 String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")"; 1584 int mask = (1 << (sizes[k] - offset)) - 1; 1585 String masked = (k == 0) ? shifted : 1586 "(" + shifted + "&0x" + hex(mask) + ")"; 1587 String index = (k == 0) ? masked : 1588 (mask == 0) ? access : "(" + access + "|" + masked + ")"; 1589 String indexNoParens = (index.charAt(0) != '(') ? index : 1590 index.substring(1, index.length() - 1); 1591 String tblname = (k == sizes.length - 1) ? tbl : tableName(k); 1592 String fetched = tblname + "[" + indexNoParens + "]"; 1593 String zeroextended = (zeroextend[k] == 0) ? fetched : 1594 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")"; 1595 int adjustment = preshifted[k] ? 0 : 1596 sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0); 1597 String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended : 1598 "(" + zeroextended + "<<" + adjustment + ")"; 1599 String bitshift = (bits == 1) ? "(" + var + "&0x1F)" : 1600 (bits == 2) ? "((" + var + "&0xF)<<1)" : 1601 (bits == 4) ? "((" + var + "&7)<<2)" : null; 1602 String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted : 1603 "((" + adjusted + ">>" + bitshift + ")&" + 1604 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")"; 1605 access = extracted; 1606 } 1607 return access; 1608 } 1609 1610 /* The command line arguments are decoded and used to set the following 1611 global variables. 1612 */ 1613 1614 static boolean verbose = false; 1615 static boolean nobidi = false; 1616 static boolean nomirror = false; 1617 static boolean identifiers = false; 1618 static boolean Csyntax = false; 1619 static String TemplateFileName = null; 1620 static String OutputFileName = null; 1621 static String UnicodeSpecFileName = null; // liu 1622 static String SpecialCasingFileName = null; 1623 static String PropListFileName = null; 1624 static String DerivedPropsFileName = null; 1625 static boolean useCharForByte = false; 1626 static int[] sizes; 1627 static int bins = 0; // liu; if > 0, then perform search 1628 static boolean tableAsString = false; 1629 static boolean bLatin1 = false; 1630 1631 static String commandLineDescription; 1632 1633 /* Other global variables, equal in length to the "sizes" array. */ 1634 1635 static int[] shifts; 1636 static int[] zeroextend; 1637 static int[] bytes; 1638 static boolean[] preshifted; 1639 static long[][] tables; 1640 1641 1642 /* Other global variables */ 1643 static String commentStart; 1644 static String commentEnd; 1645 1646 static StringBuffer initializers = new StringBuffer(); 1647 1648 /* special casing rules for 1:M toUpperCase mappings */ 1649 static SpecialCaseMap[] specialCaseMaps; 1650 1651 /** 1652 * Process the command line arguments. 1653 * 1654 * The allowed flags in command line are: 1655 * <dl> 1656 * <dt> -verbose <dd> Emit comments to standard output describing 1657 * what's going on during the processing. 1658 * <dt> -nobidi <dd> Do not include bidi categories in the 1659 * encoded character properties. 1660 * <dt> -nomirror <dd> Do no include mirror property in the encoded 1661 * character properties. 1662 * <dt> -identifiers <dd> Generate tables for scanning identifiers only. 1663 * <dt> -c <dd> Output code in C syntax instead of Java syntax. 1664 * <dt> -o filename <dd> Specify output file name. 1665 * <dt> -template filename <dd> Specify template input file name. 1666 * <dt> -spec filename <dd> Specify Unicode spec file name. 1667 * <dt> -specialcasing filename <dd> Specify Unicode special casing file name. 1668 * <dt> -search bins <dd> Try different partitions into the specified 1669 * number of bins. E.g., for 2 bins, try 1670 * 16 0, 15 1,..., 0 16. 1671 * <dt> -string <dd> Create table as string. Only valid with Java 1672 * syntax. 1673 * <dt> -latin1 <dd> Create a latin 1 only property table. 1674 * </dl> 1675 * In addition, decimal literals may appear as command line arguments; 1676 * each one represents the number of bits of the character to be broken 1677 * off at each lookup step. If present, they must add up to 16 (the number 1678 * of bits in a char value). For smaller tables, the last value should 1679 * be 0; values other than the last one may not be zero. If no such 1680 * numeric values are provided, default values are used. 1681 * 1682 * @param args the command line arguments, as an array of String 1683 * 1684 * @see GenerateCharacter#main 1685 */ 1686 1687 static void processArgs(String[] args) { 1688 StringBuffer desc = new StringBuffer("java GenerateCharacter"); 1689 for (int j=0; j<args.length; ++j) { 1690 desc.append(" " + args[j]); 1691 } 1692 for (int j = 0; j < args.length; j++) { 1693 if (args[j].equals("-verbose") || args[j].equals("-v")) 1694 verbose = true; 1695 else if (args[j].equals("-nobidi")) 1696 nobidi = true; 1697 else if (args[j].equals("-nomirror")) 1698 nomirror = true; 1699 else if (args[j].equals("-identifiers")) 1700 identifiers = true; 1701 else if (args[j].equals("-c")) 1702 Csyntax = true; 1703 else if (args[j].equals("-string")) 1704 tableAsString = true; 1705 else if (args[j].equals("-o")) { 1706 if (j == args.length - 1) { 1707 FAIL("File name missing after -o"); 1708 } 1709 else { 1710 OutputFileName = args[++j]; 1711 } 1712 } 1713 else if (args[j].equals("-search")) { 1714 if (j == args.length - 1) 1715 FAIL("Bin count missing after -search"); 1716 else { 1717 bins = Integer.parseInt(args[++j]); 1718 if (bins < 1 || bins > 10) 1719 FAIL("Bin count must be >= 1 and <= 10"); 1720 } 1721 } 1722 else if (args[j].equals("-template")) { 1723 if (j == args.length - 1) 1724 FAIL("File name missing after -template"); 1725 else 1726 TemplateFileName = args[++j]; 1727 } 1728 else if (args[j].equals("-spec")) { // liu 1729 if (j == args.length - 1) { 1730 FAIL("File name missing after -spec"); 1731 } 1732 else { 1733 UnicodeSpecFileName = args[++j]; 1734 } 1735 } 1736 else if (args[j].equals("-specialcasing")) { 1737 if (j == args.length -1) { 1738 FAIL("File name missing after -specialcasing"); 1739 } 1740 else { 1741 SpecialCasingFileName = args[++j]; 1742 } 1743 } 1744 else if (args[j].equals("-proplist")) { 1745 if (j == args.length -1) { 1746 FAIL("File name missing after -proplist"); 1747 } 1748 else { 1749 PropListFileName = args[++j]; 1750 } 1751 } 1752 else if (args[j].equals("-derivedprops")) { 1753 if (j == args.length -1) { 1754 FAIL("File name missing after -derivedprops"); 1755 } 1756 else { 1757 DerivedPropsFileName = args[++j]; 1758 } 1759 } 1760 else if (args[j].equals("-plane")) { 1761 if (j == args.length -1) { 1762 FAIL("Plane number missing after -plane"); 1763 } 1764 else { 1765 plane = Integer.parseInt(args[++j]); 1766 } 1767 if (plane > 0) { 1768 bLatin1 = false; 1769 } 1770 } 1771 else if ("-usecharforbyte".equals(args[j])) { 1772 useCharForByte = true; 1773 } 1774 else if (args[j].equals("-latin1")) { 1775 bLatin1 = true; 1776 plane = 0; 1777 } 1778 else { 1779 try { 1780 int val = Integer.parseInt(args[j]); 1781 if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]); 1782 if (sizes == null) 1783 sizes = new int[1]; 1784 else { 1785 int[] newsizes = new int[sizes.length + 1]; 1786 System.arraycopy(sizes, 0, newsizes, 0, sizes.length); 1787 sizes = newsizes; 1788 } 1789 sizes[sizes.length - 1] = val; 1790 } 1791 catch(NumberFormatException e) { 1792 FAIL("Unknown switch: " + args[j]); 1793 } 1794 } 1795 } 1796 if (Csyntax && tableAsString) { 1797 FAIL("Can't specify table as string with C syntax"); 1798 } 1799 if (sizes == null) { 1800 desc.append(" ["); 1801 if (identifiers) { 1802 int[] newsizes = { 8, 4, 4 }; // Good default values 1803 desc.append("8 4 4]"); 1804 sizes = newsizes; 1805 } 1806 else { 1807 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 } 1808 desc.append("10 5 1]"); 1809 sizes = newsizes; 1810 } 1811 } 1812 if (UnicodeSpecFileName == null) { // liu 1813 UnicodeSpecFileName = DefaultUnicodeSpecFileName; 1814 desc.append(" [-spec " + UnicodeSpecFileName + ']'); 1815 } 1816 if (SpecialCasingFileName == null) { 1817 SpecialCasingFileName = DefaultSpecialCasingFileName; 1818 desc.append(" [-specialcasing " + SpecialCasingFileName + ']'); 1819 } 1820 if (PropListFileName == null) { 1821 PropListFileName = DefaultPropListFileName; 1822 desc.append(" [-proplist " + PropListFileName + ']'); 1823 } 1824 if (DerivedPropsFileName == null) { 1825 DerivedPropsFileName = DefaultDerivedPropsFileName; 1826 desc.append(" [-derivedprops " + DerivedPropsFileName + ']'); 1827 } 1828 if (TemplateFileName == null) { 1829 TemplateFileName = (Csyntax ? DefaultCTemplateFileName 1830 : DefaultJavaTemplateFileName); 1831 desc.append(" [-template " + TemplateFileName + ']'); 1832 } 1833 if (OutputFileName == null) { 1834 OutputFileName = (Csyntax ? DefaultCOutputFileName 1835 : DefaultJavaOutputFileName); 1836 desc.append(" [-o " + OutputFileName + ']'); 1837 } 1838 commentStart = (Csyntax ? "/*" : "//"); 1839 commentEnd = (Csyntax ? " */" : ""); 1840 commandLineDescription = desc.toString(); 1841 } 1842 1843 private static void searchBins(long[] map, int binsOccupied) throws Exception { 1844 int bitsFree = 16; 1845 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i]; 1846 if (binsOccupied == (bins-1)) { 1847 sizes[binsOccupied] = bitsFree; 1848 generateForSizes(map); 1849 } 1850 else { 1851 for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one 1852 sizes[binsOccupied] = i; 1853 searchBins(map, binsOccupied+1); 1854 } 1855 } 1856 } 1857 1858 private static void generateForSizes(long[] map) throws Exception { 1859 int sum = 0; 1860 shifts = new int[sizes.length]; 1861 for (int k = sizes.length - 1; k >= 0; k--) { 1862 shifts[k] = sum; 1863 sum += sizes[k]; 1864 } 1865 if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) { 1866 FAIL("Bit field widths total to " + sum + 1867 ": wrong total for map of size " + map.length); 1868 } 1869 // need a table for each set of lookup bits in char 1870 tables = new long[sizes.length][]; 1871 // the last table is the map 1872 tables[sizes.length - 1] = map; 1873 for (int j = sizes.length - 1; j > 0; j--) { 1874 if (verbose && bins==0) 1875 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]); 1876 long[][] temp = buildTable(tables[j], sizes[j]); 1877 tables[j-1] = temp[0]; 1878 tables[j] = temp[1]; 1879 } 1880 preshifted = new boolean[sizes.length]; 1881 zeroextend = new int[sizes.length]; 1882 bytes = new int[sizes.length]; 1883 for (int j = 0; j < sizes.length - 1; j++) { 1884 int len = tables[j+1].length; 1885 int size = sizes[j+1]; 1886 if (len > 0x100 && (len >> size) <= 0x100) { 1887 len >>= size; 1888 preshifted[j] = false; 1889 } 1890 else if (len > 0x10000 && (len >> size) <= 0x10000) { 1891 len >>= size; 1892 preshifted[j] = false; 1893 } 1894 else preshifted[j] = true; 1895 if (Csyntax) 1896 zeroextend[j] = 0; 1897 else if (len > 0x7F && len <= 0xFF) { 1898 if (!useCharForByte) { 1899 zeroextend[j] = 0xFF; 1900 } 1901 } else if (len > 0x7FFF && len <= 0xFFFF) 1902 zeroextend[j] = 0xFFFF; 1903 else zeroextend[j] = 0; 1904 if (len <= 0x100) bytes[j] = 1; 1905 else if (len <= 0x10000) bytes[j] = 2; 1906 else bytes[j] = 4; 1907 } 1908 preshifted[sizes.length - 1] = true; 1909 zeroextend[sizes.length - 1] = 0; 1910 bytes[sizes.length - 1] = 0; 1911 if (bins > 0) { 1912 int totalBytes = getTotalBytes(); 1913 String access = genAccess("A", "ch", (identifiers ? 2 : 32)); 1914 int accessComplexity = 0; 1915 for (int j=0; j<access.length(); ++j) { 1916 char ch = access.charAt(j); 1917 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity; 1918 if (ch == '<' || ch == '>') ++j; 1919 } 1920 System.out.print("("); 1921 for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]); 1922 System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access); 1923 return; 1924 } 1925 if (verbose) { 1926 System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted"); 1927 for (int j = 0; j < sizes.length; j++) { 1928 System.out.println(dec5(j) + "\t" + 1929 dec5(sizes[j]) + "\t" + 1930 dec5(tables[j].length) + "\t" + 1931 dec5(shifts[j]) + "\t" + 1932 dec5(zeroextend[j]) + "\t" + 1933 dec5(bytes[j]) + "\t " + 1934 preshifted[j]); 1935 } 1936 } 1937 if (verbose) { 1938 System.out.println("Generating source code for class Character"); 1939 System.out.println("A table access looks like " + 1940 genAccess("A", "ch", (identifiers ? 2 : 32))); 1941 } 1942 generateCharacterClass(TemplateFileName, OutputFileName); 1943 } 1944 1945 /** 1946 * The main program for generating source code for the Character class. 1947 * The basic outline of its operation is: 1948 * <ol> 1949 * <li> Process the command line arguments. One result of this process 1950 * is a list of sizes (measured in bits and summing to 16). 1951 * <li> Get the Unicode character property data from the specification file. 1952 * <li> From that, build a map that has, for each character code, its 1953 * relevant properties encoded as a long integer value. 1954 * <li> Repeatedly compress the map, producing a compressed table and a 1955 * new map. This is done once for each size value in the list. 1956 * When this is done, we have a set of tables. 1957 * <li> Make some decisions about table representation; record these 1958 * decisions in arrays named preshifted, zeroextend, and bytes. 1959 * <li> Generate the source code for the class Character by performing 1960 * macro processing on a template file. 1961 * </ol> 1962 * 1963 * @param args the command line arguments, as an array of String 1964 * 1965 * @see GenerateCharacter#processArgs 1966 * @see UnicodeSpec@readSpecFile 1967 * @see GenerateCharacter#buildMap 1968 * @see GenerateCharacter#buildTable 1969 * @see GenerateCharacter#generateCharacterClass 1970 */ 1971 1972 public static void main(String[] args) { 1973 processArgs(args); 1974 try { 1975 1976 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane); 1977 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane); 1978 PropList propList = PropList.readSpecFile(new File(PropListFileName), plane); 1979 propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane)); 1980 1981 if (verbose) { 1982 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu 1983 } 1984 long[] map = buildMap(data, specialCaseMaps, propList); 1985 if (verbose) { 1986 System.err.println("Completed building of initial map"); 1987 } 1988 1989 if (bins == 0) { 1990 generateForSizes(map); 1991 } 1992 else { 1993 while (bins > 0) { 1994 sizes = new int[bins]; 1995 searchBins(map, 0); 1996 --bins; 1997 } 1998 } 1999 if (verbose && false) { 2000 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" + 2001 hex8(maxOffsetSeen)); 2002 System.out.println(" allowed: -" + hex8(-minOffset) + "..+" + 2003 hex8(maxOffset)); 2004 } 2005 } 2006 catch (FileNotFoundException e) { FAIL(e.toString()); } 2007 catch (IOException e) { FAIL(e.toString()); } 2008 catch (Throwable e) { 2009 System.out.println("Unexpected exception:"); 2010 e.printStackTrace(); 2011 FAIL("Unexpected exception!"); 2012 } 2013 if (verbose) { System.out.println("Done!");} 2014 } 2015 2016 } // end class