Old make/tools/src/build/tools/generatecharacter/GenerateCharacter.java

   1 
   2 /*
   3  * Copyright 2002-2003 Sun Microsystems, Inc.  All Rights Reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.  Sun designates this
   9  * particular file as subject to the "Classpath" exception as provided
  10  * by Sun in the LICENSE file that accompanied this code.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  */
  26 
  27 package build.tools.generatecharacter;
  28 
  29 import java.io.IOException;
  30 import java.io.FileNotFoundException;
  31 import java.io.BufferedReader;
  32 import java.io.FileReader;
  33 import java.io.PrintWriter;
  34 import java.io.BufferedWriter;
  35 import java.io.FileWriter;
  36 import java.io.File;
  37 
  38 /**
  39  * This program generates the source code for the class java.lang.Character.
  40  * It also generates native C code that can perform the same operations.
  41  * It requires two external input data files:
  42  * <ul>
  43  * <li> Unicode specification file
  44  * <li> Character class template file
  45  * </ul>
  46  * The Unicode specification file is available from the Unicode consortium.
  47  * It has character specification lines that look like this:
  48  * <listing>
  49  * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
  50  * </listing>
  51  * The Character class template file is filled in with additional
  52  * information to produce the file Character.java, which can then be
  53  * compiled by a Java compiler.  The template file contains certain
  54  * markers consisting of an alphabetic name string preceded by "$$".
  55  * Such markers are replaced with generated program text.  As a special
  56  * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
  57  * alphabetic characters constituting a variable name.  The character "_"
  58  * is considered alphabetic for these purposes.
  59  *
  60  * @author  Guy Steele
  61  * @author  Alan Liu
  62  * @author  John O'Conner
  63  */
  64 
  65 public class GenerateCharacter {
  66 
  67     final static boolean DEBUG = false;
  68 
  69     final static int MAX_UNICODE_VALUE = 0xFFFF;
  70     final static String commandMarker = "$$";
  71     static String ROOT                        = "";
  72     static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
  73     static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
  74     static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
  75     static String DefaultJavaOutputFileName   = ROOT + "Character.java";
  76     static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
  77     static String DefaultCOutputFileName      = ROOT + "Character.c";
  78 
  79     static String CharacterDataClassName      = "CharacterData";
  80         static int plane = 0;
  81 
  82     /* The overall idea is that, in the generated Character class source code,
  83     most character property data is stored in a special multi-level table whose
  84     structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
  85     The integers must sum to 16 (the number of bits in a character).
  86     The first table is indexed by the k1 high-order bits of the character code.
  87     The result is concatenated to the next k2 bits of the character code to index
  88     the second table, and so on.  Eventually the kn low-order bits of the character
  89     code are concatenated and used to index one of two tables A and B; A contains
  90     32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
  91     can be thus obtained encode the properties for the character.
  92 
  93     The default specification is [9, 4, 3, 0].  This particular table format was
  94     designed by conducting an exhaustive search of table formats to minimize the
  95     space consumed by the tables: the first and third tables need have only byte
  96     values (the second table must have short values).  Another good choice is
  97     [10, 6, 0], which produces a larger table but allows particularly fast table
  98     lookup code.
  99 
 100     In each case, where the word "concatenated" is used, this may imply
 101     first a << and then a | operation, or perhaps just a | operation if
 102     the values in the table can be preshifted (generally possible if the table
 103     entries are short rather than byte).
 104     */
 105 
 106     /* The character properties are currently encoded into 32 bits in the following manner:
 107     1 bit Mirrored property.
 108     4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
 109     9 bits      A signed offset used for converting case .
 110     1 bit       If 1, adding the signed offset converts the character to lowercase.
 111     1 bit       If 1, subtracting the signed offset converts the character to uppercase.
 112         Note: for a titlecase character, both of the preceding bits will be 1
 113         and the signed offset will be 1.
 114     1 bit   If 1, this character has a titlecase equivalent (possibly itself);
 115         in this case, the two bits before this bit can be used to decide
 116         whether this character is in fact uppercase, lowercase, or titlecase.
 117     3 bits      This field provides a quick way to lex identifiers.
 118         The eight possible values for this field are as follows:
 119         0  May not be part of an identifier
 120         1  Ignorable control; may continue a Unicode identifier or Java identifier
 121         2  May continue a Java identifier but not a Unicode identifier (unused)
 122         3  May continue a Unicode identifier or Java identifier
 123         4  Is a Java whitespace character
 124         5  May start or continue a Java identifier;
 125            may continue but not start a Unicode identifier
 126            (this value is used for connector punctuation such as _)
 127         6  May start or continue a Java identifier;
 128            may not occur in a Unicode identifier
 129            (this value is used for currency symbols such as $)
 130         7  May start or continue a Unicode identifier or Java identifier
 131         Thus:
 132            5, 6, 7 may start a Java identifier
 133            1, 2, 3, 5, 6, 7 may continue a Java identifier
 134            7 may start a Unicode identifier
 135            1, 3, 5, 7 may continue a Unicode identifier
 136            1 is ignorable within an identifier
 137            4 is Java whitespace
 138     2 bits      This field indicates whether the character has a numeric property.
 139         The four possible values for this field are as follows:
 140         0  This character has no numeric property.
 141         1  Adding the digit offset to the character code and then
 142            masking with 0x1F will produce the desired numeric value.
 143         2  This character has a "strange" numeric value.
 144         3  A Java supradecimal digit: adding the digit offset to the
 145            character code, then masking with 0x1F, then adding 10
 146            will produce the desired numeric value.
 147     5 bits  The digit offset (see description of previous field)
 148     5 bits      Character type (see below)
 149     */
 150 
 151 
 152     // bit masks identify each component of a 32-bit property field described
 153     // above.
 154     // shift* indicates how many shifts right must happen to get the
 155     // indicated property value in the lowest bits of the 32-bit space.
 156     private static final int
 157         shiftType           = 0,        maskType            =       0x001F,
 158         shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
 159         shiftNumericType    = 10,       maskNumericType     =       0x0C00,
 160         shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
 161                                         maskUnicodePart     =       0x1000,
 162         shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
 163                                         maskLowerCase       =      0x20000,
 164                                         maskUpperCase       =      0x10000,
 165                                         maskTitleCase       =      0x08000,
 166         shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
 167         shiftCaseOffsetSign = 5,
 168                                         // used only when calculating and
 169                                         // storing digit offsets from char values
 170                                         maskDigit               =   0x001F,
 171                                         // case offset are 9 bits
 172                                         maskCase                =   0x01FF,
 173         shiftBidi           = 27,       maskBidi              = 0x78000000,
 174         shiftMirrored       = 31,       maskMirrored          = 0x80000000,
 175         shiftPlane          = 16,       maskPlane = 0xFF0000;
 176 
 177     // Can compare masked values with these to determine
 178     // numeric or lexical types.
 179     public static int
 180         valueNotNumeric             = 0x0000,
 181         valueDigit                  = 0x0400,
 182         valueStrangeNumeric         = 0x0800,
 183         valueJavaSupradecimal       = 0x0C00,
 184         valueIgnorable              = 0x1000,
 185         valueJavaOnlyPart           = 0x2000,
 186         valueJavaUnicodePart        = 0x3000,
 187         valueJavaWhitespace         = 0x4000,
 188         valueJavaStartUnicodePart   = 0x5000,
 189         valueJavaOnlyStart          = 0x6000,
 190         valueJavaUnicodeStart       = 0x7000,
 191         lowJavaStart                = 0x5000,
 192         nonzeroJavaPart             = 0x3000,
 193         valueUnicodeStart           = 0x7000;
 194 
 195     // these values are used when only identifier properties are generated
 196     // for use in verifier code. Shortens the property down to a single byte.
 197     private static final int
 198         bitJavaStart            = 0x02,
 199         bitJavaPart             = 0x01,
 200         maskIsJavaIdentifierPart = bitJavaPart,
 201         maskIsJavaIdentifierStart = bitJavaStart;
 202 
 203     static int maxOffset = maskCase/2 ;
 204     static int minOffset = -maxOffset;
 205 
 206     /* The following routines provide simple, concise formatting of long integer values.
 207      The number in the name of the method indicates the desired number of characters
 208      to be produced.  If the number of digits required to represent the integer value
 209      is less than that number, then the output is padded on the left  with zeros
 210      (for hex) or with spaces (for decimal).  If the number of digits required to
 211      represent the integer value is greater than the desired number, then all the digits
 212      that are required are actually produced.
 213     */
 214 
 215     static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
 216 
 217     static String hex2(long n) {
 218         String q = Long.toHexString(n & 0xFF).toUpperCase();
 219         return "00".substring(Math.min(2, q.length())) + q;
 220     }
 221 
 222     static String hex4(long n) {
 223         String q = Long.toHexString(n & 0xFFFF).toUpperCase();
 224         return "0000".substring(Math.min(4, q.length())) + q;
 225     }
 226 
 227     static String hex8(long n) {
 228         String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
 229         return "00000000".substring(Math.min(8, q.length())) + q;
 230     }
 231 
 232     static String hex16(long n) {
 233         String q = Long.toHexString(n).toUpperCase();
 234         return "0000000000000000".substring(Math.min(16, q.length())) + q;
 235     }
 236 
 237     static String dec3(long n) {
 238         String q = Long.toString(n);
 239         return "   ".substring(Math.min(3, q.length())) + q;
 240     }
 241 
 242     static String dec5(long n) {
 243         String q = Long.toString(n);
 244         return "     ".substring(Math.min(5, q.length())) + q;
 245     }
 246 
 247     /* This routine is called when some failure occurs. */
 248 
 249     static void FAIL(String s) {
 250         System.out.println("** " + s);
 251     }
 252 
 253     /**
 254     * Given the data from the Unicode specification file, this routine builds a map.
 255     *
 256     * The specification file is assumed to contain its data in sorted order by
 257     * character code; as a result, the array passed as an argument to this method
 258     * has its components in the same sorted order, with one entry for each defined
 259         * Unicode character or character range.  (A range is indicated by two consecutive
 260     * entries, such that the name of the first entry begins with "<" and ends with
 261     * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
 262     * therefore a sparse representation of the character property data.
 263     *
 264     * The resulting map is dense representation of the character data.  It contains
 265     * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
 266     * of this long value are used, but type long is used rather than int to facilitate
 267     * future extensions of this source code generator that might require more than
 268     * 32 bits to encode relevant character properties.)  Entry k holds the encoded
 269     * properties for character k.
 270     *
 271     * Method buildMap manages the transformation from the sparse representation to
 272     * the dense representation.  It calls method buildOne to handle the encoding
 273     * of character property data from a single UnicodeSpec object into 32 bits.
 274     * For undefined characters, method buildOne is not called and the map entry for
 275     * that character is set to UnicodeSpec.UNASSIGNED.
 276     *
 277     * @param data       character property data from the Unicode specification file
 278     * @return   an array of length 65536 with one entry for every possible char value
 279     *
 280     * @see GenerateCharacter#buildOne
 281     */
 282 
 283     static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps) {
 284         long[] result;
 285         if (bLatin1 == true) {
 286             result = new long[256];
 287         } else {
 288             result = new long[1<<16];
 289         }
 290         int k=0;
 291                 int codePoint = plane<<16;
 292         UnicodeSpec nonCharSpec = new UnicodeSpec();
 293         for (int j = 0; j < data.length && k < result.length; j++) {
 294             if (data[j].codePoint == codePoint) {
 295                 result[k] = buildOne(codePoint, data[j], specialMaps);
 296                 ++k;
 297                                 ++codePoint;
 298             }
 299             else if(data[j].codePoint > codePoint) {
 300                 if (data[j].name.endsWith("Last>")) {
 301                     // build map data for all chars except last in range
 302                     while (codePoint < data[j].codePoint && k < result.length) {
 303                         result[k] = buildOne(codePoint, data[j], specialMaps);
 304                         ++k;
 305                                                 ++codePoint;
 306                     }
 307                 }
 308                 else {
 309                     // we have a few unassigned chars before data[j].codePoint
 310                     while (codePoint < data[j].codePoint && k < result.length) {
 311                         result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 312                         ++k;
 313                                                 ++codePoint;
 314                     }
 315                 }
 316                 k = data[j].codePoint & 0xFFFF;
 317                                 codePoint = data[j].codePoint;
 318                 result[k] = buildOne(codePoint, data[j], specialMaps);
 319                 ++k;
 320                                 ++codePoint;
 321 
 322             }
 323             else {
 324                 System.out.println("An error has occured during spec mapping.");
 325                 System.exit(0);
 326             }
 327         }
 328         // if there are still unprocessed chars, process them
 329         // as unassigned/undefined.
 330         codePoint = (plane<<16) | k;
 331         while (k < result.length) {
 332             result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 333             ++k;
 334                         ++codePoint;
 335         }
 336         return result;
 337     }
 338 
 339     // The maximum and minimum offsets found while scanning the database
 340     static int maxOffsetSeen = 0;
 341     static int minOffsetSeen = 0;
 342 
 343     /**
 344      * Some Unicode separator characters are not considered Java whitespace.
 345      * @param c character to test
 346      * @return true if c in an invalid Java whitespace character, false otherwise.
 347      */
 348     static boolean isInvalidJavaWhiteSpace(int c) {
 349         int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
 350         boolean retValue = false;
 351         for(int x=0;x<exceptions.length;x++) {
 352             if(c == exceptions[x]) {
 353                 retValue = true;
 354                 break;
 355             }
 356         }
 357         return retValue;
 358 
 359     }
 360 
 361     /**
 362     * Given the character property data for one Unicode character, encode the data
 363     * of interest into a single long integer value.  (Right now only 32 bits
 364     * of this long value are used, but type long is used rather than int to facilitate
 365     * future extensions of this source code generator that might require more than
 366     * 32 bits to encode relevant character properties.)
 367     *
 368     * @param c   the character code for which to encode property data
 369     * @param us  property data record from the Unicode specification file
 370     *            (its character code might not be equal to c if it specifies data
 371     *            for a range of characters)
 372     * @return   an encoded long value that contains the properties for a single char
 373     *
 374     * @see GenerateCharacter#buildMap
 375     */
 376 
 377     static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
 378         long resultA = 0;
 379         // record the general category
 380         resultA |= us.generalCategory;
 381 
 382     // record the numeric properties
 383     NUMERIC: {
 384         STRANGE: {
 385             int val = 0;
 386         // c is A-Z
 387             if ((c >= 0x0041) && (c <= 0x005A)) {
 388                 val = c - 0x0041;
 389                 resultA |= valueJavaSupradecimal;
 390         // c is a-z
 391             } else if ((c >= 0x0061) && (c <= 0x007A)) {
 392                 val = c - 0x0061;
 393                 resultA |= valueJavaSupradecimal;
 394             // c is a full-width A-Z
 395             } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
 396                 val = c - 0xFF21;
 397                 resultA |= valueJavaSupradecimal;
 398             // c is a full-width a-z
 399             } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
 400                 val = c - 0xFF41;
 401                 resultA |= valueJavaSupradecimal;
 402             } else if (us.isDecimalValue()) {
 403                 val = us.decimalValue;
 404                 resultA |= valueDigit;
 405             } else if (us.isDigitValue()) {
 406                 val = us.digitValue;
 407                 resultA |= valueDigit;
 408             } else {
 409                 if (us.numericValue.length() == 0) {
 410                     break NUMERIC;                      // no numeric value at all
 411                 } else {
 412                     try {
 413                         val = Integer.parseInt(us.numericValue);
 414                         if (val >= 32 || val < 0) break STRANGE;
 415                         if (c == 0x215F) break STRANGE;
 416                     } catch(NumberFormatException e) {
 417                         break STRANGE;
 418                     }
 419                     resultA |= valueDigit;
 420                 }
 421             }
 422             if (val >= 32 || val < 0) break STRANGE;
 423             resultA |= ((val - c & maskDigit) << shiftDigitOffset);
 424             break NUMERIC;
 425         } // end STRANGE
 426         resultA |= valueStrangeNumeric;
 427         } // end NUMERIC
 428 
 429     // record case mapping
 430         int offset = 0;
 431         // might have a 1:M mapping
 432         int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
 433         boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
 434         if (bHasUpper) {
 435             resultA |= maskUpperCase;
 436         }
 437         if (specialMap != -1) {
 438             // has mapping, but cannot record the
 439             // proper offset; can only flag it and provide special case
 440             // code in Character.java
 441             offset = -1;
 442         }
 443         else if (us.hasUpperMap())  {
 444             offset = c - us.upperMap;
 445         }
 446 
 447         if (us.hasLowerMap()) {
 448             resultA |= maskLowerCase;
 449             if (offset == 0)
 450                 offset = us.lowerMap - c;
 451             else if (offset != (us.lowerMap - c)) {
 452                 if (DEBUG) {
 453                 FAIL("Character " + hex(c) +
 454                 " has incompatible lowercase and uppercase mappings");
 455                 }
 456             }
 457         }
 458         if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
 459                 (bHasUpper && us.hasLowerMap())) {
 460             resultA |= maskTitleCase;
 461         }
 462         if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
 463           System.out.println("Warning: Character " + hex4(c) + " has upper but " +
 464                              "no title case; Java won't know this");
 465         }
 466         if (offset < minOffsetSeen) minOffsetSeen = offset;
 467         if (offset > maxOffsetSeen) maxOffsetSeen = offset;
 468         if (offset > maxOffset || offset < minOffset) {
 469             if (DEBUG) {
 470             FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
 471             }
 472             offset = maskCase;
 473         }
 474         resultA |= ((offset & maskCase) << shiftCaseOffset);
 475 
 476 
 477     // record lexical info about this character
 478         if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
 479                 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
 480                 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
 481                 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
 482                 || us.generalCategory == UnicodeSpec.OTHER_LETTER
 483                 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
 484             resultA |= valueJavaUnicodeStart;
 485         }
 486         else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
 487                 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
 488                 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
 489             resultA |= valueJavaUnicodePart;
 490         }
 491         else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
 492             resultA |= valueJavaStartUnicodePart;
 493         }
 494         else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
 495             resultA |= valueJavaOnlyStart;
 496         }
 497         else if (((c >= 0x0000) && (c <= 0x0008))
 498                 || ((c >= 0x000E) && (c <= 0x001B))
 499                 || ((c >= 0x007F) && (c <= 0x009F))
 500                 || us.generalCategory == UnicodeSpec.FORMAT) {
 501             resultA |= valueIgnorable;
 502         }
 503         else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
 504                 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
 505                 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
 506             if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
 507         }
 508         else if (((c >= 0x0009) && (c <= 0x000D))
 509                 || ((c >= 0x001C) && (c <= 0x001F))) {
 510             resultA |= valueJavaWhitespace;
 511         }
 512 
 513         // record bidi category
 514         if (!nobidi) {
 515             int tmpBidi =
 516                 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
 517                     us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
 518             resultA |= tmpBidi;
 519         }
 520 
 521         // record mirrored property
 522         if (!nomirror) {
 523             resultA |= us.mirrored ? maskMirrored : 0;
 524         }
 525 
 526         if (identifiers) {
 527             long replacement = 0;
 528             if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
 529                 replacement |= bitJavaStart;
 530             }
 531             if ( ((resultA & nonzeroJavaPart) != 0)
 532                     && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
 533                 replacement |= bitJavaPart;
 534             }
 535             resultA = replacement;
 536         }
 537         return resultA;
 538     }
 539 
 540     /**
 541     * This is the heart of the table compression strategy.  The inputs are a map
 542     * and a number of bits (size).  The map is simply an array of long integer values;
 543     * the number of bits indicates how index values for that map are to be split.
 544     * The length of the given map must be a multiple of (1 << size).  The result is
 545     * a new map z and a compressed table t such that for every valid index value k
 546     * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
 547     *
 548     * In other words, the index k can be split into two parts, namely the "size"
 549     * low-order bits and all the remaining high-order bits; the high-order bits are then
 550     * remapped by map z to produce an index into table t.  In effect, the data of the
 551     * original map m is broken up into blocks of size (1<<size); the compression relies
 552     * on the expectation that many of these blocks will be identical and therefore need
 553     * be represented only once in the compressed table t.
 554     *
 555     * This method is intended to be used iteratively.  The first map to be handed
 556     * to it is the one constructed by method buildMap.  After that, the first of the
 557     * two arrays returned by this method is fed back into it for further compression.
 558     * At the end of the iteration, one has a starter map and a sequence of tables.
 559     *
 560     * The algorithm used to implement this computation is straightforward and not
 561     * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
 562     * to locate identical blocks, so overall the time complexity of the algorithm
 563     * is quadratic in the length of the input map.  Fortunately, speed is not crucial
 564     * to this application.
 565     *
 566     * @param map                a map to be compressed
 567     * @param size       the number of index bits to be split off by the compression
 568     * @return   an array of length 2 containing two arrays; the first is a new map
 569     *           and the second is a compressed data table
 570     *
 571     * @see GenerateCharacter#buildMap
 572     */
 573 
 574     static long[][] buildTable(long[] map, int size) {
 575         int n = map.length;
 576         if (((n >> size) << size) != n) {
 577             FAIL("Length " + n + " is not a multiple of " + (1 << size));
 578         }
 579         int m = 1 << size;
 580         // We know the final length of the new map up front.
 581         long[] newmap = new long[n >> size];
 582         // The buffer is used temporarily to hold data for the compressed table
 583         // because we don't know its final length yet.
 584         long[] buffer = new long[n];
 585         int ptr = 0;
 586 OUTER:  for (int i = 0; i < n; i += m) {
 587             // For every block of size m in the original map...
 588     MIDDLE: for (int j = 0; j < ptr; j += m) {
 589             // Find out whether there is already a block just like it in the buffer.
 590                 for (int k = 0; k < m; k++) {
 591                     if (buffer[j+k] != map[i+k])
 592                         continue MIDDLE;
 593                 }
 594                 // There is a block just like it at position j, so just
 595                 // put its index into the new map (thereby sharing it).
 596                 newmap[i >> size] = (j >> size);
 597                 continue OUTER;
 598             } // end MIDDLE
 599             // There is no block just like it already, so add it to
 600             // the buffer and put its index into the new map.
 601             for (int k = 0; k < m; k++) {
 602                 buffer[ptr+k] = map[i+k];
 603             }
 604             newmap[i >> size] = (ptr >> size);
 605             ptr += m;
 606         } // end OUTER
 607         // Now we know how long the compressed table should be,
 608         // so create a new array and copy data from the temporary buffer.
 609         long[] newdata = new long[ptr];
 610         for (int j = 0; j < ptr; j++) {
 611             newdata[j] = buffer[j];
 612         }
 613         // Return the new map and the new data table.
 614         long[][] result = { newmap, newdata };
 615         return result;
 616     }
 617 
 618     /**
 619     * Once the compressed tables have been computed, this method reads in a
 620     * template file for the source code to be generated and writes out the final
 621     * source code by acting as a sort of specialized macro processor.
 622     *
 623     * The first output line is a comment saying that the file was automatically
 624     * generated; it includes a timestamp.  All other output is generated by
 625     * reading a line from the template file, performing macro replacements,
 626     * and then writing the resulting line or lines of code to the output file.
 627     *
 628     * This method handles the I/O, the timestamp comment, and the locating of
 629     * macro calls within each input line.  The method replaceCommand is called
 630     * to generate replacement text for each macro call.
 631     *
 632     * Macro calls to be replaced are indicated in the template file by
 633     * occurrences of the commandMarker "$$".  The rest of the call may consist
 634     * of Java letters (including the underscore "_") and also of balanced
 635     * parentheses.
 636     *
 637     * @param theTemplateFileName
 638     *           the file name for the template input file
 639     * @param theOutputFileName
 640     *           the file name for the source code output file
 641     *
 642     *     @see GenerateCharacter#replaceCommand
 643     */
 644 
 645     static void generateCharacterClass(String theTemplateFileName,
 646                      String theOutputFileName)
 647             throws FileNotFoundException, IOException {
 648         BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
 649         PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
 650         out.println(commentStart +
 651             " This file was generated AUTOMATICALLY from a template file " +
 652             new java.util.Date() + commentEnd);
 653         int marklen = commandMarker.length();
 654         LOOP: while(true) {
 655             try {
 656                 String line = in.readLine();
 657                 if (line == null) break LOOP;
 658                 int pos = 0;
 659                 int depth = 0;
 660                 while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
 661                     int newpos = pos + marklen;
 662                     char ch = 'x';
 663                     SCAN: while (newpos < line.length() &&
 664                             (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
 665                             || ch == '(' || (ch == ')' && depth > 0))) {
 666                         ++newpos;
 667                         if (ch == '(') {
 668                             ++depth;
 669                         }
 670                         else if (ch == ')') {
 671                             --depth;
 672                             if (depth == 0)
 673                                 break SCAN;
 674                         }
 675                     }
 676                     String replacement = replaceCommand(line.substring(pos + marklen, newpos));
 677                     line = line.substring(0, pos) + replacement + line.substring(newpos);
 678                     pos += replacement.length();
 679                 }
 680                 out.println(line);
 681             }
 682             catch (IOException e) {
 683                 break LOOP;
 684             }
 685         }
 686         in.close();
 687         out.close();
 688     }
 689 
 690     /**
 691     * The replaceCommand method takes a command (a macro call without the
 692     * leading marker "$$") and computes replacement text for it.
 693     *
 694     * Most of the commands are simply names of integer constants that are defined
 695     * in the source code of this GenerateCharacter class.  The replacement text is
 696     * simply the value of the constant as an appropriately formatted integer literal.
 697     *
 698     * Two cases are more complicated, however.  The command "Tables" causes the
 699     * final map and compressed tables to be emitted, with elaborate comments
 700     * describing their contents.  (This is actually handled by method genTables.)
 701     * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
 702     * an expression that will return the character property data for the character
 703     * whose code is the value of the variable "xxx".  (this is handled by method
 704     * "genAccess".)
 705     *
 706     * @param x  a command from the template file to be replaced
 707     * @return   the replacement text, as a String
 708     *
 709     * @see GenerateCharacter#genTables
 710     * @see GenerateCharacter#genAccess
 711     * @see GenerateCharacter#generateCharacterClass
 712     */
 713 
 714     static String replaceCommand(String x) {
 715         if (x.equals("Tables")) return genTables();
 716         if (x.equals("Initializers")) return genInitializers();
 717         if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
 718                 x.substring(x.length()-1).equals(")") )
 719             return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
 720         if (x.equals("shiftType")) return Long.toString(shiftType);
 721         if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
 722         if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
 723         if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
 724         if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
 725         if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
 726         if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
 727         if (x.equals("maskCase")) return "0x" + hex8(maskCase);
 728         if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
 729         if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
 730         if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
 731         if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
 732         if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
 733         if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
 734         if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
 735         if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
 736         if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
 737         if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
 738         if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
 739         if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
 740         if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
 741         if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
 742         if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
 743         if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
 744         if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
 745         if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
 746         if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
 747         if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
 748         if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
 749         if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
 750         if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
 751         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 752         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 753         if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
 754         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 755         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 756         if (x.equals("maskType")) return "0x" + hex(maskType);
 757         if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
 758         if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
 759         if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
 760         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
 761             return Integer.toString(UnicodeSpec.UNASSIGNED);
 762         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
 763             return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
 764         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
 765             return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
 766         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
 767             return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
 768         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
 769              return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
 770         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
 771              return Integer.toString(UnicodeSpec.OTHER_LETTER);
 772         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
 773              return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
 774         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
 775              return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
 776         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
 777              return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
 778         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
 779              return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
 780         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
 781              return Integer.toString(UnicodeSpec.OTHER_NUMBER);
 782         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
 783              return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
 784         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
 785              return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
 786         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 787              return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
 788         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
 789             return Integer.toString(UnicodeSpec.CONTROL);
 790         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
 791             return Integer.toString(UnicodeSpec.FORMAT);
 792         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
 793             return Integer.toString(UnicodeSpec.PRIVATE_USE);
 794         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
 795             return Integer.toString(UnicodeSpec.SURROGATE);
 796         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
 797             return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
 798         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
 799             return Integer.toString(UnicodeSpec.START_PUNCTUATION);
 800         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
 801             return Integer.toString(UnicodeSpec.END_PUNCTUATION);
 802         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 803             return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
 804         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 805             return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
 806         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
 807             return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
 808         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
 809             return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
 810         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
 811             return Integer.toString(UnicodeSpec.LETTER_NUMBER);
 812         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
 813             return Integer.toString(UnicodeSpec.MATH_SYMBOL);
 814         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
 815             return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
 816         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
 817             return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
 818         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
 819             return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
 820         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
 821             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
 822         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
 823             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
 824         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
 825             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
 826         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
 827             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
 828         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
 829             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
 830         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
 831             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
 832         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
 833             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
 834         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
 835             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
 836         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
 837             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
 838         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 839             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
 840         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
 841             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
 842         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
 843             return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
 844         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 845             return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
 846         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
 847             return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
 848          if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
 849             return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
 850         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 851             return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
 852         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
 853             return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
 854         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
 855             return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
 856         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
 857             return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
 858         FAIL("Unknown text substitution marker " + commandMarker + x);
 859         return commandMarker + x;
 860     }
 861 
 862     /**
 863     * The genTables method generates source code for all the lookup tables
 864     * needed to represent the various Unicode character properties.
 865     * It simply calls the method genTable once for each table to be generated
 866     * and then generates a summary comment.
 867     *
 868     * @return   the replacement text for the "Tables" command, as a String
 869     *
 870     * @see GenerateCharacter#genTable
 871     * @see GenerateCharacter#replaceCommand
 872     */
 873     static String genTables() {
 874         int n = sizes.length;
 875         StringBuffer result = new StringBuffer();
 876         // liu : Add a comment showing the source of this table
 877         result.append(commentStart + " The following tables and code generated using:" +
 878                   commentEnd + "\n  ");
 879         result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
 880 
 881                 if (plane == 0 && bLatin1 == false) {
 882             genCaseMapTableDeclaration(result);
 883             genCaseMapTable(initializers, specialCaseMaps);
 884                 }
 885         int totalBytes = 0;
 886         for (int k = 0; k < n - 1; k++) {
 887             genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
 888                 sizes[k+1], false, false, k==0);
 889             int s = bytes[k];
 890             if (s == 1 && useCharForByte) {
 891                 s = 2;
 892             }
 893             totalBytes += tables[k].length * s;
 894         }
 895         genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
 896             sizes[n - 1], false, 0, true, !(identifiers), false);
 897 
 898         // If we ever need more than 32 bits to represent the character properties,
 899         // then a table "B" may be needed as well.
 900         //  genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
 901 
 902         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
 903         result.append(commentStart);
 904         result.append(" In all, the character property tables require ");
 905         result.append(totalBytes).append(" bytes.").append(commentEnd);
 906         if (verbose) {
 907             System.out.println("The character property tables require "
 908                  + totalBytes + " bytes.");
 909         }
 910         return result.toString();
 911     }
 912 
 913     /**
 914      * The genInitializers method generates the body of the
 915      * ensureInitted() method, which enables lazy initialization of
 916      * the case map table and other tables.
 917      */
 918     static String genInitializers() {
 919         return initializers.toString();
 920     }
 921 
 922     /**
 923      * Return the total number of bytes needed by all tables.  This is a stripped-
 924      * down copy of genTables().
 925      */
 926     static int getTotalBytes() {
 927         int n = sizes.length;
 928         int totalBytes = 0;
 929         for (int k = 0; k < n - 1; k++) {
 930             totalBytes += tables[k].length * bytes[k];
 931         }
 932         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
 933                          + 31) >> 5) << 2);
 934         return totalBytes;
 935     }
 936 
 937     static void appendEscapedStringFragment(StringBuffer result,
 938                                             char[] line,
 939                                             int length,
 940                                             boolean lastFragment) {
 941         result.append("    \"");
 942         for (int k=0; k<length; ++k) {
 943             result.append("\\u");
 944             result.append(hex4(line[k]));
 945         }
 946         result.append("\"");
 947         result.append(lastFragment ? ";" : "+");
 948         result.append("\n");
 949     }
 950 
 951     static String SMALL_INITIALIZER =
 952         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 953         // "            $$name = new $$type[$$size];\n"+
 954         "            int len = $$name_DATA.length();\n"+
 955         "            int j=0;\n"+
 956         "            for (int i=0; i<len; ++i) {\n"+
 957         "                int c = $$name_DATA.charAt(i);\n"+
 958         "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
 959         "                    $$name[j++] = ($$type)c;\n"+
 960         "                    c >>= $$bits;\n"+
 961         "                }\n"+
 962         "            }\n"+
 963         "            assert (j == $$size);\n"+
 964         "        }\n";
 965 
 966     static String SAME_SIZE_INITIALIZER =
 967         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 968         "            assert ($$name_DATA.length() == $$size);\n"+
 969         // "            $$name = new $$type[$$size];\n"+
 970         "            for (int i=0; i<$$size; ++i)\n"+
 971         "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
 972         "        }\n";
 973 
 974     static String BIG_INITIALIZER =
 975         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 976         // "            $$name = new $$type[$$size];\n"+
 977         "            int len = $$name_DATA.length();\n"+
 978         "            int j=0;\n"+
 979         "            int charsInEntry=0;\n"+
 980         "            $$type entry=0;\n"+
 981         "            for (int i=0; i<len; ++i) {\n"+
 982         "                entry |= $$name_DATA.charAt(i);\n"+
 983         "                if (++charsInEntry == $$charsPerEntry) {\n"+
 984         "                    $$name[j++] = entry;\n"+
 985         "                    entry = 0;\n"+
 986         "                    charsInEntry = 0;\n"+
 987         "                }\n"+
 988         "                else {\n"+
 989         "                    entry <<= 16;\n"+
 990         "                }\n"+
 991         "            }\n"+
 992         "            assert (j == $$size);\n"+
 993         "        }\n";
 994 
 995     static String INT32_INITIALIZER =
 996         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 997         "            char[] data = $$name_DATA.toCharArray();\n"+
 998         "            assert (data.length == ($$size * 2));\n"+
 999         "            int i = 0, j = 0;\n"+
1000         "            while (i < ($$size * 2)) {\n"+
1001         "                int entry = data[i++] << 16;\n"+
1002         "                $$name[j++] = entry | data[i++];\n"+
1003         "            }\n"+
1004         "        }\n";
1005 
1006     static void addInitializer(String name, String type, int entriesPerChar,
1007                                int bits, int size) {
1008 
1009         String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1010                           ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1011         if (entriesPerChar == -2) {
1012             template = INT32_INITIALIZER;
1013         }
1014         int marklen = commandMarker.length();
1015         int pos = 0;
1016         while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1017             int newpos = pos + marklen;
1018             char ch = 'x';
1019             while (newpos < template.length() &&
1020                    Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1021                    ch != '_') // Don't allow this in token names
1022                 ++newpos;
1023             String token = template.substring(pos+marklen, newpos);
1024             String replacement = "ERROR";
1025 
1026             if (token.equals("name")) replacement = name;
1027             else if (token.equals("type")) replacement = type;
1028             else if (token.equals("bits")) replacement = ""+bits;
1029             else if (token.equals("size")) replacement = ""+size;
1030             else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1031             else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1032             else FAIL("Unrecognized token: " + token);
1033 
1034             template = template.substring(0, pos) + replacement + template.substring(newpos);
1035             pos += replacement.length();
1036         }
1037         initializers.append(template);
1038     }
1039 
1040     /**
1041     * The genTable method generates source code for one lookup table.
1042     * Most of the complexity stems from handling various options as to
1043     * the type of the array components, the precise representation of the
1044     * values, the format in which to render each value, the number of values
1045     * to emit on each line of source code, and the kinds of useful comments
1046     * to be generated.
1047     *
1048     * @param result     a StringBuffer, to which the generated source code
1049     *                   text is to be appended
1050     * @param name       the name of the table
1051     * @param table      the table data (an array of long values)
1052     * @param extract    a distance, in bits, by which each entry of the table
1053     *                   is to be right-shifted before it is processed
1054     * @param bits       the number of bits (not bytes) to be used to represent
1055     *                   each table entry
1056     * @param size       the table data is divided up into blocks of size (1<<size);
1057     *                   in this method, this information is used only to affect
1058     *                   how many table values are to be generated per line
1059     * @param preshifted if this flag is true, then the table entries are to be
1060     *                   emitted in a preshifted form; that is, each value should
1061     *                   be left-shifted by the amount "shift", so that this work
1062     *                   is built into the table and need not be performed by an
1063     *                   explicit shift operator at run time
1064     * @param shift      this is the shift amount for preshifting of table entries
1065     * @param hexFormat  if this flag is true, table entries should be emitted as
1066     *                   hexadecimal literals; otherwise decimal literals are used
1067     * @param properties if this flag is true, the table entries are encoded
1068     *                   character properties rather than indexes into yet other tables;
1069     *                   therefore comments describing the encoded properties should
1070     *                   be generated
1071     * @param hexComment if this flag is true, each line of output is labelled with
1072     *                   a hexadecimal comment indicating the character values to
1073     *                   which that line applies; otherwise, decimal values indicating
1074     *                   table indices are generated
1075     *
1076     * @see GenerateCharacter#genTables
1077     * @see GenerateCharacter#replaceCommand
1078     */
1079 
1080     static void genTable(StringBuffer result, String name,
1081              long[] table, int extract, int bits, int size,
1082              boolean preshifted, int shift, boolean hexFormat,
1083              boolean properties, boolean hexComment) {
1084 
1085         String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1086             bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1087             bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1088             bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1089             bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1090             bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1091             (Csyntax ? "int64" : "long");
1092         long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1093             bits == 2 ? Integer.MAX_VALUE :
1094             bits == 4 ? Integer.MAX_VALUE :
1095             bits == 8 ? Byte.MAX_VALUE :
1096             bits == 16 ? Short.MAX_VALUE :
1097             bits == 32 ? Integer.MAX_VALUE :
1098             Long.MAX_VALUE;
1099         int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1100         boolean shiftEntries = preshifted && shift != 0;
1101         if (bits == 8 && tableAsString && useCharForByte) {
1102             atype = "char";
1103             maxPosEntry = Character.MAX_VALUE;
1104             entriesPerChar = 1;
1105         }
1106         boolean noConversion = atype.equals("char");
1107 
1108         result.append(commentStart);
1109         result.append(" The ").append(name).append(" table has ").append(table.length);
1110         result.append(" entries for a total of ");
1111         int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1112         if (bits == 8 && useCharForByte) {
1113             sizeOfTable *= 2;
1114         }
1115         result.append(sizeOfTable);
1116         result.append(" bytes.").append(commentEnd).append("\n\n");
1117         if (Csyntax)
1118             result.append("  static ");
1119         else
1120             result.append("  static final ");
1121         result.append(atype);
1122         result.append(" ").append(name).append("[");
1123         if (Csyntax)
1124             result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1125         if (tableAsString) {
1126             if (noConversion) {
1127                 result.append("] = (\n");
1128             } else {
1129                 result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1130                 result.append("static final String ").append(name).append("_DATA =\n");
1131             }
1132             int CHARS_PER_LINE = 8;
1133             StringBuffer theString = new StringBuffer();
1134             int entriesInCharSoFar = 0;
1135             char ch = '\u0000';
1136             int charsPerEntry = -entriesPerChar;
1137             for (int j=0; j<table.length; ++j) {
1138                 long entry = table[j] >> extract;
1139                 if (shiftEntries) entry <<= shift;
1140                 if (entry >= (1L << bits)) {
1141                     FAIL("Entry too big");
1142                 }
1143                 if (entriesPerChar > 0) {
1144                     // Pack multiple entries into a character
1145                     ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1146                     ++entriesInCharSoFar;
1147                     if (entriesInCharSoFar == entriesPerChar) {
1148                         // Character is full
1149                         theString.append(ch);
1150                         entriesInCharSoFar = 0;
1151                         ch = '\u0000';
1152                     }
1153                 }
1154                 else {
1155                     // Use multiple characters per entry
1156                     for (int k=0; k<charsPerEntry; ++k) {
1157                         ch = (char)(entry >> ((charsPerEntry-1)*16));
1158                         entry <<= 16;
1159                         theString.append(ch);
1160                     }
1161                 }
1162             }
1163             if (entriesInCharSoFar > 0) {
1164                 while (entriesInCharSoFar < entriesPerChar) {
1165                     ch = (char)((int)ch >> bits);
1166                     ++entriesInCharSoFar;
1167                 }
1168                 theString.append(ch);
1169                 entriesInCharSoFar = 0;
1170             }
1171             result.append(Utility.formatForSource(theString.toString(), "    "));
1172             if (noConversion) {
1173                 result.append(").toCharArray()");
1174             }
1175             result.append(";\n\n  ");
1176 
1177             if (!noConversion) {
1178                 addInitializer(name, atype, entriesPerChar, bits, table.length);
1179             }
1180         }
1181         else {
1182             result.append("] = {");
1183             boolean castEntries = shiftEntries && (bits < 32);
1184             int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1185                 bits == 2 ? 16*4 :
1186                 bits == 4 ? 8*4 :
1187                 bits == 8 ? 8 :
1188                 bits == 16 ? 8 :
1189                 bits == 32 ? 4 : 2) :
1190                 (bits == 8 ? 8 :
1191                 bits == 16 ? 8 : 4);
1192             int printMask = properties ? 0 :
1193             Math.min(1 << size,
1194                 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1195             int commentShift = ((1 << size) == table.length) ? 0 : size;
1196             int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1197             long val = 0;
1198             for (int j = 0; j < table.length; j++) {
1199                 if ((j & printMask) == 0) {
1200                     while (result.charAt(result.length() - 1) == ' ')
1201                         result.setLength(result.length() - 1);
1202                     result.append("\n    ");
1203                 }
1204         PRINT:  {
1205                 if (castEntries)
1206                     result.append("(").append(atype).append(")(");
1207                 long entry = table[j] >> extract;
1208                 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1209                 int k = j & packMask;
1210                 if (bits >= 8)
1211                     val = entry;
1212                 else if (k == 0) {
1213                     val = entry;
1214                     break PRINT;
1215                 }
1216                 else {
1217                     val |= (entry << (k*bits));
1218                     if (k != packMask)
1219                         break PRINT;
1220                 }
1221                 if (val > maxPosEntry && !Csyntax) { // liu
1222                 // For values that are out of range, convert them to in-range negative values.
1223                 // Actually, output the '-' and convert them to the negative of the corresponding
1224                 // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1225                     result.append('-');
1226                     val = maxPosEntry + maxPosEntry + 2 - val;
1227                 }
1228                 if (hexFormat) {
1229                     result.append("0x");
1230                     if (bits == 8)
1231                         result.append(hex2((byte)val));
1232                     else if (bits == 16)
1233                         result.append(hex4((short)val));
1234                     else if (bits == 32 || bits < 8)
1235                         result.append(hex8((int)val));
1236                     else {
1237                         result.append(hex16((long)val));
1238                         if (!Csyntax)
1239                             result.append("L");
1240                     }
1241                 }
1242                 else {
1243                     if (bits == 8)
1244                         result.append(dec3(val));
1245                     else if (bits == 64) {
1246                         result.append(dec5(val));
1247                         if (!Csyntax)
1248                             result.append("L");
1249                     }
1250                     else
1251                         result.append(dec5(val));
1252                 }
1253                 if (shiftEntries)
1254                     result.append("<<").append(shift);
1255                 if (castEntries) result.append(")");
1256                 if (j < (table.length - 1))
1257                     result.append(", ");
1258                 else
1259                     result.append("  ");
1260                 if ((j & printMask) == printMask) {
1261                     result.append(" ").append(commentStart).append(" ");
1262                     if (hexComment)
1263                         result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1264                     else
1265                         result.append(dec3((j & ~commentMask) >> commentShift));
1266                     if (properties) propertiesComments(result, val);
1267                     result.append(commentEnd);
1268                 }
1269                 } // end PRINT
1270             }
1271             result.append("\n  };\n\n  ");
1272         }
1273     }
1274 
1275     static void genCaseMapTableDeclaration(StringBuffer result) {
1276         String myTab = "    ";
1277         result.append(myTab + "static final char[][][] charMap;\n");
1278     }
1279 
1280     static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1281         String myTab = "    ";
1282         int ch;
1283         char[] map;
1284         result.append(myTab + "charMap = new char[][][] {\n");
1285         for (int x = 0; x < specialCaseMaps.length; x++) {
1286             ch = specialCaseMaps[x].getCharSource();
1287             map = specialCaseMaps[x].getUpperCaseMap();
1288             result.append(myTab + myTab);
1289             result.append("{ ");
1290             result.append("{\'\\u"+hex4(ch)+"\'}, {");
1291             for (int y = 0; y < map.length; y++) {
1292                 result.append("\'\\u"+hex4(map[y])+"\', ");
1293             }
1294             result.append("} },\n");
1295         }
1296         result.append(myTab + "};\n");
1297 
1298     }
1299 
1300     /**
1301     * The propertiesComments method generates comments describing encoded
1302     * character properties.
1303     *
1304     * @param result     a StringBuffer, to which the generated source code
1305     *                   text is to be appended
1306     * @param val                encoded character properties
1307     *
1308     * @see GenerateCharacter#genTable
1309     */
1310 
1311     static void propertiesComments(StringBuffer result, long val) {
1312         result.append("   ");
1313         switch ((int)(val & maskType)) {
1314             case UnicodeSpec.CONTROL:
1315                 result.append("Cc");
1316                 break;
1317             case UnicodeSpec.FORMAT:
1318                 result.append("Cf");
1319                 break;
1320             case UnicodeSpec.PRIVATE_USE:
1321                 result.append("Co");
1322                 break;
1323             case UnicodeSpec.SURROGATE:
1324                 result.append("Cs");
1325                 break;
1326             case UnicodeSpec.LOWERCASE_LETTER:
1327                 result.append("Ll");
1328                 break;
1329             case UnicodeSpec.MODIFIER_LETTER:
1330                 result.append("Lm");
1331                 break;
1332             case UnicodeSpec.OTHER_LETTER:
1333                 result.append("Lo");
1334                 break;
1335             case UnicodeSpec.TITLECASE_LETTER:
1336                 result.append("Lt");
1337                 break;
1338             case UnicodeSpec.UPPERCASE_LETTER:
1339                 result.append("Lu");
1340                 break;
1341             case UnicodeSpec.COMBINING_SPACING_MARK:
1342                 result.append("Mc");
1343                 break;
1344             case UnicodeSpec.ENCLOSING_MARK:
1345                 result.append("Me");
1346                 break;
1347             case UnicodeSpec.NON_SPACING_MARK:
1348                 result.append("Mn");
1349                 break;
1350             case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1351                 result.append("Nd");
1352                 break;
1353             case UnicodeSpec.LETTER_NUMBER:
1354                 result.append("Nl");
1355                 break;
1356             case UnicodeSpec.OTHER_NUMBER:
1357                 result.append("No");
1358                 break;
1359             case UnicodeSpec.CONNECTOR_PUNCTUATION:
1360                 result.append("Pc");
1361                 break;
1362             case UnicodeSpec.DASH_PUNCTUATION:
1363                 result.append("Pd");
1364                 break;
1365             case UnicodeSpec.END_PUNCTUATION:
1366                 result.append("Pe");
1367                 break;
1368             case UnicodeSpec.OTHER_PUNCTUATION:
1369                 result.append("Po");
1370                 break;
1371             case UnicodeSpec.START_PUNCTUATION:
1372                 result.append("Ps");
1373                 break;
1374             case UnicodeSpec.CURRENCY_SYMBOL:
1375                 result.append("Sc");
1376                 break;
1377             case UnicodeSpec.MODIFIER_SYMBOL:
1378                 result.append("Sk");
1379                 break;
1380             case UnicodeSpec.MATH_SYMBOL:
1381                 result.append("Sm");
1382                 break;
1383             case UnicodeSpec.OTHER_SYMBOL:
1384                 result.append("So");
1385                 break;
1386             case UnicodeSpec.LINE_SEPARATOR:
1387                 result.append("Zl"); break;
1388             case UnicodeSpec.PARAGRAPH_SEPARATOR:
1389                 result.append("Zp");
1390                 break;
1391             case UnicodeSpec.SPACE_SEPARATOR:
1392                 result.append("Zs");
1393                 break;
1394             case UnicodeSpec.UNASSIGNED:
1395                 result.append("unassigned");
1396                 break;
1397         }
1398 
1399         switch ((int)((val & maskBidi) >> shiftBidi)) {
1400             case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1401                 result.append(", L");
1402                 break;
1403             case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1404                 result.append(", R");
1405                 break;
1406             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1407                 result.append(", EN");
1408                 break;
1409             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1410                 result.append(", ES");
1411                 break;
1412             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1413                 result.append(", ET");
1414                 break;
1415             case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1416                 result.append(", AN");
1417                 break;
1418             case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1419                 result.append(", CS");
1420                 break;
1421             case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1422                 result.append(", B");
1423                 break;
1424             case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1425                 result.append(", S");
1426                 break;
1427             case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1428                 result.append(", WS");
1429                 break;
1430             case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1431                 result.append(", ON");
1432                 break;
1433         }
1434         if ((val & maskUpperCase) != 0) {
1435             result.append(", hasUpper (subtract ");
1436             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1437         }
1438         if ((val & maskLowerCase) != 0) {
1439             result.append(", hasLower (add ");
1440             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1441         }
1442         if ((val & maskTitleCase) != 0) {
1443             result.append(", hasTitle");
1444         }
1445         if ((val & maskIdentifierInfo) == valueIgnorable) {
1446             result.append(", ignorable");
1447         }
1448         if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1449             result.append(", identifier part");
1450         }
1451         if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1452             result.append(", underscore");
1453         }
1454         if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1455             result.append(", whitespace");
1456         }
1457         if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1458             result.append(", currency");
1459         }
1460         if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1461             result.append(", identifier start");
1462         }
1463         if ((val & maskNumericType) == valueDigit) {
1464             result.append(", decimal ");
1465             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1466         }
1467         if ((val & maskNumericType) == valueStrangeNumeric) {
1468             result.append(", strange");
1469         }
1470         if ((val & maskNumericType) == valueJavaSupradecimal) {
1471             result.append(", supradecimal ");
1472             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1473         }
1474     }
1475 
1476     static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1477 
1478     static String tableName(int j) { return tableNames[j]; }
1479 
1480     /**
1481     * The genAccess method generates source code for one table access expression.
1482     *
1483     * Most of the complexity stems from handling various options as to
1484     * table representation, such as whether it contains values so large that
1485     * they are represented as negative values and whether the table values are
1486     * preshifted.  This method also avoids such "ugly" expressions as shifting
1487     * by distance zero, masking when no masking is necessary, and so on.
1488     * For clarity, it generates expressions that do not rely on operator
1489     * precedence, but otherwise it avoids generating redundant parentheses.
1490     *
1491     * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1492     * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1493     *
1494     * @param tbl                the name of the final table to be accessed
1495     * @param var                the variable name that appeared in parentheses in the
1496     *                           "Lookup" command
1497     * @param bits       the number of bits (not bytes) to be used to represent
1498     *                   the final table entry
1499     * @return   the replacement text for the "Lookup(xxx)" command, as a String
1500     *
1501     * @see GenerateCharacter#replaceCommand
1502     */
1503 
1504     static String genAccess(String tbl, String var, int bits) {
1505         String access = null;
1506         int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1507         for (int k = 0; k < sizes.length; k++) {
1508             int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1509             int shift = shifts[k] + offset;
1510             String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1511             int mask = (1 << (sizes[k] - offset)) - 1;
1512             String masked = (k == 0) ? shifted :
1513               "(" + shifted + "&0x" + hex(mask) + ")";
1514             String index = (k == 0) ? masked :
1515              (mask == 0) ? access : "(" + access + "|" + masked + ")";
1516             String indexNoParens = (index.charAt(0) != '(') ? index :
1517                  index.substring(1, index.length() - 1);
1518             String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1519             String fetched = tblname + "[" + indexNoParens + "]";
1520             String zeroextended = (zeroextend[k] == 0) ? fetched :
1521                 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1522             int adjustment = preshifted[k] ? 0 :
1523                sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1524             String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1525                 "(" + zeroextended + "<<" + adjustment + ")";
1526             String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1527                 (bits == 2) ? "((" + var + "&0xF)<<1)" :
1528                 (bits == 4) ? "((" + var + "&7)<<2)" : null;
1529             String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1530                 "((" + adjusted + ">>" + bitshift + ")&" +
1531                 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1532             access = extracted;
1533         }
1534         return access;
1535     }
1536 
1537     /* The command line arguments are decoded and used to set the following
1538      global variables.
1539      */
1540 
1541     static boolean verbose = false;
1542     static boolean nobidi = false;
1543     static boolean nomirror = false;
1544     static boolean identifiers = false;
1545     static boolean Csyntax = false;
1546     static String TemplateFileName = null;
1547     static String OutputFileName = null;
1548     static String UnicodeSpecFileName = null; // liu
1549     static String SpecialCasingFileName = null;
1550     static boolean useCharForByte = false;
1551     static int[] sizes;
1552     static int bins = 0; // liu; if > 0, then perform search
1553     static boolean tableAsString = false;
1554     static boolean bLatin1 = false;
1555 
1556     static String commandLineDescription;
1557 
1558     /* Other global variables, equal in length to the "sizes" array. */
1559 
1560     static int[] shifts;
1561     static int[] zeroextend;
1562     static int[] bytes;
1563     static boolean[] preshifted;
1564     static long[][] tables;
1565 
1566 
1567     /* Other global variables */
1568     static String commentStart;
1569     static String commentEnd;
1570 
1571     static StringBuffer initializers = new StringBuffer();
1572 
1573     /* special casing rules for 1:M toUpperCase mappings */
1574     static SpecialCaseMap[] specialCaseMaps;
1575 
1576     /**
1577     * Process the command line arguments.
1578     *
1579     * The allowed flags in command line are:
1580     * <dl>
1581     * <dt> -verbose             <dd> Emit comments to standard output describing
1582     *                                   what's going on during the processing.
1583     * <dt> -nobidi              <dd> Do not include bidi categories in the
1584     *                                   encoded character properties.
1585     * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1586     *                        character properties.
1587     * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1588     * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1589     * <dt> -o filename          <dd> Specify output file name.
1590     * <dt> -template filename   <dd> Specify template input file name.
1591     * <dt> -spec filename        <dd> Specify Unicode spec file name.
1592     * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1593     * <dt> -search bins          <dd> Try different partitions into the specified
1594     *                                    number of bins.  E.g., for 2 bins, try
1595     *                                    16 0, 15 1,..., 0 16.
1596     * <dt> -string               <dd> Create table as string.  Only valid with Java
1597     *                                    syntax.
1598     * <dt> -latin1          <dd> Create a latin 1 only property table.
1599     * </dl>
1600     * In addition, decimal literals may appear as command line arguments;
1601     * each one represents the number of bits of the character to be broken
1602     * off at each lookup step.  If present, they must add up to 16 (the number
1603     * of bits in a char value).  For smaller tables, the last value should
1604     * be 0; values other than the last one may not be zero.  If no such
1605     * numeric values are provided, default values are used.
1606     *
1607     * @param args       the command line arguments, as an array of String
1608     *
1609     * @see GenerateCharacter#main
1610     */
1611 
1612     static void processArgs(String[] args) {
1613         StringBuffer desc = new StringBuffer("java GenerateCharacter");
1614         for (int j=0; j<args.length; ++j) {
1615             desc.append(" " + args[j]);
1616         }
1617         for (int j = 0; j < args.length; j++) {
1618             if (args[j].equals("-verbose") || args[j].equals("-v"))
1619                 verbose = true;
1620             else if (args[j].equals("-nobidi"))
1621                 nobidi = true;
1622             else if (args[j].equals("-nomirror"))
1623                 nomirror = true;
1624             else if (args[j].equals("-identifiers"))
1625                 identifiers = true;
1626             else if (args[j].equals("-c"))
1627                 Csyntax = true;
1628             else if (args[j].equals("-string"))
1629                 tableAsString = true;
1630             else if (args[j].equals("-o")) {
1631                 if (j == args.length - 1) {
1632                     FAIL("File name missing after -o");
1633                 }
1634                 else {
1635                     OutputFileName = args[++j];
1636                 }
1637             }
1638             else if (args[j].equals("-search")) {
1639                 if (j == args.length - 1)
1640                     FAIL("Bin count missing after -search");
1641                 else {
1642                     bins = Integer.parseInt(args[++j]);
1643                     if (bins < 1 || bins > 10)
1644                         FAIL("Bin count must be >= 1 and <= 10");
1645                 }
1646             }
1647             else if (args[j].equals("-template")) {
1648                 if (j == args.length - 1)
1649                     FAIL("File name missing after -template");
1650                 else
1651                     TemplateFileName = args[++j];
1652             }
1653             else if (args[j].equals("-spec")) { // liu
1654                 if (j == args.length - 1) {
1655                     FAIL("File name missing after -spec");
1656                 }
1657                 else {
1658                     UnicodeSpecFileName = args[++j];
1659                 }
1660             }
1661             else if (args[j].equals("-specialcasing")) {
1662                 if (j == args.length -1) {
1663                     FAIL("File name missing after -specialcasing");
1664                 }
1665                 else {
1666                     SpecialCasingFileName = args[++j];
1667                 }
1668             }
1669                         else if (args[j].equals("-plane")) {
1670                                 if (j == args.length -1) {
1671                                         FAIL("Plane number missing after -plane");
1672                                 }
1673                                 else {
1674                                         plane = Integer.parseInt(args[++j]);
1675                                 }
1676                                 if (plane > 0) {
1677                                         bLatin1 = false;
1678                                 }
1679                         }
1680                         else if ("-usecharforbyte".equals(args[j])) {
1681                                 useCharForByte = true;
1682                         }
1683             else if (args[j].equals("-latin1")) {
1684                 bLatin1 = true;
1685                 plane = 0;
1686             }
1687             else {
1688                 try {
1689                     int val = Integer.parseInt(args[j]);
1690                     if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1691                     if (sizes == null)
1692                         sizes = new int[1];
1693                     else {
1694                         int[] newsizes = new int[sizes.length + 1];
1695                         System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1696                         sizes = newsizes;
1697                     }
1698                     sizes[sizes.length - 1] = val;
1699                 }
1700                 catch(NumberFormatException e) {
1701                     FAIL("Unknown switch: " + args[j]);
1702                 }
1703             }
1704         }
1705         if (Csyntax && tableAsString) {
1706             FAIL("Can't specify table as string with C syntax");
1707         }
1708         if (sizes == null) {
1709             desc.append(" [");
1710             if (identifiers) {
1711                 int[] newsizes = { 8, 4, 4 };           // Good default values
1712                 desc.append("8 4 4]");
1713                 sizes = newsizes;
1714             }
1715             else {
1716                 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1717                 desc.append("10 5 1]");
1718                 sizes = newsizes;
1719             }
1720         }
1721         if (UnicodeSpecFileName == null) { // liu
1722             UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1723             desc.append(" [-spec " + UnicodeSpecFileName + ']');
1724         }
1725         if (SpecialCasingFileName == null) {
1726             SpecialCasingFileName = DefaultSpecialCasingFileName;
1727             desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1728         }
1729         if (TemplateFileName == null) {
1730             TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1731                   : DefaultJavaTemplateFileName);
1732             desc.append(" [-template " + TemplateFileName + ']');
1733         }
1734         if (OutputFileName == null) {
1735             OutputFileName = (Csyntax ? DefaultCOutputFileName
1736                     : DefaultJavaOutputFileName);
1737             desc.append(" [-o " + OutputFileName + ']');
1738         }
1739         commentStart = (Csyntax ? "/*" : "//");
1740         commentEnd = (Csyntax ? " */" : "");
1741         commandLineDescription = desc.toString();
1742     }
1743 
1744     private static void searchBins(long[] map, int binsOccupied) throws Exception {
1745         int bitsFree = 16;
1746         for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1747         if (binsOccupied == (bins-1)) {
1748             sizes[binsOccupied] = bitsFree;
1749             generateForSizes(map);
1750         }
1751         else {
1752             for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1753                 sizes[binsOccupied] = i;
1754                 searchBins(map, binsOccupied+1);
1755             }
1756         }
1757     }
1758 
1759     private static void generateForSizes(long[] map) throws Exception {
1760         int sum = 0;
1761         shifts = new int[sizes.length];
1762         for (int k = sizes.length - 1; k >= 0; k--) {
1763             shifts[k] = sum;
1764             sum += sizes[k];
1765         }
1766         if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1767             FAIL("Bit field widths total to " + sum +
1768              ": wrong total for map of size " + map.length);
1769         }
1770         // need a table for each set of lookup bits in char
1771         tables = new long[sizes.length][];
1772         // the last table is the map
1773         tables[sizes.length - 1] = map;
1774         for (int j = sizes.length - 1; j > 0; j--) {
1775             if (verbose && bins==0)
1776                 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1777             long[][] temp = buildTable(tables[j], sizes[j]);
1778             tables[j-1] = temp[0];
1779             tables[j] = temp[1];
1780         }
1781         preshifted = new boolean[sizes.length];
1782         zeroextend = new int[sizes.length];
1783         bytes = new int[sizes.length];
1784         for (int j = 0; j < sizes.length - 1; j++) {
1785             int len = tables[j+1].length;
1786             int size = sizes[j+1];
1787             if (len > 0x100 && (len >> size) <= 0x100) {
1788                 len >>= size;
1789                 preshifted[j] = false;
1790             }
1791             else if (len > 0x10000 && (len >> size) <= 0x10000) {
1792                 len >>= size;
1793                 preshifted[j] = false;
1794             }
1795             else preshifted[j] = true;
1796             if (Csyntax)
1797                 zeroextend[j] = 0;
1798             else if (len > 0x7F && len <= 0xFF) {
1799                 if (!useCharForByte) {
1800                     zeroextend[j] = 0xFF;
1801                 }
1802             } else if (len > 0x7FFF && len <= 0xFFFF)
1803                 zeroextend[j] = 0xFFFF;
1804             else zeroextend[j] = 0;
1805             if (len <= 0x100) bytes[j] = 1;
1806             else if (len <= 0x10000) bytes[j] = 2;
1807             else bytes[j] = 4;
1808         }
1809         preshifted[sizes.length - 1] = true;
1810         zeroextend[sizes.length - 1] = 0;
1811         bytes[sizes.length - 1] = 0;
1812         if (bins > 0) {
1813             int totalBytes = getTotalBytes();
1814             String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1815             int accessComplexity = 0;
1816             for (int j=0; j<access.length(); ++j) {
1817                 char ch = access.charAt(j);
1818                 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1819                 if (ch == '<' || ch == '>') ++j;
1820             }
1821             System.out.print("(");
1822             for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1823             System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1824             return;
1825         }
1826         if (verbose) {
1827             System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1828             for (int j = 0; j < sizes.length; j++) {
1829                 System.out.println(dec5(j) + "\t" +
1830                     dec5(sizes[j]) + "\t" +
1831                     dec5(tables[j].length) + "\t" +
1832                     dec5(shifts[j]) + "\t" +
1833                     dec5(zeroextend[j]) + "\t" +
1834                     dec5(bytes[j]) + "\t " +
1835                     preshifted[j]);
1836             }
1837         }
1838         if (verbose) {
1839             System.out.println("Generating source code for class Character");
1840             System.out.println("A table access looks like " +
1841                          genAccess("A", "ch", (identifiers ? 2 : 32)));
1842         }
1843         generateCharacterClass(TemplateFileName, OutputFileName);
1844     }
1845 
1846     /**
1847     * The main program for generating source code for the Character class.
1848     * The basic outline of its operation is:
1849     * <ol>
1850     * <li> Process the command line arguments.  One result of this process
1851     *           is a list of sizes (measured in bits and summing to 16).
1852     * <li> Get the Unicode character property data from the specification file.
1853     * <li> From that, build a map that has, for each character code, its
1854     *           relevant properties encoded as a long integer value.
1855     * <li> Repeatedly compress the map, producing a compressed table and a
1856     *           new map.  This is done once for each size value in the list.
1857     *           When this is done, we have a set of tables.
1858     * <li> Make some decisions about table representation; record these
1859     *           decisions in arrays named preshifted, zeroextend, and bytes.
1860     * <li> Generate the source code for the class Character by performing
1861     *           macro processing on a template file.
1862     * </ol>
1863     *
1864     * @param args       the command line arguments, as an array of String
1865     *
1866     * @see GenerateCharacter#processArgs
1867     * @see UnicodeSpec@readSpecFile
1868     * @see GenerateCharacter#buildMap
1869     * @see GenerateCharacter#buildTable
1870     * @see GenerateCharacter#generateCharacterClass
1871     */
1872 
1873     public static void main(String[] args) {
1874         processArgs(args);
1875         try {
1876 
1877             UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1878 
1879             specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1880             if (verbose) {
1881                 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1882             }
1883             long[] map = buildMap(data, specialCaseMaps);
1884             if (verbose) {
1885                 System.err.println("Completed building of initial map");
1886             }
1887 
1888             if (bins == 0) {
1889                 generateForSizes(map);
1890             }
1891             else {
1892                 while (bins > 0) {
1893                     sizes = new int[bins];
1894                     searchBins(map, 0);
1895                     --bins;
1896                 }
1897             }
1898             if (verbose && false) {
1899                 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1900                              hex8(maxOffsetSeen));
1901                 System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
1902                              hex8(maxOffset));
1903             }
1904         }
1905         catch (FileNotFoundException e) { FAIL(e.toString()); }
1906         catch (IOException e) { FAIL(e.toString()); }
1907         catch (Throwable e) {
1908             System.out.println("Unexpected exception:");
1909             e.printStackTrace();
1910             FAIL("Unexpected exception!");
1911         }
1912         if (verbose) { System.out.println("Done!");}
1913     }
1914 
1915 }   // end class