1 
   2 /*
   3  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.  Oracle designates this
   9  * particular file as subject to the "Classpath" exception as provided
  10  * by Oracle in the LICENSE file that accompanied this code.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  */
  26 
  27 package build.tools.generatecharacter;
  28 
  29 import java.io.IOException;
  30 import java.io.FileNotFoundException;
  31 import java.io.BufferedReader;
  32 import java.io.FileReader;
  33 import java.io.PrintWriter;
  34 import java.io.BufferedWriter;
  35 import java.io.FileWriter;
  36 import java.io.File;
  37 
  38 import build.tools.generatecharacter.CharacterName;
  39 
  40 /**
  41  * This program generates the source code for the class java.lang.Character.
  42  * It also generates native C code that can perform the same operations.
  43  * It requires two external input data files:
  44  * <ul>
  45  * <li> Unicode specification file
  46  * <li> Character class template file
  47  * </ul>
  48  * The Unicode specification file is available from the Unicode consortium.
  49  * It has character specification lines that look like this:
  50  * <listing>
  51  * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
  52  * </listing>
  53  * The Character class template file is filled in with additional
  54  * information to produce the file Character.java, which can then be
  55  * compiled by a Java compiler.  The template file contains certain
  56  * markers consisting of an alphabetic name string preceded by "$$".
  57  * Such markers are replaced with generated program text.  As a special
  58  * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
  59  * alphabetic characters constituting a variable name.  The character "_"
  60  * is considered alphabetic for these purposes.
  61  *
  62  * @author  Guy Steele
  63  * @author  Alan Liu
  64  * @author  John O'Conner
  65  */
  66 
  67 public class GenerateCharacter {
  68 
  69     final static boolean DEBUG = false;
  70 
  71     final static int MAX_UNICODE_VALUE = 0xFFFF;
  72     final static String commandMarker = "$$";
  73     static String ROOT                        = "";
  74     static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
  75     static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
  76     static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
  77     static String DefaultJavaOutputFileName   = ROOT + "Character.java";
  78     static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
  79     static String DefaultCOutputFileName      = ROOT + "Character.c";
  80 
  81     static String CharacterDataClassName      = "CharacterData";
  82         static int plane = 0;
  83 
  84     /* The overall idea is that, in the generated Character class source code,
  85     most character property data is stored in a special multi-level table whose
  86     structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
  87     The integers must sum to 16 (the number of bits in a character).
  88     The first table is indexed by the k1 high-order bits of the character code.
  89     The result is concatenated to the next k2 bits of the character code to index
  90     the second table, and so on.  Eventually the kn low-order bits of the character
  91     code are concatenated and used to index one of two tables A and B; A contains
  92     32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
  93     can be thus obtained encode the properties for the character.
  94 
  95     The default specification is [9, 4, 3, 0].  This particular table format was
  96     designed by conducting an exhaustive search of table formats to minimize the
  97     space consumed by the tables: the first and third tables need have only byte
  98     values (the second table must have short values).  Another good choice is
  99     [10, 6, 0], which produces a larger table but allows particularly fast table
 100     lookup code.
 101 
 102     In each case, where the word "concatenated" is used, this may imply
 103     first a << and then a | operation, or perhaps just a | operation if
 104     the values in the table can be preshifted (generally possible if the table
 105     entries are short rather than byte).
 106     */
 107 
 108     /* The character properties are currently encoded into 32 bits in the following manner:
 109     1 bit Mirrored property.
 110     4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
 111     9 bits      A signed offset used for converting case .
 112     1 bit       If 1, adding the signed offset converts the character to lowercase.
 113     1 bit       If 1, subtracting the signed offset converts the character to uppercase.
 114         Note: for a titlecase character, both of the preceding bits will be 1
 115         and the signed offset will be 1.
 116     1 bit   If 1, this character has a titlecase equivalent (possibly itself);
 117         in this case, the two bits before this bit can be used to decide
 118         whether this character is in fact uppercase, lowercase, or titlecase.
 119     3 bits      This field provides a quick way to lex identifiers.
 120         The eight possible values for this field are as follows:
 121         0  May not be part of an identifier
 122         1  Ignorable control; may continue a Unicode identifier or Java identifier
 123         2  May continue a Java identifier but not a Unicode identifier (unused)
 124         3  May continue a Unicode identifier or Java identifier
 125         4  Is a Java whitespace character
 126         5  May start or continue a Java identifier;
 127            may continue but not start a Unicode identifier
 128            (this value is used for connector punctuation such as _)
 129         6  May start or continue a Java identifier;
 130            may not occur in a Unicode identifier
 131            (this value is used for currency symbols such as $)
 132         7  May start or continue a Unicode identifier or Java identifier
 133         Thus:
 134            5, 6, 7 may start a Java identifier
 135            1, 2, 3, 5, 6, 7 may continue a Java identifier
 136            7 may start a Unicode identifier
 137            1, 3, 5, 7 may continue a Unicode identifier
 138            1 is ignorable within an identifier
 139            4 is Java whitespace
 140     2 bits      This field indicates whether the character has a numeric property.
 141         The four possible values for this field are as follows:
 142         0  This character has no numeric property.
 143         1  Adding the digit offset to the character code and then
 144            masking with 0x1F will produce the desired numeric value.
 145         2  This character has a "strange" numeric value.
 146         3  A Java supradecimal digit: adding the digit offset to the
 147            character code, then masking with 0x1F, then adding 10
 148            will produce the desired numeric value.
 149     5 bits  The digit offset (see description of previous field)
 150     5 bits      Character type (see below)
 151     */
 152 
 153 
 154     // bit masks identify each component of a 32-bit property field described
 155     // above.
 156     // shift* indicates how many shifts right must happen to get the
 157     // indicated property value in the lowest bits of the 32-bit space.
 158     private static final int
 159         shiftType           = 0,        maskType            =       0x001F,
 160         shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
 161         shiftNumericType    = 10,       maskNumericType     =       0x0C00,
 162         shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
 163                                         maskUnicodePart     =       0x1000,
 164         shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
 165                                         maskLowerCase       =      0x20000,
 166                                         maskUpperCase       =      0x10000,
 167                                         maskTitleCase       =      0x08000,
 168         shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
 169         shiftCaseOffsetSign = 5,
 170                                         // used only when calculating and
 171                                         // storing digit offsets from char values
 172                                         maskDigit               =   0x001F,
 173                                         // case offset are 9 bits
 174                                         maskCase                =   0x01FF,
 175         shiftBidi           = 27,       maskBidi              = 0x78000000,
 176         shiftMirrored       = 31,       maskMirrored          = 0x80000000,
 177         shiftPlane          = 16,       maskPlane = 0xFF0000;
 178 
 179     // Can compare masked values with these to determine
 180     // numeric or lexical types.
 181     public static int
 182         valueNotNumeric             = 0x0000,
 183         valueDigit                  = 0x0400,
 184         valueStrangeNumeric         = 0x0800,
 185         valueJavaSupradecimal       = 0x0C00,
 186         valueIgnorable              = 0x1000,
 187         valueJavaOnlyPart           = 0x2000,
 188         valueJavaUnicodePart        = 0x3000,
 189         valueJavaWhitespace         = 0x4000,
 190         valueJavaStartUnicodePart   = 0x5000,
 191         valueJavaOnlyStart          = 0x6000,
 192         valueJavaUnicodeStart       = 0x7000,
 193         lowJavaStart                = 0x5000,
 194         nonzeroJavaPart             = 0x3000,
 195         valueUnicodeStart           = 0x7000;
 196 
 197     // these values are used when only identifier properties are generated
 198     // for use in verifier code. Shortens the property down to a single byte.
 199     private static final int
 200         bitJavaStart            = 0x02,
 201         bitJavaPart             = 0x01,
 202         maskIsJavaIdentifierPart = bitJavaPart,
 203         maskIsJavaIdentifierStart = bitJavaStart;
 204 
 205     static int maxOffset = maskCase/2 ;
 206     static int minOffset = -maxOffset;
 207 
 208     /* The following routines provide simple, concise formatting of long integer values.
 209      The number in the name of the method indicates the desired number of characters
 210      to be produced.  If the number of digits required to represent the integer value
 211      is less than that number, then the output is padded on the left  with zeros
 212      (for hex) or with spaces (for decimal).  If the number of digits required to
 213      represent the integer value is greater than the desired number, then all the digits
 214      that are required are actually produced.
 215     */
 216 
 217     static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
 218 
 219     static String hex2(long n) {
 220         String q = Long.toHexString(n & 0xFF).toUpperCase();
 221         return "00".substring(Math.min(2, q.length())) + q;
 222     }
 223 
 224     static String hex4(long n) {
 225         String q = Long.toHexString(n & 0xFFFF).toUpperCase();
 226         return "0000".substring(Math.min(4, q.length())) + q;
 227     }
 228 
 229     static String hex8(long n) {
 230         String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
 231         return "00000000".substring(Math.min(8, q.length())) + q;
 232     }
 233 
 234     static String hex16(long n) {
 235         String q = Long.toHexString(n).toUpperCase();
 236         return "0000000000000000".substring(Math.min(16, q.length())) + q;
 237     }
 238 
 239     static String dec3(long n) {
 240         String q = Long.toString(n);
 241         return "   ".substring(Math.min(3, q.length())) + q;
 242     }
 243 
 244     static String dec5(long n) {
 245         String q = Long.toString(n);
 246         return "     ".substring(Math.min(5, q.length())) + q;
 247     }
 248 
 249     /* This routine is called when some failure occurs. */
 250 
 251     static void FAIL(String s) {
 252         System.out.println("** " + s);
 253     }
 254 
 255     /**
 256     * Given the data from the Unicode specification file, this routine builds a map.
 257     *
 258     * The specification file is assumed to contain its data in sorted order by
 259     * character code; as a result, the array passed as an argument to this method
 260     * has its components in the same sorted order, with one entry for each defined
 261         * Unicode character or character range.  (A range is indicated by two consecutive
 262     * entries, such that the name of the first entry begins with "<" and ends with
 263     * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
 264     * therefore a sparse representation of the character property data.
 265     *
 266     * The resulting map is dense representation of the character data.  It contains
 267     * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
 268     * of this long value are used, but type long is used rather than int to facilitate
 269     * future extensions of this source code generator that might require more than
 270     * 32 bits to encode relevant character properties.)  Entry k holds the encoded
 271     * properties for character k.
 272     *
 273     * Method buildMap manages the transformation from the sparse representation to
 274     * the dense representation.  It calls method buildOne to handle the encoding
 275     * of character property data from a single UnicodeSpec object into 32 bits.
 276     * For undefined characters, method buildOne is not called and the map entry for
 277     * that character is set to UnicodeSpec.UNASSIGNED.
 278     *
 279     * @param data       character property data from the Unicode specification file
 280     * @return   an array of length 65536 with one entry for every possible char value
 281     *
 282     * @see GenerateCharacter#buildOne
 283     */
 284 
 285     static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps) {
 286         long[] result;
 287         if (bLatin1 == true) {
 288             result = new long[256];
 289         } else {
 290             result = new long[1<<16];
 291         }
 292         int k=0;
 293                 int codePoint = plane<<16;
 294         UnicodeSpec nonCharSpec = new UnicodeSpec();
 295         for (int j = 0; j < data.length && k < result.length; j++) {
 296             if (data[j].codePoint == codePoint) {
 297                 result[k] = buildOne(codePoint, data[j], specialMaps);
 298                 ++k;
 299                                 ++codePoint;
 300             }
 301             else if(data[j].codePoint > codePoint) {
 302                 if (data[j].name.endsWith("Last>")) {
 303                     // build map data for all chars except last in range
 304                     while (codePoint < data[j].codePoint && k < result.length) {
 305                         result[k] = buildOne(codePoint, data[j], specialMaps);
 306                         ++k;
 307                                                 ++codePoint;
 308                     }
 309                 }
 310                 else {
 311                     // we have a few unassigned chars before data[j].codePoint
 312                     while (codePoint < data[j].codePoint && k < result.length) {
 313                         result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 314                         ++k;
 315                                                 ++codePoint;
 316                     }
 317                 }
 318                 k = data[j].codePoint & 0xFFFF;
 319                                 codePoint = data[j].codePoint;
 320                 result[k] = buildOne(codePoint, data[j], specialMaps);
 321                 ++k;
 322                                 ++codePoint;
 323 
 324             }
 325             else {
 326                 System.out.println("An error has occured during spec mapping.");
 327                 System.exit(0);
 328             }
 329         }
 330         // if there are still unprocessed chars, process them
 331         // as unassigned/undefined.
 332         codePoint = (plane<<16) | k;
 333         while (k < result.length) {
 334             result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 335             ++k;
 336                         ++codePoint;
 337         }
 338         return result;
 339     }
 340 
 341     // The maximum and minimum offsets found while scanning the database
 342     static int maxOffsetSeen = 0;
 343     static int minOffsetSeen = 0;
 344 
 345     /**
 346      * Some Unicode separator characters are not considered Java whitespace.
 347      * @param c character to test
 348      * @return true if c in an invalid Java whitespace character, false otherwise.
 349      */
 350     static boolean isInvalidJavaWhiteSpace(int c) {
 351         int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
 352         boolean retValue = false;
 353         for(int x=0;x<exceptions.length;x++) {
 354             if(c == exceptions[x]) {
 355                 retValue = true;
 356                 break;
 357             }
 358         }
 359         return retValue;
 360 
 361     }
 362 
 363     /**
 364     * Given the character property data for one Unicode character, encode the data
 365     * of interest into a single long integer value.  (Right now only 32 bits
 366     * of this long value are used, but type long is used rather than int to facilitate
 367     * future extensions of this source code generator that might require more than
 368     * 32 bits to encode relevant character properties.)
 369     *
 370     * @param c   the character code for which to encode property data
 371     * @param us  property data record from the Unicode specification file
 372     *            (its character code might not be equal to c if it specifies data
 373     *            for a range of characters)
 374     * @return   an encoded long value that contains the properties for a single char
 375     *
 376     * @see GenerateCharacter#buildMap
 377     */
 378 
 379     static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
 380         long resultA = 0;
 381         // record the general category
 382         resultA |= us.generalCategory;
 383 
 384     // record the numeric properties
 385     NUMERIC: {
 386         STRANGE: {
 387             int val = 0;
 388         // c is A-Z
 389             if ((c >= 0x0041) && (c <= 0x005A)) {
 390                 val = c - 0x0041;
 391                 resultA |= valueJavaSupradecimal;
 392         // c is a-z
 393             } else if ((c >= 0x0061) && (c <= 0x007A)) {
 394                 val = c - 0x0061;
 395                 resultA |= valueJavaSupradecimal;
 396             // c is a full-width A-Z
 397             } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
 398                 val = c - 0xFF21;
 399                 resultA |= valueJavaSupradecimal;
 400             // c is a full-width a-z
 401             } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
 402                 val = c - 0xFF41;
 403                 resultA |= valueJavaSupradecimal;
 404             } else if (us.isDecimalValue()) {
 405                 val = us.decimalValue;
 406                 resultA |= valueDigit;
 407             } else if (us.isDigitValue()) {
 408                 val = us.digitValue;
 409                 resultA |= valueDigit;
 410             } else {
 411                 if (us.numericValue.length() == 0) {
 412                     break NUMERIC;                      // no numeric value at all
 413                 } else {
 414                     try {
 415                         val = Integer.parseInt(us.numericValue);
 416                         if (val >= 32 || val < 0) break STRANGE;
 417                         if (c == 0x215F) break STRANGE;
 418                     } catch(NumberFormatException e) {
 419                         break STRANGE;
 420                     }
 421                     resultA |= valueDigit;
 422                 }
 423             }
 424             if (val >= 32 || val < 0) break STRANGE;
 425             resultA |= ((val - c & maskDigit) << shiftDigitOffset);
 426             break NUMERIC;
 427         } // end STRANGE
 428         resultA |= valueStrangeNumeric;
 429         } // end NUMERIC
 430 
 431     // record case mapping
 432         int offset = 0;
 433         // might have a 1:M mapping
 434         int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
 435         boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
 436         if (bHasUpper) {
 437             resultA |= maskUpperCase;
 438         }
 439         if (specialMap != -1) {
 440             // has mapping, but cannot record the
 441             // proper offset; can only flag it and provide special case
 442             // code in Character.java
 443             offset = -1;
 444         }
 445         else if (us.hasUpperMap())  {
 446             offset = c - us.upperMap;
 447         }
 448 
 449         if (us.hasLowerMap()) {
 450             resultA |= maskLowerCase;
 451             if (offset == 0)
 452                 offset = us.lowerMap - c;
 453             else if (offset != (us.lowerMap - c)) {
 454                 if (DEBUG) {
 455                 FAIL("Character " + hex(c) +
 456                 " has incompatible lowercase and uppercase mappings");
 457                 }
 458             }
 459         }
 460         if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
 461                 (bHasUpper && us.hasLowerMap())) {
 462             resultA |= maskTitleCase;
 463         }
 464         if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
 465           System.out.println("Warning: Character " + hex4(c) + " has upper but " +
 466                              "no title case; Java won't know this");
 467         }
 468         if (offset < minOffsetSeen) minOffsetSeen = offset;
 469         if (offset > maxOffsetSeen) maxOffsetSeen = offset;
 470         if (offset > maxOffset || offset < minOffset) {
 471             if (DEBUG) {
 472             FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
 473             }
 474             offset = maskCase;
 475         }
 476         resultA |= ((offset & maskCase) << shiftCaseOffset);
 477 
 478 
 479     // record lexical info about this character
 480         if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
 481                 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
 482                 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
 483                 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
 484                 || us.generalCategory == UnicodeSpec.OTHER_LETTER
 485                 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
 486             resultA |= valueJavaUnicodeStart;
 487         }
 488         else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
 489                 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
 490                 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
 491             resultA |= valueJavaUnicodePart;
 492         }
 493         else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
 494             resultA |= valueJavaStartUnicodePart;
 495         }
 496         else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
 497             resultA |= valueJavaOnlyStart;
 498         }
 499         else if (((c >= 0x0000) && (c <= 0x0008))
 500                 || ((c >= 0x000E) && (c <= 0x001B))
 501                 || ((c >= 0x007F) && (c <= 0x009F))
 502                 || us.generalCategory == UnicodeSpec.FORMAT) {
 503             resultA |= valueIgnorable;
 504         }
 505         else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
 506                 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
 507                 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
 508             if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
 509         }
 510         else if (((c >= 0x0009) && (c <= 0x000D))
 511                 || ((c >= 0x001C) && (c <= 0x001F))) {
 512             resultA |= valueJavaWhitespace;
 513         }
 514 
 515         // record bidi category
 516         if (!nobidi) {
 517             int tmpBidi =
 518                 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
 519                     us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
 520             resultA |= tmpBidi;
 521         }
 522 
 523         // record mirrored property
 524         if (!nomirror) {
 525             resultA |= us.mirrored ? maskMirrored : 0;
 526         }
 527 
 528         if (identifiers) {
 529             long replacement = 0;
 530             if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
 531                 replacement |= bitJavaStart;
 532             }
 533             if ( ((resultA & nonzeroJavaPart) != 0)
 534                     && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
 535                 replacement |= bitJavaPart;
 536             }
 537             resultA = replacement;
 538         }
 539         return resultA;
 540     }
 541 
 542     /**
 543     * This is the heart of the table compression strategy.  The inputs are a map
 544     * and a number of bits (size).  The map is simply an array of long integer values;
 545     * the number of bits indicates how index values for that map are to be split.
 546     * The length of the given map must be a multiple of (1 << size).  The result is
 547     * a new map z and a compressed table t such that for every valid index value k
 548     * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
 549     *
 550     * In other words, the index k can be split into two parts, namely the "size"
 551     * low-order bits and all the remaining high-order bits; the high-order bits are then
 552     * remapped by map z to produce an index into table t.  In effect, the data of the
 553     * original map m is broken up into blocks of size (1<<size); the compression relies
 554     * on the expectation that many of these blocks will be identical and therefore need
 555     * be represented only once in the compressed table t.
 556     *
 557     * This method is intended to be used iteratively.  The first map to be handed
 558     * to it is the one constructed by method buildMap.  After that, the first of the
 559     * two arrays returned by this method is fed back into it for further compression.
 560     * At the end of the iteration, one has a starter map and a sequence of tables.
 561     *
 562     * The algorithm used to implement this computation is straightforward and not
 563     * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
 564     * to locate identical blocks, so overall the time complexity of the algorithm
 565     * is quadratic in the length of the input map.  Fortunately, speed is not crucial
 566     * to this application.
 567     *
 568     * @param map                a map to be compressed
 569     * @param size       the number of index bits to be split off by the compression
 570     * @return   an array of length 2 containing two arrays; the first is a new map
 571     *           and the second is a compressed data table
 572     *
 573     * @see GenerateCharacter#buildMap
 574     */
 575 
 576     static long[][] buildTable(long[] map, int size) {
 577         int n = map.length;
 578         if (((n >> size) << size) != n) {
 579             FAIL("Length " + n + " is not a multiple of " + (1 << size));
 580         }
 581         int m = 1 << size;
 582         // We know the final length of the new map up front.
 583         long[] newmap = new long[n >> size];
 584         // The buffer is used temporarily to hold data for the compressed table
 585         // because we don't know its final length yet.
 586         long[] buffer = new long[n];
 587         int ptr = 0;
 588 OUTER:  for (int i = 0; i < n; i += m) {
 589             // For every block of size m in the original map...
 590     MIDDLE: for (int j = 0; j < ptr; j += m) {
 591             // Find out whether there is already a block just like it in the buffer.
 592                 for (int k = 0; k < m; k++) {
 593                     if (buffer[j+k] != map[i+k])
 594                         continue MIDDLE;
 595                 }
 596                 // There is a block just like it at position j, so just
 597                 // put its index into the new map (thereby sharing it).
 598                 newmap[i >> size] = (j >> size);
 599                 continue OUTER;
 600             } // end MIDDLE
 601             // There is no block just like it already, so add it to
 602             // the buffer and put its index into the new map.
 603             for (int k = 0; k < m; k++) {
 604                 buffer[ptr+k] = map[i+k];
 605             }
 606             newmap[i >> size] = (ptr >> size);
 607             ptr += m;
 608         } // end OUTER
 609         // Now we know how long the compressed table should be,
 610         // so create a new array and copy data from the temporary buffer.
 611         long[] newdata = new long[ptr];
 612         for (int j = 0; j < ptr; j++) {
 613             newdata[j] = buffer[j];
 614         }
 615         // Return the new map and the new data table.
 616         long[][] result = { newmap, newdata };
 617         return result;
 618     }
 619 
 620     /**
 621     * Once the compressed tables have been computed, this method reads in a
 622     * template file for the source code to be generated and writes out the final
 623     * source code by acting as a sort of specialized macro processor.
 624     *
 625     * The first output line is a comment saying that the file was automatically
 626     * generated; it includes a timestamp.  All other output is generated by
 627     * reading a line from the template file, performing macro replacements,
 628     * and then writing the resulting line or lines of code to the output file.
 629     *
 630     * This method handles the I/O, the timestamp comment, and the locating of
 631     * macro calls within each input line.  The method replaceCommand is called
 632     * to generate replacement text for each macro call.
 633     *
 634     * Macro calls to be replaced are indicated in the template file by
 635     * occurrences of the commandMarker "$$".  The rest of the call may consist
 636     * of Java letters (including the underscore "_") and also of balanced
 637     * parentheses.
 638     *
 639     * @param theTemplateFileName
 640     *           the file name for the template input file
 641     * @param theOutputFileName
 642     *           the file name for the source code output file
 643     *
 644     *     @see GenerateCharacter#replaceCommand
 645     */
 646 
 647     static void generateCharacterClass(String theTemplateFileName,
 648                      String theOutputFileName)
 649             throws FileNotFoundException, IOException {
 650         BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
 651         PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
 652         out.println(commentStart +
 653             " This file was generated AUTOMATICALLY from a template file " +
 654             new java.util.Date() + commentEnd);
 655         int marklen = commandMarker.length();
 656         LOOP: while(true) {
 657             try {
 658                 String line = in.readLine();
 659                 if (line == null) break LOOP;
 660                 int pos = 0;
 661                 int depth = 0;
 662                 while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
 663                     int newpos = pos + marklen;
 664                     char ch = 'x';
 665                     SCAN: while (newpos < line.length() &&
 666                             (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
 667                             || ch == '(' || (ch == ')' && depth > 0))) {
 668                         ++newpos;
 669                         if (ch == '(') {
 670                             ++depth;
 671                         }
 672                         else if (ch == ')') {
 673                             --depth;
 674                             if (depth == 0)
 675                                 break SCAN;
 676                         }
 677                     }
 678                     String replacement = replaceCommand(line.substring(pos + marklen, newpos));
 679                     line = line.substring(0, pos) + replacement + line.substring(newpos);
 680                     pos += replacement.length();
 681                 }
 682                 out.println(line);
 683             }
 684             catch (IOException e) {
 685                 break LOOP;
 686             }
 687         }
 688         in.close();
 689         out.close();
 690     }
 691 
 692     /**
 693     * The replaceCommand method takes a command (a macro call without the
 694     * leading marker "$$") and computes replacement text for it.
 695     *
 696     * Most of the commands are simply names of integer constants that are defined
 697     * in the source code of this GenerateCharacter class.  The replacement text is
 698     * simply the value of the constant as an appropriately formatted integer literal.
 699     *
 700     * Two cases are more complicated, however.  The command "Tables" causes the
 701     * final map and compressed tables to be emitted, with elaborate comments
 702     * describing their contents.  (This is actually handled by method genTables.)
 703     * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
 704     * an expression that will return the character property data for the character
 705     * whose code is the value of the variable "xxx".  (this is handled by method
 706     * "genAccess".)
 707     *
 708     * @param x  a command from the template file to be replaced
 709     * @return   the replacement text, as a String
 710     *
 711     * @see GenerateCharacter#genTables
 712     * @see GenerateCharacter#genAccess
 713     * @see GenerateCharacter#generateCharacterClass
 714     */
 715 
 716     static String replaceCommand(String x) {
 717         if (x.equals("Tables")) return genTables();
 718         if (x.equals("Initializers")) return genInitializers();
 719         if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
 720                 x.substring(x.length()-1).equals(")") )
 721             return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
 722         if (x.equals("shiftType")) return Long.toString(shiftType);
 723         if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
 724         if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
 725         if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
 726         if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
 727         if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
 728         if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
 729         if (x.equals("maskCase")) return "0x" + hex8(maskCase);
 730         if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
 731         if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
 732         if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
 733         if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
 734         if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
 735         if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
 736         if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
 737         if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
 738         if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
 739         if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
 740         if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
 741         if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
 742         if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
 743         if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
 744         if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
 745         if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
 746         if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
 747         if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
 748         if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
 749         if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
 750         if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
 751         if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
 752         if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
 753         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 754         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 755         if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
 756         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 757         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 758         if (x.equals("maskType")) return "0x" + hex(maskType);
 759         if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
 760         if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
 761         if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
 762         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
 763             return Integer.toString(UnicodeSpec.UNASSIGNED);
 764         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
 765             return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
 766         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
 767             return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
 768         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
 769             return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
 770         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
 771              return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
 772         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
 773              return Integer.toString(UnicodeSpec.OTHER_LETTER);
 774         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
 775              return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
 776         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
 777              return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
 778         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
 779              return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
 780         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
 781              return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
 782         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
 783              return Integer.toString(UnicodeSpec.OTHER_NUMBER);
 784         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
 785              return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
 786         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
 787              return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
 788         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 789              return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
 790         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
 791             return Integer.toString(UnicodeSpec.CONTROL);
 792         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
 793             return Integer.toString(UnicodeSpec.FORMAT);
 794         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
 795             return Integer.toString(UnicodeSpec.PRIVATE_USE);
 796         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
 797             return Integer.toString(UnicodeSpec.SURROGATE);
 798         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
 799             return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
 800         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
 801             return Integer.toString(UnicodeSpec.START_PUNCTUATION);
 802         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
 803             return Integer.toString(UnicodeSpec.END_PUNCTUATION);
 804         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 805             return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
 806         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 807             return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
 808         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
 809             return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
 810         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
 811             return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
 812         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
 813             return Integer.toString(UnicodeSpec.LETTER_NUMBER);
 814         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
 815             return Integer.toString(UnicodeSpec.MATH_SYMBOL);
 816         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
 817             return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
 818         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
 819             return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
 820         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
 821             return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
 822         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
 823             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
 824         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
 825             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
 826         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
 827             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
 828         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
 829             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
 830         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
 831             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
 832         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
 833             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
 834         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
 835             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
 836         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
 837             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
 838         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
 839             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
 840         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 841             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
 842         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
 843             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
 844         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
 845             return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
 846         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 847             return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
 848         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
 849             return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
 850          if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
 851             return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
 852         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 853             return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
 854         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
 855             return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
 856         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
 857             return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
 858         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
 859             return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
 860         FAIL("Unknown text substitution marker " + commandMarker + x);
 861         return commandMarker + x;
 862     }
 863 
 864     /**
 865     * The genTables method generates source code for all the lookup tables
 866     * needed to represent the various Unicode character properties.
 867     * It simply calls the method genTable once for each table to be generated
 868     * and then generates a summary comment.
 869     *
 870     * @return   the replacement text for the "Tables" command, as a String
 871     *
 872     * @see GenerateCharacter#genTable
 873     * @see GenerateCharacter#replaceCommand
 874     */
 875     static String genTables() {
 876         int n = sizes.length;
 877         StringBuffer result = new StringBuffer();
 878         // liu : Add a comment showing the source of this table
 879         result.append(commentStart + " The following tables and code generated using:" +
 880                   commentEnd + "\n  ");
 881         result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
 882 
 883                 if (plane == 0 && bLatin1 == false) {
 884             genCaseMapTableDeclaration(result);
 885             genCaseMapTable(initializers, specialCaseMaps);
 886                 }
 887         int totalBytes = 0;
 888         for (int k = 0; k < n - 1; k++) {
 889             genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
 890                 sizes[k+1], false, false, k==0);
 891             int s = bytes[k];
 892             if (s == 1 && useCharForByte) {
 893                 s = 2;
 894             }
 895             totalBytes += tables[k].length * s;
 896         }
 897         genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
 898             sizes[n - 1], false, 0, true, !(identifiers), false);
 899 
 900         // If we ever need more than 32 bits to represent the character properties,
 901         // then a table "B" may be needed as well.
 902         //  genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
 903 
 904         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
 905         result.append(commentStart);
 906         result.append(" In all, the character property tables require ");
 907         result.append(totalBytes).append(" bytes.").append(commentEnd);
 908         if (verbose) {
 909             System.out.println("The character property tables require "
 910                  + totalBytes + " bytes.");
 911         }
 912         return result.toString();
 913     }
 914 
 915     /**
 916      * The genInitializers method generates the body of the
 917      * ensureInitted() method, which enables lazy initialization of
 918      * the case map table and other tables.
 919      */
 920     static String genInitializers() {
 921         return initializers.toString();
 922     }
 923 
 924     /**
 925      * Return the total number of bytes needed by all tables.  This is a stripped-
 926      * down copy of genTables().
 927      */
 928     static int getTotalBytes() {
 929         int n = sizes.length;
 930         int totalBytes = 0;
 931         for (int k = 0; k < n - 1; k++) {
 932             totalBytes += tables[k].length * bytes[k];
 933         }
 934         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
 935                          + 31) >> 5) << 2);
 936         return totalBytes;
 937     }
 938 
 939     static void appendEscapedStringFragment(StringBuffer result,
 940                                             char[] line,
 941                                             int length,
 942                                             boolean lastFragment) {
 943         result.append("    \"");
 944         for (int k=0; k<length; ++k) {
 945             result.append("\\u");
 946             result.append(hex4(line[k]));
 947         }
 948         result.append("\"");
 949         result.append(lastFragment ? ";" : "+");
 950         result.append("\n");
 951     }
 952 
 953     static String SMALL_INITIALIZER =
 954         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 955         // "            $$name = new $$type[$$size];\n"+
 956         "            int len = $$name_DATA.length();\n"+
 957         "            int j=0;\n"+
 958         "            for (int i=0; i<len; ++i) {\n"+
 959         "                int c = $$name_DATA.charAt(i);\n"+
 960         "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
 961         "                    $$name[j++] = ($$type)c;\n"+
 962         "                    c >>= $$bits;\n"+
 963         "                }\n"+
 964         "            }\n"+
 965         "            assert (j == $$size);\n"+
 966         "        }\n";
 967 
 968     static String SAME_SIZE_INITIALIZER =
 969         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 970         "            assert ($$name_DATA.length() == $$size);\n"+
 971         // "            $$name = new $$type[$$size];\n"+
 972         "            for (int i=0; i<$$size; ++i)\n"+
 973         "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
 974         "        }\n";
 975 
 976     static String BIG_INITIALIZER =
 977         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 978         // "            $$name = new $$type[$$size];\n"+
 979         "            int len = $$name_DATA.length();\n"+
 980         "            int j=0;\n"+
 981         "            int charsInEntry=0;\n"+
 982         "            $$type entry=0;\n"+
 983         "            for (int i=0; i<len; ++i) {\n"+
 984         "                entry |= $$name_DATA.charAt(i);\n"+
 985         "                if (++charsInEntry == $$charsPerEntry) {\n"+
 986         "                    $$name[j++] = entry;\n"+
 987         "                    entry = 0;\n"+
 988         "                    charsInEntry = 0;\n"+
 989         "                }\n"+
 990         "                else {\n"+
 991         "                    entry <<= 16;\n"+
 992         "                }\n"+
 993         "            }\n"+
 994         "            assert (j == $$size);\n"+
 995         "        }\n";
 996 
 997     static String INT32_INITIALIZER =
 998         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
 999         "            char[] data = $$name_DATA.toCharArray();\n"+
1000         "            assert (data.length == ($$size * 2));\n"+
1001         "            int i = 0, j = 0;\n"+
1002         "            while (i < ($$size * 2)) {\n"+
1003         "                int entry = data[i++] << 16;\n"+
1004         "                $$name[j++] = entry | data[i++];\n"+
1005         "            }\n"+
1006         "        }\n";
1007 
1008     static void addInitializer(String name, String type, int entriesPerChar,
1009                                int bits, int size) {
1010 
1011         String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1012                           ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1013         if (entriesPerChar == -2) {
1014             template = INT32_INITIALIZER;
1015         }
1016         int marklen = commandMarker.length();
1017         int pos = 0;
1018         while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1019             int newpos = pos + marklen;
1020             char ch = 'x';
1021             while (newpos < template.length() &&
1022                    Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1023                    ch != '_') // Don't allow this in token names
1024                 ++newpos;
1025             String token = template.substring(pos+marklen, newpos);
1026             String replacement = "ERROR";
1027 
1028             if (token.equals("name")) replacement = name;
1029             else if (token.equals("type")) replacement = type;
1030             else if (token.equals("bits")) replacement = ""+bits;
1031             else if (token.equals("size")) replacement = ""+size;
1032             else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1033             else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1034             else FAIL("Unrecognized token: " + token);
1035 
1036             template = template.substring(0, pos) + replacement + template.substring(newpos);
1037             pos += replacement.length();
1038         }
1039         initializers.append(template);
1040     }
1041 
1042     /**
1043     * The genTable method generates source code for one lookup table.
1044     * Most of the complexity stems from handling various options as to
1045     * the type of the array components, the precise representation of the
1046     * values, the format in which to render each value, the number of values
1047     * to emit on each line of source code, and the kinds of useful comments
1048     * to be generated.
1049     *
1050     * @param result     a StringBuffer, to which the generated source code
1051     *                   text is to be appended
1052     * @param name       the name of the table
1053     * @param table      the table data (an array of long values)
1054     * @param extract    a distance, in bits, by which each entry of the table
1055     *                   is to be right-shifted before it is processed
1056     * @param bits       the number of bits (not bytes) to be used to represent
1057     *                   each table entry
1058     * @param size       the table data is divided up into blocks of size (1<<size);
1059     *                   in this method, this information is used only to affect
1060     *                   how many table values are to be generated per line
1061     * @param preshifted if this flag is true, then the table entries are to be
1062     *                   emitted in a preshifted form; that is, each value should
1063     *                   be left-shifted by the amount "shift", so that this work
1064     *                   is built into the table and need not be performed by an
1065     *                   explicit shift operator at run time
1066     * @param shift      this is the shift amount for preshifting of table entries
1067     * @param hexFormat  if this flag is true, table entries should be emitted as
1068     *                   hexadecimal literals; otherwise decimal literals are used
1069     * @param properties if this flag is true, the table entries are encoded
1070     *                   character properties rather than indexes into yet other tables;
1071     *                   therefore comments describing the encoded properties should
1072     *                   be generated
1073     * @param hexComment if this flag is true, each line of output is labelled with
1074     *                   a hexadecimal comment indicating the character values to
1075     *                   which that line applies; otherwise, decimal values indicating
1076     *                   table indices are generated
1077     *
1078     * @see GenerateCharacter#genTables
1079     * @see GenerateCharacter#replaceCommand
1080     */
1081 
1082     static void genTable(StringBuffer result, String name,
1083              long[] table, int extract, int bits, int size,
1084              boolean preshifted, int shift, boolean hexFormat,
1085              boolean properties, boolean hexComment) {
1086 
1087         String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1088             bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1089             bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1090             bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1091             bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1092             bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1093             (Csyntax ? "int64" : "long");
1094         long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1095             bits == 2 ? Integer.MAX_VALUE :
1096             bits == 4 ? Integer.MAX_VALUE :
1097             bits == 8 ? Byte.MAX_VALUE :
1098             bits == 16 ? Short.MAX_VALUE :
1099             bits == 32 ? Integer.MAX_VALUE :
1100             Long.MAX_VALUE;
1101         int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1102         boolean shiftEntries = preshifted && shift != 0;
1103         if (bits == 8 && tableAsString && useCharForByte) {
1104             atype = "char";
1105             maxPosEntry = Character.MAX_VALUE;
1106             entriesPerChar = 1;
1107         }
1108         boolean noConversion = atype.equals("char");
1109 
1110         result.append(commentStart);
1111         result.append(" The ").append(name).append(" table has ").append(table.length);
1112         result.append(" entries for a total of ");
1113         int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1114         if (bits == 8 && useCharForByte) {
1115             sizeOfTable *= 2;
1116         }
1117         result.append(sizeOfTable);
1118         result.append(" bytes.").append(commentEnd).append("\n\n");
1119         if (Csyntax)
1120             result.append("  static ");
1121         else
1122             result.append("  static final ");
1123         result.append(atype);
1124         result.append(" ").append(name).append("[");
1125         if (Csyntax)
1126             result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1127         if (tableAsString) {
1128             if (noConversion) {
1129                 result.append("] = (\n");
1130             } else {
1131                 result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1132                 result.append("static final String ").append(name).append("_DATA =\n");
1133             }
1134             int CHARS_PER_LINE = 8;
1135             StringBuffer theString = new StringBuffer();
1136             int entriesInCharSoFar = 0;
1137             char ch = '\u0000';
1138             int charsPerEntry = -entriesPerChar;
1139             for (int j=0; j<table.length; ++j) {
1140                 long entry = table[j] >> extract;
1141                 if (shiftEntries) entry <<= shift;
1142                 if (entry >= (1L << bits)) {
1143                     FAIL("Entry too big");
1144                 }
1145                 if (entriesPerChar > 0) {
1146                     // Pack multiple entries into a character
1147                     ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1148                     ++entriesInCharSoFar;
1149                     if (entriesInCharSoFar == entriesPerChar) {
1150                         // Character is full
1151                         theString.append(ch);
1152                         entriesInCharSoFar = 0;
1153                         ch = '\u0000';
1154                     }
1155                 }
1156                 else {
1157                     // Use multiple characters per entry
1158                     for (int k=0; k<charsPerEntry; ++k) {
1159                         ch = (char)(entry >> ((charsPerEntry-1)*16));
1160                         entry <<= 16;
1161                         theString.append(ch);
1162                     }
1163                 }
1164             }
1165             if (entriesInCharSoFar > 0) {
1166                 while (entriesInCharSoFar < entriesPerChar) {
1167                     ch = (char)((int)ch >> bits);
1168                     ++entriesInCharSoFar;
1169                 }
1170                 theString.append(ch);
1171                 entriesInCharSoFar = 0;
1172             }
1173             result.append(Utility.formatForSource(theString.toString(), "    "));
1174             if (noConversion) {
1175                 result.append(").toCharArray()");
1176             }
1177             result.append(";\n\n  ");
1178 
1179             if (!noConversion) {
1180                 addInitializer(name, atype, entriesPerChar, bits, table.length);
1181             }
1182         }
1183         else {
1184             result.append("] = {");
1185             boolean castEntries = shiftEntries && (bits < 32);
1186             int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1187                 bits == 2 ? 16*4 :
1188                 bits == 4 ? 8*4 :
1189                 bits == 8 ? 8 :
1190                 bits == 16 ? 8 :
1191                 bits == 32 ? 4 : 2) :
1192                 (bits == 8 ? 8 :
1193                 bits == 16 ? 8 : 4);
1194             int printMask = properties ? 0 :
1195             Math.min(1 << size,
1196                 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1197             int commentShift = ((1 << size) == table.length) ? 0 : size;
1198             int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1199             long val = 0;
1200             for (int j = 0; j < table.length; j++) {
1201                 if ((j & printMask) == 0) {
1202                     while (result.charAt(result.length() - 1) == ' ')
1203                         result.setLength(result.length() - 1);
1204                     result.append("\n    ");
1205                 }
1206         PRINT:  {
1207                 if (castEntries)
1208                     result.append("(").append(atype).append(")(");
1209                 long entry = table[j] >> extract;
1210                 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1211                 int k = j & packMask;
1212                 if (bits >= 8)
1213                     val = entry;
1214                 else if (k == 0) {
1215                     val = entry;
1216                     break PRINT;
1217                 }
1218                 else {
1219                     val |= (entry << (k*bits));
1220                     if (k != packMask)
1221                         break PRINT;
1222                 }
1223                 if (val > maxPosEntry && !Csyntax) { // liu
1224                 // For values that are out of range, convert them to in-range negative values.
1225                 // Actually, output the '-' and convert them to the negative of the corresponding
1226                 // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1227                     result.append('-');
1228                     val = maxPosEntry + maxPosEntry + 2 - val;
1229                 }
1230                 if (hexFormat) {
1231                     result.append("0x");
1232                     if (bits == 8)
1233                         result.append(hex2((byte)val));
1234                     else if (bits == 16)
1235                         result.append(hex4((short)val));
1236                     else if (bits == 32 || bits < 8)
1237                         result.append(hex8((int)val));
1238                     else {
1239                         result.append(hex16((long)val));
1240                         if (!Csyntax)
1241                             result.append("L");
1242                     }
1243                 }
1244                 else {
1245                     if (bits == 8)
1246                         result.append(dec3(val));
1247                     else if (bits == 64) {
1248                         result.append(dec5(val));
1249                         if (!Csyntax)
1250                             result.append("L");
1251                     }
1252                     else
1253                         result.append(dec5(val));
1254                 }
1255                 if (shiftEntries)
1256                     result.append("<<").append(shift);
1257                 if (castEntries) result.append(")");
1258                 if (j < (table.length - 1))
1259                     result.append(", ");
1260                 else
1261                     result.append("  ");
1262                 if ((j & printMask) == printMask) {
1263                     result.append(" ").append(commentStart).append(" ");
1264                     if (hexComment)
1265                         result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1266                     else
1267                         result.append(dec3((j & ~commentMask) >> commentShift));
1268                     if (properties) propertiesComments(result, val);
1269                     result.append(commentEnd);
1270                 }
1271                 } // end PRINT
1272             }
1273             result.append("\n  };\n\n  ");
1274         }
1275     }
1276 
1277     static void genCaseMapTableDeclaration(StringBuffer result) {
1278         String myTab = "    ";
1279         result.append(myTab + "static final char[][][] charMap;\n");
1280     }
1281 
1282     static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1283         String myTab = "    ";
1284         int ch;
1285         char[] map;
1286         result.append(myTab + "charMap = new char[][][] {\n");
1287         for (int x = 0; x < specialCaseMaps.length; x++) {
1288             ch = specialCaseMaps[x].getCharSource();
1289             map = specialCaseMaps[x].getUpperCaseMap();
1290             result.append(myTab + myTab);
1291             result.append("{ ");
1292             result.append("{\'\\u"+hex4(ch)+"\'}, {");
1293             for (int y = 0; y < map.length; y++) {
1294                 result.append("\'\\u"+hex4(map[y])+"\', ");
1295             }
1296             result.append("} },\n");
1297         }
1298         result.append(myTab + "};\n");
1299 
1300     }
1301 
1302     /**
1303     * The propertiesComments method generates comments describing encoded
1304     * character properties.
1305     *
1306     * @param result     a StringBuffer, to which the generated source code
1307     *                   text is to be appended
1308     * @param val                encoded character properties
1309     *
1310     * @see GenerateCharacter#genTable
1311     */
1312 
1313     static void propertiesComments(StringBuffer result, long val) {
1314         result.append("   ");
1315         switch ((int)(val & maskType)) {
1316             case UnicodeSpec.CONTROL:
1317                 result.append("Cc");
1318                 break;
1319             case UnicodeSpec.FORMAT:
1320                 result.append("Cf");
1321                 break;
1322             case UnicodeSpec.PRIVATE_USE:
1323                 result.append("Co");
1324                 break;
1325             case UnicodeSpec.SURROGATE:
1326                 result.append("Cs");
1327                 break;
1328             case UnicodeSpec.LOWERCASE_LETTER:
1329                 result.append("Ll");
1330                 break;
1331             case UnicodeSpec.MODIFIER_LETTER:
1332                 result.append("Lm");
1333                 break;
1334             case UnicodeSpec.OTHER_LETTER:
1335                 result.append("Lo");
1336                 break;
1337             case UnicodeSpec.TITLECASE_LETTER:
1338                 result.append("Lt");
1339                 break;
1340             case UnicodeSpec.UPPERCASE_LETTER:
1341                 result.append("Lu");
1342                 break;
1343             case UnicodeSpec.COMBINING_SPACING_MARK:
1344                 result.append("Mc");
1345                 break;
1346             case UnicodeSpec.ENCLOSING_MARK:
1347                 result.append("Me");
1348                 break;
1349             case UnicodeSpec.NON_SPACING_MARK:
1350                 result.append("Mn");
1351                 break;
1352             case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1353                 result.append("Nd");
1354                 break;
1355             case UnicodeSpec.LETTER_NUMBER:
1356                 result.append("Nl");
1357                 break;
1358             case UnicodeSpec.OTHER_NUMBER:
1359                 result.append("No");
1360                 break;
1361             case UnicodeSpec.CONNECTOR_PUNCTUATION:
1362                 result.append("Pc");
1363                 break;
1364             case UnicodeSpec.DASH_PUNCTUATION:
1365                 result.append("Pd");
1366                 break;
1367             case UnicodeSpec.END_PUNCTUATION:
1368                 result.append("Pe");
1369                 break;
1370             case UnicodeSpec.OTHER_PUNCTUATION:
1371                 result.append("Po");
1372                 break;
1373             case UnicodeSpec.START_PUNCTUATION:
1374                 result.append("Ps");
1375                 break;
1376             case UnicodeSpec.CURRENCY_SYMBOL:
1377                 result.append("Sc");
1378                 break;
1379             case UnicodeSpec.MODIFIER_SYMBOL:
1380                 result.append("Sk");
1381                 break;
1382             case UnicodeSpec.MATH_SYMBOL:
1383                 result.append("Sm");
1384                 break;
1385             case UnicodeSpec.OTHER_SYMBOL:
1386                 result.append("So");
1387                 break;
1388             case UnicodeSpec.LINE_SEPARATOR:
1389                 result.append("Zl"); break;
1390             case UnicodeSpec.PARAGRAPH_SEPARATOR:
1391                 result.append("Zp");
1392                 break;
1393             case UnicodeSpec.SPACE_SEPARATOR:
1394                 result.append("Zs");
1395                 break;
1396             case UnicodeSpec.UNASSIGNED:
1397                 result.append("unassigned");
1398                 break;
1399         }
1400 
1401         switch ((int)((val & maskBidi) >> shiftBidi)) {
1402             case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1403                 result.append(", L");
1404                 break;
1405             case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1406                 result.append(", R");
1407                 break;
1408             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1409                 result.append(", EN");
1410                 break;
1411             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1412                 result.append(", ES");
1413                 break;
1414             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1415                 result.append(", ET");
1416                 break;
1417             case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1418                 result.append(", AN");
1419                 break;
1420             case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1421                 result.append(", CS");
1422                 break;
1423             case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1424                 result.append(", B");
1425                 break;
1426             case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1427                 result.append(", S");
1428                 break;
1429             case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1430                 result.append(", WS");
1431                 break;
1432             case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1433                 result.append(", ON");
1434                 break;
1435         }
1436         if ((val & maskUpperCase) != 0) {
1437             result.append(", hasUpper (subtract ");
1438             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1439         }
1440         if ((val & maskLowerCase) != 0) {
1441             result.append(", hasLower (add ");
1442             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1443         }
1444         if ((val & maskTitleCase) != 0) {
1445             result.append(", hasTitle");
1446         }
1447         if ((val & maskIdentifierInfo) == valueIgnorable) {
1448             result.append(", ignorable");
1449         }
1450         if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1451             result.append(", identifier part");
1452         }
1453         if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1454             result.append(", underscore");
1455         }
1456         if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1457             result.append(", whitespace");
1458         }
1459         if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1460             result.append(", currency");
1461         }
1462         if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1463             result.append(", identifier start");
1464         }
1465         if ((val & maskNumericType) == valueDigit) {
1466             result.append(", decimal ");
1467             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1468         }
1469         if ((val & maskNumericType) == valueStrangeNumeric) {
1470             result.append(", strange");
1471         }
1472         if ((val & maskNumericType) == valueJavaSupradecimal) {
1473             result.append(", supradecimal ");
1474             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1475         }
1476     }
1477 
1478     static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1479 
1480     static String tableName(int j) { return tableNames[j]; }
1481 
1482     /**
1483     * The genAccess method generates source code for one table access expression.
1484     *
1485     * Most of the complexity stems from handling various options as to
1486     * table representation, such as whether it contains values so large that
1487     * they are represented as negative values and whether the table values are
1488     * preshifted.  This method also avoids such "ugly" expressions as shifting
1489     * by distance zero, masking when no masking is necessary, and so on.
1490     * For clarity, it generates expressions that do not rely on operator
1491     * precedence, but otherwise it avoids generating redundant parentheses.
1492     *
1493     * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1494     * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1495     *
1496     * @param tbl                the name of the final table to be accessed
1497     * @param var                the variable name that appeared in parentheses in the
1498     *                           "Lookup" command
1499     * @param bits       the number of bits (not bytes) to be used to represent
1500     *                   the final table entry
1501     * @return   the replacement text for the "Lookup(xxx)" command, as a String
1502     *
1503     * @see GenerateCharacter#replaceCommand
1504     */
1505 
1506     static String genAccess(String tbl, String var, int bits) {
1507         String access = null;
1508         int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1509         for (int k = 0; k < sizes.length; k++) {
1510             int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1511             int shift = shifts[k] + offset;
1512             String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1513             int mask = (1 << (sizes[k] - offset)) - 1;
1514             String masked = (k == 0) ? shifted :
1515               "(" + shifted + "&0x" + hex(mask) + ")";
1516             String index = (k == 0) ? masked :
1517              (mask == 0) ? access : "(" + access + "|" + masked + ")";
1518             String indexNoParens = (index.charAt(0) != '(') ? index :
1519                  index.substring(1, index.length() - 1);
1520             String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1521             String fetched = tblname + "[" + indexNoParens + "]";
1522             String zeroextended = (zeroextend[k] == 0) ? fetched :
1523                 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1524             int adjustment = preshifted[k] ? 0 :
1525                sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1526             String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1527                 "(" + zeroextended + "<<" + adjustment + ")";
1528             String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1529                 (bits == 2) ? "((" + var + "&0xF)<<1)" :
1530                 (bits == 4) ? "((" + var + "&7)<<2)" : null;
1531             String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1532                 "((" + adjusted + ">>" + bitshift + ")&" +
1533                 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1534             access = extracted;
1535         }
1536         return access;
1537     }
1538 
1539     /* The command line arguments are decoded and used to set the following
1540      global variables.
1541      */
1542 
1543     static boolean verbose = false;
1544     static boolean nobidi = false;
1545     static boolean nomirror = false;
1546     static boolean identifiers = false;
1547     static boolean Csyntax = false;
1548     static String TemplateFileName = null;
1549     static String OutputFileName = null;
1550     static String UnicodeSpecFileName = null; // liu
1551     static String SpecialCasingFileName = null;
1552     static boolean useCharForByte = false;
1553     static int[] sizes;
1554     static int bins = 0; // liu; if > 0, then perform search
1555     static boolean tableAsString = false;
1556     static boolean bLatin1 = false;
1557 
1558     static String commandLineDescription;
1559 
1560     /* Other global variables, equal in length to the "sizes" array. */
1561 
1562     static int[] shifts;
1563     static int[] zeroextend;
1564     static int[] bytes;
1565     static boolean[] preshifted;
1566     static long[][] tables;
1567 
1568 
1569     /* Other global variables */
1570     static String commentStart;
1571     static String commentEnd;
1572 
1573     static StringBuffer initializers = new StringBuffer();
1574 
1575     /* special casing rules for 1:M toUpperCase mappings */
1576     static SpecialCaseMap[] specialCaseMaps;
1577 
1578     /**
1579     * Process the command line arguments.
1580     *
1581     * The allowed flags in command line are:
1582     * <dl>
1583     * <dt> -verbose             <dd> Emit comments to standard output describing
1584     *                                   what's going on during the processing.
1585     * <dt> -nobidi              <dd> Do not include bidi categories in the
1586     *                                   encoded character properties.
1587     * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1588     *                        character properties.
1589     * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1590     * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1591     * <dt> -o filename          <dd> Specify output file name.
1592     * <dt> -template filename   <dd> Specify template input file name.
1593     * <dt> -spec filename        <dd> Specify Unicode spec file name.
1594     * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1595     * <dt> -search bins          <dd> Try different partitions into the specified
1596     *                                    number of bins.  E.g., for 2 bins, try
1597     *                                    16 0, 15 1,..., 0 16.
1598     * <dt> -string               <dd> Create table as string.  Only valid with Java
1599     *                                    syntax.
1600     * <dt> -latin1          <dd> Create a latin 1 only property table.
1601     * </dl>
1602     * In addition, decimal literals may appear as command line arguments;
1603     * each one represents the number of bits of the character to be broken
1604     * off at each lookup step.  If present, they must add up to 16 (the number
1605     * of bits in a char value).  For smaller tables, the last value should
1606     * be 0; values other than the last one may not be zero.  If no such
1607     * numeric values are provided, default values are used.
1608     *
1609     * @param args       the command line arguments, as an array of String
1610     *
1611     * @see GenerateCharacter#main
1612     */
1613 
1614     static void processArgs(String[] args) {
1615         StringBuffer desc = new StringBuffer("java GenerateCharacter");
1616         for (int j=0; j<args.length; ++j) {
1617             desc.append(" " + args[j]);
1618         }
1619         for (int j = 0; j < args.length; j++) {
1620             if (args[j].equals("-verbose") || args[j].equals("-v"))
1621                 verbose = true;
1622             else if (args[j].equals("-nobidi"))
1623                 nobidi = true;
1624             else if (args[j].equals("-nomirror"))
1625                 nomirror = true;
1626             else if (args[j].equals("-identifiers"))
1627                 identifiers = true;
1628             else if (args[j].equals("-c"))
1629                 Csyntax = true;
1630             else if (args[j].equals("-string"))
1631                 tableAsString = true;
1632             else if (args[j].equals("-o")) {
1633                 if (j == args.length - 1) {
1634                     FAIL("File name missing after -o");
1635                 }
1636                 else {
1637                     OutputFileName = args[++j];
1638                 }
1639             }
1640             else if (args[j].equals("-search")) {
1641                 if (j == args.length - 1)
1642                     FAIL("Bin count missing after -search");
1643                 else {
1644                     bins = Integer.parseInt(args[++j]);
1645                     if (bins < 1 || bins > 10)
1646                         FAIL("Bin count must be >= 1 and <= 10");
1647                 }
1648             }
1649             else if (args[j].equals("-template")) {
1650                 if (j == args.length - 1)
1651                     FAIL("File name missing after -template");
1652                 else
1653                     TemplateFileName = args[++j];
1654             }
1655             else if (args[j].equals("-spec")) { // liu
1656                 if (j == args.length - 1) {
1657                     FAIL("File name missing after -spec");
1658                 }
1659                 else {
1660                     UnicodeSpecFileName = args[++j];
1661                 }
1662             }
1663             else if (args[j].equals("-specialcasing")) {
1664                 if (j == args.length -1) {
1665                     FAIL("File name missing after -specialcasing");
1666                 }
1667                 else {
1668                     SpecialCasingFileName = args[++j];
1669                 }
1670             }
1671                         else if (args[j].equals("-plane")) {
1672                                 if (j == args.length -1) {
1673                                         FAIL("Plane number missing after -plane");
1674                                 }
1675                                 else {
1676                                         plane = Integer.parseInt(args[++j]);
1677                                 }
1678                                 if (plane > 0) {
1679                                         bLatin1 = false;
1680                                 }
1681                         }
1682                         else if ("-usecharforbyte".equals(args[j])) {
1683                                 useCharForByte = true;
1684                         }
1685             else if (args[j].equals("-latin1")) {
1686                 bLatin1 = true;
1687                 plane = 0;
1688             }
1689             else {
1690                 try {
1691                     int val = Integer.parseInt(args[j]);
1692                     if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1693                     if (sizes == null)
1694                         sizes = new int[1];
1695                     else {
1696                         int[] newsizes = new int[sizes.length + 1];
1697                         System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1698                         sizes = newsizes;
1699                     }
1700                     sizes[sizes.length - 1] = val;
1701                 }
1702                 catch(NumberFormatException e) {
1703                     FAIL("Unknown switch: " + args[j]);
1704                 }
1705             }
1706         }
1707         if (Csyntax && tableAsString) {
1708             FAIL("Can't specify table as string with C syntax");
1709         }
1710         if (sizes == null) {
1711             desc.append(" [");
1712             if (identifiers) {
1713                 int[] newsizes = { 8, 4, 4 };           // Good default values
1714                 desc.append("8 4 4]");
1715                 sizes = newsizes;
1716             }
1717             else {
1718                 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1719                 desc.append("10 5 1]");
1720                 sizes = newsizes;
1721             }
1722         }
1723         if (UnicodeSpecFileName == null) { // liu
1724             UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1725             desc.append(" [-spec " + UnicodeSpecFileName + ']');
1726         }
1727         if (SpecialCasingFileName == null) {
1728             SpecialCasingFileName = DefaultSpecialCasingFileName;
1729             desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1730         }
1731         if (TemplateFileName == null) {
1732             TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1733                   : DefaultJavaTemplateFileName);
1734             desc.append(" [-template " + TemplateFileName + ']');
1735         }
1736         if (OutputFileName == null) {
1737             OutputFileName = (Csyntax ? DefaultCOutputFileName
1738                     : DefaultJavaOutputFileName);
1739             desc.append(" [-o " + OutputFileName + ']');
1740         }
1741         commentStart = (Csyntax ? "/*" : "//");
1742         commentEnd = (Csyntax ? " */" : "");
1743         commandLineDescription = desc.toString();
1744     }
1745 
1746     private static void searchBins(long[] map, int binsOccupied) throws Exception {
1747         int bitsFree = 16;
1748         for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1749         if (binsOccupied == (bins-1)) {
1750             sizes[binsOccupied] = bitsFree;
1751             generateForSizes(map);
1752         }
1753         else {
1754             for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1755                 sizes[binsOccupied] = i;
1756                 searchBins(map, binsOccupied+1);
1757             }
1758         }
1759     }
1760 
1761     private static void generateForSizes(long[] map) throws Exception {
1762         int sum = 0;
1763         shifts = new int[sizes.length];
1764         for (int k = sizes.length - 1; k >= 0; k--) {
1765             shifts[k] = sum;
1766             sum += sizes[k];
1767         }
1768         if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1769             FAIL("Bit field widths total to " + sum +
1770              ": wrong total for map of size " + map.length);
1771         }
1772         // need a table for each set of lookup bits in char
1773         tables = new long[sizes.length][];
1774         // the last table is the map
1775         tables[sizes.length - 1] = map;
1776         for (int j = sizes.length - 1; j > 0; j--) {
1777             if (verbose && bins==0)
1778                 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1779             long[][] temp = buildTable(tables[j], sizes[j]);
1780             tables[j-1] = temp[0];
1781             tables[j] = temp[1];
1782         }
1783         preshifted = new boolean[sizes.length];
1784         zeroextend = new int[sizes.length];
1785         bytes = new int[sizes.length];
1786         for (int j = 0; j < sizes.length - 1; j++) {
1787             int len = tables[j+1].length;
1788             int size = sizes[j+1];
1789             if (len > 0x100 && (len >> size) <= 0x100) {
1790                 len >>= size;
1791                 preshifted[j] = false;
1792             }
1793             else if (len > 0x10000 && (len >> size) <= 0x10000) {
1794                 len >>= size;
1795                 preshifted[j] = false;
1796             }
1797             else preshifted[j] = true;
1798             if (Csyntax)
1799                 zeroextend[j] = 0;
1800             else if (len > 0x7F && len <= 0xFF) {
1801                 if (!useCharForByte) {
1802                     zeroextend[j] = 0xFF;
1803                 }
1804             } else if (len > 0x7FFF && len <= 0xFFFF)
1805                 zeroextend[j] = 0xFFFF;
1806             else zeroextend[j] = 0;
1807             if (len <= 0x100) bytes[j] = 1;
1808             else if (len <= 0x10000) bytes[j] = 2;
1809             else bytes[j] = 4;
1810         }
1811         preshifted[sizes.length - 1] = true;
1812         zeroextend[sizes.length - 1] = 0;
1813         bytes[sizes.length - 1] = 0;
1814         if (bins > 0) {
1815             int totalBytes = getTotalBytes();
1816             String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1817             int accessComplexity = 0;
1818             for (int j=0; j<access.length(); ++j) {
1819                 char ch = access.charAt(j);
1820                 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1821                 if (ch == '<' || ch == '>') ++j;
1822             }
1823             System.out.print("(");
1824             for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1825             System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1826             return;
1827         }
1828         if (verbose) {
1829             System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1830             for (int j = 0; j < sizes.length; j++) {
1831                 System.out.println(dec5(j) + "\t" +
1832                     dec5(sizes[j]) + "\t" +
1833                     dec5(tables[j].length) + "\t" +
1834                     dec5(shifts[j]) + "\t" +
1835                     dec5(zeroextend[j]) + "\t" +
1836                     dec5(bytes[j]) + "\t " +
1837                     preshifted[j]);
1838             }
1839         }
1840         if (verbose) {
1841             System.out.println("Generating source code for class Character");
1842             System.out.println("A table access looks like " +
1843                          genAccess("A", "ch", (identifiers ? 2 : 32)));
1844         }
1845         generateCharacterClass(TemplateFileName, OutputFileName);
1846     }
1847 
1848     /**
1849     * The main program for generating source code for the Character class.
1850     * The basic outline of its operation is:
1851     * <ol>
1852     * <li> Process the command line arguments.  One result of this process
1853     *           is a list of sizes (measured in bits and summing to 16).
1854     * <li> Get the Unicode character property data from the specification file.
1855     * <li> From that, build a map that has, for each character code, its
1856     *           relevant properties encoded as a long integer value.
1857     * <li> Repeatedly compress the map, producing a compressed table and a
1858     *           new map.  This is done once for each size value in the list.
1859     *           When this is done, we have a set of tables.
1860     * <li> Make some decisions about table representation; record these
1861     *           decisions in arrays named preshifted, zeroextend, and bytes.
1862     * <li> Generate the source code for the class Character by performing
1863     *           macro processing on a template file.
1864     * </ol>
1865     *
1866     * @param args       the command line arguments, as an array of String
1867     *
1868     * @see GenerateCharacter#processArgs
1869     * @see UnicodeSpec@readSpecFile
1870     * @see GenerateCharacter#buildMap
1871     * @see GenerateCharacter#buildTable
1872     * @see GenerateCharacter#generateCharacterClass
1873     */
1874 
1875     public static void main(String[] args) {
1876         processArgs(args);
1877         try {
1878 
1879             UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1880 
1881             specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1882             if (verbose) {
1883                 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1884             }
1885             long[] map = buildMap(data, specialCaseMaps);
1886             if (verbose) {
1887                 System.err.println("Completed building of initial map");
1888             }
1889 
1890             if (bins == 0) {
1891                 generateForSizes(map);
1892             }
1893             else {
1894                 while (bins > 0) {
1895                     sizes = new int[bins];
1896                     searchBins(map, 0);
1897                     --bins;
1898                 }
1899             }
1900             if (verbose && false) {
1901                 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1902                              hex8(maxOffsetSeen));
1903                 System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
1904                              hex8(maxOffset));
1905             }
1906         }
1907         catch (FileNotFoundException e) { FAIL(e.toString()); }
1908         catch (IOException e) { FAIL(e.toString()); }
1909         catch (Throwable e) {
1910             System.out.println("Unexpected exception:");
1911             e.printStackTrace();
1912             FAIL("Unexpected exception!");
1913         }
1914         if (verbose) { System.out.println("Done!");}
1915     }
1916 
1917 }   // end class