1 /*
   2  * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package build.tools.generatecharacter;
  27 
  28 import java.io.IOException;
  29 import java.io.FileNotFoundException;
  30 import java.io.BufferedReader;
  31 import java.io.FileReader;
  32 import java.io.PrintWriter;
  33 import java.io.BufferedWriter;
  34 import java.io.FileWriter;
  35 import java.io.File;
  36 import java.util.List;
  37 
  38 import build.tools.generatecharacter.CharacterName;
  39 
  40 /**
  41  * This program generates the source code for the class java.lang.Character.
  42  * It also generates native C code that can perform the same operations.
  43  * It requires two external input data files:
  44  * <ul>
  45  * <li> Unicode specification file
  46  * <li> Character class template file
  47  * </ul>
  48  * The Unicode specification file is available from the Unicode consortium.
  49  * It has character specification lines that look like this:
  50  * <listing>
  51  * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
  52  * </listing>
  53  * The Character class template file is filled in with additional
  54  * information to produce the file Character.java, which can then be
  55  * compiled by a Java compiler.  The template file contains certain
  56  * markers consisting of an alphabetic name string preceded by "$$".
  57  * Such markers are replaced with generated program text.  As a special
  58  * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
  59  * alphabetic characters constituting a variable name.  The character "_"
  60  * is considered alphabetic for these purposes.
  61  *
  62  * @author  Guy Steele
  63  * @author  Alan Liu
  64  * @author  John O'Conner
  65  */
  66 
  67 public class GenerateCharacter {
  68 
  69     final static boolean DEBUG = false;
  70 
  71     final static String commandMarker = "$$";
  72     static String ROOT                        = "";
  73     static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
  74     static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
  75     static String DefaultPropListFileName     = ROOT + "PropList.txt";
  76     static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
  77     static String DefaultJavaOutputFileName   = ROOT + "Character.java";
  78     static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
  79     static String DefaultCOutputFileName      = ROOT + "Character.c";
  80 
  81     static int plane = 0;
  82 
  83     /* The overall idea is that, in the generated Character class source code,
  84     most character property data is stored in a special multi-level table whose
  85     structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
  86     The integers must sum to 16 (the number of bits in a character).
  87     The first table is indexed by the k1 high-order bits of the character code.
  88     The result is concatenated to the next k2 bits of the character code to index
  89     the second table, and so on.  Eventually the kn low-order bits of the character
  90     code are concatenated and used to index one of two tables A and B; A contains
  91     32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
  92     can be thus obtained encode the properties for the character.
  93 
  94     The default specification is [9, 4, 3, 0].  This particular table format was
  95     designed by conducting an exhaustive search of table formats to minimize the
  96     space consumed by the tables: the first and third tables need have only byte
  97     values (the second table must have short values).  Another good choice is
  98     [10, 6, 0], which produces a larger table but allows particularly fast table
  99     lookup code.
 100 
 101     In each case, where the word "concatenated" is used, this may imply
 102     first a << and then a | operation, or perhaps just a | operation if
 103     the values in the table can be preshifted (generally possible if the table
 104     entries are short rather than byte).
 105     */
 106 
 107     /* The character properties are currently encoded into A (32 bits)and B (16 bits)
 108        two parts.
 109 
 110     A: the low 32 bits are defined  in the following manner:
 111 
 112     1 bit Mirrored property.
 113     4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
 114     9 bits      A signed offset used for converting case .
 115     1 bit       If 1, adding the signed offset converts the character to lowercase.
 116     1 bit       If 1, subtracting the signed offset converts the character to uppercase.
 117         Note: for a titlecase character, both of the preceding bits will be 1
 118         and the signed offset will be 1.
 119     1 bit   If 1, this character has a titlecase equivalent (possibly itself);
 120         in this case, the two bits before this bit can be used to decide
 121         whether this character is in fact uppercase, lowercase, or titlecase.
 122     3 bits      This field provides a quick way to lex identifiers.
 123         The eight possible values for this field are as follows:
 124         0  May not be part of an identifier
 125         1  Ignorable control; may continue a Unicode identifier or Java identifier
 126         2  May continue a Java identifier but not a Unicode identifier (unused)
 127         3  May continue a Unicode identifier or Java identifier
 128         4  Is a Java whitespace character
 129         5  May start or continue a Java identifier;
 130            may continue but not start a Unicode identifier
 131            (this value is used for connector punctuation such as _)
 132         6  May start or continue a Java identifier;
 133            may not occur in a Unicode identifier
 134            (this value is used for currency symbols such as $)
 135         7  May start or continue a Unicode identifier or Java identifier
 136         Thus:
 137            5, 6, 7 may start a Java identifier
 138            1, 2, 3, 5, 6, 7 may continue a Java identifier
 139            7 may start a Unicode identifier
 140            1, 3, 5, 7 may continue a Unicode identifier
 141            1 is ignorable within an identifier
 142            4 is Java whitespace
 143     2 bits      This field indicates whether the character has a numeric property.
 144         The four possible values for this field are as follows:
 145         0  This character has no numeric property.
 146         1  Adding the digit offset to the character code and then
 147            masking with 0x1F will produce the desired numeric value.
 148         2  This character has a "strange" numeric value.
 149         3  A Java supradecimal digit: adding the digit offset to the
 150            character code, then masking with 0x1F, then adding 10
 151            will produce the desired numeric value.
 152     5 bits  The digit offset (see description of previous field)
 153     5 bits      Character type (see below)
 154 
 155     B: the high 16 bits are defined as:
 156     1 bit Other_Lowercase property
 157     1 bit Other_Uppercase property
 158     1 bit Other_Alphabetic property
 159     1 bit Other_Math property
 160     1 bit Ideographic property
 161     1 bit Noncharacter codepoint property
 162     */
 163 
 164 
 165     // bit masks identify each component of a 32-bit property field described
 166     // above.
 167     // shift* indicates how many shifts right must happen to get the
 168     // indicated property value in the lowest bits of the 32-bit space.
 169     private static final int
 170         shiftType           = 0,        maskType            =       0x001F,
 171         shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
 172         shiftNumericType    = 10,       maskNumericType     =       0x0C00,
 173         shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
 174                                         maskUnicodePart     =       0x1000,
 175         shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
 176                                         maskLowerCase       =      0x20000,
 177                                         maskUpperCase       =      0x10000,
 178                                         maskTitleCase       =      0x08000,
 179         shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
 180         shiftCaseOffsetSign = 5,
 181                                         // used only when calculating and
 182                                         // storing digit offsets from char values
 183                                         maskDigit               =   0x001F,
 184                                         // case offset are 9 bits
 185                                         maskCase                =   0x01FF,
 186         shiftBidi           = 27,       maskBidi              = 0x78000000,
 187         shiftMirrored       = 31,       //maskMirrored          = 0x80000000,
 188         shiftPlane          = 16,       maskPlane = 0xFF0000;
 189 
 190     // maskMirrored needs to be long, if up 16-bit
 191     private static final long maskMirrored          = 0x80000000L;
 192 
 193     // bit masks identify the 16-bit priperty field described above, in B
 194     // table
 195     private static final long
 196         maskOtherLowercase  = 0x100000000L,
 197         maskOtherUppercase  = 0x200000000L,
 198         maskOtherAlphabetic = 0x400000000L,
 199         maskOtherMath       = 0x800000000L,
 200         maskIdeographic     = 0x1000000000L,
 201         maskNoncharacterCP  = 0x2000000000L;
 202 
 203     // Can compare masked values with these to determine
 204     // numeric or lexical types.
 205     public static int
 206         valueNotNumeric             = 0x0000,
 207         valueDigit                  = 0x0400,
 208         valueStrangeNumeric         = 0x0800,
 209         valueJavaSupradecimal       = 0x0C00,
 210         valueIgnorable              = 0x1000,
 211         valueJavaOnlyPart           = 0x2000,
 212         valueJavaUnicodePart        = 0x3000,
 213         valueJavaWhitespace         = 0x4000,
 214         valueJavaStartUnicodePart   = 0x5000,
 215         valueJavaOnlyStart          = 0x6000,
 216         valueJavaUnicodeStart       = 0x7000,
 217         lowJavaStart                = 0x5000,
 218         nonzeroJavaPart             = 0x3000,
 219         valueUnicodeStart           = 0x7000;
 220 
 221     // these values are used when only identifier properties are generated
 222     // for use in verifier code. Shortens the property down to a single byte.
 223     private static final int
 224         bitJavaStart            = 0x02,
 225         bitJavaPart             = 0x01,
 226         maskIsJavaIdentifierPart = bitJavaPart,
 227         maskIsJavaIdentifierStart = bitJavaStart;
 228 
 229     static int maxOffset = maskCase/2 ;
 230     static int minOffset = -maxOffset;
 231 
 232     /* The following routines provide simple, concise formatting of long integer values.
 233      The number in the name of the method indicates the desired number of characters
 234      to be produced.  If the number of digits required to represent the integer value
 235      is less than that number, then the output is padded on the left  with zeros
 236      (for hex) or with spaces (for decimal).  If the number of digits required to
 237      represent the integer value is greater than the desired number, then all the digits
 238      that are required are actually produced.
 239     */
 240 
 241     static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
 242 
 243     static String hex2(long n) {
 244         String q = Long.toHexString(n & 0xFF).toUpperCase();
 245         return "00".substring(Math.min(2, q.length())) + q;
 246     }
 247 
 248     static String hex4(long n) {
 249         String q = Long.toHexString(n & 0xFFFF).toUpperCase();
 250         return "0000".substring(Math.min(4, q.length())) + q;
 251     }
 252 
 253     static String hex8(long n) {
 254         String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
 255         return "00000000".substring(Math.min(8, q.length())) + q;
 256     }
 257 
 258     static String hex16(long n) {
 259         String q = Long.toHexString(n).toUpperCase();
 260         return "0000000000000000".substring(Math.min(16, q.length())) + q;
 261     }
 262 
 263     static String dec3(long n) {
 264         String q = Long.toString(n);
 265         return "   ".substring(Math.min(3, q.length())) + q;
 266     }
 267 
 268     static String dec5(long n) {
 269         String q = Long.toString(n);
 270         return "     ".substring(Math.min(5, q.length())) + q;
 271     }
 272 
 273     /* This routine is called when some failure occurs. */
 274 
 275     static void FAIL(String s) {
 276         System.out.println("** " + s);
 277     }
 278 
 279     /**
 280     * Given the data from the Unicode specification file, this routine builds a map.
 281     *
 282     * The specification file is assumed to contain its data in sorted order by
 283     * character code; as a result, the array passed as an argument to this method
 284     * has its components in the same sorted order, with one entry for each defined
 285     * Unicode character or character range.  (A range is indicated by two consecutive
 286     * entries, such that the name of the first entry begins with "<" and ends with
 287     * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
 288     * therefore a sparse representation of the character property data.
 289     *
 290     * The resulting map is dense representation of the character data.  It contains
 291     * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
 292     * of this long value are used, but type long is used rather than int to facilitate
 293     * future extensions of this source code generator that might require more than
 294     * 32 bits to encode relevant character properties.)  Entry k holds the encoded
 295     * properties for character k.
 296     *
 297     * Method buildMap manages the transformation from the sparse representation to
 298     * the dense representation.  It calls method buildOne to handle the encoding
 299     * of character property data from a single UnicodeSpec object into 32 bits.
 300     * For undefined characters, method buildOne is not called and the map entry for
 301     * that character is set to UnicodeSpec.UNASSIGNED.
 302     *
 303     * @param data       character property data from the Unicode specification file
 304     * @return   an array of length 65536 with one entry for every possible char value
 305     *
 306     * @see GenerateCharacter#buildOne
 307     */
 308 
 309     static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
 310     {
 311         long[] result;
 312         if (bLatin1 == true) {
 313             result = new long[256];
 314         } else {
 315             result = new long[1<<16];
 316         }
 317         int k=0;
 318         int codePoint = plane<<16;
 319         UnicodeSpec nonCharSpec = new UnicodeSpec();
 320         for (int j = 0; j < data.length && k < result.length; j++) {
 321             if (data[j].codePoint == codePoint) {
 322                 result[k] = buildOne(codePoint, data[j], specialMaps);
 323                 ++k;
 324                 ++codePoint;
 325             }
 326             else if(data[j].codePoint > codePoint) {
 327                 if (data[j].name.endsWith("Last>")) {
 328                     // build map data for all chars except last in range
 329                     while (codePoint < data[j].codePoint && k < result.length) {
 330                         result[k] = buildOne(codePoint, data[j], specialMaps);
 331                         ++k;
 332                         ++codePoint;
 333                     }
 334                 }
 335                 else {
 336                     // we have a few unassigned chars before data[j].codePoint
 337                     while (codePoint < data[j].codePoint && k < result.length) {
 338                         result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 339                         ++k;
 340                         ++codePoint;
 341                     }
 342                 }
 343                 k = data[j].codePoint & 0xFFFF;
 344                 codePoint = data[j].codePoint;
 345                 result[k] = buildOne(codePoint, data[j], specialMaps);
 346                 ++k;
 347                 ++codePoint;
 348             }
 349             else {
 350                 System.out.println("An error has occured during spec mapping.");
 351                 System.exit(0);
 352             }
 353         }
 354         // if there are still unprocessed chars, process them
 355         // as unassigned/undefined.
 356         codePoint = (plane<<16) | k;
 357         while (k < result.length) {
 358             result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 359             ++k;
 360             ++codePoint;
 361         }
 362         // now add all extra supported properties from PropList, to the
 363         // upper 16-bit
 364         addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
 365         addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
 366         addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
 367         addExProp(result, propList, "Ideographic", maskIdeographic);
 368         //addExProp(result, propList, "Other_Math", maskOtherMath);
 369         //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
 370 
 371         return result;
 372     }
 373 
 374     // The maximum and minimum offsets found while scanning the database
 375     static int maxOffsetSeen = 0;
 376     static int minOffsetSeen = 0;
 377 
 378     /**
 379      * Some Unicode separator characters are not considered Java whitespace.
 380      * @param c character to test
 381      * @return true if c in an invalid Java whitespace character, false otherwise.
 382      */
 383     static boolean isInvalidJavaWhiteSpace(int c) {
 384         int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
 385         boolean retValue = false;
 386         for(int x=0;x<exceptions.length;x++) {
 387             if(c == exceptions[x]) {
 388                 retValue = true;
 389                 break;
 390             }
 391         }
 392         return retValue;
 393 
 394     }
 395 
 396     /**
 397     * Given the character property data for one Unicode character, encode the data
 398     * of interest into a single long integer value.  (Right now only 32 bits
 399     * of this long value are used, but type long is used rather than int to facilitate
 400     * future extensions of this source code generator that might require more than
 401     * 32 bits to encode relevant character properties.)
 402     *
 403     * @param c   the character code for which to encode property data
 404     * @param us  property data record from the Unicode specification file
 405     *            (its character code might not be equal to c if it specifies data
 406     *            for a range of characters)
 407     * @return   an encoded long value that contains the properties for a single char
 408     *
 409     * @see GenerateCharacter#buildMap
 410     */
 411 
 412     static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
 413         long resultA = 0;
 414         // record the general category
 415         resultA |= us.generalCategory;
 416 
 417         // record the numeric properties
 418         NUMERIC: {
 419         STRANGE: {
 420             int val = 0;
 421             // c is A-Z
 422             if ((c >= 0x0041) && (c <= 0x005A)) {
 423                 val = c - 0x0041;
 424                 resultA |= valueJavaSupradecimal;
 425             // c is a-z
 426             } else if ((c >= 0x0061) && (c <= 0x007A)) {
 427                 val = c - 0x0061;
 428                 resultA |= valueJavaSupradecimal;
 429             // c is a full-width A-Z
 430             } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
 431                 val = c - 0xFF21;
 432                 resultA |= valueJavaSupradecimal;
 433             // c is a full-width a-z
 434             } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
 435                 val = c - 0xFF41;
 436                 resultA |= valueJavaSupradecimal;
 437             } else if (us.isDecimalValue()) {
 438                 val = us.decimalValue;
 439                 resultA |= valueDigit;
 440             } else if (us.isDigitValue()) {
 441                 val = us.digitValue;
 442                 resultA |= valueDigit;
 443             } else {
 444                 if (us.numericValue.length() == 0) {
 445                     break NUMERIC;                      // no numeric value at all
 446                 } else {
 447                     try {
 448                         val = Integer.parseInt(us.numericValue);
 449                         if (val >= 32 || val < 0) break STRANGE;
 450                         if (c == 0x215F) break STRANGE;
 451                     } catch(NumberFormatException e) {
 452                         break STRANGE;
 453                     }
 454                     resultA |= valueDigit;
 455                 }
 456             }
 457             if (val >= 32 || val < 0) break STRANGE;
 458             resultA |= ((val - c & maskDigit) << shiftDigitOffset);
 459             break NUMERIC;
 460         } // end STRANGE
 461         resultA |= valueStrangeNumeric;
 462         } // end NUMERIC
 463 
 464         // record case mapping
 465         int offset = 0;
 466         // might have a 1:M mapping
 467         int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
 468         boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
 469         if (bHasUpper) {
 470             resultA |= maskUpperCase;
 471         }
 472         if (specialMap != -1) {
 473             // has mapping, but cannot record the
 474             // proper offset; can only flag it and provide special case
 475             // code in Character.java
 476             offset = -1;
 477         }
 478         else if (us.hasUpperMap())  {
 479             offset = c - us.upperMap;
 480         }
 481 
 482         if (us.hasLowerMap()) {
 483             resultA |= maskLowerCase;
 484             if (offset == 0)
 485                 offset = us.lowerMap - c;
 486             else if (offset != (us.lowerMap - c)) {
 487                 if (DEBUG) {
 488                 FAIL("Character " + hex(c) +
 489                 " has incompatible lowercase and uppercase mappings");
 490                 }
 491             }
 492         }
 493         if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
 494             (bHasUpper && us.hasLowerMap())) {
 495             resultA |= maskTitleCase;
 496         }
 497         if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
 498             System.out.println("Warning: Character " + hex4(c) + " has upper but " +
 499                                "no title case; Java won't know this");
 500         }
 501         if (offset < minOffsetSeen) minOffsetSeen = offset;
 502         if (offset > maxOffsetSeen) maxOffsetSeen = offset;
 503         if (offset > maxOffset || offset < minOffset) {
 504             if (DEBUG) {
 505             FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
 506             }
 507             offset = maskCase;
 508         }
 509         resultA |= ((offset & maskCase) << shiftCaseOffset);
 510 
 511         // record lexical info about this character
 512         if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
 513                 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
 514                 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
 515                 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
 516                 || us.generalCategory == UnicodeSpec.OTHER_LETTER
 517                 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
 518             resultA |= valueJavaUnicodeStart;
 519         }
 520         else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
 521                 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
 522                 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
 523             resultA |= valueJavaUnicodePart;
 524         }
 525         else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
 526             resultA |= valueJavaStartUnicodePart;
 527         }
 528         else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
 529             resultA |= valueJavaOnlyStart;
 530         }
 531         else if (((c >= 0x0000) && (c <= 0x0008))
 532                 || ((c >= 0x000E) && (c <= 0x001B))
 533                 || ((c >= 0x007F) && (c <= 0x009F))
 534                 || us.generalCategory == UnicodeSpec.FORMAT) {
 535             resultA |= valueIgnorable;
 536         }
 537         else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
 538                 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
 539                 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
 540             if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
 541         }
 542         else if (((c >= 0x0009) && (c <= 0x000D))
 543                 || ((c >= 0x001C) && (c <= 0x001F))) {
 544             resultA |= valueJavaWhitespace;
 545         }
 546 
 547         // record bidi category
 548         if (!nobidi) {
 549             int tmpBidi =
 550                 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
 551                     us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
 552             resultA |= tmpBidi;
 553         }
 554 
 555         // record mirrored property
 556         if (!nomirror) {
 557             resultA |= us.mirrored ? maskMirrored : 0;
 558         }
 559 
 560         if (identifiers) {
 561             long replacement = 0;
 562             if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
 563                 replacement |= bitJavaStart;
 564             }
 565             if ( ((resultA & nonzeroJavaPart) != 0)
 566                     && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
 567                 replacement |= bitJavaPart;
 568             }
 569             resultA = replacement;
 570         }
 571         return resultA;
 572     }
 573 
 574     static void addExProp(long[] map, PropList propList, String prop, long mask) {
 575         List<Integer> cps = propList.codepoints(prop);
 576         if (cps != null) {
 577             for (Integer cp : cps) {
 578                 if (cp < map.length)
 579                     map[cp] |= mask;
 580             }
 581         }
 582     }
 583 
 584     /**
 585     * This is the heart of the table compression strategy.  The inputs are a map
 586     * and a number of bits (size).  The map is simply an array of long integer values;
 587     * the number of bits indicates how index values for that map are to be split.
 588     * The length of the given map must be a multiple of (1 << size).  The result is
 589     * a new map z and a compressed table t such that for every valid index value k
 590     * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
 591     *
 592     * In other words, the index k can be split into two parts, namely the "size"
 593     * low-order bits and all the remaining high-order bits; the high-order bits are then
 594     * remapped by map z to produce an index into table t.  In effect, the data of the
 595     * original map m is broken up into blocks of size (1<<size); the compression relies
 596     * on the expectation that many of these blocks will be identical and therefore need
 597     * be represented only once in the compressed table t.
 598     *
 599     * This method is intended to be used iteratively.  The first map to be handed
 600     * to it is the one constructed by method buildMap.  After that, the first of the
 601     * two arrays returned by this method is fed back into it for further compression.
 602     * At the end of the iteration, one has a starter map and a sequence of tables.
 603     *
 604     * The algorithm used to implement this computation is straightforward and not
 605     * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
 606     * to locate identical blocks, so overall the time complexity of the algorithm
 607     * is quadratic in the length of the input map.  Fortunately, speed is not crucial
 608     * to this application.
 609     *
 610     * @param map                a map to be compressed
 611     * @param size       the number of index bits to be split off by the compression
 612     * @return   an array of length 2 containing two arrays; the first is a new map
 613     *           and the second is a compressed data table
 614     *
 615     * @see GenerateCharacter#buildMap
 616     */
 617 
 618     static long[][] buildTable(long[] map, int size) {
 619         int n = map.length;
 620         if (((n >> size) << size) != n) {
 621             FAIL("Length " + n + " is not a multiple of " + (1 << size));
 622         }
 623         int m = 1 << size;
 624         // We know the final length of the new map up front.
 625         long[] newmap = new long[n >> size];
 626         // The buffer is used temporarily to hold data for the compressed table
 627         // because we don't know its final length yet.
 628         long[] buffer = new long[n];
 629         int ptr = 0;
 630 OUTER:  for (int i = 0; i < n; i += m) {
 631             // For every block of size m in the original map...
 632     MIDDLE: for (int j = 0; j < ptr; j += m) {
 633             // Find out whether there is already a block just like it in the buffer.
 634                 for (int k = 0; k < m; k++) {
 635                     if (buffer[j+k] != map[i+k])
 636                         continue MIDDLE;
 637                 }
 638                 // There is a block just like it at position j, so just
 639                 // put its index into the new map (thereby sharing it).
 640                 newmap[i >> size] = (j >> size);
 641                 continue OUTER;
 642             } // end MIDDLE
 643             // There is no block just like it already, so add it to
 644             // the buffer and put its index into the new map.
 645             for (int k = 0; k < m; k++) {
 646                 buffer[ptr+k] = map[i+k];
 647             }
 648             newmap[i >> size] = (ptr >> size);
 649             ptr += m;
 650         } // end OUTER
 651         // Now we know how long the compressed table should be,
 652         // so create a new array and copy data from the temporary buffer.
 653         long[] newdata = new long[ptr];
 654         for (int j = 0; j < ptr; j++) {
 655             newdata[j] = buffer[j];
 656         }
 657         // Return the new map and the new data table.
 658         long[][] result = { newmap, newdata };
 659         return result;
 660     }
 661 
 662     /**
 663     * Once the compressed tables have been computed, this method reads in a
 664     * template file for the source code to be generated and writes out the final
 665     * source code by acting as a sort of specialized macro processor.
 666     *
 667     * The first output line is a comment saying that the file was automatically
 668     * generated; it includes a timestamp.  All other output is generated by
 669     * reading a line from the template file, performing macro replacements,
 670     * and then writing the resulting line or lines of code to the output file.
 671     *
 672     * This method handles the I/O, the timestamp comment, and the locating of
 673     * macro calls within each input line.  The method replaceCommand is called
 674     * to generate replacement text for each macro call.
 675     *
 676     * Macro calls to be replaced are indicated in the template file by
 677     * occurrences of the commandMarker "$$".  The rest of the call may consist
 678     * of Java letters (including the underscore "_") and also of balanced
 679     * parentheses.
 680     *
 681     * @param theTemplateFileName
 682     *           the file name for the template input file
 683     * @param theOutputFileName
 684     *           the file name for the source code output file
 685     *
 686     *     @see GenerateCharacter#replaceCommand
 687     */
 688 
 689     static void generateCharacterClass(String theTemplateFileName,
 690                                        String theOutputFileName)
 691         throws FileNotFoundException, IOException {
 692         BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
 693         PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
 694         out.println(commentStart +
 695             " This file was generated AUTOMATICALLY from a template file " +
 696             new java.util.Date() + commentEnd);
 697         int marklen = commandMarker.length();
 698         LOOP: while(true) {
 699             try {
 700                 String line = in.readLine();
 701                 if (line == null) break LOOP;
 702                 int pos = 0;
 703                 int depth = 0;
 704                 while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
 705                     int newpos = pos + marklen;
 706                     char ch = 'x';
 707                     SCAN: while (newpos < line.length() &&
 708                             (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
 709                             || ch == '(' || (ch == ')' && depth > 0))) {
 710                         ++newpos;
 711                         if (ch == '(') {
 712                             ++depth;
 713                         }
 714                         else if (ch == ')') {
 715                             --depth;
 716                             if (depth == 0)
 717                                 break SCAN;
 718                         }
 719                     }
 720                     String replacement = replaceCommand(line.substring(pos + marklen, newpos));
 721                     line = line.substring(0, pos) + replacement + line.substring(newpos);
 722                     pos += replacement.length();
 723                 }
 724                 out.println(line);
 725             }
 726             catch (IOException e) {
 727                 break LOOP;
 728             }
 729         }
 730         in.close();
 731         out.close();
 732     }
 733 
 734     /**
 735     * The replaceCommand method takes a command (a macro call without the
 736     * leading marker "$$") and computes replacement text for it.
 737     *
 738     * Most of the commands are simply names of integer constants that are defined
 739     * in the source code of this GenerateCharacter class.  The replacement text is
 740     * simply the value of the constant as an appropriately formatted integer literal.
 741     *
 742     * Two cases are more complicated, however.  The command "Tables" causes the
 743     * final map and compressed tables to be emitted, with elaborate comments
 744     * describing their contents.  (This is actually handled by method genTables.)
 745     * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
 746     * an expression that will return the character property data for the character
 747     * whose code is the value of the variable "xxx".  (this is handled by method
 748     * "genAccess".)
 749     *
 750     * @param x  a command from the template file to be replaced
 751     * @return   the replacement text, as a String
 752     *
 753     * @see GenerateCharacter#genTables
 754     * @see GenerateCharacter#genAccess
 755     * @see GenerateCharacter#generateCharacterClass
 756     */
 757 
 758     static String replaceCommand(String x) {
 759         if (x.equals("Tables")) return genTables();
 760         if (x.equals("Initializers")) return genInitializers();
 761         if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
 762                 x.substring(x.length()-1).equals(")") )
 763             return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
 764         if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
 765                 x.substring(x.length()-1).equals(")") )
 766             return genAccess("B", x.substring(9, x.length()-1), 16);
 767         if (x.equals("shiftType")) return Long.toString(shiftType);
 768         if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
 769         if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
 770         if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
 771         if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
 772         if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
 773         if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
 774         if (x.equals("maskCase")) return "0x" + hex8(maskCase);
 775         if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
 776         if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
 777         if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
 778         if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
 779         if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
 780         if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
 781         if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
 782         if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
 783         if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
 784         if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
 785         if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
 786         if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
 787         if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
 788         if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
 789         if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
 790         if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
 791         if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
 792         if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
 793         if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
 794         if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
 795         if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
 796         if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
 797         if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
 798         if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
 799         if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
 800         if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
 801         if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
 802         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 803         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 804         if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
 805         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 806         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 807         if (x.equals("maskType")) return "0x" + hex(maskType);
 808         if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
 809         if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
 810         if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
 811         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
 812             return Integer.toString(UnicodeSpec.UNASSIGNED);
 813         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
 814             return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
 815         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
 816             return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
 817         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
 818             return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
 819         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
 820              return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
 821         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
 822              return Integer.toString(UnicodeSpec.OTHER_LETTER);
 823         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
 824              return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
 825         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
 826              return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
 827         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
 828              return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
 829         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
 830              return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
 831         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
 832              return Integer.toString(UnicodeSpec.OTHER_NUMBER);
 833         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
 834              return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
 835         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
 836              return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
 837         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 838              return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
 839         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
 840             return Integer.toString(UnicodeSpec.CONTROL);
 841         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
 842             return Integer.toString(UnicodeSpec.FORMAT);
 843         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
 844             return Integer.toString(UnicodeSpec.PRIVATE_USE);
 845         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
 846             return Integer.toString(UnicodeSpec.SURROGATE);
 847         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
 848             return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
 849         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
 850             return Integer.toString(UnicodeSpec.START_PUNCTUATION);
 851         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
 852             return Integer.toString(UnicodeSpec.END_PUNCTUATION);
 853         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 854             return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
 855         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 856             return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
 857         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
 858             return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
 859         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
 860             return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
 861         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
 862             return Integer.toString(UnicodeSpec.LETTER_NUMBER);
 863         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
 864             return Integer.toString(UnicodeSpec.MATH_SYMBOL);
 865         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
 866             return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
 867         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
 868             return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
 869         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
 870             return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
 871         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
 872             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
 873         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
 874             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
 875         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
 876             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
 877         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
 878             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
 879         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
 880             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
 881         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
 882             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
 883         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
 884             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
 885         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
 886             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
 887         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
 888             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
 889         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 890             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
 891         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
 892             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
 893         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
 894             return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
 895         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 896             return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
 897         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
 898             return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
 899          if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
 900             return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
 901         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 902             return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
 903         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
 904             return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
 905         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
 906             return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
 907         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
 908             return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
 909         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG]))
 910             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE);
 911         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG]))
 912             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE);
 913         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG]))
 914             return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE);
 915         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG]))
 916             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE);
 917         FAIL("Unknown text substitution marker " + commandMarker + x);
 918         return commandMarker + x;
 919     }
 920 
 921     /**
 922     * The genTables method generates source code for all the lookup tables
 923     * needed to represent the various Unicode character properties.
 924     * It simply calls the method genTable once for each table to be generated
 925     * and then generates a summary comment.
 926     *
 927     * @return   the replacement text for the "Tables" command, as a String
 928     *
 929     * @see GenerateCharacter#genTable
 930     * @see GenerateCharacter#replaceCommand
 931     */
 932     static String genTables() {
 933         int n = sizes.length;
 934         StringBuffer result = new StringBuffer();
 935         // liu : Add a comment showing the source of this table
 936         result.append(commentStart + " The following tables and code generated using:" +
 937                   commentEnd + "\n  ");
 938         result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
 939 
 940                 if (plane == 0 && bLatin1 == false) {
 941             genCaseMapTableDeclaration(result);
 942             genCaseMapTable(initializers, specialCaseMaps);
 943                 }
 944         int totalBytes = 0;
 945         for (int k = 0; k < n - 1; k++) {
 946             genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
 947                 sizes[k+1], false, false, k==0);
 948             int s = bytes[k];
 949             if (s == 1 && useCharForByte) {
 950                 s = 2;
 951             }
 952             totalBytes += tables[k].length * s;
 953         }
 954         genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
 955             sizes[n - 1], false, 0, true, !(identifiers), false);
 956 
 957         // If we ever need more than 32 bits to represent the character properties,
 958         // then a table "B" may be needed as well.
 959         genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
 960 
 961         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
 962         result.append(commentStart);
 963         result.append(" In all, the character property tables require ");
 964         result.append(totalBytes).append(" bytes.").append(commentEnd);
 965         if (verbose) {
 966             System.out.println("The character property tables require "
 967                  + totalBytes + " bytes.");
 968         }
 969         return result.toString();
 970     }
 971 
 972     /**
 973      * The genInitializers method generates the body of the
 974      * ensureInitted() method, which enables lazy initialization of
 975      * the case map table and other tables.
 976      */
 977     static String genInitializers() {
 978         return initializers.toString();
 979     }
 980 
 981     /**
 982      * Return the total number of bytes needed by all tables.  This is a stripped-
 983      * down copy of genTables().
 984      */
 985     static int getTotalBytes() {
 986         int n = sizes.length;
 987         int totalBytes = 0;
 988         for (int k = 0; k < n - 1; k++) {
 989             totalBytes += tables[k].length * bytes[k];
 990         }
 991         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
 992                          + 31) >> 5) << 2);
 993         return totalBytes;
 994     }
 995 
 996     static void appendEscapedStringFragment(StringBuffer result,
 997                                             char[] line,
 998                                             int length,
 999                                             boolean lastFragment) {
1000         result.append("    \"");
1001         for (int k=0; k<length; ++k) {
1002             result.append("\\u");
1003             result.append(hex4(line[k]));
1004         }
1005         result.append("\"");
1006         result.append(lastFragment ? ";" : "+");
1007         result.append("\n");
1008     }
1009 
1010     static String SMALL_INITIALIZER =
1011         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1012         // "            $$name = new $$type[$$size];\n"+
1013         "            int len = $$name_DATA.length();\n"+
1014         "            int j=0;\n"+
1015         "            for (int i=0; i<len; ++i) {\n"+
1016         "                int c = $$name_DATA.charAt(i);\n"+
1017         "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
1018         "                    $$name[j++] = ($$type)c;\n"+
1019         "                    c >>= $$bits;\n"+
1020         "                }\n"+
1021         "            }\n"+
1022         "            assert (j == $$size);\n"+
1023         "        }\n";
1024 
1025     static String SAME_SIZE_INITIALIZER =
1026         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1027         "            assert ($$name_DATA.length() == $$size);\n"+
1028         // "            $$name = new $$type[$$size];\n"+
1029         "            for (int i=0; i<$$size; ++i)\n"+
1030         "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
1031         "        }\n";
1032 
1033     static String BIG_INITIALIZER =
1034         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1035         // "            $$name = new $$type[$$size];\n"+
1036         "            int len = $$name_DATA.length();\n"+
1037         "            int j=0;\n"+
1038         "            int charsInEntry=0;\n"+
1039         "            $$type entry=0;\n"+
1040         "            for (int i=0; i<len; ++i) {\n"+
1041         "                entry |= $$name_DATA.charAt(i);\n"+
1042         "                if (++charsInEntry == $$charsPerEntry) {\n"+
1043         "                    $$name[j++] = entry;\n"+
1044         "                    entry = 0;\n"+
1045         "                    charsInEntry = 0;\n"+
1046         "                }\n"+
1047         "                else {\n"+
1048         "                    entry <<= 16;\n"+
1049         "                }\n"+
1050         "            }\n"+
1051         "            assert (j == $$size);\n"+
1052         "        }\n";
1053 
1054     static String INT32_INITIALIZER =
1055         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1056         "            char[] data = $$name_DATA.toCharArray();\n"+
1057         "            assert (data.length == ($$size * 2));\n"+
1058         "            int i = 0, j = 0;\n"+
1059         "            while (i < ($$size * 2)) {\n"+
1060         "                int entry = data[i++] << 16;\n"+
1061         "                $$name[j++] = entry | data[i++];\n"+
1062         "            }\n"+
1063         "        }\n";
1064 
1065     static void addInitializer(String name, String type, int entriesPerChar,
1066                                int bits, int size) {
1067 
1068         String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1069                           ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1070         if (entriesPerChar == -2) {
1071             template = INT32_INITIALIZER;
1072         }
1073         int marklen = commandMarker.length();
1074         int pos = 0;
1075         while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1076             int newpos = pos + marklen;
1077             char ch = 'x';
1078             while (newpos < template.length() &&
1079                    Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1080                    ch != '_') // Don't allow this in token names
1081                 ++newpos;
1082             String token = template.substring(pos+marklen, newpos);
1083             String replacement = "ERROR";
1084 
1085             if (token.equals("name")) replacement = name;
1086             else if (token.equals("type")) replacement = type;
1087             else if (token.equals("bits")) replacement = ""+bits;
1088             else if (token.equals("size")) replacement = ""+size;
1089             else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1090             else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1091             else FAIL("Unrecognized token: " + token);
1092 
1093             template = template.substring(0, pos) + replacement + template.substring(newpos);
1094             pos += replacement.length();
1095         }
1096         initializers.append(template);
1097     }
1098 
1099     /**
1100     * The genTable method generates source code for one lookup table.
1101     * Most of the complexity stems from handling various options as to
1102     * the type of the array components, the precise representation of the
1103     * values, the format in which to render each value, the number of values
1104     * to emit on each line of source code, and the kinds of useful comments
1105     * to be generated.
1106     *
1107     * @param result     a StringBuffer, to which the generated source code
1108     *                   text is to be appended
1109     * @param name       the name of the table
1110     * @param table      the table data (an array of long values)
1111     * @param extract    a distance, in bits, by which each entry of the table
1112     *                   is to be right-shifted before it is processed
1113     * @param bits       the number of bits (not bytes) to be used to represent
1114     *                   each table entry
1115     * @param size       the table data is divided up into blocks of size (1<<size);
1116     *                   in this method, this information is used only to affect
1117     *                   how many table values are to be generated per line
1118     * @param preshifted if this flag is true, then the table entries are to be
1119     *                   emitted in a preshifted form; that is, each value should
1120     *                   be left-shifted by the amount "shift", so that this work
1121     *                   is built into the table and need not be performed by an
1122     *                   explicit shift operator at run time
1123     * @param shift      this is the shift amount for preshifting of table entries
1124     * @param hexFormat  if this flag is true, table entries should be emitted as
1125     *                   hexadecimal literals; otherwise decimal literals are used
1126     * @param properties if this flag is true, the table entries are encoded
1127     *                   character properties rather than indexes into yet other tables;
1128     *                   therefore comments describing the encoded properties should
1129     *                   be generated
1130     * @param hexComment if this flag is true, each line of output is labelled with
1131     *                   a hexadecimal comment indicating the character values to
1132     *                   which that line applies; otherwise, decimal values indicating
1133     *                   table indices are generated
1134     *
1135     * @see GenerateCharacter#genTables
1136     * @see GenerateCharacter#replaceCommand
1137     */
1138 
1139     static void genTable(StringBuffer result, String name,
1140                          long[] table, int extract, int bits, int size,
1141                          boolean preshifted, int shift, boolean hexFormat,
1142                          boolean properties, boolean hexComment) {
1143 
1144         String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1145             bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1146             bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1147             bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1148             bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1149             bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1150             (Csyntax ? "int64" : "long");
1151         long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1152             bits == 2 ? Integer.MAX_VALUE :
1153             bits == 4 ? Integer.MAX_VALUE :
1154             bits == 8 ? Byte.MAX_VALUE :
1155             bits == 16 ? Short.MAX_VALUE :
1156             bits == 32 ? Integer.MAX_VALUE :
1157             Long.MAX_VALUE;
1158         int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1159         boolean shiftEntries = preshifted && shift != 0;
1160         if (bits == 8 && tableAsString && useCharForByte) {
1161             atype = "char";
1162             maxPosEntry = Character.MAX_VALUE;
1163             entriesPerChar = 1;
1164         }
1165         boolean noConversion = atype.equals("char");
1166 
1167         result.append(commentStart);
1168         result.append(" The ").append(name).append(" table has ").append(table.length);
1169         result.append(" entries for a total of ");
1170         int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1171         if (bits == 8 && useCharForByte) {
1172             sizeOfTable *= 2;
1173         }
1174         result.append(sizeOfTable);
1175         result.append(" bytes.").append(commentEnd).append("\n\n");
1176         if (Csyntax)
1177             result.append("  static ");
1178         else
1179             result.append("  static final ");
1180         result.append(atype);
1181         result.append(" ").append(name).append("[");
1182         if (Csyntax)
1183             result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1184         if (tableAsString) {
1185             if (noConversion) {
1186                 result.append("] = (\n");
1187             } else {
1188                 result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1189                 result.append("static final String ").append(name).append("_DATA =\n");
1190             }
1191             int CHARS_PER_LINE = 8;
1192             StringBuffer theString = new StringBuffer();
1193             int entriesInCharSoFar = 0;
1194             char ch = '\u0000';
1195             int charsPerEntry = -entriesPerChar;
1196             for (int j=0; j<table.length; ++j) {
1197                 //long entry = table[j] >> extract;
1198                 long entry;
1199                 if ("A".equals(name))
1200                     entry = (table[j] & 0xffffffffL) >> extract;
1201                 else
1202                     entry = (table[j] >> extract);
1203                 if (shiftEntries) entry <<= shift;
1204                 if (entry >= (1L << bits)) {
1205                     FAIL("Entry too big");
1206                 }
1207                 if (entriesPerChar > 0) {
1208                     // Pack multiple entries into a character
1209                     ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1210                     ++entriesInCharSoFar;
1211                     if (entriesInCharSoFar == entriesPerChar) {
1212                         // Character is full
1213                         theString.append(ch);
1214                         entriesInCharSoFar = 0;
1215                         ch = '\u0000';
1216                     }
1217                 }
1218                 else {
1219                     // Use multiple characters per entry
1220                     for (int k=0; k<charsPerEntry; ++k) {
1221                         ch = (char)(entry >> ((charsPerEntry-1)*16));
1222                         entry <<= 16;
1223                         theString.append(ch);
1224                     }
1225                 }
1226             }
1227             if (entriesInCharSoFar > 0) {
1228                 while (entriesInCharSoFar < entriesPerChar) {
1229                     ch = (char)((int)ch >> bits);
1230                     ++entriesInCharSoFar;
1231                 }
1232                 theString.append(ch);
1233                 entriesInCharSoFar = 0;
1234             }
1235             result.append(Utility.formatForSource(theString.toString(), "    "));
1236             if (noConversion) {
1237                 result.append(").toCharArray()");
1238             }
1239             result.append(";\n\n  ");
1240 
1241             if (!noConversion) {
1242                 addInitializer(name, atype, entriesPerChar, bits, table.length);
1243             }
1244         }
1245         else {
1246             result.append("] = {");
1247             boolean castEntries = shiftEntries && (bits < 32);
1248             int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1249                 bits == 2 ? 16*4 :
1250                 bits == 4 ? 8*4 :
1251                 bits == 8 ? 8 :
1252                 bits == 16 ? 8 :
1253                 bits == 32 ? 4 : 2) :
1254                 (bits == 8 ? 8 :
1255                 bits == 16 ? 8 : 4);
1256             int printMask = properties ? 0 :
1257             Math.min(1 << size,
1258                 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1259             int commentShift = ((1 << size) == table.length) ? 0 : size;
1260             int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1261             long val = 0;
1262             for (int j = 0; j < table.length; j++) {
1263                 if ((j & printMask) == 0) {
1264                     while (result.charAt(result.length() - 1) == ' ')
1265                         result.setLength(result.length() - 1);
1266                     result.append("\n    ");
1267                 }
1268         PRINT:  {
1269                 if (castEntries)
1270                     result.append("(").append(atype).append(")(");
1271                 long entry = table[j] >> extract;
1272                 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1273                 int k = j & packMask;
1274                 if (bits >= 8)
1275                     val = entry;
1276                 else if (k == 0) {
1277                     val = entry;
1278                     break PRINT;
1279                 }
1280                 else {
1281                     val |= (entry << (k*bits));
1282                     if (k != packMask)
1283                         break PRINT;
1284                 }
1285                 if (val > maxPosEntry && !Csyntax) { // liu
1286                 // For values that are out of range, convert them to in-range negative values.
1287                 // Actually, output the '-' and convert them to the negative of the corresponding
1288                 // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1289                     result.append('-');
1290                     val = maxPosEntry + maxPosEntry + 2 - val;
1291                 }
1292                 if (hexFormat) {
1293                     result.append("0x");
1294                     if (bits == 8)
1295                         result.append(hex2((byte)val));
1296                     else if (bits == 16)
1297                         result.append(hex4((short)val));
1298                     else if (bits == 32 || bits < 8)
1299                         result.append(hex8((int)val));
1300                     else {
1301                         result.append(hex16(val));
1302                         if (!Csyntax)
1303                             result.append("L");
1304                     }
1305                 }
1306                 else {
1307                     if (bits == 8)
1308                         result.append(dec3(val));
1309                     else if (bits == 64) {
1310                         result.append(dec5(val));
1311                         if (!Csyntax)
1312                             result.append("L");
1313                     }
1314                     else
1315                         result.append(dec5(val));
1316                 }
1317                 if (shiftEntries)
1318                     result.append("<<").append(shift);
1319                 if (castEntries) result.append(")");
1320                 if (j < (table.length - 1))
1321                     result.append(", ");
1322                 else
1323                     result.append("  ");
1324                 if ((j & printMask) == printMask) {
1325                     result.append(" ").append(commentStart).append(" ");
1326                     if (hexComment)
1327                         result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1328                     else
1329                         result.append(dec3((j & ~commentMask) >> commentShift));
1330                     if (properties) propertiesComments(result, val);
1331                     result.append(commentEnd);
1332                 }
1333                 } // end PRINT
1334             }
1335             result.append("\n  };\n\n  ");
1336         }
1337     }
1338 
1339     static void genCaseMapTableDeclaration(StringBuffer result) {
1340         String myTab = "    ";
1341         result.append(myTab + "static final char[][][] charMap;\n");
1342     }
1343 
1344     static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1345         String myTab = "    ";
1346         int ch;
1347         char[] map;
1348         result.append(myTab + "charMap = new char[][][] {\n");
1349         for (int x = 0; x < specialCaseMaps.length; x++) {
1350             ch = specialCaseMaps[x].getCharSource();
1351             map = specialCaseMaps[x].getUpperCaseMap();
1352             result.append(myTab + myTab);
1353             result.append("{ ");
1354             result.append("{\'\\u"+hex4(ch)+"\'}, {");
1355             for (int y = 0; y < map.length; y++) {
1356                 result.append("\'\\u"+hex4(map[y])+"\', ");
1357             }
1358             result.append("} },\n");
1359         }
1360         result.append(myTab + "};\n");
1361 
1362     }
1363 
1364     /**
1365     * The propertiesComments method generates comments describing encoded
1366     * character properties.
1367     *
1368     * @param result     a StringBuffer, to which the generated source code
1369     *                   text is to be appended
1370     * @param val                encoded character properties
1371     *
1372     * @see GenerateCharacter#genTable
1373     */
1374 
1375     static void propertiesComments(StringBuffer result, long val) {
1376         result.append("   ");
1377         switch ((int)(val & maskType)) {
1378             case UnicodeSpec.CONTROL:
1379                 result.append("Cc");
1380                 break;
1381             case UnicodeSpec.FORMAT:
1382                 result.append("Cf");
1383                 break;
1384             case UnicodeSpec.PRIVATE_USE:
1385                 result.append("Co");
1386                 break;
1387             case UnicodeSpec.SURROGATE:
1388                 result.append("Cs");
1389                 break;
1390             case UnicodeSpec.LOWERCASE_LETTER:
1391                 result.append("Ll");
1392                 break;
1393             case UnicodeSpec.MODIFIER_LETTER:
1394                 result.append("Lm");
1395                 break;
1396             case UnicodeSpec.OTHER_LETTER:
1397                 result.append("Lo");
1398                 break;
1399             case UnicodeSpec.TITLECASE_LETTER:
1400                 result.append("Lt");
1401                 break;
1402             case UnicodeSpec.UPPERCASE_LETTER:
1403                 result.append("Lu");
1404                 break;
1405             case UnicodeSpec.COMBINING_SPACING_MARK:
1406                 result.append("Mc");
1407                 break;
1408             case UnicodeSpec.ENCLOSING_MARK:
1409                 result.append("Me");
1410                 break;
1411             case UnicodeSpec.NON_SPACING_MARK:
1412                 result.append("Mn");
1413                 break;
1414             case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1415                 result.append("Nd");
1416                 break;
1417             case UnicodeSpec.LETTER_NUMBER:
1418                 result.append("Nl");
1419                 break;
1420             case UnicodeSpec.OTHER_NUMBER:
1421                 result.append("No");
1422                 break;
1423             case UnicodeSpec.CONNECTOR_PUNCTUATION:
1424                 result.append("Pc");
1425                 break;
1426             case UnicodeSpec.DASH_PUNCTUATION:
1427                 result.append("Pd");
1428                 break;
1429             case UnicodeSpec.END_PUNCTUATION:
1430                 result.append("Pe");
1431                 break;
1432             case UnicodeSpec.OTHER_PUNCTUATION:
1433                 result.append("Po");
1434                 break;
1435             case UnicodeSpec.START_PUNCTUATION:
1436                 result.append("Ps");
1437                 break;
1438             case UnicodeSpec.CURRENCY_SYMBOL:
1439                 result.append("Sc");
1440                 break;
1441             case UnicodeSpec.MODIFIER_SYMBOL:
1442                 result.append("Sk");
1443                 break;
1444             case UnicodeSpec.MATH_SYMBOL:
1445                 result.append("Sm");
1446                 break;
1447             case UnicodeSpec.OTHER_SYMBOL:
1448                 result.append("So");
1449                 break;
1450             case UnicodeSpec.LINE_SEPARATOR:
1451                 result.append("Zl"); break;
1452             case UnicodeSpec.PARAGRAPH_SEPARATOR:
1453                 result.append("Zp");
1454                 break;
1455             case UnicodeSpec.SPACE_SEPARATOR:
1456                 result.append("Zs");
1457                 break;
1458             case UnicodeSpec.UNASSIGNED:
1459                 result.append("unassigned");
1460                 break;
1461         }
1462 
1463         switch ((int)((val & maskBidi) >> shiftBidi)) {
1464             case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1465                 result.append(", L");
1466                 break;
1467             case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1468                 result.append(", R");
1469                 break;
1470             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1471                 result.append(", EN");
1472                 break;
1473             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1474                 result.append(", ES");
1475                 break;
1476             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1477                 result.append(", ET");
1478                 break;
1479             case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1480                 result.append(", AN");
1481                 break;
1482             case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1483                 result.append(", CS");
1484                 break;
1485             case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1486                 result.append(", B");
1487                 break;
1488             case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1489                 result.append(", S");
1490                 break;
1491             case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1492                 result.append(", WS");
1493                 break;
1494             case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1495                 result.append(", ON");
1496                 break;
1497         }
1498         if ((val & maskUpperCase) != 0) {
1499             result.append(", hasUpper (subtract ");
1500             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1501         }
1502         if ((val & maskLowerCase) != 0) {
1503             result.append(", hasLower (add ");
1504             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1505         }
1506         if ((val & maskTitleCase) != 0) {
1507             result.append(", hasTitle");
1508         }
1509         if ((val & maskIdentifierInfo) == valueIgnorable) {
1510             result.append(", ignorable");
1511         }
1512         if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1513             result.append(", identifier part");
1514         }
1515         if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1516             result.append(", underscore");
1517         }
1518         if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1519             result.append(", whitespace");
1520         }
1521         if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1522             result.append(", currency");
1523         }
1524         if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1525             result.append(", identifier start");
1526         }
1527         if ((val & maskNumericType) == valueDigit) {
1528             result.append(", decimal ");
1529             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1530         }
1531         if ((val & maskNumericType) == valueStrangeNumeric) {
1532             result.append(", strange");
1533         }
1534         if ((val & maskNumericType) == valueJavaSupradecimal) {
1535             result.append(", supradecimal ");
1536             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1537         }
1538     }
1539 
1540     static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1541 
1542     static String tableName(int j) { return tableNames[j]; }
1543 
1544     /**
1545     * The genAccess method generates source code for one table access expression.
1546     *
1547     * Most of the complexity stems from handling various options as to
1548     * table representation, such as whether it contains values so large that
1549     * they are represented as negative values and whether the table values are
1550     * preshifted.  This method also avoids such "ugly" expressions as shifting
1551     * by distance zero, masking when no masking is necessary, and so on.
1552     * For clarity, it generates expressions that do not rely on operator
1553     * precedence, but otherwise it avoids generating redundant parentheses.
1554     *
1555     * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1556     * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1557     *
1558     * @param tbl                the name of the final table to be accessed
1559     * @param var                the variable name that appeared in parentheses in the
1560     *                           "Lookup" command
1561     * @param bits       the number of bits (not bytes) to be used to represent
1562     *                   the final table entry
1563     * @return   the replacement text for the "Lookup(xxx)" command, as a String
1564     *
1565     * @see GenerateCharacter#replaceCommand
1566     */
1567 
1568     static String genAccess(String tbl, String var, int bits) {
1569         String access = null;
1570         int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1571         for (int k = 0; k < sizes.length; k++) {
1572             int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1573             int shift = shifts[k] + offset;
1574             String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1575             int mask = (1 << (sizes[k] - offset)) - 1;
1576             String masked = (k == 0) ? shifted :
1577               "(" + shifted + "&0x" + hex(mask) + ")";
1578             String index = (k == 0) ? masked :
1579              (mask == 0) ? access : "(" + access + "|" + masked + ")";
1580             String indexNoParens = (index.charAt(0) != '(') ? index :
1581                  index.substring(1, index.length() - 1);
1582             String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1583             String fetched = tblname + "[" + indexNoParens + "]";
1584             String zeroextended = (zeroextend[k] == 0) ? fetched :
1585                 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1586             int adjustment = preshifted[k] ? 0 :
1587                sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1588             String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1589                 "(" + zeroextended + "<<" + adjustment + ")";
1590             String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1591                 (bits == 2) ? "((" + var + "&0xF)<<1)" :
1592                 (bits == 4) ? "((" + var + "&7)<<2)" : null;
1593             String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1594                 "((" + adjusted + ">>" + bitshift + ")&" +
1595                 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1596             access = extracted;
1597         }
1598         return access;
1599     }
1600 
1601     /* The command line arguments are decoded and used to set the following
1602      global variables.
1603      */
1604 
1605     static boolean verbose = false;
1606     static boolean nobidi = false;
1607     static boolean nomirror = false;
1608     static boolean identifiers = false;
1609     static boolean Csyntax = false;
1610     static String TemplateFileName = null;
1611     static String OutputFileName = null;
1612     static String UnicodeSpecFileName = null; // liu
1613     static String SpecialCasingFileName = null;
1614     static String PropListFileName = null;
1615     static boolean useCharForByte = false;
1616     static int[] sizes;
1617     static int bins = 0; // liu; if > 0, then perform search
1618     static boolean tableAsString = false;
1619     static boolean bLatin1 = false;
1620 
1621     static String commandLineDescription;
1622 
1623     /* Other global variables, equal in length to the "sizes" array. */
1624 
1625     static int[] shifts;
1626     static int[] zeroextend;
1627     static int[] bytes;
1628     static boolean[] preshifted;
1629     static long[][] tables;
1630 
1631 
1632     /* Other global variables */
1633     static String commentStart;
1634     static String commentEnd;
1635 
1636     static StringBuffer initializers = new StringBuffer();
1637 
1638     /* special casing rules for 1:M toUpperCase mappings */
1639     static SpecialCaseMap[] specialCaseMaps;
1640 
1641     /**
1642     * Process the command line arguments.
1643     *
1644     * The allowed flags in command line are:
1645     * <dl>
1646     * <dt> -verbose             <dd> Emit comments to standard output describing
1647     *                                   what's going on during the processing.
1648     * <dt> -nobidi              <dd> Do not include bidi categories in the
1649     *                                   encoded character properties.
1650     * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1651     *                        character properties.
1652     * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1653     * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1654     * <dt> -o filename          <dd> Specify output file name.
1655     * <dt> -template filename   <dd> Specify template input file name.
1656     * <dt> -spec filename        <dd> Specify Unicode spec file name.
1657     * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1658     * <dt> -search bins          <dd> Try different partitions into the specified
1659     *                                    number of bins.  E.g., for 2 bins, try
1660     *                                    16 0, 15 1,..., 0 16.
1661     * <dt> -string               <dd> Create table as string.  Only valid with Java
1662     *                                    syntax.
1663     * <dt> -latin1          <dd> Create a latin 1 only property table.
1664     * </dl>
1665     * In addition, decimal literals may appear as command line arguments;
1666     * each one represents the number of bits of the character to be broken
1667     * off at each lookup step.  If present, they must add up to 16 (the number
1668     * of bits in a char value).  For smaller tables, the last value should
1669     * be 0; values other than the last one may not be zero.  If no such
1670     * numeric values are provided, default values are used.
1671     *
1672     * @param args       the command line arguments, as an array of String
1673     *
1674     * @see GenerateCharacter#main
1675     */
1676 
1677     static void processArgs(String[] args) {
1678         StringBuffer desc = new StringBuffer("java GenerateCharacter");
1679         for (int j=0; j<args.length; ++j) {
1680             desc.append(" " + args[j]);
1681         }
1682         for (int j = 0; j < args.length; j++) {
1683             if (args[j].equals("-verbose") || args[j].equals("-v"))
1684                 verbose = true;
1685             else if (args[j].equals("-nobidi"))
1686                 nobidi = true;
1687             else if (args[j].equals("-nomirror"))
1688                 nomirror = true;
1689             else if (args[j].equals("-identifiers"))
1690                 identifiers = true;
1691             else if (args[j].equals("-c"))
1692                 Csyntax = true;
1693             else if (args[j].equals("-string"))
1694                 tableAsString = true;
1695             else if (args[j].equals("-o")) {
1696                 if (j == args.length - 1) {
1697                     FAIL("File name missing after -o");
1698                 }
1699                 else {
1700                     OutputFileName = args[++j];
1701                 }
1702             }
1703             else if (args[j].equals("-search")) {
1704                 if (j == args.length - 1)
1705                     FAIL("Bin count missing after -search");
1706                 else {
1707                     bins = Integer.parseInt(args[++j]);
1708                     if (bins < 1 || bins > 10)
1709                         FAIL("Bin count must be >= 1 and <= 10");
1710                 }
1711             }
1712             else if (args[j].equals("-template")) {
1713                 if (j == args.length - 1)
1714                     FAIL("File name missing after -template");
1715                 else
1716                     TemplateFileName = args[++j];
1717             }
1718             else if (args[j].equals("-spec")) { // liu
1719                 if (j == args.length - 1) {
1720                     FAIL("File name missing after -spec");
1721                 }
1722                 else {
1723                     UnicodeSpecFileName = args[++j];
1724                 }
1725             }
1726             else if (args[j].equals("-specialcasing")) {
1727                 if (j == args.length -1) {
1728                     FAIL("File name missing after -specialcasing");
1729                 }
1730                 else {
1731                     SpecialCasingFileName = args[++j];
1732                 }
1733             }
1734             else if (args[j].equals("-proplist")) {
1735                 if (j == args.length -1) {
1736                     FAIL("File name missing after -proplist");
1737                 }
1738                 else {
1739                     PropListFileName = args[++j];
1740                 }
1741             }
1742             else if (args[j].equals("-plane")) {
1743                 if (j == args.length -1) {
1744                     FAIL("Plane number missing after -plane");
1745                 }
1746                 else {
1747                     plane = Integer.parseInt(args[++j]);
1748                 }
1749                 if (plane > 0) {
1750                     bLatin1 = false;
1751                 }
1752             }
1753             else if ("-usecharforbyte".equals(args[j])) {
1754                 useCharForByte = true;
1755             }
1756             else if (args[j].equals("-latin1")) {
1757                 bLatin1 = true;
1758                 plane = 0;
1759             }
1760             else {
1761                 try {
1762                     int val = Integer.parseInt(args[j]);
1763                     if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1764                     if (sizes == null)
1765                         sizes = new int[1];
1766                     else {
1767                         int[] newsizes = new int[sizes.length + 1];
1768                         System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1769                         sizes = newsizes;
1770                     }
1771                     sizes[sizes.length - 1] = val;
1772                 }
1773                 catch(NumberFormatException e) {
1774                     FAIL("Unknown switch: " + args[j]);
1775                 }
1776             }
1777         }
1778         if (Csyntax && tableAsString) {
1779             FAIL("Can't specify table as string with C syntax");
1780         }
1781         if (sizes == null) {
1782             desc.append(" [");
1783             if (identifiers) {
1784                 int[] newsizes = { 8, 4, 4 };           // Good default values
1785                 desc.append("8 4 4]");
1786                 sizes = newsizes;
1787             }
1788             else {
1789                 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1790                 desc.append("10 5 1]");
1791                 sizes = newsizes;
1792             }
1793         }
1794         if (UnicodeSpecFileName == null) { // liu
1795             UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1796             desc.append(" [-spec " + UnicodeSpecFileName + ']');
1797         }
1798         if (SpecialCasingFileName == null) {
1799             SpecialCasingFileName = DefaultSpecialCasingFileName;
1800             desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1801         }
1802         if (PropListFileName == null) {
1803             PropListFileName = DefaultPropListFileName;
1804             desc.append(" [-proplist " + PropListFileName + ']');
1805         }
1806         if (TemplateFileName == null) {
1807             TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1808                   : DefaultJavaTemplateFileName);
1809             desc.append(" [-template " + TemplateFileName + ']');
1810         }
1811         if (OutputFileName == null) {
1812             OutputFileName = (Csyntax ? DefaultCOutputFileName
1813                     : DefaultJavaOutputFileName);
1814             desc.append(" [-o " + OutputFileName + ']');
1815         }
1816         commentStart = (Csyntax ? "/*" : "//");
1817         commentEnd = (Csyntax ? " */" : "");
1818         commandLineDescription = desc.toString();
1819     }
1820 
1821     private static void searchBins(long[] map, int binsOccupied) throws Exception {
1822         int bitsFree = 16;
1823         for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1824         if (binsOccupied == (bins-1)) {
1825             sizes[binsOccupied] = bitsFree;
1826             generateForSizes(map);
1827         }
1828         else {
1829             for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1830                 sizes[binsOccupied] = i;
1831                 searchBins(map, binsOccupied+1);
1832             }
1833         }
1834     }
1835 
1836     private static void generateForSizes(long[] map) throws Exception {
1837         int sum = 0;
1838         shifts = new int[sizes.length];
1839         for (int k = sizes.length - 1; k >= 0; k--) {
1840             shifts[k] = sum;
1841             sum += sizes[k];
1842         }
1843         if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1844             FAIL("Bit field widths total to " + sum +
1845              ": wrong total for map of size " + map.length);
1846         }
1847         // need a table for each set of lookup bits in char
1848         tables = new long[sizes.length][];
1849         // the last table is the map
1850         tables[sizes.length - 1] = map;
1851         for (int j = sizes.length - 1; j > 0; j--) {
1852             if (verbose && bins==0)
1853                 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1854             long[][] temp = buildTable(tables[j], sizes[j]);
1855             tables[j-1] = temp[0];
1856             tables[j] = temp[1];
1857         }
1858         preshifted = new boolean[sizes.length];
1859         zeroextend = new int[sizes.length];
1860         bytes = new int[sizes.length];
1861         for (int j = 0; j < sizes.length - 1; j++) {
1862             int len = tables[j+1].length;
1863             int size = sizes[j+1];
1864             if (len > 0x100 && (len >> size) <= 0x100) {
1865                 len >>= size;
1866                 preshifted[j] = false;
1867             }
1868             else if (len > 0x10000 && (len >> size) <= 0x10000) {
1869                 len >>= size;
1870                 preshifted[j] = false;
1871             }
1872             else preshifted[j] = true;
1873             if (Csyntax)
1874                 zeroextend[j] = 0;
1875             else if (len > 0x7F && len <= 0xFF) {
1876                 if (!useCharForByte) {
1877                     zeroextend[j] = 0xFF;
1878                 }
1879             } else if (len > 0x7FFF && len <= 0xFFFF)
1880                 zeroextend[j] = 0xFFFF;
1881             else zeroextend[j] = 0;
1882             if (len <= 0x100) bytes[j] = 1;
1883             else if (len <= 0x10000) bytes[j] = 2;
1884             else bytes[j] = 4;
1885         }
1886         preshifted[sizes.length - 1] = true;
1887         zeroextend[sizes.length - 1] = 0;
1888         bytes[sizes.length - 1] = 0;
1889         if (bins > 0) {
1890             int totalBytes = getTotalBytes();
1891             String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1892             int accessComplexity = 0;
1893             for (int j=0; j<access.length(); ++j) {
1894                 char ch = access.charAt(j);
1895                 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1896                 if (ch == '<' || ch == '>') ++j;
1897             }
1898             System.out.print("(");
1899             for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1900             System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1901             return;
1902         }
1903         if (verbose) {
1904             System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1905             for (int j = 0; j < sizes.length; j++) {
1906                 System.out.println(dec5(j) + "\t" +
1907                     dec5(sizes[j]) + "\t" +
1908                     dec5(tables[j].length) + "\t" +
1909                     dec5(shifts[j]) + "\t" +
1910                     dec5(zeroextend[j]) + "\t" +
1911                     dec5(bytes[j]) + "\t " +
1912                     preshifted[j]);
1913             }
1914         }
1915         if (verbose) {
1916             System.out.println("Generating source code for class Character");
1917             System.out.println("A table access looks like " +
1918                          genAccess("A", "ch", (identifiers ? 2 : 32)));
1919         }
1920         generateCharacterClass(TemplateFileName, OutputFileName);
1921     }
1922 
1923     /**
1924     * The main program for generating source code for the Character class.
1925     * The basic outline of its operation is:
1926     * <ol>
1927     * <li> Process the command line arguments.  One result of this process
1928     *           is a list of sizes (measured in bits and summing to 16).
1929     * <li> Get the Unicode character property data from the specification file.
1930     * <li> From that, build a map that has, for each character code, its
1931     *           relevant properties encoded as a long integer value.
1932     * <li> Repeatedly compress the map, producing a compressed table and a
1933     *           new map.  This is done once for each size value in the list.
1934     *           When this is done, we have a set of tables.
1935     * <li> Make some decisions about table representation; record these
1936     *           decisions in arrays named preshifted, zeroextend, and bytes.
1937     * <li> Generate the source code for the class Character by performing
1938     *           macro processing on a template file.
1939     * </ol>
1940     *
1941     * @param args       the command line arguments, as an array of String
1942     *
1943     * @see GenerateCharacter#processArgs
1944     * @see UnicodeSpec@readSpecFile
1945     * @see GenerateCharacter#buildMap
1946     * @see GenerateCharacter#buildTable
1947     * @see GenerateCharacter#generateCharacterClass
1948     */
1949 
1950     public static void main(String[] args) {
1951         processArgs(args);
1952         try {
1953 
1954             UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1955             specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1956             PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1957 
1958             if (verbose) {
1959                 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1960             }
1961             long[] map = buildMap(data, specialCaseMaps, propList);
1962             if (verbose) {
1963                 System.err.println("Completed building of initial map");
1964             }
1965 
1966             if (bins == 0) {
1967                 generateForSizes(map);
1968             }
1969             else {
1970                 while (bins > 0) {
1971                     sizes = new int[bins];
1972                     searchBins(map, 0);
1973                     --bins;
1974                 }
1975             }
1976             if (verbose && false) {
1977                 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1978                              hex8(maxOffsetSeen));
1979                 System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
1980                              hex8(maxOffset));
1981             }
1982         }
1983         catch (FileNotFoundException e) { FAIL(e.toString()); }
1984         catch (IOException e) { FAIL(e.toString()); }
1985         catch (Throwable e) {
1986             System.out.println("Unexpected exception:");
1987             e.printStackTrace();
1988             FAIL("Unexpected exception!");
1989         }
1990         if (verbose) { System.out.println("Done!");}
1991     }
1992 
1993 }   // end class