1 /*
   2  * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package build.tools.generatecharacter;
  27 
  28 import java.io.IOException;
  29 import java.io.FileNotFoundException;
  30 import java.io.BufferedReader;
  31 import java.io.FileReader;
  32 import java.io.PrintWriter;
  33 import java.io.BufferedWriter;
  34 import java.io.FileWriter;
  35 import java.io.File;
  36 import java.util.List;
  37 
  38 import build.tools.generatecharacter.CharacterName;
  39 
  40 /**
  41  * This program generates the source code for the class java.lang.Character.
  42  * It also generates native C code that can perform the same operations.
  43  * It requires two external input data files:
  44  * <ul>
  45  * <li> Unicode specification file
  46  * <li> Character class template file
  47  * </ul>
  48  * The Unicode specification file is available from the Unicode consortium.
  49  * It has character specification lines that look like this:
  50  * <listing>
  51  * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
  52  * </listing>
  53  * The Character class template file is filled in with additional
  54  * information to produce the file Character.java, which can then be
  55  * compiled by a Java compiler.  The template file contains certain
  56  * markers consisting of an alphabetic name string preceded by "$$".
  57  * Such markers are replaced with generated program text.  As a special
  58  * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
  59  * alphabetic characters constituting a variable name.  The character "_"
  60  * is considered alphabetic for these purposes.
  61  *
  62  * @author  Guy Steele
  63  * @author  Alan Liu
  64  * @author  John O'Conner
  65  */
  66 
  67 public class GenerateCharacter {
  68 
  69     final static boolean DEBUG = false;
  70 
  71     final static String commandMarker = "$$";
  72     static String ROOT                        = "";
  73     static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
  74     static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
  75     static String DefaultPropListFileName     = ROOT + "PropList.txt";
  76     static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt";
  77     static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
  78     static String DefaultJavaOutputFileName   = ROOT + "Character.java";
  79     static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
  80     static String DefaultCOutputFileName      = ROOT + "Character.c";
  81 
  82     static int plane = 0;
  83 
  84     /* The overall idea is that, in the generated Character class source code,
  85     most character property data is stored in a special multi-level table whose
  86     structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
  87     The integers must sum to 16 (the number of bits in a character).
  88     The first table is indexed by the k1 high-order bits of the character code.
  89     The result is concatenated to the next k2 bits of the character code to index
  90     the second table, and so on.  Eventually the kn low-order bits of the character
  91     code are concatenated and used to index one of two tables A and B; A contains
  92     32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
  93     can be thus obtained encode the properties for the character.
  94 
  95     The default specification is [9, 4, 3, 0].  This particular table format was
  96     designed by conducting an exhaustive search of table formats to minimize the
  97     space consumed by the tables: the first and third tables need have only byte
  98     values (the second table must have short values).  Another good choice is
  99     [10, 6, 0], which produces a larger table but allows particularly fast table
 100     lookup code.
 101 
 102     In each case, where the word "concatenated" is used, this may imply
 103     first a << and then a | operation, or perhaps just a | operation if
 104     the values in the table can be preshifted (generally possible if the table
 105     entries are short rather than byte).
 106     */
 107 
 108     /* The character properties are currently encoded into A (32 bits)and B (16 bits)
 109        two parts.
 110 
 111     A: the low 32 bits are defined  in the following manner:
 112 
 113     1 bit Mirrored property.
 114     4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
 115     9 bits      A signed offset used for converting case .
 116     1 bit       If 1, adding the signed offset converts the character to lowercase.
 117     1 bit       If 1, subtracting the signed offset converts the character to uppercase.
 118         Note: for a titlecase character, both of the preceding bits will be 1
 119         and the signed offset will be 1.
 120     1 bit   If 1, this character has a titlecase equivalent (possibly itself);
 121         in this case, the two bits before this bit can be used to decide
 122         whether this character is in fact uppercase, lowercase, or titlecase.
 123     3 bits      This field provides a quick way to lex identifiers.
 124         The eight possible values for this field are as follows:
 125         0  May not be part of an identifier
 126         1  Ignorable control; may continue a Unicode identifier or Java identifier
 127         2  May continue a Java identifier but not a Unicode identifier (unused)
 128         3  May continue a Unicode identifier or Java identifier
 129         4  Is a Java whitespace character
 130         5  May start or continue a Java identifier;
 131            may continue but not start a Unicode identifier
 132            (this value is used for connector punctuation such as _)
 133         6  May start or continue a Java identifier;
 134            may not occur in a Unicode identifier
 135            (this value is used for currency symbols such as $)
 136         7  May start or continue a Unicode identifier or Java identifier
 137         Thus:
 138            5, 6, 7 may start a Java identifier
 139            1, 2, 3, 5, 6, 7 may continue a Java identifier
 140            7 may start a Unicode identifier
 141            1, 3, 5, 7 may continue a Unicode identifier
 142            1 is ignorable within an identifier
 143            4 is Java whitespace
 144     2 bits      This field indicates whether the character has a numeric property.
 145         The four possible values for this field are as follows:
 146         0  This character has no numeric property.
 147         1  Adding the digit offset to the character code and then
 148            masking with 0x1F will produce the desired numeric value.
 149         2  This character has a "strange" numeric value.
 150         3  A Java supradecimal digit: adding the digit offset to the
 151            character code, then masking with 0x1F, then adding 10
 152            will produce the desired numeric value.
 153     5 bits  The digit offset (see description of previous field)
 154     5 bits      Character type (see below)
 155 
 156     B: the high 16 bits are defined as:
 157     1 bit Other_Lowercase property
 158     1 bit Other_Uppercase property
 159     1 bit Other_Alphabetic property
 160     1 bit Other_Math property
 161     1 bit Ideographic property
 162     1 bit Noncharacter codepoint property
 163     1 bit ID_Start property
 164     1 bit ID_Continue property
 165     */
 166 
 167 
 168     // bit masks identify each component of a 32-bit property field described
 169     // above.
 170     // shift* indicates how many shifts right must happen to get the
 171     // indicated property value in the lowest bits of the 32-bit space.
 172     private static final int
 173         shiftType           = 0,        maskType            =       0x001F,
 174         shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
 175         shiftNumericType    = 10,       maskNumericType     =       0x0C00,
 176         shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
 177                                         maskUnicodePart     =       0x1000,
 178         shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
 179                                         maskLowerCase       =      0x20000,
 180                                         maskUpperCase       =      0x10000,
 181                                         maskTitleCase       =      0x08000,
 182         shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
 183         shiftCaseOffsetSign = 5,
 184                                         // used only when calculating and
 185                                         // storing digit offsets from char values
 186                                         maskDigit               =   0x001F,
 187                                         // case offset are 9 bits
 188                                         maskCase                =   0x01FF,
 189         shiftBidi           = 27,       maskBidi              = 0x78000000,
 190         shiftMirrored       = 31,       //maskMirrored          = 0x80000000,
 191         shiftPlane          = 16,       maskPlane = 0xFF0000;
 192 
 193     // maskMirrored needs to be long, if up 16-bit
 194     private static final long maskMirrored          = 0x80000000L;
 195 
 196     // bit masks identify the 16-bit property field described above, in B
 197     // table
 198     private static final long
 199         maskOtherLowercase  = 0x100000000L,
 200         maskOtherUppercase  = 0x200000000L,
 201         maskOtherAlphabetic = 0x400000000L,
 202         maskOtherMath       = 0x800000000L,
 203         maskIdeographic     = 0x1000000000L,
 204         maskNoncharacterCP  = 0x2000000000L,
 205         maskIDStart         = 0x4000000000L,
 206         maskIDContinue      = 0x8000000000L;
 207 
 208     // Can compare masked values with these to determine
 209     // numeric or lexical types.
 210     public static int
 211         valueNotNumeric             = 0x0000,
 212         valueDigit                  = 0x0400,
 213         valueStrangeNumeric         = 0x0800,
 214         valueJavaSupradecimal       = 0x0C00,
 215         valueIgnorable              = 0x1000,
 216         valueJavaOnlyPart           = 0x2000,
 217         valueJavaUnicodePart        = 0x3000,
 218         valueJavaWhitespace         = 0x4000,
 219         valueJavaStartUnicodePart   = 0x5000,
 220         valueJavaOnlyStart          = 0x6000,
 221         valueJavaUnicodeStart       = 0x7000,
 222         lowJavaStart                = 0x5000,
 223         nonzeroJavaPart             = 0x3000,
 224         valueUnicodeStart           = 0x7000;
 225 
 226     // these values are used when only identifier properties are generated
 227     // for use in verifier code. Shortens the property down to a single byte.
 228     private static final int
 229         bitJavaStart            = 0x02,
 230         bitJavaPart             = 0x01,
 231         maskIsJavaIdentifierPart = bitJavaPart,
 232         maskIsJavaIdentifierStart = bitJavaStart;
 233 
 234     static int maxOffset = maskCase/2 ;
 235     static int minOffset = -maxOffset;
 236 
 237     /* The following routines provide simple, concise formatting of long integer values.
 238      The number in the name of the method indicates the desired number of characters
 239      to be produced.  If the number of digits required to represent the integer value
 240      is less than that number, then the output is padded on the left  with zeros
 241      (for hex) or with spaces (for decimal).  If the number of digits required to
 242      represent the integer value is greater than the desired number, then all the digits
 243      that are required are actually produced.
 244     */
 245 
 246     static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
 247 
 248     static String hex2(long n) {
 249         String q = Long.toHexString(n & 0xFF).toUpperCase();
 250         return "00".substring(Math.min(2, q.length())) + q;
 251     }
 252 
 253     static String hex4(long n) {
 254         String q = Long.toHexString(n & 0xFFFF).toUpperCase();
 255         return "0000".substring(Math.min(4, q.length())) + q;
 256     }
 257 
 258     static String hex8(long n) {
 259         String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
 260         return "00000000".substring(Math.min(8, q.length())) + q;
 261     }
 262 
 263     static String hex16(long n) {
 264         String q = Long.toHexString(n).toUpperCase();
 265         return "0000000000000000".substring(Math.min(16, q.length())) + q;
 266     }
 267 
 268     static String dec3(long n) {
 269         String q = Long.toString(n);
 270         return "   ".substring(Math.min(3, q.length())) + q;
 271     }
 272 
 273     static String dec5(long n) {
 274         String q = Long.toString(n);
 275         return "     ".substring(Math.min(5, q.length())) + q;
 276     }
 277 
 278     /* This routine is called when some failure occurs. */
 279 
 280     static void FAIL(String s) {
 281         System.out.println("** " + s);
 282     }
 283 
 284     /**
 285     * Given the data from the Unicode specification file, this routine builds a map.
 286     *
 287     * The specification file is assumed to contain its data in sorted order by
 288     * character code; as a result, the array passed as an argument to this method
 289     * has its components in the same sorted order, with one entry for each defined
 290     * Unicode character or character range.  (A range is indicated by two consecutive
 291     * entries, such that the name of the first entry begins with "<" and ends with
 292     * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
 293     * therefore a sparse representation of the character property data.
 294     *
 295     * The resulting map is dense representation of the character data.  It contains
 296     * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
 297     * of this long value are used, but type long is used rather than int to facilitate
 298     * future extensions of this source code generator that might require more than
 299     * 32 bits to encode relevant character properties.)  Entry k holds the encoded
 300     * properties for character k.
 301     *
 302     * Method buildMap manages the transformation from the sparse representation to
 303     * the dense representation.  It calls method buildOne to handle the encoding
 304     * of character property data from a single UnicodeSpec object into 32 bits.
 305     * For undefined characters, method buildOne is not called and the map entry for
 306     * that character is set to UnicodeSpec.UNASSIGNED.
 307     *
 308     * @param data       character property data from the Unicode specification file
 309     * @return   an array of length 65536 with one entry for every possible char value
 310     *
 311     * @see GenerateCharacter#buildOne
 312     */
 313 
 314     static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
 315     {
 316         long[] result;
 317         if (bLatin1 == true) {
 318             result = new long[256];
 319         } else {
 320             result = new long[1<<16];
 321         }
 322         int k=0;
 323         int codePoint = plane<<16;
 324         UnicodeSpec nonCharSpec = new UnicodeSpec();
 325         for (int j = 0; j < data.length && k < result.length; j++) {
 326             if (data[j].codePoint == codePoint) {
 327                 result[k] = buildOne(codePoint, data[j], specialMaps);
 328                 ++k;
 329                 ++codePoint;
 330             }
 331             else if(data[j].codePoint > codePoint) {
 332                 if (data[j].name.endsWith("Last>")) {
 333                     // build map data for all chars except last in range
 334                     while (codePoint < data[j].codePoint && k < result.length) {
 335                         result[k] = buildOne(codePoint, data[j], specialMaps);
 336                         ++k;
 337                         ++codePoint;
 338                     }
 339                 }
 340                 else {
 341                     // we have a few unassigned chars before data[j].codePoint
 342                     while (codePoint < data[j].codePoint && k < result.length) {
 343                         result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 344                         ++k;
 345                         ++codePoint;
 346                     }
 347                 }
 348                 k = data[j].codePoint & 0xFFFF;
 349                 codePoint = data[j].codePoint;
 350                 result[k] = buildOne(codePoint, data[j], specialMaps);
 351                 ++k;
 352                 ++codePoint;
 353             }
 354             else {
 355                 System.out.println("An error has occured during spec mapping.");
 356                 System.exit(0);
 357             }
 358         }
 359         // if there are still unprocessed chars, process them
 360         // as unassigned/undefined.
 361         codePoint = (plane<<16) | k;
 362         while (k < result.length) {
 363             result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
 364             ++k;
 365             ++codePoint;
 366         }
 367         // now add all extra supported properties from PropList, to the
 368         // upper 16-bit
 369         addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
 370         addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
 371         addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
 372         addExProp(result, propList, "Ideographic", maskIdeographic);
 373         //addExProp(result, propList, "Other_Math", maskOtherMath);
 374         //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
 375         addExProp(result, propList, "ID_Start", maskIDStart);
 376         addExProp(result, propList, "ID_Continue", maskIDContinue);
 377 
 378         return result;
 379     }
 380 
 381     // The maximum and minimum offsets found while scanning the database
 382     static int maxOffsetSeen = 0;
 383     static int minOffsetSeen = 0;
 384 
 385     /**
 386      * Some Unicode separator characters are not considered Java whitespace.
 387      * @param c character to test
 388      * @return true if c in an invalid Java whitespace character, false otherwise.
 389      */
 390     static boolean isInvalidJavaWhiteSpace(int c) {
 391         int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
 392         boolean retValue = false;
 393         for(int x=0;x<exceptions.length;x++) {
 394             if(c == exceptions[x]) {
 395                 retValue = true;
 396                 break;
 397             }
 398         }
 399         return retValue;
 400 
 401     }
 402 
 403     /**
 404     * Given the character property data for one Unicode character, encode the data
 405     * of interest into a single long integer value.  (Right now only 32 bits
 406     * of this long value are used, but type long is used rather than int to facilitate
 407     * future extensions of this source code generator that might require more than
 408     * 32 bits to encode relevant character properties.)
 409     *
 410     * @param c   the character code for which to encode property data
 411     * @param us  property data record from the Unicode specification file
 412     *            (its character code might not be equal to c if it specifies data
 413     *            for a range of characters)
 414     * @return   an encoded long value that contains the properties for a single char
 415     *
 416     * @see GenerateCharacter#buildMap
 417     */
 418 
 419     static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
 420         long resultA = 0;
 421         // record the general category
 422         resultA |= us.generalCategory;
 423 
 424         // record the numeric properties
 425         NUMERIC: {
 426         STRANGE: {
 427             int val = 0;
 428             // c is A-Z
 429             if ((c >= 0x0041) && (c <= 0x005A)) {
 430                 val = c - 0x0041;
 431                 resultA |= valueJavaSupradecimal;
 432             // c is a-z
 433             } else if ((c >= 0x0061) && (c <= 0x007A)) {
 434                 val = c - 0x0061;
 435                 resultA |= valueJavaSupradecimal;
 436             // c is a full-width A-Z
 437             } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
 438                 val = c - 0xFF21;
 439                 resultA |= valueJavaSupradecimal;
 440             // c is a full-width a-z
 441             } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
 442                 val = c - 0xFF41;
 443                 resultA |= valueJavaSupradecimal;
 444             } else if (us.isDecimalValue()) {
 445                 val = us.decimalValue;
 446                 resultA |= valueDigit;
 447             } else if (us.isDigitValue()) {
 448                 val = us.digitValue;
 449                 resultA |= valueDigit;
 450             } else {
 451                 if (us.numericValue.length() == 0) {
 452                     break NUMERIC;                      // no numeric value at all
 453                 } else {
 454                     try {
 455                         val = Integer.parseInt(us.numericValue);
 456                         if (val >= 32 || val < 0) break STRANGE;
 457                         if (c == 0x215F) break STRANGE;
 458                     } catch(NumberFormatException e) {
 459                         break STRANGE;
 460                     }
 461                     resultA |= valueDigit;
 462                 }
 463             }
 464             if (val >= 32 || val < 0) break STRANGE;
 465             resultA |= ((val - c & maskDigit) << shiftDigitOffset);
 466             break NUMERIC;
 467         } // end STRANGE
 468         resultA |= valueStrangeNumeric;
 469         } // end NUMERIC
 470 
 471         // record case mapping
 472         int offset = 0;
 473         // might have a 1:M mapping
 474         int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
 475         boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
 476         if (bHasUpper) {
 477             resultA |= maskUpperCase;
 478         }
 479         if (specialMap != -1) {
 480             // has mapping, but cannot record the
 481             // proper offset; can only flag it and provide special case
 482             // code in Character.java
 483             offset = -1;
 484         }
 485         else if (us.hasUpperMap())  {
 486             offset = c - us.upperMap;
 487         }
 488 
 489         if (us.hasLowerMap()) {
 490             resultA |= maskLowerCase;
 491             if (offset == 0)
 492                 offset = us.lowerMap - c;
 493             else if (offset != (us.lowerMap - c)) {
 494                 if (DEBUG) {
 495                 FAIL("Character " + hex(c) +
 496                 " has incompatible lowercase and uppercase mappings");
 497                 }
 498             }
 499         }
 500         if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
 501             (bHasUpper && us.hasLowerMap())) {
 502             resultA |= maskTitleCase;
 503         }
 504         if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
 505             System.out.println("Warning: Character " + hex4(c) + " has upper but " +
 506                                "no title case; Java won't know this");
 507         }
 508         if (offset < minOffsetSeen) minOffsetSeen = offset;
 509         if (offset > maxOffsetSeen) maxOffsetSeen = offset;
 510         if (offset > maxOffset || offset < minOffset) {
 511             if (DEBUG) {
 512             FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
 513             }
 514             offset = maskCase;
 515         }
 516         resultA |= ((offset & maskCase) << shiftCaseOffset);
 517 
 518         // record lexical info about this character
 519         if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
 520                 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
 521                 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
 522                 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
 523                 || us.generalCategory == UnicodeSpec.OTHER_LETTER
 524                 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
 525             resultA |= valueJavaUnicodeStart;
 526         }
 527         else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
 528                 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
 529                 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
 530             resultA |= valueJavaUnicodePart;
 531         }
 532         else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
 533             resultA |= valueJavaStartUnicodePart;
 534         }
 535         else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
 536             resultA |= valueJavaOnlyStart;
 537         }
 538         else if (((c >= 0x0000) && (c <= 0x0008))
 539                 || ((c >= 0x000E) && (c <= 0x001B))
 540                 || ((c >= 0x007F) && (c <= 0x009F))
 541                 || us.generalCategory == UnicodeSpec.FORMAT) {
 542             resultA |= valueIgnorable;
 543         }
 544         else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
 545                 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
 546                 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
 547             if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
 548         }
 549         else if (((c >= 0x0009) && (c <= 0x000D))
 550                 || ((c >= 0x001C) && (c <= 0x001F))) {
 551             resultA |= valueJavaWhitespace;
 552         }
 553 
 554         // record bidi category
 555         if (!nobidi) {
 556             int tmpBidi =
 557                 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
 558                     us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
 559             resultA |= tmpBidi;
 560         }
 561 
 562         // record mirrored property
 563         if (!nomirror) {
 564             resultA |= us.mirrored ? maskMirrored : 0;
 565         }
 566 
 567         if (identifiers) {
 568             long replacement = 0;
 569             if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
 570                 replacement |= bitJavaStart;
 571             }
 572             if ( ((resultA & nonzeroJavaPart) != 0)
 573                     && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
 574                 replacement |= bitJavaPart;
 575             }
 576             resultA = replacement;
 577         }
 578         return resultA;
 579     }
 580 
 581     static void addExProp(long[] map, PropList propList, String prop, long mask) {
 582         List<Integer> cps = propList.codepoints(prop);
 583         if (cps != null) {
 584             for (Integer cp : cps) {
 585                 if (cp < map.length)
 586                     map[cp] |= mask;
 587             }
 588         }
 589     }
 590 
 591     /**
 592     * This is the heart of the table compression strategy.  The inputs are a map
 593     * and a number of bits (size).  The map is simply an array of long integer values;
 594     * the number of bits indicates how index values for that map are to be split.
 595     * The length of the given map must be a multiple of (1 << size).  The result is
 596     * a new map z and a compressed table t such that for every valid index value k
 597     * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
 598     *
 599     * In other words, the index k can be split into two parts, namely the "size"
 600     * low-order bits and all the remaining high-order bits; the high-order bits are then
 601     * remapped by map z to produce an index into table t.  In effect, the data of the
 602     * original map m is broken up into blocks of size (1<<size); the compression relies
 603     * on the expectation that many of these blocks will be identical and therefore need
 604     * be represented only once in the compressed table t.
 605     *
 606     * This method is intended to be used iteratively.  The first map to be handed
 607     * to it is the one constructed by method buildMap.  After that, the first of the
 608     * two arrays returned by this method is fed back into it for further compression.
 609     * At the end of the iteration, one has a starter map and a sequence of tables.
 610     *
 611     * The algorithm used to implement this computation is straightforward and not
 612     * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
 613     * to locate identical blocks, so overall the time complexity of the algorithm
 614     * is quadratic in the length of the input map.  Fortunately, speed is not crucial
 615     * to this application.
 616     *
 617     * @param map                a map to be compressed
 618     * @param size       the number of index bits to be split off by the compression
 619     * @return   an array of length 2 containing two arrays; the first is a new map
 620     *           and the second is a compressed data table
 621     *
 622     * @see GenerateCharacter#buildMap
 623     */
 624 
 625     static long[][] buildTable(long[] map, int size) {
 626         int n = map.length;
 627         if (((n >> size) << size) != n) {
 628             FAIL("Length " + n + " is not a multiple of " + (1 << size));
 629         }
 630         int m = 1 << size;
 631         // We know the final length of the new map up front.
 632         long[] newmap = new long[n >> size];
 633         // The buffer is used temporarily to hold data for the compressed table
 634         // because we don't know its final length yet.
 635         long[] buffer = new long[n];
 636         int ptr = 0;
 637 OUTER:  for (int i = 0; i < n; i += m) {
 638             // For every block of size m in the original map...
 639     MIDDLE: for (int j = 0; j < ptr; j += m) {
 640             // Find out whether there is already a block just like it in the buffer.
 641                 for (int k = 0; k < m; k++) {
 642                     if (buffer[j+k] != map[i+k])
 643                         continue MIDDLE;
 644                 }
 645                 // There is a block just like it at position j, so just
 646                 // put its index into the new map (thereby sharing it).
 647                 newmap[i >> size] = (j >> size);
 648                 continue OUTER;
 649             } // end MIDDLE
 650             // There is no block just like it already, so add it to
 651             // the buffer and put its index into the new map.
 652             for (int k = 0; k < m; k++) {
 653                 buffer[ptr+k] = map[i+k];
 654             }
 655             newmap[i >> size] = (ptr >> size);
 656             ptr += m;
 657         } // end OUTER
 658         // Now we know how long the compressed table should be,
 659         // so create a new array and copy data from the temporary buffer.
 660         long[] newdata = new long[ptr];
 661         for (int j = 0; j < ptr; j++) {
 662             newdata[j] = buffer[j];
 663         }
 664         // Return the new map and the new data table.
 665         long[][] result = { newmap, newdata };
 666         return result;
 667     }
 668 
 669     /**
 670     * Once the compressed tables have been computed, this method reads in a
 671     * template file for the source code to be generated and writes out the final
 672     * source code by acting as a sort of specialized macro processor.
 673     *
 674     * The first output line is a comment saying that the file was automatically
 675     * generated; it includes a timestamp.  All other output is generated by
 676     * reading a line from the template file, performing macro replacements,
 677     * and then writing the resulting line or lines of code to the output file.
 678     *
 679     * This method handles the I/O, the timestamp comment, and the locating of
 680     * macro calls within each input line.  The method replaceCommand is called
 681     * to generate replacement text for each macro call.
 682     *
 683     * Macro calls to be replaced are indicated in the template file by
 684     * occurrences of the commandMarker "$$".  The rest of the call may consist
 685     * of Java letters (including the underscore "_") and also of balanced
 686     * parentheses.
 687     *
 688     * @param theTemplateFileName
 689     *           the file name for the template input file
 690     * @param theOutputFileName
 691     *           the file name for the source code output file
 692     *
 693     *     @see GenerateCharacter#replaceCommand
 694     */
 695 
 696     static void generateCharacterClass(String theTemplateFileName,
 697                                        String theOutputFileName)
 698         throws FileNotFoundException, IOException {
 699         BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
 700         PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
 701         out.println(commentStart +
 702             " This file was generated AUTOMATICALLY from a template file " +
 703             new java.util.Date() + commentEnd);
 704         int marklen = commandMarker.length();
 705         LOOP: while(true) {
 706             try {
 707                 String line = in.readLine();
 708                 if (line == null) break LOOP;
 709                 int pos = 0;
 710                 int depth = 0;
 711                 while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
 712                     int newpos = pos + marklen;
 713                     char ch = 'x';
 714                     SCAN: while (newpos < line.length() &&
 715                             (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
 716                             || ch == '(' || (ch == ')' && depth > 0))) {
 717                         ++newpos;
 718                         if (ch == '(') {
 719                             ++depth;
 720                         }
 721                         else if (ch == ')') {
 722                             --depth;
 723                             if (depth == 0)
 724                                 break SCAN;
 725                         }
 726                     }
 727                     String replacement = replaceCommand(line.substring(pos + marklen, newpos));
 728                     line = line.substring(0, pos) + replacement + line.substring(newpos);
 729                     pos += replacement.length();
 730                 }
 731                 out.println(line);
 732             }
 733             catch (IOException e) {
 734                 break LOOP;
 735             }
 736         }
 737         in.close();
 738         out.close();
 739     }
 740 
 741     /**
 742     * The replaceCommand method takes a command (a macro call without the
 743     * leading marker "$$") and computes replacement text for it.
 744     *
 745     * Most of the commands are simply names of integer constants that are defined
 746     * in the source code of this GenerateCharacter class.  The replacement text is
 747     * simply the value of the constant as an appropriately formatted integer literal.
 748     *
 749     * Two cases are more complicated, however.  The command "Tables" causes the
 750     * final map and compressed tables to be emitted, with elaborate comments
 751     * describing their contents.  (This is actually handled by method genTables.)
 752     * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
 753     * an expression that will return the character property data for the character
 754     * whose code is the value of the variable "xxx".  (this is handled by method
 755     * "genAccess".)
 756     *
 757     * @param x  a command from the template file to be replaced
 758     * @return   the replacement text, as a String
 759     *
 760     * @see GenerateCharacter#genTables
 761     * @see GenerateCharacter#genAccess
 762     * @see GenerateCharacter#generateCharacterClass
 763     */
 764 
 765     static String replaceCommand(String x) {
 766         if (x.equals("Tables")) return genTables();
 767         if (x.equals("Initializers")) return genInitializers();
 768         if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
 769                 x.substring(x.length()-1).equals(")") )
 770             return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
 771         if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
 772                 x.substring(x.length()-1).equals(")") )
 773             return genAccess("B", x.substring(9, x.length()-1), 16);
 774         if (x.equals("shiftType")) return Long.toString(shiftType);
 775         if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
 776         if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
 777         if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
 778         if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
 779         if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
 780         if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
 781         if (x.equals("maskCase")) return "0x" + hex8(maskCase);
 782         if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
 783         if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
 784         if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
 785         if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
 786         if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
 787         if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
 788         if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
 789         if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
 790         if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32);
 791         if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32);
 792         if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
 793         if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
 794         if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
 795         if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
 796         if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
 797         if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
 798         if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
 799         if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
 800         if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
 801         if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
 802         if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
 803         if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
 804         if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
 805         if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
 806         if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
 807         if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
 808         if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
 809         if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
 810         if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
 811         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 812         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 813         if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
 814         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
 815         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
 816         if (x.equals("maskType")) return "0x" + hex(maskType);
 817         if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
 818         if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
 819         if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
 820         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
 821             return Integer.toString(UnicodeSpec.UNASSIGNED);
 822         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
 823             return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
 824         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
 825             return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
 826         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
 827             return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
 828         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
 829              return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
 830         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
 831              return Integer.toString(UnicodeSpec.OTHER_LETTER);
 832         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
 833              return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
 834         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
 835              return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
 836         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
 837              return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
 838         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
 839              return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
 840         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
 841              return Integer.toString(UnicodeSpec.OTHER_NUMBER);
 842         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
 843              return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
 844         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
 845              return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
 846         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 847              return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
 848         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
 849             return Integer.toString(UnicodeSpec.CONTROL);
 850         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
 851             return Integer.toString(UnicodeSpec.FORMAT);
 852         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
 853             return Integer.toString(UnicodeSpec.PRIVATE_USE);
 854         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
 855             return Integer.toString(UnicodeSpec.SURROGATE);
 856         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
 857             return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
 858         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
 859             return Integer.toString(UnicodeSpec.START_PUNCTUATION);
 860         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
 861             return Integer.toString(UnicodeSpec.END_PUNCTUATION);
 862         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 863             return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
 864         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
 865             return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
 866         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
 867             return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
 868         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
 869             return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
 870         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
 871             return Integer.toString(UnicodeSpec.LETTER_NUMBER);
 872         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
 873             return Integer.toString(UnicodeSpec.MATH_SYMBOL);
 874         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
 875             return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
 876         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
 877             return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
 878         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
 879             return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
 880         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
 881             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
 882         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
 883             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
 884         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
 885             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
 886         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
 887             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
 888         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
 889             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
 890         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
 891             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
 892         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
 893             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
 894         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
 895             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
 896         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
 897             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
 898         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 899             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
 900         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
 901             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
 902         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
 903             return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
 904         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
 905             return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
 906         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
 907             return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
 908          if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
 909             return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
 910         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
 911             return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
 912         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
 913             return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
 914         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
 915             return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
 916         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
 917             return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
 918         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG]))
 919             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE);
 920         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG]))
 921             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE);
 922         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG]))
 923             return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE);
 924         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG]))
 925             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE);
 926         FAIL("Unknown text substitution marker " + commandMarker + x);
 927         return commandMarker + x;
 928     }
 929 
 930     /**
 931     * The genTables method generates source code for all the lookup tables
 932     * needed to represent the various Unicode character properties.
 933     * It simply calls the method genTable once for each table to be generated
 934     * and then generates a summary comment.
 935     *
 936     * @return   the replacement text for the "Tables" command, as a String
 937     *
 938     * @see GenerateCharacter#genTable
 939     * @see GenerateCharacter#replaceCommand
 940     */
 941     static String genTables() {
 942         int n = sizes.length;
 943         StringBuffer result = new StringBuffer();
 944         // liu : Add a comment showing the source of this table
 945         result.append(commentStart + " The following tables and code generated using:" +
 946                   commentEnd + "\n  ");
 947         result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
 948 
 949                 if (plane == 0 && bLatin1 == false) {
 950             genCaseMapTableDeclaration(result);
 951             genCaseMapTable(initializers, specialCaseMaps);
 952                 }
 953         int totalBytes = 0;
 954         for (int k = 0; k < n - 1; k++) {
 955             genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
 956                 sizes[k+1], false, false, k==0);
 957             int s = bytes[k];
 958             if (s == 1 && useCharForByte) {
 959                 s = 2;
 960             }
 961             totalBytes += tables[k].length * s;
 962         }
 963         genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
 964             sizes[n - 1], false, 0, true, !(identifiers), false);
 965 
 966         // If we ever need more than 32 bits to represent the character properties,
 967         // then a table "B" may be needed as well.
 968         genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
 969 
 970         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
 971         result.append(commentStart);
 972         result.append(" In all, the character property tables require ");
 973         result.append(totalBytes).append(" bytes.").append(commentEnd);
 974         if (verbose) {
 975             System.out.println("The character property tables require "
 976                  + totalBytes + " bytes.");
 977         }
 978         return result.toString();
 979     }
 980 
 981     /**
 982      * The genInitializers method generates the body of the
 983      * ensureInitted() method, which enables lazy initialization of
 984      * the case map table and other tables.
 985      */
 986     static String genInitializers() {
 987         return initializers.toString();
 988     }
 989 
 990     /**
 991      * Return the total number of bytes needed by all tables.  This is a stripped-
 992      * down copy of genTables().
 993      */
 994     static int getTotalBytes() {
 995         int n = sizes.length;
 996         int totalBytes = 0;
 997         for (int k = 0; k < n - 1; k++) {
 998             totalBytes += tables[k].length * bytes[k];
 999         }
1000         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
1001                          + 31) >> 5) << 2);
1002         return totalBytes;
1003     }
1004 
1005     static void appendEscapedStringFragment(StringBuffer result,
1006                                             char[] line,
1007                                             int length,
1008                                             boolean lastFragment) {
1009         result.append("    \"");
1010         for (int k=0; k<length; ++k) {
1011             result.append("\\u");
1012             result.append(hex4(line[k]));
1013         }
1014         result.append("\"");
1015         result.append(lastFragment ? ";" : "+");
1016         result.append("\n");
1017     }
1018 
1019     static String SMALL_INITIALIZER =
1020         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1021         // "            $$name = new $$type[$$size];\n"+
1022         "            int len = $$name_DATA.length();\n"+
1023         "            int j=0;\n"+
1024         "            for (int i=0; i<len; ++i) {\n"+
1025         "                int c = $$name_DATA.charAt(i);\n"+
1026         "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
1027         "                    $$name[j++] = ($$type)c;\n"+
1028         "                    c >>= $$bits;\n"+
1029         "                }\n"+
1030         "            }\n"+
1031         "            assert (j == $$size);\n"+
1032         "        }\n";
1033 
1034     static String SAME_SIZE_INITIALIZER =
1035         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1036         "            assert ($$name_DATA.length() == $$size);\n"+
1037         // "            $$name = new $$type[$$size];\n"+
1038         "            for (int i=0; i<$$size; ++i)\n"+
1039         "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
1040         "        }\n";
1041 
1042     static String BIG_INITIALIZER =
1043         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1044         // "            $$name = new $$type[$$size];\n"+
1045         "            int len = $$name_DATA.length();\n"+
1046         "            int j=0;\n"+
1047         "            int charsInEntry=0;\n"+
1048         "            $$type entry=0;\n"+
1049         "            for (int i=0; i<len; ++i) {\n"+
1050         "                entry |= $$name_DATA.charAt(i);\n"+
1051         "                if (++charsInEntry == $$charsPerEntry) {\n"+
1052         "                    $$name[j++] = entry;\n"+
1053         "                    entry = 0;\n"+
1054         "                    charsInEntry = 0;\n"+
1055         "                }\n"+
1056         "                else {\n"+
1057         "                    entry <<= 16;\n"+
1058         "                }\n"+
1059         "            }\n"+
1060         "            assert (j == $$size);\n"+
1061         "        }\n";
1062 
1063     static String INT32_INITIALIZER =
1064         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1065         "            char[] data = $$name_DATA.toCharArray();\n"+
1066         "            assert (data.length == ($$size * 2));\n"+
1067         "            int i = 0, j = 0;\n"+
1068         "            while (i < ($$size * 2)) {\n"+
1069         "                int entry = data[i++] << 16;\n"+
1070         "                $$name[j++] = entry | data[i++];\n"+
1071         "            }\n"+
1072         "        }\n";
1073 
1074     static void addInitializer(String name, String type, int entriesPerChar,
1075                                int bits, int size) {
1076 
1077         String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1078                           ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1079         if (entriesPerChar == -2) {
1080             template = INT32_INITIALIZER;
1081         }
1082         int marklen = commandMarker.length();
1083         int pos = 0;
1084         while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1085             int newpos = pos + marklen;
1086             char ch = 'x';
1087             while (newpos < template.length() &&
1088                    Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1089                    ch != '_') // Don't allow this in token names
1090                 ++newpos;
1091             String token = template.substring(pos+marklen, newpos);
1092             String replacement = "ERROR";
1093 
1094             if (token.equals("name")) replacement = name;
1095             else if (token.equals("type")) replacement = type;
1096             else if (token.equals("bits")) replacement = ""+bits;
1097             else if (token.equals("size")) replacement = ""+size;
1098             else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1099             else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1100             else FAIL("Unrecognized token: " + token);
1101 
1102             template = template.substring(0, pos) + replacement + template.substring(newpos);
1103             pos += replacement.length();
1104         }
1105         initializers.append(template);
1106     }
1107 
1108     /**
1109     * The genTable method generates source code for one lookup table.
1110     * Most of the complexity stems from handling various options as to
1111     * the type of the array components, the precise representation of the
1112     * values, the format in which to render each value, the number of values
1113     * to emit on each line of source code, and the kinds of useful comments
1114     * to be generated.
1115     *
1116     * @param result     a StringBuffer, to which the generated source code
1117     *                   text is to be appended
1118     * @param name       the name of the table
1119     * @param table      the table data (an array of long values)
1120     * @param extract    a distance, in bits, by which each entry of the table
1121     *                   is to be right-shifted before it is processed
1122     * @param bits       the number of bits (not bytes) to be used to represent
1123     *                   each table entry
1124     * @param size       the table data is divided up into blocks of size (1<<size);
1125     *                   in this method, this information is used only to affect
1126     *                   how many table values are to be generated per line
1127     * @param preshifted if this flag is true, then the table entries are to be
1128     *                   emitted in a preshifted form; that is, each value should
1129     *                   be left-shifted by the amount "shift", so that this work
1130     *                   is built into the table and need not be performed by an
1131     *                   explicit shift operator at run time
1132     * @param shift      this is the shift amount for preshifting of table entries
1133     * @param hexFormat  if this flag is true, table entries should be emitted as
1134     *                   hexadecimal literals; otherwise decimal literals are used
1135     * @param properties if this flag is true, the table entries are encoded
1136     *                   character properties rather than indexes into yet other tables;
1137     *                   therefore comments describing the encoded properties should
1138     *                   be generated
1139     * @param hexComment if this flag is true, each line of output is labelled with
1140     *                   a hexadecimal comment indicating the character values to
1141     *                   which that line applies; otherwise, decimal values indicating
1142     *                   table indices are generated
1143     *
1144     * @see GenerateCharacter#genTables
1145     * @see GenerateCharacter#replaceCommand
1146     */
1147 
1148     static void genTable(StringBuffer result, String name,
1149                          long[] table, int extract, int bits, int size,
1150                          boolean preshifted, int shift, boolean hexFormat,
1151                          boolean properties, boolean hexComment) {
1152 
1153         String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1154             bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1155             bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1156             bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1157             bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1158             bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1159             (Csyntax ? "int64" : "long");
1160         long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1161             bits == 2 ? Integer.MAX_VALUE :
1162             bits == 4 ? Integer.MAX_VALUE :
1163             bits == 8 ? Byte.MAX_VALUE :
1164             bits == 16 ? Short.MAX_VALUE :
1165             bits == 32 ? Integer.MAX_VALUE :
1166             Long.MAX_VALUE;
1167         int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1168         boolean shiftEntries = preshifted && shift != 0;
1169         if (bits == 8 && tableAsString && useCharForByte) {
1170             atype = "char";
1171             maxPosEntry = Character.MAX_VALUE;
1172             entriesPerChar = 1;
1173         }
1174         boolean noConversion = atype.equals("char");
1175 
1176         result.append(commentStart);
1177         result.append(" The ").append(name).append(" table has ").append(table.length);
1178         result.append(" entries for a total of ");
1179         int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1180         if (bits == 8 && useCharForByte) {
1181             sizeOfTable *= 2;
1182         }
1183         result.append(sizeOfTable);
1184         result.append(" bytes.").append(commentEnd).append("\n\n");
1185         if (Csyntax)
1186             result.append("  static ");
1187         else
1188             result.append("  static final ");
1189         result.append(atype);
1190         result.append(" ").append(name).append("[");
1191         if (Csyntax)
1192             result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1193         if (tableAsString) {
1194             if (noConversion) {
1195                 result.append("] = (\n");
1196             } else {
1197                 result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1198                 result.append("static final String ").append(name).append("_DATA =\n");
1199             }
1200             int CHARS_PER_LINE = 8;
1201             StringBuffer theString = new StringBuffer();
1202             int entriesInCharSoFar = 0;
1203             char ch = '\u0000';
1204             int charsPerEntry = -entriesPerChar;
1205             for (int j=0; j<table.length; ++j) {
1206                 //long entry = table[j] >> extract;
1207                 long entry;
1208                 if ("A".equals(name))
1209                     entry = (table[j] & 0xffffffffL) >> extract;
1210                 else
1211                     entry = (table[j] >> extract);
1212                 if (shiftEntries) entry <<= shift;
1213                 if (entry >= (1L << bits)) {
1214                     FAIL("Entry too big");
1215                 }
1216                 if (entriesPerChar > 0) {
1217                     // Pack multiple entries into a character
1218                     ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1219                     ++entriesInCharSoFar;
1220                     if (entriesInCharSoFar == entriesPerChar) {
1221                         // Character is full
1222                         theString.append(ch);
1223                         entriesInCharSoFar = 0;
1224                         ch = '\u0000';
1225                     }
1226                 }
1227                 else {
1228                     // Use multiple characters per entry
1229                     for (int k=0; k<charsPerEntry; ++k) {
1230                         ch = (char)(entry >> ((charsPerEntry-1)*16));
1231                         entry <<= 16;
1232                         theString.append(ch);
1233                     }
1234                 }
1235             }
1236             if (entriesInCharSoFar > 0) {
1237                 while (entriesInCharSoFar < entriesPerChar) {
1238                     ch = (char)((int)ch >> bits);
1239                     ++entriesInCharSoFar;
1240                 }
1241                 theString.append(ch);
1242                 entriesInCharSoFar = 0;
1243             }
1244             result.append(Utility.formatForSource(theString.toString(), "    "));
1245             if (noConversion) {
1246                 result.append(").toCharArray()");
1247             }
1248             result.append(";\n\n  ");
1249 
1250             if (!noConversion) {
1251                 addInitializer(name, atype, entriesPerChar, bits, table.length);
1252             }
1253         }
1254         else {
1255             result.append("] = {");
1256             boolean castEntries = shiftEntries && (bits < 32);
1257             int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1258                 bits == 2 ? 16*4 :
1259                 bits == 4 ? 8*4 :
1260                 bits == 8 ? 8 :
1261                 bits == 16 ? 8 :
1262                 bits == 32 ? 4 : 2) :
1263                 (bits == 8 ? 8 :
1264                 bits == 16 ? 8 : 4);
1265             int printMask = properties ? 0 :
1266             Math.min(1 << size,
1267                 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1268             int commentShift = ((1 << size) == table.length) ? 0 : size;
1269             int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1270             long val = 0;
1271             for (int j = 0; j < table.length; j++) {
1272                 if ((j & printMask) == 0) {
1273                     while (result.charAt(result.length() - 1) == ' ')
1274                         result.setLength(result.length() - 1);
1275                     result.append("\n    ");
1276                 }
1277         PRINT:  {
1278                 if (castEntries)
1279                     result.append("(").append(atype).append(")(");
1280                 long entry = table[j] >> extract;
1281                 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1282                 int k = j & packMask;
1283                 if (bits >= 8)
1284                     val = entry;
1285                 else if (k == 0) {
1286                     val = entry;
1287                     break PRINT;
1288                 }
1289                 else {
1290                     val |= (entry << (k*bits));
1291                     if (k != packMask)
1292                         break PRINT;
1293                 }
1294                 if (val > maxPosEntry && !Csyntax) { // liu
1295                 // For values that are out of range, convert them to in-range negative values.
1296                 // Actually, output the '-' and convert them to the negative of the corresponding
1297                 // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1298                     result.append('-');
1299                     val = maxPosEntry + maxPosEntry + 2 - val;
1300                 }
1301                 if (hexFormat) {
1302                     result.append("0x");
1303                     if (bits == 8)
1304                         result.append(hex2((byte)val));
1305                     else if (bits == 16)
1306                         result.append(hex4((short)val));
1307                     else if (bits == 32 || bits < 8)
1308                         result.append(hex8((int)val));
1309                     else {
1310                         result.append(hex16(val));
1311                         if (!Csyntax)
1312                             result.append("L");
1313                     }
1314                 }
1315                 else {
1316                     if (bits == 8)
1317                         result.append(dec3(val));
1318                     else if (bits == 64) {
1319                         result.append(dec5(val));
1320                         if (!Csyntax)
1321                             result.append("L");
1322                     }
1323                     else
1324                         result.append(dec5(val));
1325                 }
1326                 if (shiftEntries)
1327                     result.append("<<").append(shift);
1328                 if (castEntries) result.append(")");
1329                 if (j < (table.length - 1))
1330                     result.append(", ");
1331                 else
1332                     result.append("  ");
1333                 if ((j & printMask) == printMask) {
1334                     result.append(" ").append(commentStart).append(" ");
1335                     if (hexComment)
1336                         result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1337                     else
1338                         result.append(dec3((j & ~commentMask) >> commentShift));
1339                     if (properties) propertiesComments(result, val);
1340                     result.append(commentEnd);
1341                 }
1342                 } // end PRINT
1343             }
1344             result.append("\n  };\n\n  ");
1345         }
1346     }
1347 
1348     static void genCaseMapTableDeclaration(StringBuffer result) {
1349         String myTab = "    ";
1350         result.append(myTab + "static final char[][][] charMap;\n");
1351     }
1352 
1353     static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1354         String myTab = "    ";
1355         int ch;
1356         char[] map;
1357         result.append(myTab + "charMap = new char[][][] {\n");
1358         for (int x = 0; x < specialCaseMaps.length; x++) {
1359             ch = specialCaseMaps[x].getCharSource();
1360             map = specialCaseMaps[x].getUpperCaseMap();
1361             result.append(myTab + myTab);
1362             result.append("{ ");
1363             result.append("{\'\\u"+hex4(ch)+"\'}, {");
1364             for (int y = 0; y < map.length; y++) {
1365                 result.append("\'\\u"+hex4(map[y])+"\', ");
1366             }
1367             result.append("} },\n");
1368         }
1369         result.append(myTab + "};\n");
1370 
1371     }
1372 
1373     /**
1374     * The propertiesComments method generates comments describing encoded
1375     * character properties.
1376     *
1377     * @param result     a StringBuffer, to which the generated source code
1378     *                   text is to be appended
1379     * @param val                encoded character properties
1380     *
1381     * @see GenerateCharacter#genTable
1382     */
1383 
1384     static void propertiesComments(StringBuffer result, long val) {
1385         result.append("   ");
1386         switch ((int)(val & maskType)) {
1387             case UnicodeSpec.CONTROL:
1388                 result.append("Cc");
1389                 break;
1390             case UnicodeSpec.FORMAT:
1391                 result.append("Cf");
1392                 break;
1393             case UnicodeSpec.PRIVATE_USE:
1394                 result.append("Co");
1395                 break;
1396             case UnicodeSpec.SURROGATE:
1397                 result.append("Cs");
1398                 break;
1399             case UnicodeSpec.LOWERCASE_LETTER:
1400                 result.append("Ll");
1401                 break;
1402             case UnicodeSpec.MODIFIER_LETTER:
1403                 result.append("Lm");
1404                 break;
1405             case UnicodeSpec.OTHER_LETTER:
1406                 result.append("Lo");
1407                 break;
1408             case UnicodeSpec.TITLECASE_LETTER:
1409                 result.append("Lt");
1410                 break;
1411             case UnicodeSpec.UPPERCASE_LETTER:
1412                 result.append("Lu");
1413                 break;
1414             case UnicodeSpec.COMBINING_SPACING_MARK:
1415                 result.append("Mc");
1416                 break;
1417             case UnicodeSpec.ENCLOSING_MARK:
1418                 result.append("Me");
1419                 break;
1420             case UnicodeSpec.NON_SPACING_MARK:
1421                 result.append("Mn");
1422                 break;
1423             case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1424                 result.append("Nd");
1425                 break;
1426             case UnicodeSpec.LETTER_NUMBER:
1427                 result.append("Nl");
1428                 break;
1429             case UnicodeSpec.OTHER_NUMBER:
1430                 result.append("No");
1431                 break;
1432             case UnicodeSpec.CONNECTOR_PUNCTUATION:
1433                 result.append("Pc");
1434                 break;
1435             case UnicodeSpec.DASH_PUNCTUATION:
1436                 result.append("Pd");
1437                 break;
1438             case UnicodeSpec.END_PUNCTUATION:
1439                 result.append("Pe");
1440                 break;
1441             case UnicodeSpec.OTHER_PUNCTUATION:
1442                 result.append("Po");
1443                 break;
1444             case UnicodeSpec.START_PUNCTUATION:
1445                 result.append("Ps");
1446                 break;
1447             case UnicodeSpec.CURRENCY_SYMBOL:
1448                 result.append("Sc");
1449                 break;
1450             case UnicodeSpec.MODIFIER_SYMBOL:
1451                 result.append("Sk");
1452                 break;
1453             case UnicodeSpec.MATH_SYMBOL:
1454                 result.append("Sm");
1455                 break;
1456             case UnicodeSpec.OTHER_SYMBOL:
1457                 result.append("So");
1458                 break;
1459             case UnicodeSpec.LINE_SEPARATOR:
1460                 result.append("Zl"); break;
1461             case UnicodeSpec.PARAGRAPH_SEPARATOR:
1462                 result.append("Zp");
1463                 break;
1464             case UnicodeSpec.SPACE_SEPARATOR:
1465                 result.append("Zs");
1466                 break;
1467             case UnicodeSpec.UNASSIGNED:
1468                 result.append("unassigned");
1469                 break;
1470         }
1471 
1472         switch ((int)((val & maskBidi) >> shiftBidi)) {
1473             case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1474                 result.append(", L");
1475                 break;
1476             case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1477                 result.append(", R");
1478                 break;
1479             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1480                 result.append(", EN");
1481                 break;
1482             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1483                 result.append(", ES");
1484                 break;
1485             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1486                 result.append(", ET");
1487                 break;
1488             case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1489                 result.append(", AN");
1490                 break;
1491             case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1492                 result.append(", CS");
1493                 break;
1494             case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1495                 result.append(", B");
1496                 break;
1497             case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1498                 result.append(", S");
1499                 break;
1500             case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1501                 result.append(", WS");
1502                 break;
1503             case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1504                 result.append(", ON");
1505                 break;
1506         }
1507         if ((val & maskUpperCase) != 0) {
1508             result.append(", hasUpper (subtract ");
1509             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1510         }
1511         if ((val & maskLowerCase) != 0) {
1512             result.append(", hasLower (add ");
1513             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1514         }
1515         if ((val & maskTitleCase) != 0) {
1516             result.append(", hasTitle");
1517         }
1518         if ((val & maskIdentifierInfo) == valueIgnorable) {
1519             result.append(", ignorable");
1520         }
1521         if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1522             result.append(", identifier part");
1523         }
1524         if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1525             result.append(", underscore");
1526         }
1527         if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1528             result.append(", whitespace");
1529         }
1530         if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1531             result.append(", currency");
1532         }
1533         if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1534             result.append(", identifier start");
1535         }
1536         if ((val & maskNumericType) == valueDigit) {
1537             result.append(", decimal ");
1538             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1539         }
1540         if ((val & maskNumericType) == valueStrangeNumeric) {
1541             result.append(", strange");
1542         }
1543         if ((val & maskNumericType) == valueJavaSupradecimal) {
1544             result.append(", supradecimal ");
1545             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1546         }
1547     }
1548 
1549     static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1550 
1551     static String tableName(int j) { return tableNames[j]; }
1552 
1553     /**
1554     * The genAccess method generates source code for one table access expression.
1555     *
1556     * Most of the complexity stems from handling various options as to
1557     * table representation, such as whether it contains values so large that
1558     * they are represented as negative values and whether the table values are
1559     * preshifted.  This method also avoids such "ugly" expressions as shifting
1560     * by distance zero, masking when no masking is necessary, and so on.
1561     * For clarity, it generates expressions that do not rely on operator
1562     * precedence, but otherwise it avoids generating redundant parentheses.
1563     *
1564     * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1565     * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1566     *
1567     * @param tbl                the name of the final table to be accessed
1568     * @param var                the variable name that appeared in parentheses in the
1569     *                           "Lookup" command
1570     * @param bits       the number of bits (not bytes) to be used to represent
1571     *                   the final table entry
1572     * @return   the replacement text for the "Lookup(xxx)" command, as a String
1573     *
1574     * @see GenerateCharacter#replaceCommand
1575     */
1576 
1577     static String genAccess(String tbl, String var, int bits) {
1578         String access = null;
1579         int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1580         for (int k = 0; k < sizes.length; k++) {
1581             int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1582             int shift = shifts[k] + offset;
1583             String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1584             int mask = (1 << (sizes[k] - offset)) - 1;
1585             String masked = (k == 0) ? shifted :
1586               "(" + shifted + "&0x" + hex(mask) + ")";
1587             String index = (k == 0) ? masked :
1588              (mask == 0) ? access : "(" + access + "|" + masked + ")";
1589             String indexNoParens = (index.charAt(0) != '(') ? index :
1590                  index.substring(1, index.length() - 1);
1591             String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1592             String fetched = tblname + "[" + indexNoParens + "]";
1593             String zeroextended = (zeroextend[k] == 0) ? fetched :
1594                 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1595             int adjustment = preshifted[k] ? 0 :
1596                sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1597             String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1598                 "(" + zeroextended + "<<" + adjustment + ")";
1599             String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1600                 (bits == 2) ? "((" + var + "&0xF)<<1)" :
1601                 (bits == 4) ? "((" + var + "&7)<<2)" : null;
1602             String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1603                 "((" + adjusted + ">>" + bitshift + ")&" +
1604                 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1605             access = extracted;
1606         }
1607         return access;
1608     }
1609 
1610     /* The command line arguments are decoded and used to set the following
1611      global variables.
1612      */
1613 
1614     static boolean verbose = false;
1615     static boolean nobidi = false;
1616     static boolean nomirror = false;
1617     static boolean identifiers = false;
1618     static boolean Csyntax = false;
1619     static String TemplateFileName = null;
1620     static String OutputFileName = null;
1621     static String UnicodeSpecFileName = null; // liu
1622     static String SpecialCasingFileName = null;
1623     static String PropListFileName = null;
1624     static String DerivedPropsFileName = null;
1625     static boolean useCharForByte = false;
1626     static int[] sizes;
1627     static int bins = 0; // liu; if > 0, then perform search
1628     static boolean tableAsString = false;
1629     static boolean bLatin1 = false;
1630 
1631     static String commandLineDescription;
1632 
1633     /* Other global variables, equal in length to the "sizes" array. */
1634 
1635     static int[] shifts;
1636     static int[] zeroextend;
1637     static int[] bytes;
1638     static boolean[] preshifted;
1639     static long[][] tables;
1640 
1641 
1642     /* Other global variables */
1643     static String commentStart;
1644     static String commentEnd;
1645 
1646     static StringBuffer initializers = new StringBuffer();
1647 
1648     /* special casing rules for 1:M toUpperCase mappings */
1649     static SpecialCaseMap[] specialCaseMaps;
1650 
1651     /**
1652     * Process the command line arguments.
1653     *
1654     * The allowed flags in command line are:
1655     * <dl>
1656     * <dt> -verbose             <dd> Emit comments to standard output describing
1657     *                                   what's going on during the processing.
1658     * <dt> -nobidi              <dd> Do not include bidi categories in the
1659     *                                   encoded character properties.
1660     * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1661     *                        character properties.
1662     * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1663     * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1664     * <dt> -o filename          <dd> Specify output file name.
1665     * <dt> -template filename   <dd> Specify template input file name.
1666     * <dt> -spec filename        <dd> Specify Unicode spec file name.
1667     * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1668     * <dt> -search bins          <dd> Try different partitions into the specified
1669     *                                    number of bins.  E.g., for 2 bins, try
1670     *                                    16 0, 15 1,..., 0 16.
1671     * <dt> -string               <dd> Create table as string.  Only valid with Java
1672     *                                    syntax.
1673     * <dt> -latin1          <dd> Create a latin 1 only property table.
1674     * </dl>
1675     * In addition, decimal literals may appear as command line arguments;
1676     * each one represents the number of bits of the character to be broken
1677     * off at each lookup step.  If present, they must add up to 16 (the number
1678     * of bits in a char value).  For smaller tables, the last value should
1679     * be 0; values other than the last one may not be zero.  If no such
1680     * numeric values are provided, default values are used.
1681     *
1682     * @param args       the command line arguments, as an array of String
1683     *
1684     * @see GenerateCharacter#main
1685     */
1686 
1687     static void processArgs(String[] args) {
1688         StringBuffer desc = new StringBuffer("java GenerateCharacter");
1689         for (int j=0; j<args.length; ++j) {
1690             desc.append(" " + args[j]);
1691         }
1692         for (int j = 0; j < args.length; j++) {
1693             if (args[j].equals("-verbose") || args[j].equals("-v"))
1694                 verbose = true;
1695             else if (args[j].equals("-nobidi"))
1696                 nobidi = true;
1697             else if (args[j].equals("-nomirror"))
1698                 nomirror = true;
1699             else if (args[j].equals("-identifiers"))
1700                 identifiers = true;
1701             else if (args[j].equals("-c"))
1702                 Csyntax = true;
1703             else if (args[j].equals("-string"))
1704                 tableAsString = true;
1705             else if (args[j].equals("-o")) {
1706                 if (j == args.length - 1) {
1707                     FAIL("File name missing after -o");
1708                 }
1709                 else {
1710                     OutputFileName = args[++j];
1711                 }
1712             }
1713             else if (args[j].equals("-search")) {
1714                 if (j == args.length - 1)
1715                     FAIL("Bin count missing after -search");
1716                 else {
1717                     bins = Integer.parseInt(args[++j]);
1718                     if (bins < 1 || bins > 10)
1719                         FAIL("Bin count must be >= 1 and <= 10");
1720                 }
1721             }
1722             else if (args[j].equals("-template")) {
1723                 if (j == args.length - 1)
1724                     FAIL("File name missing after -template");
1725                 else
1726                     TemplateFileName = args[++j];
1727             }
1728             else if (args[j].equals("-spec")) { // liu
1729                 if (j == args.length - 1) {
1730                     FAIL("File name missing after -spec");
1731                 }
1732                 else {
1733                     UnicodeSpecFileName = args[++j];
1734                 }
1735             }
1736             else if (args[j].equals("-specialcasing")) {
1737                 if (j == args.length -1) {
1738                     FAIL("File name missing after -specialcasing");
1739                 }
1740                 else {
1741                     SpecialCasingFileName = args[++j];
1742                 }
1743             }
1744             else if (args[j].equals("-proplist")) {
1745                 if (j == args.length -1) {
1746                     FAIL("File name missing after -proplist");
1747                 }
1748                 else {
1749                     PropListFileName = args[++j];
1750                 }
1751             }
1752             else if (args[j].equals("-derivedprops")) {
1753                 if (j == args.length -1) {
1754                     FAIL("File name missing after -derivedprops");
1755                 }
1756                 else {
1757                     DerivedPropsFileName = args[++j];
1758                 }
1759             }
1760             else if (args[j].equals("-plane")) {
1761                 if (j == args.length -1) {
1762                     FAIL("Plane number missing after -plane");
1763                 }
1764                 else {
1765                     plane = Integer.parseInt(args[++j]);
1766                 }
1767                 if (plane > 0) {
1768                     bLatin1 = false;
1769                 }
1770             }
1771             else if ("-usecharforbyte".equals(args[j])) {
1772                 useCharForByte = true;
1773             }
1774             else if (args[j].equals("-latin1")) {
1775                 bLatin1 = true;
1776                 plane = 0;
1777             }
1778             else {
1779                 try {
1780                     int val = Integer.parseInt(args[j]);
1781                     if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1782                     if (sizes == null)
1783                         sizes = new int[1];
1784                     else {
1785                         int[] newsizes = new int[sizes.length + 1];
1786                         System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1787                         sizes = newsizes;
1788                     }
1789                     sizes[sizes.length - 1] = val;
1790                 }
1791                 catch(NumberFormatException e) {
1792                     FAIL("Unknown switch: " + args[j]);
1793                 }
1794             }
1795         }
1796         if (Csyntax && tableAsString) {
1797             FAIL("Can't specify table as string with C syntax");
1798         }
1799         if (sizes == null) {
1800             desc.append(" [");
1801             if (identifiers) {
1802                 int[] newsizes = { 8, 4, 4 };           // Good default values
1803                 desc.append("8 4 4]");
1804                 sizes = newsizes;
1805             }
1806             else {
1807                 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1808                 desc.append("10 5 1]");
1809                 sizes = newsizes;
1810             }
1811         }
1812         if (UnicodeSpecFileName == null) { // liu
1813             UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1814             desc.append(" [-spec " + UnicodeSpecFileName + ']');
1815         }
1816         if (SpecialCasingFileName == null) {
1817             SpecialCasingFileName = DefaultSpecialCasingFileName;
1818             desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1819         }
1820         if (PropListFileName == null) {
1821             PropListFileName = DefaultPropListFileName;
1822             desc.append(" [-proplist " + PropListFileName + ']');
1823         }
1824         if (DerivedPropsFileName == null) {
1825             DerivedPropsFileName = DefaultDerivedPropsFileName;
1826             desc.append(" [-derivedprops " + DerivedPropsFileName + ']');
1827         }
1828         if (TemplateFileName == null) {
1829             TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1830                   : DefaultJavaTemplateFileName);
1831             desc.append(" [-template " + TemplateFileName + ']');
1832         }
1833         if (OutputFileName == null) {
1834             OutputFileName = (Csyntax ? DefaultCOutputFileName
1835                     : DefaultJavaOutputFileName);
1836             desc.append(" [-o " + OutputFileName + ']');
1837         }
1838         commentStart = (Csyntax ? "/*" : "//");
1839         commentEnd = (Csyntax ? " */" : "");
1840         commandLineDescription = desc.toString();
1841     }
1842 
1843     private static void searchBins(long[] map, int binsOccupied) throws Exception {
1844         int bitsFree = 16;
1845         for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1846         if (binsOccupied == (bins-1)) {
1847             sizes[binsOccupied] = bitsFree;
1848             generateForSizes(map);
1849         }
1850         else {
1851             for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1852                 sizes[binsOccupied] = i;
1853                 searchBins(map, binsOccupied+1);
1854             }
1855         }
1856     }
1857 
1858     private static void generateForSizes(long[] map) throws Exception {
1859         int sum = 0;
1860         shifts = new int[sizes.length];
1861         for (int k = sizes.length - 1; k >= 0; k--) {
1862             shifts[k] = sum;
1863             sum += sizes[k];
1864         }
1865         if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1866             FAIL("Bit field widths total to " + sum +
1867              ": wrong total for map of size " + map.length);
1868         }
1869         // need a table for each set of lookup bits in char
1870         tables = new long[sizes.length][];
1871         // the last table is the map
1872         tables[sizes.length - 1] = map;
1873         for (int j = sizes.length - 1; j > 0; j--) {
1874             if (verbose && bins==0)
1875                 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1876             long[][] temp = buildTable(tables[j], sizes[j]);
1877             tables[j-1] = temp[0];
1878             tables[j] = temp[1];
1879         }
1880         preshifted = new boolean[sizes.length];
1881         zeroextend = new int[sizes.length];
1882         bytes = new int[sizes.length];
1883         for (int j = 0; j < sizes.length - 1; j++) {
1884             int len = tables[j+1].length;
1885             int size = sizes[j+1];
1886             if (len > 0x100 && (len >> size) <= 0x100) {
1887                 len >>= size;
1888                 preshifted[j] = false;
1889             }
1890             else if (len > 0x10000 && (len >> size) <= 0x10000) {
1891                 len >>= size;
1892                 preshifted[j] = false;
1893             }
1894             else preshifted[j] = true;
1895             if (Csyntax)
1896                 zeroextend[j] = 0;
1897             else if (len > 0x7F && len <= 0xFF) {
1898                 if (!useCharForByte) {
1899                     zeroextend[j] = 0xFF;
1900                 }
1901             } else if (len > 0x7FFF && len <= 0xFFFF)
1902                 zeroextend[j] = 0xFFFF;
1903             else zeroextend[j] = 0;
1904             if (len <= 0x100) bytes[j] = 1;
1905             else if (len <= 0x10000) bytes[j] = 2;
1906             else bytes[j] = 4;
1907         }
1908         preshifted[sizes.length - 1] = true;
1909         zeroextend[sizes.length - 1] = 0;
1910         bytes[sizes.length - 1] = 0;
1911         if (bins > 0) {
1912             int totalBytes = getTotalBytes();
1913             String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1914             int accessComplexity = 0;
1915             for (int j=0; j<access.length(); ++j) {
1916                 char ch = access.charAt(j);
1917                 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1918                 if (ch == '<' || ch == '>') ++j;
1919             }
1920             System.out.print("(");
1921             for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1922             System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1923             return;
1924         }
1925         if (verbose) {
1926             System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1927             for (int j = 0; j < sizes.length; j++) {
1928                 System.out.println(dec5(j) + "\t" +
1929                     dec5(sizes[j]) + "\t" +
1930                     dec5(tables[j].length) + "\t" +
1931                     dec5(shifts[j]) + "\t" +
1932                     dec5(zeroextend[j]) + "\t" +
1933                     dec5(bytes[j]) + "\t " +
1934                     preshifted[j]);
1935             }
1936         }
1937         if (verbose) {
1938             System.out.println("Generating source code for class Character");
1939             System.out.println("A table access looks like " +
1940                          genAccess("A", "ch", (identifiers ? 2 : 32)));
1941         }
1942         generateCharacterClass(TemplateFileName, OutputFileName);
1943     }
1944 
1945     /**
1946     * The main program for generating source code for the Character class.
1947     * The basic outline of its operation is:
1948     * <ol>
1949     * <li> Process the command line arguments.  One result of this process
1950     *           is a list of sizes (measured in bits and summing to 16).
1951     * <li> Get the Unicode character property data from the specification file.
1952     * <li> From that, build a map that has, for each character code, its
1953     *           relevant properties encoded as a long integer value.
1954     * <li> Repeatedly compress the map, producing a compressed table and a
1955     *           new map.  This is done once for each size value in the list.
1956     *           When this is done, we have a set of tables.
1957     * <li> Make some decisions about table representation; record these
1958     *           decisions in arrays named preshifted, zeroextend, and bytes.
1959     * <li> Generate the source code for the class Character by performing
1960     *           macro processing on a template file.
1961     * </ol>
1962     *
1963     * @param args       the command line arguments, as an array of String
1964     *
1965     * @see GenerateCharacter#processArgs
1966     * @see UnicodeSpec@readSpecFile
1967     * @see GenerateCharacter#buildMap
1968     * @see GenerateCharacter#buildTable
1969     * @see GenerateCharacter#generateCharacterClass
1970     */
1971 
1972     public static void main(String[] args) {
1973         processArgs(args);
1974         try {
1975 
1976             UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1977             specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1978             PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1979             propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane));
1980 
1981             if (verbose) {
1982                 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1983             }
1984             long[] map = buildMap(data, specialCaseMaps, propList);
1985             if (verbose) {
1986                 System.err.println("Completed building of initial map");
1987             }
1988 
1989             if (bins == 0) {
1990                 generateForSizes(map);
1991             }
1992             else {
1993                 while (bins > 0) {
1994                     sizes = new int[bins];
1995                     searchBins(map, 0);
1996                     --bins;
1997                 }
1998             }
1999             if (verbose && false) {
2000                 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
2001                              hex8(maxOffsetSeen));
2002                 System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
2003                              hex8(maxOffset));
2004             }
2005         }
2006         catch (FileNotFoundException e) { FAIL(e.toString()); }
2007         catch (IOException e) { FAIL(e.toString()); }
2008         catch (Throwable e) {
2009             System.out.println("Unexpected exception:");
2010             e.printStackTrace();
2011             FAIL("Unexpected exception!");
2012         }
2013         if (verbose) { System.out.println("Done!");}
2014     }
2015 
2016 }   // end class