1 /*
   2  * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package org.openjdk.buildtools.generatecharacter;
  27 
  28 import java.io.BufferedReader;
  29 import java.io.FileReader;
  30 import java.io.FileNotFoundException;
  31 import java.io.IOException;
  32 import java.util.StringTokenizer;
  33 import java.io.File;
  34 import java.util.regex.Pattern;
  35 import java.util.ArrayList;
  36 
  37 /**
  38  * The UnicodeSpec class provides a way to read in Unicode character
  39  * properties from a Unicode data file.  One instance of class UnicodeSpec
  40  * holds a decoded version of one line of the data file.  The file may
  41  * be obtained from www.unicode.org.  The method readSpecFile returns an array
  42  * of UnicodeSpec objects.
  43  * @author      Guy Steele
  44  * @author  John O'Conner
  45  */
  46 
  47 public class UnicodeSpec {
  48 
  49     private static final int MAP_UNDEFINED = 0xFFFFFFFF;
  50 
  51         /**
  52          * Construct a default UnicodeSpec object, with a default
  53          * code point value 0xFFFF.
  54          *
  55          */
  56     public UnicodeSpec() {
  57                 this(0xffff);
  58     }
  59 
  60         /**
  61          * Construct a UnicodeSpec object for the given <code>codePoint<code>
  62          * argument. Provide default properties.
  63          * @param codePoint a Unicode code point between 0x0000 and 0x10FFFF
  64          */
  65     public UnicodeSpec(int codePoint) {
  66         this.codePoint = codePoint;
  67         generalCategory = UNASSIGNED;
  68         bidiCategory = DIRECTIONALITY_UNDEFINED;
  69         mirrored = false;
  70         titleMap = MAP_UNDEFINED;
  71         upperMap = MAP_UNDEFINED;
  72         lowerMap = MAP_UNDEFINED;
  73         decimalValue = -1;
  74         digitValue = -1;
  75         numericValue = "";
  76                 oldName = null;
  77                 comment = null;
  78                 name = null;
  79     }
  80 
  81         /**
  82          * Create a String representation of this UnicodeSpec object.
  83          * The string will contain the code point and all its case mappings
  84          * if available.
  85          */
  86     public String toString() {
  87         StringBuffer result = new StringBuffer(hex6(codePoint));
  88         if (getUpperMap() != MAP_UNDEFINED) {
  89             result.append(", upper=").append(hex6(upperMap));
  90         }
  91         if (getLowerMap() != MAP_UNDEFINED) {
  92             result.append(", lower=").append(hex6(lowerMap));
  93         }
  94         if (getTitleMap() != MAP_UNDEFINED) {
  95             result.append(", title=").append(hex6(titleMap));
  96         }
  97         return result.toString();
  98     }
  99 
 100     static String hex4(int n) {
 101         String q = Integer.toHexString(n & 0xFFFF).toUpperCase();
 102         return "0000".substring(Math.min(4, q.length())) + q;
 103     }
 104 
 105         static String hex6(int n) {
 106                 String str = Integer.toHexString(n & 0xFFFFFF).toUpperCase();
 107                 return "000000".substring(Math.min(6, str.length())) + str;
 108 
 109         }
 110 
 111 
 112     /**
 113     * Given one line of a Unicode data file as a String, parse the line
 114     * and return a UnicodeSpec object that contains the same character information.
 115     *
 116     * @param s a line of the Unicode data file to be parsed
 117     * @return a UnicodeSpec object, or null if the parsing process failed for some reason
 118     */
 119     public static UnicodeSpec parse(String s) {
 120         UnicodeSpec spec = null;
 121         String[] tokens = null;
 122 
 123         try {
 124             tokens = tokenSeparator.split(s, REQUIRED_FIELDS);
 125             spec = new UnicodeSpec();
 126             spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE]));
 127             spec.setName(parseName(tokens[FIELD_NAME]));
 128             spec.setGeneralCategory(parseGeneralCategory(tokens[FIELD_CATEGORY]));
 129             spec.setBidiCategory(parseBidiCategory(tokens[FIELD_BIDI]));
 130             spec.setCombiningClass(parseCombiningClass(tokens[FIELD_CLASS]));
 131             spec.setDecomposition(parseDecomposition(tokens[FIELD_DECOMPOSITION]));
 132             spec.setDecimalValue(parseDecimalValue(tokens[FIELD_DECIMAL]));
 133             spec.setDigitValue(parseDigitValue(tokens[FIELD_DIGIT]));
 134             spec.setNumericValue(parseNumericValue(tokens[FIELD_NUMERIC]));
 135             spec.setMirrored(parseMirrored(tokens[FIELD_MIRRORED]));
 136             spec.setOldName(parseOldName(tokens[FIELD_OLDNAME]));
 137             spec.setComment(parseComment(tokens[FIELD_COMMENT]));
 138             spec.setUpperMap(parseUpperMap(tokens[FIELD_UPPERCASE]));
 139             spec.setLowerMap(parseLowerMap(tokens[FIELD_LOWERCASE]));
 140             spec.setTitleMap(parseTitleMap(tokens[FIELD_TITLECASE]));
 141         }
 142 
 143         catch(Exception e) {
 144             spec = null;
 145             System.out.println("Error parsing spec line.");
 146         }
 147         return spec;
 148     }
 149 
 150     /**
 151     * Parse the codePoint attribute for a Unicode character.  If the parse succeeds,
 152     * the codePoint field of this UnicodeSpec object is updated and false is returned.
 153     *
 154     * The codePoint attribute should be a four to six digit hexadecimal integer.
 155     *
 156     * @param s   the codePoint attribute extracted from a line of the Unicode data file
 157     * @return   code point if successful
 158     * @exception NumberFormatException if unable to parse argument
 159     */
 160     public static int parseCodePoint(String s) throws NumberFormatException {
 161         return Integer.parseInt(s, 16);
 162     }
 163 
 164     public static String parseName(String s) throws Exception {
 165         if (s==null) throw new Exception("Cannot parse name.");
 166         return s;
 167     }
 168 
 169     public static byte parseGeneralCategory(String s) throws Exception {
 170         byte category = GENERAL_CATEGORY_COUNT;
 171 
 172         for (byte x=0; x<generalCategoryList.length; x++) {
 173             if (s.equals(generalCategoryList[x][SHORT])) {
 174                 category = x;
 175                 break;
 176             }
 177         }
 178         if (category >= GENERAL_CATEGORY_COUNT) {
 179             throw new Exception("Could not parse general category.");
 180         }
 181         return category;
 182     }
 183 
 184     public static byte parseBidiCategory(String s) throws Exception {
 185         byte category = DIRECTIONALITY_CATEGORY_COUNT;
 186 
 187         for (byte x=0; x<bidiCategoryList.length; x++) {
 188             if (s.equals(bidiCategoryList[x][SHORT])) {
 189                 category = x;
 190                 break;
 191             }
 192         }
 193         if (category >= DIRECTIONALITY_CATEGORY_COUNT) {
 194             throw new Exception("Could not parse bidi category.");
 195         }
 196         return category;
 197     }
 198 
 199 
 200     /**
 201     * Parse the combining attribute for a Unicode character.  If there is a combining
 202     * attribute and the parse succeeds, then the hasCombining field is set to true,
 203     * the combining field of this UnicodeSpec object is updated, and false is returned.
 204     * If the combining attribute is an empty string, the parse succeeds but the
 205     * hasCombining field is set to false. (and false is returned).
 206     *
 207     * The combining attribute, if any, should be a nonnegative decimal integer.
 208     *
 209     * @param s   the combining attribute extracted from a line of the Unicode data file
 210     * @return   the combining class value if any, -1 if property not defined
 211     * @exception Exception if can't parse the combining class
 212     */
 213 
 214     public static int parseCombiningClass(String s) throws Exception {
 215         int combining = -1;
 216         if (s.length()>0) {
 217             combining = Integer.parseInt(s, 10);
 218         }
 219         return combining;
 220     }
 221 
 222     /**
 223     * Parse the decomposition attribute for a Unicode character.  If the parse succeeds,
 224     * the decomposition field of this UnicodeSpec object is updated and false is returned.
 225     *
 226     * The decomposition attribute is complicated; for now, it is treated as a string.
 227     *
 228     * @param s   the decomposition attribute extracted from a line of the Unicode data file
 229     * @return   true if the parse failed; otherwise false
 230     */
 231 
 232     public static String parseDecomposition(String s) throws Exception {
 233         if (s==null) throw new Exception("Cannot parse decomposition.");
 234         return s;
 235     }
 236 
 237 
 238     /**
 239     * Parse the decimal value attribute for a Unicode character.  If there is a decimal value
 240     * attribute and the parse succeeds, then the hasDecimalValue field is set to true,
 241     * the decimalValue field of this UnicodeSpec object is updated, and false is returned.
 242     * If the decimal value attribute is an empty string, the parse succeeds but the
 243     * hasDecimalValue field is set to false. (and false is returned).
 244     *
 245     * The decimal value attribute, if any, should be a nonnegative decimal integer.
 246     *
 247     * @param s   the decimal value attribute extracted from a line of the Unicode data file
 248     * @return   the decimal value as an int, -1 if no decimal value defined
 249     * @exception NumberFormatException if the parse fails
 250     */
 251     public static int parseDecimalValue(String s) throws NumberFormatException {
 252         int value = -1;
 253 
 254         if (s.length() > 0) {
 255             value = Integer.parseInt(s, 10);
 256         }
 257         return value;
 258     }
 259 
 260     /**
 261     * Parse the digit value attribute for a Unicode character.  If there is a digit value
 262     * attribute and the parse succeeds, then the hasDigitValue field is set to true,
 263     * the digitValue field of this UnicodeSpec object is updated, and false is returned.
 264     * If the digit value attribute is an empty string, the parse succeeds but the
 265     * hasDigitValue field is set to false. (and false is returned).
 266     *
 267     * The digit value attribute, if any, should be a nonnegative decimal integer.
 268     *
 269     * @param s   the digit value attribute extracted from a line of the Unicode data file
 270     * @return   the digit value as an non-negative int, or -1 if no digit property defined
 271     * @exception NumberFormatException if the parse fails
 272     */
 273     public static int parseDigitValue(String s) throws NumberFormatException {
 274         int value = -1;
 275 
 276         if (s.length() > 0) {
 277             value = Integer.parseInt(s, 10);
 278         }
 279         return value;
 280     }
 281 
 282     public static String parseNumericValue(String s) throws Exception {
 283         if (s == null) throw new Exception("Cannot parse numeric value.");
 284         return s;
 285     }
 286 
 287     public static String parseComment(String s) throws Exception {
 288         if (s == null) throw new Exception("Cannot parse comment.");
 289         return s;
 290     }
 291 
 292     public static boolean parseMirrored(String s) throws Exception {
 293         boolean mirrored;
 294         if (s.length() == 1) {
 295             if (s.charAt(0) == 'Y') {mirrored = true;}
 296             else if (s.charAt(0) == 'N') {mirrored = false;}
 297             else {throw new Exception("Cannot parse mirrored property.");}
 298         }
 299         else { throw new Exception("Cannot parse mirrored property.");}
 300         return mirrored;
 301     }
 302 
 303     public static String parseOldName(String s) throws Exception {
 304         if (s == null) throw new Exception("Cannot parse old name");
 305         return s;
 306     }
 307 
 308     /**
 309     * Parse the uppercase mapping attribute for a Unicode character.  If there is a uppercase
 310     * mapping attribute and the parse succeeds, then the hasUpperMap field is set to true,
 311     * the upperMap field of this UnicodeSpec object is updated, and false is returned.
 312     * If the uppercase mapping attribute is an empty string, the parse succeeds but the
 313     * hasUpperMap field is set to false. (and false is returned).
 314     *
 315     * The uppercase mapping attribute should be a four to six digit hexadecimal integer.
 316     *
 317     * @param s   the uppercase mapping attribute extracted from a line of the Unicode data file
 318     * @return   simple uppercase character mapping if defined, MAP_UNDEFINED otherwise
 319     * @exception NumberFormatException if parse fails
 320     */
 321     public static int parseUpperMap(String s) throws NumberFormatException {
 322         int upperCase = MAP_UNDEFINED;
 323 
 324                 int length = s.length();
 325         if (length >= 4 && length <=6) {
 326             upperCase = Integer.parseInt(s, 16);
 327         }
 328         else if (s.length() != 0) {
 329             throw new NumberFormatException();
 330         }
 331         return upperCase;
 332     }
 333 
 334     /**
 335     * Parse the lowercase mapping attribute for a Unicode character.  If there is a lowercase
 336     * mapping attribute and the parse succeeds, then the hasLowerMap field is set to true,
 337     * the lowerMap field of this UnicodeSpec object is updated, and false is returned.
 338     * If the lowercase mapping attribute is an empty string, the parse succeeds but the
 339      * hasLowerMap field is set to false. (and false is returned).
 340     *
 341     * The lowercase mapping attribute should be a four to six digit hexadecimal integer.
 342     *
 343     * @param s   the lowercase mapping attribute extracted from a line of the Unicode data file
 344     * @return   simple lowercase character mapping if defined, MAP_UNDEFINED otherwise
 345     * @exception NumberFormatException if parse fails
 346     */
 347     public static int parseLowerMap(String s) throws NumberFormatException {
 348         int lowerCase = MAP_UNDEFINED;
 349                 int length = s.length();
 350         if (length >= 4 && length <= 6) {
 351             lowerCase = Integer.parseInt(s, 16);
 352         }
 353         else if (s.length() != 0) {
 354             throw new NumberFormatException();
 355         }
 356         return lowerCase;
 357     }
 358 
 359     /**
 360     * Parse the titlecase mapping attribute for a Unicode character.  If there is a titlecase
 361     * mapping attribute and the parse succeeds, then the hasTitleMap field is set to true,
 362     * the titleMap field of this UnicodeSpec object is updated, and false is returned.
 363     * If the titlecase mapping attribute is an empty string, the parse succeeds but the
 364     * hasTitleMap field is set to false. (and false is returned).
 365     *
 366     * The titlecase mapping attribute should be a four to six digit hexadecimal integer.
 367     *
 368     * @param s   the titlecase mapping attribute extracted from a line of the Unicode data file
 369     * @return   simple title case char mapping if defined, MAP_UNDEFINED otherwise
 370     * @exception NumberFormatException if parse fails
 371     */
 372     public static int parseTitleMap(String s) throws NumberFormatException {
 373         int titleCase = MAP_UNDEFINED;
 374                 int length = s.length();
 375         if (length >= 4 && length <= 6) {
 376             titleCase = Integer.parseInt(s, 16);
 377         }
 378         else if (s.length() != 0) {
 379             throw new NumberFormatException();
 380         }
 381         return titleCase;
 382     }
 383 
 384     /**
 385     * Read and parse a Unicode data file.
 386     *
 387     * @param file   a file specifying the Unicode data file to be read
 388     * @return   an array of UnicodeSpec objects, one for each line of the
 389     *           Unicode data file that could be successfully parsed as
 390     *           specifying Unicode character attributes
 391     */
 392 
 393     public static UnicodeSpec[] readSpecFile(File file, int plane) throws FileNotFoundException {
 394         ArrayList<UnicodeSpec> list = new ArrayList<>(3000);
 395         UnicodeSpec[] result = null;
 396         int count = 0;
 397         BufferedReader f = new BufferedReader(new FileReader(file));
 398         String line = null;
 399         loop:
 400         while(true) {
 401             try {
 402                 line = f.readLine();
 403             }
 404             catch (IOException e) {
 405                 break loop;
 406             }
 407             if (line == null) break loop;
 408             UnicodeSpec item = parse(line.trim());
 409             int specPlane = item.getCodePoint() >>> 16;
 410             if (specPlane < plane) continue;
 411             if (specPlane > plane) break;
 412 
 413             if (item != null) {
 414                 list.add(item);
 415             }
 416         }
 417         result = new UnicodeSpec[list.size()];
 418         list.toArray(result);
 419         return result;
 420     }
 421 
 422     void setCodePoint(int value) {
 423         codePoint = value;
 424     }
 425 
 426     /**
 427      * Return the code point in this Unicode specification
 428      * @return the char code point representing by the specification
 429      */
 430     public int getCodePoint() {
 431         return codePoint;
 432     }
 433 
 434     void setName(String name) {
 435         this.name = name;
 436     }
 437 
 438     public String getName() {
 439         return name;
 440     }
 441 
 442     void setGeneralCategory(byte category) {
 443         generalCategory = category;
 444     }
 445 
 446     public byte getGeneralCategory() {
 447         return generalCategory;
 448     }
 449 
 450     void setBidiCategory(byte category) {
 451         bidiCategory = category;
 452     }
 453 
 454     public byte getBidiCategory() {
 455         return bidiCategory;
 456     }
 457 
 458     void setCombiningClass(int combiningClass) {
 459         this.combiningClass = combiningClass;
 460     }
 461 
 462     public int getCombiningClass() {
 463         return combiningClass;
 464     }
 465 
 466     void setDecomposition(String decomposition) {
 467         this.decomposition = decomposition;
 468     }
 469 
 470     public String getDecomposition() {
 471          return decomposition;
 472     }
 473 
 474     void setDecimalValue(int value) {
 475         decimalValue = value;
 476     }
 477 
 478     public int getDecimalValue() {
 479         return decimalValue;
 480     }
 481 
 482     public boolean isDecimalValue() {
 483         return decimalValue != -1;
 484     }
 485 
 486     void setDigitValue(int value) {
 487         digitValue = value;
 488     }
 489 
 490     public int getDigitValue() {
 491         return digitValue;
 492     }
 493 
 494     public boolean isDigitValue() {
 495         return digitValue != -1;
 496     }
 497 
 498     void setNumericValue(String value) {
 499         numericValue = value;
 500     }
 501 
 502     public String getNumericValue() {
 503         return numericValue;
 504     }
 505 
 506     public boolean isNumericValue() {
 507         return numericValue.length() > 0;
 508     }
 509 
 510     void setMirrored(boolean value) {
 511         mirrored = value;
 512     }
 513 
 514     public boolean isMirrored() {
 515         return mirrored;
 516     }
 517 
 518     void setOldName(String name) {
 519         oldName = name;
 520     }
 521 
 522     public String getOldName() {
 523         return oldName;
 524     }
 525 
 526     void setComment(String comment) {
 527         this.comment = comment;
 528     }
 529 
 530     public String getComment() {
 531         return comment;
 532     }
 533 
 534     void setUpperMap(int ch) {
 535         upperMap = ch;
 536     };
 537 
 538     public int getUpperMap() {
 539         return upperMap;
 540     }
 541 
 542     public boolean hasUpperMap() {
 543         return upperMap != MAP_UNDEFINED;
 544     }
 545 
 546     void setLowerMap(int ch) {
 547         lowerMap = ch;
 548     }
 549 
 550     public int getLowerMap() {
 551         return lowerMap;
 552     }
 553 
 554     public boolean hasLowerMap() {
 555         return lowerMap != MAP_UNDEFINED;
 556     }
 557 
 558     void setTitleMap(int ch) {
 559         titleMap = ch;
 560     }
 561 
 562     public int getTitleMap() {
 563         return titleMap;
 564     }
 565 
 566     public boolean hasTitleMap() {
 567         return titleMap != MAP_UNDEFINED;
 568     }
 569 
 570     int codePoint;         // the characters UTF-32 code value
 571     String name;            // the ASCII name
 572     byte generalCategory;   // general category, available via Characte.getType()
 573     byte bidiCategory;      // available via Character.getBidiType()
 574     int combiningClass;     // not used in Character
 575     String decomposition;   // not used in Character
 576     int decimalValue;       // decimal digit value
 577     int digitValue;         // not all digits are decimal
 578     String numericValue;    // numeric value if digit or non-digit
 579     boolean mirrored;       //
 580     String oldName;
 581     String comment;
 582     int upperMap;
 583     int lowerMap;
 584     int titleMap;
 585 
 586     // this is the number of fields in one line of the UnicodeData.txt file
 587     // each field is separated by a semicolon (a token)
 588     static final int REQUIRED_FIELDS = 15;
 589 
 590     /**
 591      * General category types
 592      * To preserve compatibility, these values cannot be changed
 593      */
 594     public static final byte
 595         UNASSIGNED                  =  0, // Cn normative
 596         UPPERCASE_LETTER            =  1, // Lu normative
 597         LOWERCASE_LETTER            =  2, // Ll normative
 598         TITLECASE_LETTER            =  3, // Lt normative
 599         MODIFIER_LETTER             =  4, // Lm normative
 600         OTHER_LETTER                =  5, // Lo normative
 601         NON_SPACING_MARK            =  6, // Mn informative
 602         ENCLOSING_MARK              =  7, // Me informative
 603         COMBINING_SPACING_MARK      =  8, // Mc normative
 604         DECIMAL_DIGIT_NUMBER        =  9, // Nd normative
 605         LETTER_NUMBER               = 10, // Nl normative
 606         OTHER_NUMBER                = 11, // No normative
 607         SPACE_SEPARATOR             = 12, // Zs normative
 608         LINE_SEPARATOR              = 13, // Zl normative
 609         PARAGRAPH_SEPARATOR         = 14, // Zp normative
 610         CONTROL                     = 15, // Cc normative
 611         FORMAT                      = 16, // Cf normative
 612         // 17 is unused for no apparent reason,
 613         // but must preserve forward compatibility
 614         PRIVATE_USE                 = 18, // Co normative
 615         SURROGATE                   = 19, // Cs normative
 616         DASH_PUNCTUATION            = 20, // Pd informative
 617         START_PUNCTUATION           = 21, // Ps informative
 618         END_PUNCTUATION             = 22, // Pe informative
 619         CONNECTOR_PUNCTUATION       = 23, // Pc informative
 620         OTHER_PUNCTUATION           = 24, // Po informative
 621         MATH_SYMBOL                 = 25, // Sm informative
 622         CURRENCY_SYMBOL             = 26, // Sc informative
 623         MODIFIER_SYMBOL             = 27, // Sk informative
 624         OTHER_SYMBOL                = 28, // So informative
 625         INITIAL_QUOTE_PUNCTUATION   = 29, // Pi informative
 626         FINAL_QUOTE_PUNCTUATION     = 30, // Pf informative
 627 
 628         // this value is only used in the character generation tool
 629         // it can change to accommodate the addition of new categories.
 630         GENERAL_CATEGORY_COUNT      = 31; // sentinel value
 631 
 632     static final byte SHORT = 0, LONG = 1;
 633     // general category type strings
 634     // NOTE: The order of this category array is dependent on the assignment of
 635     // category constants above. We want to access this array using constants above.
 636     // [][SHORT] is the SHORT name, [][LONG] is the LONG name
 637     static final String[][] generalCategoryList = {
 638         {"Cn", "UNASSIGNED"},
 639         {"Lu", "UPPERCASE_LETTER"},
 640         {"Ll", "LOWERCASE_LETTER"},
 641         {"Lt", "TITLECASE_LETTER"},
 642         {"Lm", "MODIFIER_LETTER"},
 643         {"Lo", "OTHER_LETTER"},
 644         {"Mn", "NON_SPACING_MARK"},
 645         {"Me", "ENCLOSING_MARK"},
 646         {"Mc", "COMBINING_SPACING_MARK"},
 647         {"Nd", "DECIMAL_DIGIT_NUMBER"},
 648         {"Nl", "LETTER_NUMBER"},
 649         {"No", "OTHER_NUMBER"},
 650         {"Zs", "SPACE_SEPARATOR"},
 651         {"Zl", "LINE_SEPARATOR"},
 652         {"Zp", "PARAGRAPH_SEPARATOR"},
 653         {"Cc", "CONTROL"},
 654         {"Cf", "FORMAT"},
 655         {"xx", "unused"},
 656         {"Co", "PRIVATE_USE"},
 657         {"Cs", "SURROGATE"},
 658         {"Pd", "DASH_PUNCTUATION"},
 659         {"Ps", "START_PUNCTUATION"},
 660         {"Pe", "END_PUNCTUATION"},
 661         {"Pc", "CONNECTOR_PUNCTUATION"},
 662         {"Po", "OTHER_PUNCTUATION"},
 663         {"Sm", "MATH_SYMBOL"},
 664         {"Sc", "CURRENCY_SYMBOL"},
 665         {"Sk", "MODIFIER_SYMBOL"},
 666         {"So", "OTHER_SYMBOL"},
 667         {"Pi", "INITIAL_QUOTE_PUNCTUATION"},
 668         {"Pf", "FINAL_QUOTE_PUNCTUATION"}
 669     };
 670 
 671     /**
 672      * Bidirectional categories
 673      */
 674     public static final byte
 675         DIRECTIONALITY_UNDEFINED                  = -1,
 676 
 677         // Strong category
 678         DIRECTIONALITY_LEFT_TO_RIGHT              =  0, // L
 679         DIRECTIONALITY_RIGHT_TO_LEFT              =  1, // R
 680         DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC       =  2, // AL
 681         // Weak category
 682         DIRECTIONALITY_EUROPEAN_NUMBER            =  3, // EN
 683         DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR  =  4, // ES
 684         DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR =  5, // ET
 685         DIRECTIONALITY_ARABIC_NUMBER              =  6, // AN
 686         DIRECTIONALITY_COMMON_NUMBER_SEPARATOR    =  7, // CS
 687         DIRECTIONALITY_NONSPACING_MARK            =  8, // NSM
 688         DIRECTIONALITY_BOUNDARY_NEUTRAL           =  9, // BN
 689         // Neutral category
 690         DIRECTIONALITY_PARAGRAPH_SEPARATOR        = 10, // B
 691         DIRECTIONALITY_SEGMENT_SEPARATOR          = 11, // S
 692         DIRECTIONALITY_WHITESPACE                 = 12, // WS
 693         DIRECTIONALITY_OTHER_NEUTRALS             = 13, // ON
 694         // Explicit Formatting category
 695         DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING    = 14, // LRE
 696         DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE     = 15, // LRO
 697         DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING    = 16, // RLE
 698         DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE     = 17, // RLO
 699         DIRECTIONALITY_POP_DIRECTIONAL_FORMAT     = 18, // PDF
 700         DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE      = 19, // LRI
 701         DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE      = 20, // RLI
 702         DIRECTIONALITY_FIRST_STRONG_ISOLATE       = 21, // FSI
 703         DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE    = 22, // PDI
 704 
 705         DIRECTIONALITY_CATEGORY_COUNT             = 23; // sentinel value
 706 
 707     // If changes are made to the above bidi category assignments, this
 708     // list of bidi category names must be changed to keep their order in synch.
 709     // Access this list using the bidi category constants above.
 710     static final String[][] bidiCategoryList = {
 711         {"L", "DIRECTIONALITY_LEFT_TO_RIGHT"},
 712         {"R", "DIRECTIONALITY_RIGHT_TO_LEFT"},
 713         {"AL", "DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC"},
 714         {"EN", "DIRECTIONALITY_EUROPEAN_NUMBER"},
 715         {"ES", "DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR"},
 716         {"ET", "DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR"},
 717         {"AN", "DIRECTIONALITY_ARABIC_NUMBER"},
 718         {"CS", "DIRECTIONALITY_COMMON_NUMBER_SEPARATOR"},
 719         {"NSM", "DIRECTIONALITY_NONSPACING_MARK"},
 720         {"BN", "DIRECTIONALITY_BOUNDARY_NEUTRAL"},
 721         {"B", "DIRECTIONALITY_PARAGRAPH_SEPARATOR"},
 722         {"S", "DIRECTIONALITY_SEGMENT_SEPARATOR"},
 723         {"WS", "DIRECTIONALITY_WHITESPACE"},
 724         {"ON", "DIRECTIONALITY_OTHER_NEUTRALS"},
 725         {"LRE", "DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING"},
 726         {"LRO", "DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE"},
 727         {"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"},
 728         {"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"},
 729         {"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"},
 730         {"LRI", "DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE"},
 731         {"RLI", "DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE"},
 732         {"FSI", "DIRECTIONALITY_FIRST_STRONG_ISOLATE"},
 733         {"PDI", "DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE"},
 734     };
 735 
 736     // Unicode specification lines have fields in this order.
 737     static final byte
 738         FIELD_VALUE         = 0,
 739         FIELD_NAME          = 1,
 740         FIELD_CATEGORY      = 2,
 741         FIELD_CLASS         = 3,
 742         FIELD_BIDI          = 4,
 743         FIELD_DECOMPOSITION = 5,
 744         FIELD_DECIMAL       = 6,
 745         FIELD_DIGIT         = 7,
 746         FIELD_NUMERIC       = 8,
 747         FIELD_MIRRORED      = 9,
 748         FIELD_OLDNAME       = 10,
 749         FIELD_COMMENT       = 11,
 750         FIELD_UPPERCASE     = 12,
 751         FIELD_LOWERCASE     = 13,
 752         FIELD_TITLECASE     = 14;
 753 
 754         static final Pattern tokenSeparator = Pattern.compile(";");
 755 
 756         public static void main(String[] args) {
 757                 UnicodeSpec[] spec = null;
 758                 if (args.length == 2 ) {
 759                         try {
 760                                 File file = new File(args[0]);
 761                                 int plane = Integer.parseInt(args[1]);
 762                                 spec = UnicodeSpec.readSpecFile(file, plane);
 763                                 System.out.println("UnicodeSpec[" + spec.length + "]:");
 764                                 for (int x=0; x<spec.length; x++) {
 765                                         System.out.println(spec[x].toString());
 766                                 }
 767                         }
 768                         catch(Exception e) {
 769                                 e.printStackTrace();
 770                         }
 771                 }
 772 
 773         }
 774 
 775 }