1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 
  27 import java.io.BufferedReader;
  28 import java.io.FileReader;
  29 import java.io.FileNotFoundException;
  30 import java.io.IOException;
  31 import java.io.File;
  32 import java.util.regex.Pattern;
  33 import java.util.ArrayList;
  34 
  35 /**
  36  * The UnicodeSpec class provides a way to read in Unicode character
  37  * properties from a Unicode data file.  One instance of class UnicodeSpec
  38  * holds a decoded version of one line of the data file.  The file may
  39  * be obtained from www.unicode.org.  The method readSpecFile returns an array
  40  * of UnicodeSpec objects.
  41  *
  42  * @author      Guy Steele
  43  * @author  John O'Conner
  44  */
  45 
  46 public class UnicodeSpec {
  47 
  48     public UnicodeSpec() {
  49         this(0xffff);
  50     }
  51 
  52     public UnicodeSpec(int codePoint) {
  53         this.codePoint = codePoint;
  54         generalCategory = UNASSIGNED;
  55         bidiCategory = DIRECTIONALITY_UNDEFINED;
  56         mirrored = false;
  57         titleMap = 0xFFFF;
  58         upperMap = 0xFFFF;
  59         lowerMap = 0xFFFF;
  60         decimalValue = -1;
  61         digitValue = -1;
  62         numericValue = "";
  63                 oldName = null;
  64                 comment = null;
  65                 name = null;
  66     }
  67 
  68     public String toString() {
  69         StringBuffer result = new StringBuffer(hex6(codePoint));
  70         if (getUpperMap() != 0xffff) {
  71             result.append(", upper=").append(hex6(upperMap));
  72         }
  73         if (getLowerMap() != 0xffff) {
  74             result.append(", lower=").append(hex6(lowerMap));
  75         }
  76         if (getTitleMap() != 0xffff) {
  77             result.append(", title=").append(hex6(titleMap));
  78         }
  79         return result.toString();
  80     }
  81 
  82     static String hex4(int n) {
  83         String q = Long.toHexString(n & 0xFFFF).toUpperCase();
  84         return "0000".substring(Math.min(4, q.length())) + q;
  85     }
  86 
  87         static String hex6(int n) {
  88                 String str = Integer.toHexString(n & 0xFFFFFF).toUpperCase();
  89                 return "000000".substring(Math.min(6, str.length())) + str;
  90 
  91         }
  92 
  93 
  94     /**
  95     * Given one line of a Unicode data file as a String, parse the line
  96     * and return a UnicodeSpec object that contains the same character information.
  97     *
  98     * @param s a line of the Unicode data file to be parsed
  99     * @return a UnicodeSpec object, or null if the parsing process failed for some reason
 100     */
 101     public static UnicodeSpec parse(String s) {
 102         UnicodeSpec spec = null;
 103         String[] tokens = null;
 104 
 105         try {
 106                         tokens = tokenSeparator.split(s, REQUIRED_FIELDS);
 107             spec = new UnicodeSpec();
 108             spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE]));
 109             spec.setName(parseName(tokens[FIELD_NAME]));
 110             spec.setGeneralCategory(parseGeneralCategory(tokens[FIELD_CATEGORY]));
 111             spec.setBidiCategory(parseBidiCategory(tokens[FIELD_BIDI]));
 112             spec.setCombiningClass(parseCombiningClass(tokens[FIELD_CLASS]));
 113             spec.setDecomposition(parseDecomposition(tokens[FIELD_DECOMPOSITION]));
 114             spec.setDecimalValue(parseDecimalValue(tokens[FIELD_DECIMAL]));
 115             spec.setDigitValue(parseDigitValue(tokens[FIELD_DIGIT]));
 116             spec.setNumericValue(parseNumericValue(tokens[FIELD_NUMERIC]));
 117             spec.setMirrored(parseMirrored(tokens[FIELD_MIRRORED]));
 118             spec.setOldName(parseOldName(tokens[FIELD_OLDNAME]));
 119             spec.setComment(parseComment(tokens[FIELD_COMMENT]));
 120             spec.setUpperMap(parseUpperMap(tokens[FIELD_UPPERCASE]));
 121             spec.setLowerMap(parseLowerMap(tokens[FIELD_LOWERCASE]));
 122             spec.setTitleMap(parseTitleMap(tokens[FIELD_TITLECASE]));
 123         }
 124         catch(Exception e) {
 125             spec = null;
 126             System.out.println("Error parsing spec line.");
 127         }
 128         return spec;
 129     }
 130 
 131     /**
 132     * Parse the codePoint attribute for a Unicode character.  If the parse succeeds,
 133     * the codePoint field of this UnicodeSpec object is updated and false is returned.
 134     *
 135     * The codePoint attribute should be a four-digit hexadecimal integer.
 136     *
 137     * @param s   the codePoint attribute extracted from a line of the Unicode data file
 138     * @return   code point if successful
 139     * @exception NumberFormatException if unable to parse argument
 140     */
 141     public static int parseCodePoint(String s) throws NumberFormatException {
 142         return Integer.parseInt(s, 16);
 143     }
 144 
 145     public static String parseName(String s) throws Exception {
 146         if (s==null) throw new Exception("Cannot parse name.");
 147         return s;
 148     }
 149 
 150     public static byte parseGeneralCategory(String s) throws Exception {
 151         byte category = GENERAL_CATEGORY_COUNT;
 152 
 153         for (byte x=0; x<generalCategoryList.length; x++) {
 154             if (s.equals(generalCategoryList[x][SHORT])) {
 155                 category = x;
 156                 break;
 157             }
 158         }
 159         if (category >= GENERAL_CATEGORY_COUNT) {
 160             throw new Exception("Could not parse general category.");
 161         }
 162         return category;
 163     }
 164 
 165     public static byte parseBidiCategory(String s) throws Exception {
 166         byte category = DIRECTIONALITY_CATEGORY_COUNT;
 167 
 168         for (byte x=0; x<bidiCategoryList.length; x++) {
 169             if (s.equals(bidiCategoryList[x][SHORT])) {
 170                 category = x;
 171                 break;
 172             }
 173         }
 174         if (category >= DIRECTIONALITY_CATEGORY_COUNT) {
 175             throw new Exception("Could not parse bidi category.");
 176         }
 177         return category;
 178     }
 179 
 180 
 181     /**
 182     * Parse the combining attribute for a Unicode character.  If there is a combining
 183     * attribute and the parse succeeds, then the hasCombining field is set to true,
 184     * the combining field of this UnicodeSpec object is updated, and false is returned.
 185     * If the combining attribute is an empty string, the parse succeeds but the
 186     * hasCombining field is set to false. (and false is returned).
 187     *
 188     * The combining attribute, if any, should be a nonnegative decimal integer.
 189     *
 190     * @param s   the combining attribute extracted from a line of the Unicode data file
 191     * @return   the combining class value if any, -1 if property not defined
 192     * @exception Exception if can't parse the combining class
 193     */
 194 
 195     public static int parseCombiningClass(String s) throws Exception {
 196         int combining = -1;
 197         if (s.length()>0) {
 198             combining = Integer.parseInt(s, 10);
 199         }
 200         return combining;
 201     }
 202 
 203     /**
 204     * Parse the decomposition attribute for a Unicode character.  If the parse succeeds,
 205     * the decomposition field of this UnicodeSpec object is updated and false is returned.
 206     *
 207     * The decomposition attribute is complicated; for now, it is treated as a string.
 208     *
 209     * @param s   the decomposition attribute extracted from a line of the Unicode data file
 210     * @return   true if the parse failed; otherwise false
 211     */
 212 
 213     public static String parseDecomposition(String s) throws Exception {
 214         if (s==null) throw new Exception("Cannot parse decomposition.");
 215         return s;
 216     }
 217 
 218 
 219     /**
 220     * Parse the decimal value attribute for a Unicode character.  If there is a decimal value
 221     * attribute and the parse succeeds, then the hasDecimalValue field is set to true,
 222     * the decimalValue field of this UnicodeSpec object is updated, and false is returned.
 223     * If the decimal value attribute is an empty string, the parse succeeds but the
 224     * hasDecimalValue field is set to false. (and false is returned).
 225     *
 226     * The decimal value attribute, if any, should be a nonnegative decimal integer.
 227     *
 228     * @param s   the decimal value attribute extracted from a line of the Unicode data file
 229     * @return   the decimal value as an int, -1 if no decimal value defined
 230     * @exception NumberFormatException if the parse fails
 231     */
 232     public static int parseDecimalValue(String s) throws NumberFormatException {
 233         int value = -1;
 234 
 235         if (s.length() > 0) {
 236             value = Integer.parseInt(s, 10);
 237         }
 238         return value;
 239     }
 240 
 241     /**
 242     * Parse the digit value attribute for a Unicode character.  If there is a digit value
 243     * attribute and the parse succeeds, then the hasDigitValue field is set to true,
 244     * the digitValue field of this UnicodeSpec object is updated, and false is returned.
 245     * If the digit value attribute is an empty string, the parse succeeds but the
 246     * hasDigitValue field is set to false. (and false is returned).
 247     *
 248     * The digit value attribute, if any, should be a nonnegative decimal integer.
 249     *
 250     * @param s   the digit value attribute extracted from a line of the Unicode data file
 251     * @return   the digit value as an non-negative int, or -1 if no digit property defined
 252     * @exception NumberFormatException if the parse fails
 253     */
 254     public static int parseDigitValue(String s) throws NumberFormatException {
 255         int value = -1;
 256 
 257         if (s.length() > 0) {
 258             value = Integer.parseInt(s, 10);
 259         }
 260         return value;
 261     }
 262 
 263     public static String parseNumericValue(String s) throws Exception {
 264         if (s == null) throw new Exception("Cannot parse numeric value.");
 265         return s;
 266     }
 267 
 268     public static String parseComment(String s) throws Exception {
 269         if (s == null) throw new Exception("Cannot parse comment.");
 270         return s;
 271     }
 272 
 273     public static boolean parseMirrored(String s) throws Exception {
 274         boolean mirrored;
 275         if (s.length() == 1) {
 276             if (s.charAt(0) == 'Y') {mirrored = true;}
 277             else if (s.charAt(0) == 'N') {mirrored = false;}
 278             else {throw new Exception("Cannot parse mirrored property.");}
 279         }
 280         else { throw new Exception("Cannot parse mirrored property.");}
 281         return mirrored;
 282     }
 283 
 284     public static String parseOldName(String s) throws Exception {
 285         if (s == null) throw new Exception("Cannot parse old name");
 286         return s;
 287     }
 288 
 289     /**
 290     * Parse the uppercase mapping attribute for a Unicode character.  If there is a uppercase
 291     * mapping attribute and the parse succeeds, then the hasUpperMap field is set to true,
 292     * the upperMap field of this UnicodeSpec object is updated, and false is returned.
 293     * If the uppercase mapping attribute is an empty string, the parse succeeds but the
 294     * hasUpperMap field is set to false. (and false is returned).
 295     *
 296     * The uppercase mapping attribute should be a four-digit hexadecimal integer.
 297     *
 298     * @param s   the uppercase mapping attribute extracted from a line of the Unicode data file
 299     * @return   uppercase char if defined, \uffff otherwise
 300     * @exception NumberFormatException if parse fails
 301     */
 302     public static int parseUpperMap(String s) throws NumberFormatException {
 303         int upperCase = 0xFFFF;
 304 
 305         if (s.length() >= 4) {
 306             upperCase = Integer.parseInt(s, 16);
 307         }
 308         else if (s.length() != 0) {
 309             throw new NumberFormatException();
 310         }
 311         return upperCase;
 312     }
 313 
 314     /**
 315     * Parse the lowercase mapping attribute for a Unicode character.  If there is a lowercase
 316     * mapping attribute and the parse succeeds, then the hasLowerMap field is set to true,
 317     * the lowerMap field of this UnicodeSpec object is updated, and false is returned.
 318     * If the lowercase mapping attribute is an empty string, the parse succeeds but the
 319      * hasLowerMap field is set to false. (and false is returned).
 320     *
 321     * The lowercase mapping attribute should be a four-digit hexadecimal integer.
 322     *
 323     * @param s   the lowercase mapping attribute extracted from a line of the Unicode data file
 324     * @return   lowercase char mapping if defined, \uFFFF otherwise
 325     * @exception NumberFormatException if parse fails
 326     */
 327     public static int parseLowerMap(String s) throws NumberFormatException {
 328         int lowerCase = 0xFFFF;
 329 
 330         if (s.length() >= 4) {
 331             lowerCase = Integer.parseInt(s, 16);
 332         }
 333         else if (s.length() != 0) {
 334             throw new NumberFormatException();
 335         }
 336         return lowerCase;
 337     }
 338 
 339     /**
 340     * Parse the titlecase mapping attribute for a Unicode character.  If there is a titlecase
 341     * mapping attribute and the parse succeeds, then the hasTitleMap field is set to true,
 342     * the titleMap field of this UnicodeSpec object is updated, and false is returned.
 343     * If the titlecase mapping attribute is an empty string, the parse succeeds but the
 344     * hasTitleMap field is set to false. (and false is returned).
 345     *
 346     * The titlecase mapping attribute should be a four-digit hexadecimal integer.
 347     *
 348     * @param s   the titlecase mapping attribute extracted from a line of the Unicode data file
 349     * @return   title case char mapping if defined, \uFFFF otherwise
 350     * @exception NumberFormatException if parse fails
 351     */
 352     public static int parseTitleMap(String s) throws NumberFormatException {
 353         int titleCase = 0xFFFF;
 354 
 355         if (s.length() >= 4) {
 356             titleCase = Integer.parseInt(s, 16);
 357         }
 358         else if (s.length() != 0) {
 359             throw new NumberFormatException();
 360         }
 361         return titleCase;
 362     }
 363 
 364     /**
 365     * Read and parse a Unicode data file.
 366     *
 367     * @param file   a file specifying the Unicode data file to be read
 368     * @return   an array of UnicodeSpec objects, one for each line of the
 369     *           Unicode data file that could be successfully parsed as
 370     *           specifying Unicode character attributes
 371     */
 372 
 373     public static UnicodeSpec[] readSpecFile(File file, int plane) throws FileNotFoundException {
 374                 ArrayList<UnicodeSpec> list = new ArrayList<>(3000);
 375         UnicodeSpec[] result = null;
 376         int count = 0;
 377         BufferedReader f = new BufferedReader(new FileReader(file));
 378                 String line = null;
 379         loop:
 380         while(true) {
 381             try {
 382                 line = f.readLine();
 383             }
 384             catch (IOException e) {
 385                                 break loop;
 386                         }
 387             if (line == null) break loop;
 388             UnicodeSpec item = parse(line.trim());
 389                         int specPlane = item.getCodePoint() >>> 16;
 390                         if (specPlane < plane) continue;
 391                         if (specPlane > plane) break;
 392 
 393             if (item != null) {
 394                                 list.add(item);
 395             }
 396         }
 397                 result = new UnicodeSpec[list.size()];
 398                 list.toArray(result);
 399         return result;
 400     }
 401 
 402     void setCodePoint(int value) {
 403         codePoint = value;
 404     }
 405 
 406     /**
 407      * Return the code point in this Unicode specification
 408      * @return the char code point representing by the specification
 409      */
 410     public int getCodePoint() {
 411         return codePoint;
 412     }
 413 
 414     void setName(String name) {
 415         this.name = name;
 416     }
 417 
 418     public String getName() {
 419         return name;
 420     }
 421 
 422     void setGeneralCategory(byte category) {
 423         generalCategory = category;
 424     }
 425 
 426     public byte getGeneralCategory() {
 427         return generalCategory;
 428     }
 429 
 430     void setBidiCategory(byte category) {
 431         bidiCategory = category;
 432     }
 433 
 434     public byte getBidiCategory() {
 435         return bidiCategory;
 436     }
 437 
 438     void setCombiningClass(int combiningClass) {
 439         this.combiningClass = combiningClass;
 440     }
 441 
 442     public int getCombiningClass() {
 443         return combiningClass;
 444     }
 445 
 446     void setDecomposition(String decomposition) {
 447         this.decomposition = decomposition;
 448     }
 449 
 450     public String getDecomposition() {
 451          return decomposition;
 452     }
 453 
 454     void setDecimalValue(int value) {
 455         decimalValue = value;
 456     }
 457 
 458     public int getDecimalValue() {
 459         return decimalValue;
 460     }
 461 
 462     public boolean isDecimalValue() {
 463         return decimalValue != -1;
 464     }
 465 
 466     void setDigitValue(int value) {
 467         digitValue = value;
 468     }
 469 
 470     public int getDigitValue() {
 471         return digitValue;
 472     }
 473 
 474     public boolean isDigitValue() {
 475         return digitValue != -1;
 476     }
 477 
 478     void setNumericValue(String value) {
 479         numericValue = value;
 480     }
 481 
 482     public String getNumericValue() {
 483         return numericValue;
 484     }
 485 
 486     public boolean isNumericValue() {
 487         return numericValue.length() > 0;
 488     }
 489 
 490     void setMirrored(boolean value) {
 491         mirrored = value;
 492     }
 493 
 494     public boolean isMirrored() {
 495         return mirrored;
 496     }
 497 
 498     void setOldName(String name) {
 499         oldName = name;
 500     }
 501 
 502     public String getOldName() {
 503         return oldName;
 504     }
 505 
 506     void setComment(String comment) {
 507         this.comment = comment;
 508     }
 509 
 510     public String getComment() {
 511         return comment;
 512     }
 513 
 514     void setUpperMap(int ch) {
 515         upperMap = ch;
 516     };
 517 
 518     public int getUpperMap() {
 519         return upperMap;
 520     }
 521 
 522     public boolean hasUpperMap() {
 523         return upperMap != 0xffff;
 524     }
 525 
 526     void setLowerMap(int ch) {
 527         lowerMap = ch;
 528     }
 529 
 530     public int getLowerMap() {
 531         return lowerMap;
 532     }
 533 
 534     public boolean hasLowerMap() {
 535         return lowerMap != 0xffff;
 536     }
 537 
 538     void setTitleMap(int ch) {
 539         titleMap = ch;
 540     }
 541 
 542     public int getTitleMap() {
 543         return titleMap;
 544     }
 545 
 546     public boolean hasTitleMap() {
 547         return titleMap != 0xffff;
 548     }
 549 
 550     int codePoint;         // the characters UTF-32 code value
 551     String name;            // the ASCII name
 552     byte generalCategory;   // general category, available via Characte.getType()
 553     byte bidiCategory;      // available via Character.getBidiType()
 554     int combiningClass;     // not used in Character
 555     String decomposition;   // not used in Character
 556     int decimalValue;       // decimal digit value
 557     int digitValue;         // not all digits are decimal
 558     String numericValue;    // numeric value if digit or non-digit
 559     boolean mirrored;       //
 560     String oldName;
 561     String comment;
 562     int upperMap;
 563     int lowerMap;
 564     int titleMap;
 565 
 566     // this is the number of fields in one line of the UnicodeData.txt file
 567     // each field is separated by a semicolon (a token)
 568     static final int REQUIRED_FIELDS = 15;
 569 
 570     /**
 571      * General category types
 572      * To preserve compatibility, these values cannot be changed
 573      */
 574     public static final byte
 575         UNASSIGNED                  =  0, // Cn normative
 576         UPPERCASE_LETTER            =  1, // Lu normative
 577         LOWERCASE_LETTER            =  2, // Ll normative
 578         TITLECASE_LETTER            =  3, // Lt normative
 579         MODIFIER_LETTER             =  4, // Lm normative
 580         OTHER_LETTER                =  5, // Lo normative
 581         NON_SPACING_MARK            =  6, // Mn informative
 582         ENCLOSING_MARK              =  7, // Me informative
 583         COMBINING_SPACING_MARK      =  8, // Mc normative
 584         DECIMAL_DIGIT_NUMBER        =  9, // Nd normative
 585         LETTER_NUMBER               = 10, // Nl normative
 586         OTHER_NUMBER                = 11, // No normative
 587         SPACE_SEPARATOR             = 12, // Zs normative
 588         LINE_SEPARATOR              = 13, // Zl normative
 589         PARAGRAPH_SEPARATOR         = 14, // Zp normative
 590         CONTROL                     = 15, // Cc normative
 591         FORMAT                      = 16, // Cf normative
 592         // 17 is unused for no apparent reason,
 593         // but must preserve forward compatibility
 594         PRIVATE_USE                 = 18, // Co normative
 595         SURROGATE                   = 19, // Cs normative
 596         DASH_PUNCTUATION            = 20, // Pd informative
 597         START_PUNCTUATION           = 21, // Ps informative
 598         END_PUNCTUATION             = 22, // Pe informative
 599         CONNECTOR_PUNCTUATION       = 23, // Pc informative
 600         OTHER_PUNCTUATION           = 24, // Po informative
 601         MATH_SYMBOL                 = 25, // Sm informative
 602         CURRENCY_SYMBOL             = 26, // Sc informative
 603         MODIFIER_SYMBOL             = 27, // Sk informative
 604         OTHER_SYMBOL                = 28, // So informative
 605         INITIAL_QUOTE_PUNCTUATION   = 29, // Pi informative
 606         FINAL_QUOTE_PUNCTUATION     = 30, // Pf informative
 607 
 608         // this value is only used in the character generation tool
 609         // it can change to accommodate the addition of new categories.
 610         GENERAL_CATEGORY_COUNT      = 31; // sentinel value
 611 
 612     static final byte SHORT = 0, LONG = 1;
 613     // general category type strings
 614     // NOTE: The order of this category array is dependent on the assignment of
 615     // category constants above. We want to access this array using constants above.
 616     // [][SHORT] is the SHORT name, [][LONG] is the LONG name
 617     static final String[][] generalCategoryList = {
 618         {"Cn", "UNASSIGNED"},
 619         {"Lu", "UPPERCASE_LETTER"},
 620         {"Ll", "LOWERCASE_LETTER"},
 621         {"Lt", "TITLECASE_LETTER"},
 622         {"Lm", "MODIFIER_LETTER"},
 623         {"Lo", "OTHER_LETTER"},
 624         {"Mn", "NON_SPACING_MARK"},
 625         {"Me", "ENCLOSING_MARK"},
 626         {"Mc", "COMBINING_SPACING_MARK"},
 627         {"Nd", "DECIMAL_DIGIT_NUMBER"},
 628         {"Nl", "LETTER_NUMBER"},
 629         {"No", "OTHER_NUMBER"},
 630         {"Zs", "SPACE_SEPARATOR"},
 631         {"Zl", "LINE_SEPARATOR"},
 632         {"Zp", "PARAGRAPH_SEPARATOR"},
 633         {"Cc", "CONTROL"},
 634         {"Cf", "FORMAT"},
 635         {"xx", "unused"},
 636         {"Co", "PRIVATE_USE"},
 637         {"Cs", "SURROGATE"},
 638         {"Pd", "DASH_PUNCTUATION"},
 639         {"Ps", "START_PUNCTUATION"},
 640         {"Pe", "END_PUNCTUATION"},
 641         {"Pc", "CONNECTOR_PUNCTUATION"},
 642         {"Po", "OTHER_PUNCTUATION"},
 643         {"Sm", "MATH_SYMBOL"},
 644         {"Sc", "CURRENCY_SYMBOL"},
 645         {"Sk", "MODIFIER_SYMBOL"},
 646         {"So", "OTHER_SYMBOL"},
 647         {"Pi", "INITIAL_QUOTE_PUNCTUATION"},
 648         {"Pf", "FINAL_QUOTE_PUNCTUATION"}
 649     };
 650 
 651     /**
 652      * Bidirectional categories
 653      */
 654     public static final byte
 655                 DIRECTIONALITY_UNDEFINED                  = -1,
 656         // Strong category
 657         DIRECTIONALITY_LEFT_TO_RIGHT              =  0, // L
 658         DIRECTIONALITY_RIGHT_TO_LEFT              =  1, // R
 659         DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC       =  2, // AL
 660         // Weak category
 661         DIRECTIONALITY_EUROPEAN_NUMBER            =  3, // EN
 662         DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR  =  4, // ES
 663         DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR =  5, // ET
 664         DIRECTIONALITY_ARABIC_NUMBER              =  6, // AN
 665         DIRECTIONALITY_COMMON_NUMBER_SEPARATOR    =  7, // CS
 666         DIRECTIONALITY_NONSPACING_MARK            =  8, // NSM
 667         DIRECTIONALITY_BOUNDARY_NEUTRAL           =  9, // BN
 668         // Neutral category
 669         DIRECTIONALITY_PARAGRAPH_SEPARATOR        = 10, // B
 670         DIRECTIONALITY_SEGMENT_SEPARATOR          = 11, // S
 671         DIRECTIONALITY_WHITESPACE                 = 12, // WS
 672         DIRECTIONALITY_OTHER_NEUTRALS              = 13, // ON
 673 
 674         DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING    = 14, // LRE
 675         DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE     = 15, // LRO
 676         DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING    = 16, // RLE
 677         DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE     = 17, // RLO
 678         DIRECTIONALITY_POP_DIRECTIONAL_FORMAT     = 18, // PDF
 679 
 680         DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE      = 19, // LRI
 681         DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE      = 20, // RLI
 682         DIRECTIONALITY_FIRST_STRONG_ISOLATE       = 21, // FSI
 683         DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE    = 22, // PDI
 684 
 685         DIRECTIONALITY_CATEGORY_COUNT             = 23; // sentinel value
 686 
 687     // If changes are made to the above bidi category assignments, this
 688     // list of bidi category names must be changed to keep their order in synch.
 689     // Access this list using the bidi category constants above.
 690     static final String[][] bidiCategoryList = {
 691         {"L", "DIRECTIONALITY_LEFT_TO_RIGHT"},
 692         {"R", "DIRECTIONALITY_RIGHT_TO_LEFT"},
 693         {"AL", "DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC"},
 694         {"EN", "DIRECTIONALITY_EUROPEAN_NUMBER"},
 695         {"ES", "DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR"},
 696         {"ET", "DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR"},
 697         {"AN", "DIRECTIONALITY_ARABIC_NUMBER"},
 698         {"CS", "DIRECTIONALITY_COMMON_NUMBER_SEPARATOR"},
 699         {"NSM", "DIRECTIONALITY_NONSPACING_MARK"},
 700         {"BN", "DIRECTIONALITY_BOUNDARY_NEUTRAL"},
 701         {"B", "DIRECTIONALITY_PARAGRAPH_SEPARATOR"},
 702         {"S", "DIRECTIONALITY_SEGMENT_SEPARATOR"},
 703         {"WS", "DIRECTIONALITY_WHITESPACE"},
 704         {"ON", "DIRECTIONALITY_OTHER_NEUTRALS"},
 705         {"LRE", "DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING"},
 706         {"LRO", "DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE"},
 707         {"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"},
 708         {"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"},
 709         {"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"},
 710         {"LRI", "DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE"},
 711         {"RLI", "DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE"},
 712         {"FSI", "DIRECTIONALITY_FIRST_STRONG_ISOLATE"},
 713         {"PDI", "DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE"},
 714 
 715     };
 716 
 717     // Unicode specification lines have fields in this order.
 718     static final byte
 719         FIELD_VALUE         = 0,
 720         FIELD_NAME          = 1,
 721         FIELD_CATEGORY      = 2,
 722         FIELD_CLASS         = 3,
 723         FIELD_BIDI          = 4,
 724         FIELD_DECOMPOSITION = 5,
 725         FIELD_DECIMAL       = 6,
 726         FIELD_DIGIT         = 7,
 727         FIELD_NUMERIC       = 8,
 728         FIELD_MIRRORED      = 9,
 729         FIELD_OLDNAME       = 10,
 730         FIELD_COMMENT       = 11,
 731         FIELD_UPPERCASE     = 12,
 732         FIELD_LOWERCASE     = 13,
 733         FIELD_TITLECASE     = 14;
 734 
 735         static final Pattern tokenSeparator = Pattern.compile(";");
 736 
 737         public static void main(String[] args) {
 738                 UnicodeSpec[] spec = null;
 739                 if (args.length == 2 ) {
 740                         try {
 741                                 File file = new File(args[0]);
 742                                 int plane = Integer.parseInt(args[1]);
 743                                 spec = UnicodeSpec.readSpecFile(file, plane);
 744                                 System.out.println("UnicodeSpec[" + spec.length + "]:");
 745                                 for (int x=0; x<spec.length; x++) {
 746                                         System.out.println(spec[x].toString());
 747                                 }
 748                         }
 749                         catch(Exception e) {
 750                                 e.printStackTrace();
 751                         }
 752                 }
 753 
 754         }
 755 
 756 }