1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 1996-2014, International Business Machines Corporation and
  28  * others. All Rights Reserved.
  29  *******************************************************************************
  30  */
  31 
  32 package sun.text.normalizer;
  33 
  34 import java.io.IOException;
  35 import java.nio.ByteBuffer;
  36 import java.util.Iterator;
  37 import java.util.MissingResourceException;
  38 
  39 import sun.text.normalizer.UCharacter.HangulSyllableType;
  40 import sun.text.normalizer.UCharacter.NumericType;
  41 
  42 /**
  43 * <p>Internal class used for Unicode character property database.</p>
  44 * <p>This classes store binary data read from uprops.icu.
  45 * It does not have the capability to parse the data into more high-level
  46 * information. It only returns bytes of information when required.</p>
  47 * <p>Due to the form most commonly used for retrieval, array of char is used
  48 * to store the binary data.</p>
  49 * <p>UCharacterPropertyDB also contains information on accessing indexes to
  50 * significant points in the binary data.</p>
  51 * <p>Responsibility for molding the binary data into more meaning form lies on
  52 * <a href=UCharacter.html>UCharacter</a>.</p>
  53 * @author Syn Wee Quek
  54 * @since release 2.1, february 1st 2002
  55 */
  56 
  57 final class UCharacterProperty
  58 {
  59     // public data members -----------------------------------------------
  60 
  61     /*
  62      * public singleton instance
  63      */
  64     public static final UCharacterProperty INSTANCE;
  65 
  66     /**
  67     * Trie data
  68     */
  69     public Trie2_16 m_trie_;
  70 
  71     /**
  72     * Unicode version
  73     */
  74     public VersionInfo m_unicodeVersion_;
  75 
  76     /**
  77     * Character type mask
  78     */
  79     public static final int TYPE_MASK = 0x1F;
  80 
  81     // uprops.h enum UPropertySource --------------------------------------- ***
  82 
  83     /** From uchar.c/uprops.icu main trie */
  84     public static final int SRC_CHAR=1;
  85     /** From uchar.c/uprops.icu properties vectors trie */
  86     public static final int SRC_PROPSVEC=2;
  87     /** From ubidi_props.c/ubidi.icu */
  88     public static final int SRC_BIDI=5;
  89     /** From normalizer2impl.cpp/nfc.nrm */
  90     public static final int SRC_NFC=8;
  91     /** From normalizer2impl.cpp/nfkc.nrm */
  92     public static final int SRC_NFKC=9;
  93 
  94     // public methods ----------------------------------------------------
  95 
  96     /**
  97     * Gets the main property value for code point ch.
  98     * @param ch code point whose property value is to be retrieved
  99     * @return property value of code point
 100     */
 101     public final int getProperty(int ch)
 102     {
 103         return m_trie_.get(ch);
 104     }
 105 
 106     /**
 107      * Gets the unicode additional properties.
 108      * Java version of C u_getUnicodeProperties().
 109      * @param codepoint codepoint whose additional properties is to be
 110      *                  retrieved
 111      * @param column The column index.
 112      * @return unicode properties
 113      */
 114     public int getAdditional(int codepoint, int column) {
 115         assert column >= 0;
 116         if (column >= m_additionalColumnsCount_) {
 117             return 0;
 118         }
 119         return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
 120     }
 121 
 122     /**
 123      * <p>Get the "age" of the code point.</p>
 124      * <p>The "age" is the Unicode version when the code point was first
 125      * designated (as a non-character or for Private Use) or assigned a
 126      * character.</p>
 127      * <p>This can be useful to avoid emitting code points to receiving
 128      * processes that do not accept newer characters.</p>
 129      * <p>The data is from the UCD file DerivedAge.txt.</p>
 130      * <p>This API does not check the validity of the codepoint.</p>
 131      * @param codepoint The code point.
 132      * @return the Unicode version number
 133      */
 134     public VersionInfo getAge(int codepoint)
 135     {
 136         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
 137         return VersionInfo.getInstance(
 138                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
 139                            version & LAST_NIBBLE_MASK_, 0, 0);
 140     }
 141 
 142     // int-value and enumerated properties --------------------------------- ***
 143 
 144     public int getType(int c) {
 145         return getProperty(c)&TYPE_MASK;
 146     }
 147 
 148     /*
 149      * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
 150      * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
 151      */
 152     private static final int[] /* UHangulSyllableType */ gcbToHst={
 153         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
 154         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CONTROL */
 155         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CR */
 156         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_EXTEND */
 157         HangulSyllableType.LEADING_JAMO,     /* U_GCB_L */
 158         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_LF */
 159         HangulSyllableType.LV_SYLLABLE,      /* U_GCB_LV */
 160         HangulSyllableType.LVT_SYLLABLE,     /* U_GCB_LVT */
 161         HangulSyllableType.TRAILING_JAMO,    /* U_GCB_T */
 162         HangulSyllableType.VOWEL_JAMO        /* U_GCB_V */
 163         /*
 164          * Omit GCB values beyond what we need for hst.
 165          * The code below checks for the array length.
 166          */
 167     };
 168 
 169     private class IntProperty {
 170         int column;  // SRC_PROPSVEC column, or "source" if mask==0
 171         int mask;
 172         int shift;
 173 
 174         IntProperty(int column, int mask, int shift) {
 175             this.column=column;
 176             this.mask=mask;
 177             this.shift=shift;
 178         }
 179 
 180         IntProperty(int source) {
 181             this.column=source;
 182             this.mask=0;
 183         }
 184 
 185         int getValue(int c) {
 186             // systematic, directly stored properties
 187             return (getAdditional(c, column)&mask)>>>shift;
 188         }
 189     }
 190 
 191     private class BiDiIntProperty extends IntProperty {
 192         BiDiIntProperty() {
 193             super(SRC_BIDI);
 194         }
 195     }
 196 
 197     private class CombiningClassIntProperty extends IntProperty {
 198         CombiningClassIntProperty(int source) {
 199             super(source);
 200         }
 201     }
 202 
 203     private class NormQuickCheckIntProperty extends IntProperty {  // UCHAR_NF*_QUICK_CHECK properties
 204         int which;
 205         int max;
 206 
 207         NormQuickCheckIntProperty(int source, int which, int max) {
 208             super(source);
 209             this.which=which;
 210             this.max=max;
 211         }
 212     }
 213 
 214     private IntProperty intProp =  new BiDiIntProperty() {  // BIDI_PAIRED_BRACKET_TYPE
 215         int getValue(int c) {
 216             return UBiDiProps.INSTANCE.getPairedBracketType(c);
 217         }
 218     };
 219 
 220     public int getIntPropertyValue(int c, int which) {
 221         if (which == BIDI_PAIRED_BRACKET_TYPE) {
 222             return intProp.getValue(c);
 223         }
 224         return 0; // undefined
 225     }
 226 
 227     /**
 228     * Forms a supplementary code point from the argument character<br>
 229     * Note this is for internal use hence no checks for the validity of the
 230     * surrogate characters are done
 231     * @param lead lead surrogate character
 232     * @param trail trailing surrogate character
 233     * @return code point of the supplementary character
 234     */
 235     public static int getRawSupplementary(char lead, char trail)
 236     {
 237         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
 238     }
 239 
 240     /**
 241      * Gets the type mask
 242      * @param type character type
 243      * @return mask
 244      */
 245     public static final int getMask(int type)
 246     {
 247         return 1 << type;
 248     }
 249 
 250     /**
 251      * Returns the digit values of characters like 'A' - 'Z', normal,
 252      * half-width and full-width. This method assumes that the other digit
 253      * characters are checked by the calling method.
 254      * @param ch character to test
 255      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
 256      *         its corresponding digit will be returned.
 257      */
 258     public static int getEuropeanDigit(int ch) {
 259         if ((ch > 0x7a && ch < 0xff21)
 260             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
 261             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
 262             return -1;
 263         }
 264         if (ch <= 0x7a) {
 265             // ch >= 0x41 or ch < 0x61
 266             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
 267         }
 268         // ch >= 0xff21
 269         if (ch <= 0xff3a) {
 270             return ch + 10 - 0xff21;
 271         }
 272         // ch >= 0xff41 && ch <= 0xff5a
 273         return ch + 10 - 0xff41;
 274     }
 275 
 276     public int digit(int c) {
 277         int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
 278         if(value<=9) {
 279             return value;
 280         } else {
 281             return -1;
 282         }
 283     }
 284 
 285     // protected variables -----------------------------------------------
 286 
 287     /**
 288      * Extra property trie
 289      */
 290     Trie2_16 m_additionalTrie_;
 291     /**
 292      * Extra property vectors, 1st column for age and second for binary
 293      * properties.
 294      */
 295     int[] m_additionalVectors_;
 296     /**
 297      * Number of additional columns
 298      */
 299     int m_additionalColumnsCount_;
 300     /**
 301      * Maximum values for block, bits used as in vector word
 302      * 0
 303      */
 304     int m_maxBlockScriptValue_;
 305     /**
 306      * Maximum values for script, bits used as in vector word
 307      * 0
 308      */
 309      int m_maxJTGValue_;
 310     /**
 311      * Script_Extensions data
 312      */
 313     public char[] m_scriptExtensions_;
 314 
 315     // private variables -------------------------------------------------
 316 
 317     /**
 318     * Default name of the datafile
 319     */
 320     private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
 321 
 322     /**
 323     * Shift value for lead surrogate to form a supplementary character.
 324     */
 325     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 326     /**
 327     * Offset to add to combined surrogate pair to avoid masking.
 328     */
 329     private static final int SURROGATE_OFFSET_ =
 330                            UTF16.SUPPLEMENTARY_MIN_VALUE -
 331                            (UTF16.SURROGATE_MIN_VALUE <<
 332                            LEAD_SURROGATE_SHIFT_) -
 333                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
 334 
 335 
 336     // property data constants -------------------------------------------------
 337 
 338     /**
 339      * Numeric types and values in the main properties words.
 340      */
 341     private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
 342     private static final int getNumericTypeValue(int props) {
 343         return props >> NUMERIC_TYPE_VALUE_SHIFT_;
 344     }
 345 
 346     /* constants for the storage form of numeric types and values */
 347     /** No numeric value. */
 348     private static final int NTV_NONE_ = 0;
 349     /** Decimal digits: nv=0..9 */
 350     private static final int NTV_DECIMAL_START_ = 1;
 351     /** Other digits: nv=0..9 */
 352     private static final int NTV_DIGIT_START_ = 11;
 353     /** Small integers: nv=0..154 */
 354     private static final int NTV_NUMERIC_START_ = 21;
 355 
 356     private static final int ntvGetType(int ntv) {
 357         return
 358             (ntv==NTV_NONE_) ? NumericType.NONE :
 359             (ntv<NTV_DIGIT_START_) ?  NumericType.DECIMAL :
 360             (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
 361             NumericType.NUMERIC;
 362     }
 363 
 364     /*
 365      * Properties in vector word 0
 366      * Bits
 367      * 31..24   DerivedAge version major/minor one nibble each
 368      * 23..22   3..1: Bits 7..0 = Script_Extensions index
 369      *             3: Script value from Script_Extensions
 370      *             2: Script=Inherited
 371      *             1: Script=Common
 372      *             0: Script=bits 7..0
 373      * 21..20   reserved
 374      * 19..17   East Asian Width
 375      * 16.. 8   UBlockCode
 376      *  7.. 0   UScriptCode
 377      */
 378     /**
 379      * Script_Extensions: mask includes Script
 380      */
 381     public static final int SCRIPT_X_MASK = 0x00c000ff;
 382     //private static final int SCRIPT_X_SHIFT = 22;
 383     /**
 384      * Integer properties mask and shift values for East Asian cell width.
 385      * Equivalent to icu4c UPROPS_EA_MASK
 386      */
 387     private static final int EAST_ASIAN_MASK_ = 0x000e0000;
 388     /**
 389      * Integer properties mask and shift values for East Asian cell width.
 390      * Equivalent to icu4c UPROPS_EA_SHIFT
 391      */
 392     private static final int EAST_ASIAN_SHIFT_ = 17;
 393     /**
 394      * Integer properties mask and shift values for blocks.
 395      * Equivalent to icu4c UPROPS_BLOCK_MASK
 396      */
 397     private static final int BLOCK_MASK_ = 0x0001ff00;
 398     /**
 399      * Integer properties mask and shift values for blocks.
 400      * Equivalent to icu4c UPROPS_BLOCK_SHIFT
 401      */
 402     private static final int BLOCK_SHIFT_ = 8;
 403     /**
 404      * Integer properties mask and shift values for scripts.
 405      * Equivalent to icu4c UPROPS_SHIFT_MASK
 406      */
 407     public static final int SCRIPT_MASK_ = 0x000000ff;
 408 
 409     /**
 410      * Additional properties used in internal trie data
 411      */
 412     /*
 413      * Properties in vector word 1
 414      * Each bit encodes one binary property.
 415      * The following constants represent the bit number, use 1<<UPROPS_XYZ.
 416      * UPROPS_BINARY_1_TOP<=32!
 417      *
 418      * Keep this list of property enums in sync with
 419      * propListNames[] in icu/source/tools/genprops/props2.c!
 420      *
 421      * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
 422      */
 423     private static final int WHITE_SPACE_PROPERTY_ = 0;
 424     private static final int DASH_PROPERTY_ = 1;
 425     private static final int HYPHEN_PROPERTY_ = 2;
 426     private static final int QUOTATION_MARK_PROPERTY_ = 3;
 427     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
 428     private static final int MATH_PROPERTY_ = 5;
 429     private static final int HEX_DIGIT_PROPERTY_ = 6;
 430     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
 431     private static final int ALPHABETIC_PROPERTY_ = 8;
 432     private static final int IDEOGRAPHIC_PROPERTY_ = 9;
 433     private static final int DIACRITIC_PROPERTY_ = 10;
 434     private static final int EXTENDER_PROPERTY_ = 11;
 435     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
 436     private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
 437     private static final int GRAPHEME_LINK_PROPERTY_ = 14;
 438     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
 439     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
 440     private static final int RADICAL_PROPERTY_ = 17;
 441     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
 442     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
 443     private static final int DEPRECATED_PROPERTY_ = 20;
 444     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
 445     private static final int XID_START_PROPERTY_ = 22;
 446     private static final int XID_CONTINUE_PROPERTY_ = 23;
 447     private static final int ID_START_PROPERTY_    = 24;
 448     private static final int ID_CONTINUE_PROPERTY_ = 25;
 449     private static final int GRAPHEME_BASE_PROPERTY_ = 26;
 450     private static final int S_TERM_PROPERTY_ = 27;
 451     private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
 452     private static final int PATTERN_SYNTAX = 29;                   /* new in ICU 3.4 and Unicode 4.1 */
 453     private static final int PATTERN_WHITE_SPACE = 30;
 454 
 455     /*
 456      * Properties in vector word 2
 457      * Bits
 458      * 31..26   reserved
 459      * 25..20   Line Break
 460      * 19..15   Sentence Break
 461      * 14..10   Word Break
 462      *  9.. 5   Grapheme Cluster Break
 463      *  4.. 0   Decomposition Type
 464      */
 465     private static final int LB_MASK          = 0x03f00000;
 466     private static final int LB_SHIFT         = 20;
 467 
 468     private static final int SB_MASK          = 0x000f8000;
 469     private static final int SB_SHIFT         = 15;
 470 
 471     private static final int WB_MASK          = 0x00007c00;
 472     private static final int WB_SHIFT         = 10;
 473 
 474     private static final int GCB_MASK         = 0x000003e0;
 475     private static final int GCB_SHIFT        = 5;
 476 
 477     /**
 478      * Integer properties mask for decomposition type.
 479      * Equivalent to icu4c UPROPS_DT_MASK.
 480      */
 481     private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
 482 
 483     /**
 484      * First nibble shift
 485      */
 486     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
 487     /**
 488      * Second nibble mask
 489      */
 490     private static final int LAST_NIBBLE_MASK_ = 0xF;
 491     /**
 492      * Age value shift
 493      */
 494     private static final int AGE_SHIFT_ = 24;
 495 
 496     // private constructors --------------------------------------------------
 497 
 498     /**
 499      * Constructor
 500      * @exception IOException thrown when data reading fails or data corrupted
 501      */
 502     private UCharacterProperty() throws IOException
 503     {
 504         // jar access
 505         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
 506         m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
 507         // Read or skip the 16 indexes.
 508         int propertyOffset = bytes.getInt();
 509         /* exceptionOffset = */ bytes.getInt();
 510         /* caseOffset = */ bytes.getInt();
 511         int additionalOffset = bytes.getInt();
 512         int additionalVectorsOffset = bytes.getInt();
 513         m_additionalColumnsCount_ = bytes.getInt();
 514         int scriptExtensionsOffset = bytes.getInt();
 515         int reservedOffset7 = bytes.getInt();
 516         /* reservedOffset8 = */ bytes.getInt();
 517         /* dataTopOffset = */ bytes.getInt();
 518         m_maxBlockScriptValue_ = bytes.getInt();
 519         m_maxJTGValue_ = bytes.getInt();
 520         ICUBinary.skipBytes(bytes, (16 - 12) << 2);
 521 
 522         // read the main properties trie
 523         m_trie_ = Trie2_16.createFromSerialized(bytes);
 524         int expectedTrieLength = (propertyOffset - 16) * 4;
 525         int trieLength = m_trie_.getSerializedLength();
 526         if(trieLength > expectedTrieLength) {
 527             throw new IOException("uprops.icu: not enough bytes for main trie");
 528         }
 529         // skip padding after trie bytes
 530         ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
 531 
 532         // skip unused intervening data structures
 533         ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
 534 
 535         if(m_additionalColumnsCount_ > 0) {
 536             // reads the additional property block
 537             m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
 538             expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
 539             trieLength = m_additionalTrie_.getSerializedLength();
 540             if(trieLength > expectedTrieLength) {
 541                 throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
 542             }
 543             // skip padding after trie bytes
 544             ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
 545 
 546             // additional properties
 547             int size = scriptExtensionsOffset - additionalVectorsOffset;
 548             m_additionalVectors_ = new int[size];
 549             for (int i = 0; i < size; i ++) {
 550                 m_additionalVectors_[i] = bytes.getInt();
 551             }
 552         }
 553 
 554         // Script_Extensions
 555         int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
 556         if(numChars > 0) {
 557             m_scriptExtensions_ = new char[numChars];
 558             for(int i = 0; i < numChars; ++i) {
 559                 m_scriptExtensions_[i] = bytes.getChar();
 560             }
 561         }
 562     }
 563 
 564     private static final class IsAcceptable implements ICUBinary.Authenticate {
 565         // @Override when we switch to Java 6
 566         public boolean isDataVersionAcceptable(byte[] version) {
 567             return version[0] == 7;
 568         }
 569     }
 570 
 571     private static final int DATA_FORMAT = 0x5550726F;  // "UPro"
 572 
 573     public void upropsvec_addPropertyStarts(UnicodeSet set) {
 574         /* add the start code point of each same-value range of the properties vectors trie */
 575         if(m_additionalColumnsCount_>0) {
 576             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
 577             Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
 578             Trie2.Range range;
 579             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
 580                 set.add(range.startCodePoint);
 581             }
 582         }
 583     }
 584 
 585     // This static initializer block must be placed after
 586     // other static member initialization
 587     static {
 588         try {
 589             INSTANCE = new UCharacterProperty();
 590         }
 591         catch (IOException e) {
 592             throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
 593         }
 594     }
 595 
 596 
 597     // Moved from UProperty.java
 598     /**
 599      * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
 600      * Used in UAX #9: Unicode Bidirectional Algorithm
 601      * (http://www.unicode.org/reports/tr9/)
 602      * Returns UCharacter.BidiPairedBracketType values.
 603      * @stable ICU 52
 604      */
 605     public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
 606 
 607 }