1 /* 2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 1996-2014, International Business Machines Corporation and 28 * others. All Rights Reserved. 29 ******************************************************************************* 30 */ 31 32 package sun.text.normalizer; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.util.Iterator; 37 import java.util.MissingResourceException; 38 39 import sun.text.normalizer.UCharacter.HangulSyllableType; 40 import sun.text.normalizer.UCharacter.NumericType; 41 42 /** 43 * <p>Internal class used for Unicode character property database.</p> 44 * <p>This classes store binary data read from uprops.icu. 45 * It does not have the capability to parse the data into more high-level 46 * information. It only returns bytes of information when required.</p> 47 * <p>Due to the form most commonly used for retrieval, array of char is used 48 * to store the binary data.</p> 49 * <p>UCharacterPropertyDB also contains information on accessing indexes to 50 * significant points in the binary data.</p> 51 * <p>Responsibility for molding the binary data into more meaning form lies on 52 * <a href=UCharacter.html>UCharacter</a>.</p> 53 * @author Syn Wee Quek 54 * @since release 2.1, february 1st 2002 55 */ 56 57 final class UCharacterProperty 58 { 59 // public data members ----------------------------------------------- 60 61 /* 62 * public singleton instance 63 */ 64 public static final UCharacterProperty INSTANCE; 65 66 /** 67 * Trie data 68 */ 69 public Trie2_16 m_trie_; 70 71 /** 72 * Unicode version 73 */ 74 public VersionInfo m_unicodeVersion_; 75 76 /** 77 * Character type mask 78 */ 79 public static final int TYPE_MASK = 0x1F; 80 81 // uprops.h enum UPropertySource --------------------------------------- *** 82 83 /** From uchar.c/uprops.icu main trie */ 84 public static final int SRC_CHAR=1; 85 /** From uchar.c/uprops.icu properties vectors trie */ 86 public static final int SRC_PROPSVEC=2; 87 /** From ubidi_props.c/ubidi.icu */ 88 public static final int SRC_BIDI=5; 89 /** From normalizer2impl.cpp/nfc.nrm */ 90 public static final int SRC_NFC=8; 91 /** From normalizer2impl.cpp/nfkc.nrm */ 92 public static final int SRC_NFKC=9; 93 94 // public methods ---------------------------------------------------- 95 96 /** 97 * Gets the main property value for code point ch. 98 * @param ch code point whose property value is to be retrieved 99 * @return property value of code point 100 */ 101 public final int getProperty(int ch) 102 { 103 return m_trie_.get(ch); 104 } 105 106 /** 107 * Gets the unicode additional properties. 108 * Java version of C u_getUnicodeProperties(). 109 * @param codepoint codepoint whose additional properties is to be 110 * retrieved 111 * @param column The column index. 112 * @return unicode properties 113 */ 114 public int getAdditional(int codepoint, int column) { 115 assert column >= 0; 116 if (column >= m_additionalColumnsCount_) { 117 return 0; 118 } 119 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 120 } 121 122 /** 123 * <p>Get the "age" of the code point.</p> 124 * <p>The "age" is the Unicode version when the code point was first 125 * designated (as a non-character or for Private Use) or assigned a 126 * character.</p> 127 * <p>This can be useful to avoid emitting code points to receiving 128 * processes that do not accept newer characters.</p> 129 * <p>The data is from the UCD file DerivedAge.txt.</p> 130 * <p>This API does not check the validity of the codepoint.</p> 131 * @param codepoint The code point. 132 * @return the Unicode version number 133 */ 134 public VersionInfo getAge(int codepoint) 135 { 136 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 137 return VersionInfo.getInstance( 138 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 139 version & LAST_NIBBLE_MASK_, 0, 0); 140 } 141 142 // int-value and enumerated properties --------------------------------- *** 143 144 public int getType(int c) { 145 return getProperty(c)&TYPE_MASK; 146 } 147 148 /* 149 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 150 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 151 */ 152 private static final int[] /* UHangulSyllableType */ gcbToHst={ 153 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 154 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 155 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 156 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 157 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 158 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 159 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 160 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 161 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 162 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 163 /* 164 * Omit GCB values beyond what we need for hst. 165 * The code below checks for the array length. 166 */ 167 }; 168 169 private class IntProperty { 170 int column; // SRC_PROPSVEC column, or "source" if mask==0 171 int mask; 172 int shift; 173 174 IntProperty(int column, int mask, int shift) { 175 this.column=column; 176 this.mask=mask; 177 this.shift=shift; 178 } 179 180 IntProperty(int source) { 181 this.column=source; 182 this.mask=0; 183 } 184 185 int getValue(int c) { 186 // systematic, directly stored properties 187 return (getAdditional(c, column)&mask)>>>shift; 188 } 189 } 190 191 private class BiDiIntProperty extends IntProperty { 192 BiDiIntProperty() { 193 super(SRC_BIDI); 194 } 195 } 196 197 private class CombiningClassIntProperty extends IntProperty { 198 CombiningClassIntProperty(int source) { 199 super(source); 200 } 201 } 202 203 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 204 int which; 205 int max; 206 207 NormQuickCheckIntProperty(int source, int which, int max) { 208 super(source); 209 this.which=which; 210 this.max=max; 211 } 212 } 213 214 private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 215 int getValue(int c) { 216 return UBiDiProps.INSTANCE.getPairedBracketType(c); 217 } 218 }; 219 220 public int getIntPropertyValue(int c, int which) { 221 if (which == BIDI_PAIRED_BRACKET_TYPE) { 222 return intProp.getValue(c); 223 } 224 return 0; // undefined 225 } 226 227 /** 228 * Forms a supplementary code point from the argument character<br> 229 * Note this is for internal use hence no checks for the validity of the 230 * surrogate characters are done 231 * @param lead lead surrogate character 232 * @param trail trailing surrogate character 233 * @return code point of the supplementary character 234 */ 235 public static int getRawSupplementary(char lead, char trail) 236 { 237 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 238 } 239 240 /** 241 * Gets the type mask 242 * @param type character type 243 * @return mask 244 */ 245 public static final int getMask(int type) 246 { 247 return 1 << type; 248 } 249 250 /** 251 * Returns the digit values of characters like 'A' - 'Z', normal, 252 * half-width and full-width. This method assumes that the other digit 253 * characters are checked by the calling method. 254 * @param ch character to test 255 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 256 * its corresponding digit will be returned. 257 */ 258 public static int getEuropeanDigit(int ch) { 259 if ((ch > 0x7a && ch < 0xff21) 260 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 261 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 262 return -1; 263 } 264 if (ch <= 0x7a) { 265 // ch >= 0x41 or ch < 0x61 266 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 267 } 268 // ch >= 0xff21 269 if (ch <= 0xff3a) { 270 return ch + 10 - 0xff21; 271 } 272 // ch >= 0xff41 && ch <= 0xff5a 273 return ch + 10 - 0xff41; 274 } 275 276 public int digit(int c) { 277 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 278 if(value<=9) { 279 return value; 280 } else { 281 return -1; 282 } 283 } 284 285 // protected variables ----------------------------------------------- 286 287 /** 288 * Extra property trie 289 */ 290 Trie2_16 m_additionalTrie_; 291 /** 292 * Extra property vectors, 1st column for age and second for binary 293 * properties. 294 */ 295 int[] m_additionalVectors_; 296 /** 297 * Number of additional columns 298 */ 299 int m_additionalColumnsCount_; 300 /** 301 * Maximum values for block, bits used as in vector word 302 * 0 303 */ 304 int m_maxBlockScriptValue_; 305 /** 306 * Maximum values for script, bits used as in vector word 307 * 0 308 */ 309 int m_maxJTGValue_; 310 /** 311 * Script_Extensions data 312 */ 313 public char[] m_scriptExtensions_; 314 315 // private variables ------------------------------------------------- 316 317 /** 318 * Default name of the datafile 319 */ 320 private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; 321 322 /** 323 * Shift value for lead surrogate to form a supplementary character. 324 */ 325 private static final int LEAD_SURROGATE_SHIFT_ = 10; 326 /** 327 * Offset to add to combined surrogate pair to avoid masking. 328 */ 329 private static final int SURROGATE_OFFSET_ = 330 UTF16.SUPPLEMENTARY_MIN_VALUE - 331 (UTF16.SURROGATE_MIN_VALUE << 332 LEAD_SURROGATE_SHIFT_) - 333 UTF16.TRAIL_SURROGATE_MIN_VALUE; 334 335 336 // property data constants ------------------------------------------------- 337 338 /** 339 * Numeric types and values in the main properties words. 340 */ 341 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 342 private static final int getNumericTypeValue(int props) { 343 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 344 } 345 346 /* constants for the storage form of numeric types and values */ 347 /** No numeric value. */ 348 private static final int NTV_NONE_ = 0; 349 /** Decimal digits: nv=0..9 */ 350 private static final int NTV_DECIMAL_START_ = 1; 351 /** Other digits: nv=0..9 */ 352 private static final int NTV_DIGIT_START_ = 11; 353 /** Small integers: nv=0..154 */ 354 private static final int NTV_NUMERIC_START_ = 21; 355 356 private static final int ntvGetType(int ntv) { 357 return 358 (ntv==NTV_NONE_) ? NumericType.NONE : 359 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 360 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 361 NumericType.NUMERIC; 362 } 363 364 /* 365 * Properties in vector word 0 366 * Bits 367 * 31..24 DerivedAge version major/minor one nibble each 368 * 23..22 3..1: Bits 7..0 = Script_Extensions index 369 * 3: Script value from Script_Extensions 370 * 2: Script=Inherited 371 * 1: Script=Common 372 * 0: Script=bits 7..0 373 * 21..20 reserved 374 * 19..17 East Asian Width 375 * 16.. 8 UBlockCode 376 * 7.. 0 UScriptCode 377 */ 378 /** 379 * Script_Extensions: mask includes Script 380 */ 381 public static final int SCRIPT_X_MASK = 0x00c000ff; 382 //private static final int SCRIPT_X_SHIFT = 22; 383 /** 384 * Integer properties mask and shift values for East Asian cell width. 385 * Equivalent to icu4c UPROPS_EA_MASK 386 */ 387 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 388 /** 389 * Integer properties mask and shift values for East Asian cell width. 390 * Equivalent to icu4c UPROPS_EA_SHIFT 391 */ 392 private static final int EAST_ASIAN_SHIFT_ = 17; 393 /** 394 * Integer properties mask and shift values for blocks. 395 * Equivalent to icu4c UPROPS_BLOCK_MASK 396 */ 397 private static final int BLOCK_MASK_ = 0x0001ff00; 398 /** 399 * Integer properties mask and shift values for blocks. 400 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 401 */ 402 private static final int BLOCK_SHIFT_ = 8; 403 /** 404 * Integer properties mask and shift values for scripts. 405 * Equivalent to icu4c UPROPS_SHIFT_MASK 406 */ 407 public static final int SCRIPT_MASK_ = 0x000000ff; 408 409 /** 410 * Additional properties used in internal trie data 411 */ 412 /* 413 * Properties in vector word 1 414 * Each bit encodes one binary property. 415 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 416 * UPROPS_BINARY_1_TOP<=32! 417 * 418 * Keep this list of property enums in sync with 419 * propListNames[] in icu/source/tools/genprops/props2.c! 420 * 421 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 422 */ 423 private static final int WHITE_SPACE_PROPERTY_ = 0; 424 private static final int DASH_PROPERTY_ = 1; 425 private static final int HYPHEN_PROPERTY_ = 2; 426 private static final int QUOTATION_MARK_PROPERTY_ = 3; 427 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 428 private static final int MATH_PROPERTY_ = 5; 429 private static final int HEX_DIGIT_PROPERTY_ = 6; 430 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 431 private static final int ALPHABETIC_PROPERTY_ = 8; 432 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 433 private static final int DIACRITIC_PROPERTY_ = 10; 434 private static final int EXTENDER_PROPERTY_ = 11; 435 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 436 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 437 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 438 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 439 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 440 private static final int RADICAL_PROPERTY_ = 17; 441 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 442 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 443 private static final int DEPRECATED_PROPERTY_ = 20; 444 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 445 private static final int XID_START_PROPERTY_ = 22; 446 private static final int XID_CONTINUE_PROPERTY_ = 23; 447 private static final int ID_START_PROPERTY_ = 24; 448 private static final int ID_CONTINUE_PROPERTY_ = 25; 449 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 450 private static final int S_TERM_PROPERTY_ = 27; 451 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 452 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 453 private static final int PATTERN_WHITE_SPACE = 30; 454 455 /* 456 * Properties in vector word 2 457 * Bits 458 * 31..26 reserved 459 * 25..20 Line Break 460 * 19..15 Sentence Break 461 * 14..10 Word Break 462 * 9.. 5 Grapheme Cluster Break 463 * 4.. 0 Decomposition Type 464 */ 465 private static final int LB_MASK = 0x03f00000; 466 private static final int LB_SHIFT = 20; 467 468 private static final int SB_MASK = 0x000f8000; 469 private static final int SB_SHIFT = 15; 470 471 private static final int WB_MASK = 0x00007c00; 472 private static final int WB_SHIFT = 10; 473 474 private static final int GCB_MASK = 0x000003e0; 475 private static final int GCB_SHIFT = 5; 476 477 /** 478 * Integer properties mask for decomposition type. 479 * Equivalent to icu4c UPROPS_DT_MASK. 480 */ 481 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 482 483 /** 484 * First nibble shift 485 */ 486 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 487 /** 488 * Second nibble mask 489 */ 490 private static final int LAST_NIBBLE_MASK_ = 0xF; 491 /** 492 * Age value shift 493 */ 494 private static final int AGE_SHIFT_ = 24; 495 496 // private constructors -------------------------------------------------- 497 498 /** 499 * Constructor 500 * @exception IOException thrown when data reading fails or data corrupted 501 */ 502 private UCharacterProperty() throws IOException 503 { 504 // jar access 505 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 506 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 507 // Read or skip the 16 indexes. 508 int propertyOffset = bytes.getInt(); 509 /* exceptionOffset = */ bytes.getInt(); 510 /* caseOffset = */ bytes.getInt(); 511 int additionalOffset = bytes.getInt(); 512 int additionalVectorsOffset = bytes.getInt(); 513 m_additionalColumnsCount_ = bytes.getInt(); 514 int scriptExtensionsOffset = bytes.getInt(); 515 int reservedOffset7 = bytes.getInt(); 516 /* reservedOffset8 = */ bytes.getInt(); 517 /* dataTopOffset = */ bytes.getInt(); 518 m_maxBlockScriptValue_ = bytes.getInt(); 519 m_maxJTGValue_ = bytes.getInt(); 520 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 521 522 // read the main properties trie 523 m_trie_ = Trie2_16.createFromSerialized(bytes); 524 int expectedTrieLength = (propertyOffset - 16) * 4; 525 int trieLength = m_trie_.getSerializedLength(); 526 if(trieLength > expectedTrieLength) { 527 throw new IOException("uprops.icu: not enough bytes for main trie"); 528 } 529 // skip padding after trie bytes 530 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 531 532 // skip unused intervening data structures 533 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 534 535 if(m_additionalColumnsCount_ > 0) { 536 // reads the additional property block 537 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 538 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 539 trieLength = m_additionalTrie_.getSerializedLength(); 540 if(trieLength > expectedTrieLength) { 541 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 542 } 543 // skip padding after trie bytes 544 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 545 546 // additional properties 547 int size = scriptExtensionsOffset - additionalVectorsOffset; 548 m_additionalVectors_ = new int[size]; 549 for (int i = 0; i < size; i ++) { 550 m_additionalVectors_[i] = bytes.getInt(); 551 } 552 } 553 554 // Script_Extensions 555 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 556 if(numChars > 0) { 557 m_scriptExtensions_ = new char[numChars]; 558 for(int i = 0; i < numChars; ++i) { 559 m_scriptExtensions_[i] = bytes.getChar(); 560 } 561 } 562 } 563 564 private static final class IsAcceptable implements ICUBinary.Authenticate { 565 // @Override when we switch to Java 6 566 public boolean isDataVersionAcceptable(byte[] version) { 567 return version[0] == 7; 568 } 569 } 570 571 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 572 573 public void upropsvec_addPropertyStarts(UnicodeSet set) { 574 /* add the start code point of each same-value range of the properties vectors trie */ 575 if(m_additionalColumnsCount_>0) { 576 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 577 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 578 Trie2.Range range; 579 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 580 set.add(range.startCodePoint); 581 } 582 } 583 } 584 585 // This static initializer block must be placed after 586 // other static member initialization 587 static { 588 try { 589 INSTANCE = new UCharacterProperty(); 590 } 591 catch (IOException e) { 592 throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); 593 } 594 } 595 596 597 // Moved from UProperty.java 598 /** 599 * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). 600 * Used in UAX #9: Unicode Bidirectional Algorithm 601 * (http://www.unicode.org/reports/tr9/) 602 * Returns UCharacter.BidiPairedBracketType values. 603 * @stable ICU 52 604 */ 605 public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; 606 607 }