1 /* 2 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 1996-2014, International Business Machines Corporation and 28 * others. All Rights Reserved. 29 ******************************************************************************* 30 */ 31 32 package jdk.internal.icu.impl; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.util.Iterator; 37 import java.util.MissingResourceException; 38 39 import jdk.internal.icu.lang.UCharacter.HangulSyllableType; 40 import jdk.internal.icu.lang.UCharacter.NumericType; 41 import jdk.internal.icu.text.UTF16; 42 import jdk.internal.icu.text.UnicodeSet; 43 import jdk.internal.icu.util.VersionInfo; 44 45 /** 46 * <p>Internal class used for Unicode character property database.</p> 47 * <p>This classes store binary data read from uprops.icu. 48 * It does not have the capability to parse the data into more high-level 49 * information. It only returns bytes of information when required.</p> 50 * <p>Due to the form most commonly used for retrieval, array of char is used 51 * to store the binary data.</p> 52 * <p>UCharacterPropertyDB also contains information on accessing indexes to 53 * significant points in the binary data.</p> 54 * <p>Responsibility for molding the binary data into more meaning form lies on 55 * <a href=UCharacter.html>UCharacter</a>.</p> 56 * @author Syn Wee Quek 57 * @since release 2.1, february 1st 2002 58 */ 59 60 public final class UCharacterProperty 61 { 62 // public data members ----------------------------------------------- 63 64 /* 65 * public singleton instance 66 */ 67 public static final UCharacterProperty INSTANCE; 68 69 /** 70 * Trie data 71 */ 72 public Trie2_16 m_trie_; 73 74 /** 75 * Unicode version 76 */ 77 public VersionInfo m_unicodeVersion_; 78 79 /** 80 * Character type mask 81 */ 82 public static final int TYPE_MASK = 0x1F; 83 84 // uprops.h enum UPropertySource --------------------------------------- *** 85 86 /** From uchar.c/uprops.icu main trie */ 87 public static final int SRC_CHAR=1; 88 /** From uchar.c/uprops.icu properties vectors trie */ 89 public static final int SRC_PROPSVEC=2; 90 /** From ubidi_props.c/ubidi.icu */ 91 public static final int SRC_BIDI=5; 92 /** From normalizer2impl.cpp/nfc.nrm */ 93 public static final int SRC_NFC=8; 94 /** From normalizer2impl.cpp/nfkc.nrm */ 95 public static final int SRC_NFKC=9; 96 97 // public methods ---------------------------------------------------- 98 99 /** 100 * Gets the main property value for code point ch. 101 * @param ch code point whose property value is to be retrieved 102 * @return property value of code point 103 */ 104 public final int getProperty(int ch) 105 { 106 return m_trie_.get(ch); 107 } 108 109 /** 110 * Gets the unicode additional properties. 111 * Java version of C u_getUnicodeProperties(). 112 * @param codepoint codepoint whose additional properties is to be 113 * retrieved 114 * @param column The column index. 115 * @return unicode properties 116 */ 117 public int getAdditional(int codepoint, int column) { 118 assert column >= 0; 119 if (column >= m_additionalColumnsCount_) { 120 return 0; 121 } 122 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 123 } 124 125 /** 126 * <p>Get the "age" of the code point.</p> 127 * <p>The "age" is the Unicode version when the code point was first 128 * designated (as a non-character or for Private Use) or assigned a 129 * character.</p> 130 * <p>This can be useful to avoid emitting code points to receiving 131 * processes that do not accept newer characters.</p> 132 * <p>The data is from the UCD file DerivedAge.txt.</p> 133 * <p>This API does not check the validity of the codepoint.</p> 134 * @param codepoint The code point. 135 * @return the Unicode version number 136 */ 137 public VersionInfo getAge(int codepoint) 138 { 139 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 140 return VersionInfo.getInstance( 141 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 142 version & LAST_NIBBLE_MASK_, 0, 0); 143 } 144 145 // int-value and enumerated properties --------------------------------- *** 146 147 public int getType(int c) { 148 return getProperty(c)&TYPE_MASK; 149 } 150 151 /* 152 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 153 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 154 */ 155 private static final int /* UHangulSyllableType */ gcbToHst[]={ 156 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 157 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 158 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 159 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 160 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 161 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 162 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 163 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 164 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 165 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 166 /* 167 * Omit GCB values beyond what we need for hst. 168 * The code below checks for the array length. 169 */ 170 }; 171 172 private class IntProperty { 173 int column; // SRC_PROPSVEC column, or "source" if mask==0 174 int mask; 175 int shift; 176 177 IntProperty(int column, int mask, int shift) { 178 this.column=column; 179 this.mask=mask; 180 this.shift=shift; 181 } 182 183 IntProperty(int source) { 184 this.column=source; 185 this.mask=0; 186 } 187 188 int getValue(int c) { 189 // systematic, directly stored properties 190 return (getAdditional(c, column)&mask)>>>shift; 191 } 192 } 193 194 private class BiDiIntProperty extends IntProperty { 195 BiDiIntProperty() { 196 super(SRC_BIDI); 197 } 198 } 199 200 private class CombiningClassIntProperty extends IntProperty { 201 CombiningClassIntProperty(int source) { 202 super(source); 203 } 204 } 205 206 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 207 int which; 208 int max; 209 210 NormQuickCheckIntProperty(int source, int which, int max) { 211 super(source); 212 this.which=which; 213 this.max=max; 214 } 215 } 216 217 private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 218 int getValue(int c) { 219 return UBiDiProps.INSTANCE.getPairedBracketType(c); 220 } 221 }; 222 223 public int getIntPropertyValue(int c, int which) { 224 if (which == BIDI_PAIRED_BRACKET_TYPE) { 225 return intProp.getValue(c); 226 } 227 return 0; // undefined 228 } 229 230 /** 231 * Forms a supplementary code point from the argument character<br> 232 * Note this is for internal use hence no checks for the validity of the 233 * surrogate characters are done 234 * @param lead lead surrogate character 235 * @param trail trailing surrogate character 236 * @return code point of the supplementary character 237 */ 238 public static int getRawSupplementary(char lead, char trail) 239 { 240 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 241 } 242 243 /** 244 * Gets the type mask 245 * @param type character type 246 * @return mask 247 */ 248 public static final int getMask(int type) 249 { 250 return 1 << type; 251 } 252 253 /** 254 * Returns the digit values of characters like 'A' - 'Z', normal, 255 * half-width and full-width. This method assumes that the other digit 256 * characters are checked by the calling method. 257 * @param ch character to test 258 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 259 * its corresponding digit will be returned. 260 */ 261 public static int getEuropeanDigit(int ch) { 262 if ((ch > 0x7a && ch < 0xff21) 263 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 264 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 265 return -1; 266 } 267 if (ch <= 0x7a) { 268 // ch >= 0x41 or ch < 0x61 269 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 270 } 271 // ch >= 0xff21 272 if (ch <= 0xff3a) { 273 return ch + 10 - 0xff21; 274 } 275 // ch >= 0xff41 && ch <= 0xff5a 276 return ch + 10 - 0xff41; 277 } 278 279 public int digit(int c) { 280 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 281 if(value<=9) { 282 return value; 283 } else { 284 return -1; 285 } 286 } 287 288 // protected variables ----------------------------------------------- 289 290 /** 291 * Extra property trie 292 */ 293 Trie2_16 m_additionalTrie_; 294 /** 295 * Extra property vectors, 1st column for age and second for binary 296 * properties. 297 */ 298 int m_additionalVectors_[]; 299 /** 300 * Number of additional columns 301 */ 302 int m_additionalColumnsCount_; 303 /** 304 * Maximum values for block, bits used as in vector word 305 * 0 306 */ 307 int m_maxBlockScriptValue_; 308 /** 309 * Maximum values for script, bits used as in vector word 310 * 0 311 */ 312 int m_maxJTGValue_; 313 /** 314 * Script_Extensions data 315 */ 316 public char[] m_scriptExtensions_; 317 318 // private variables ------------------------------------------------- 319 320 /** 321 * Default name of the datafile 322 */ 323 @SuppressWarnings("deprecation") 324 private static final String DATA_FILE_NAME_ = 325 "/jdk/internal/icu/impl/data/icudt" + 326 VersionInfo.ICU_DATA_VERSION_PATH + 327 "/uprops.icu"; 328 329 /** 330 * Shift value for lead surrogate to form a supplementary character. 331 */ 332 private static final int LEAD_SURROGATE_SHIFT_ = 10; 333 /** 334 * Offset to add to combined surrogate pair to avoid masking. 335 */ 336 private static final int SURROGATE_OFFSET_ = 337 UTF16.SUPPLEMENTARY_MIN_VALUE - 338 (UTF16.SURROGATE_MIN_VALUE << 339 LEAD_SURROGATE_SHIFT_) - 340 UTF16.TRAIL_SURROGATE_MIN_VALUE; 341 342 343 // property data constants ------------------------------------------------- 344 345 /** 346 * Numeric types and values in the main properties words. 347 */ 348 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 349 private static final int getNumericTypeValue(int props) { 350 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 351 } 352 353 /* constants for the storage form of numeric types and values */ 354 /** No numeric value. */ 355 private static final int NTV_NONE_ = 0; 356 /** Decimal digits: nv=0..9 */ 357 private static final int NTV_DECIMAL_START_ = 1; 358 /** Other digits: nv=0..9 */ 359 private static final int NTV_DIGIT_START_ = 11; 360 /** Small integers: nv=0..154 */ 361 private static final int NTV_NUMERIC_START_ = 21; 362 363 private static final int ntvGetType(int ntv) { 364 return 365 (ntv==NTV_NONE_) ? NumericType.NONE : 366 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 367 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 368 NumericType.NUMERIC; 369 } 370 371 /* 372 * Properties in vector word 0 373 * Bits 374 * 31..24 DerivedAge version major/minor one nibble each 375 * 23..22 3..1: Bits 7..0 = Script_Extensions index 376 * 3: Script value from Script_Extensions 377 * 2: Script=Inherited 378 * 1: Script=Common 379 * 0: Script=bits 7..0 380 * 21..20 reserved 381 * 19..17 East Asian Width 382 * 16.. 8 UBlockCode 383 * 7.. 0 UScriptCode 384 */ 385 /** 386 * Script_Extensions: mask includes Script 387 */ 388 public static final int SCRIPT_X_MASK = 0x00c000ff; 389 //private static final int SCRIPT_X_SHIFT = 22; 390 /** 391 * Integer properties mask and shift values for East Asian cell width. 392 * Equivalent to icu4c UPROPS_EA_MASK 393 */ 394 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 395 /** 396 * Integer properties mask and shift values for East Asian cell width. 397 * Equivalent to icu4c UPROPS_EA_SHIFT 398 */ 399 private static final int EAST_ASIAN_SHIFT_ = 17; 400 /** 401 * Integer properties mask and shift values for blocks. 402 * Equivalent to icu4c UPROPS_BLOCK_MASK 403 */ 404 private static final int BLOCK_MASK_ = 0x0001ff00; 405 /** 406 * Integer properties mask and shift values for blocks. 407 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 408 */ 409 private static final int BLOCK_SHIFT_ = 8; 410 /** 411 * Integer properties mask and shift values for scripts. 412 * Equivalent to icu4c UPROPS_SHIFT_MASK 413 */ 414 public static final int SCRIPT_MASK_ = 0x000000ff; 415 416 /** 417 * Additional properties used in internal trie data 418 */ 419 /* 420 * Properties in vector word 1 421 * Each bit encodes one binary property. 422 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 423 * UPROPS_BINARY_1_TOP<=32! 424 * 425 * Keep this list of property enums in sync with 426 * propListNames[] in icu/source/tools/genprops/props2.c! 427 * 428 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 429 */ 430 private static final int WHITE_SPACE_PROPERTY_ = 0; 431 private static final int DASH_PROPERTY_ = 1; 432 private static final int HYPHEN_PROPERTY_ = 2; 433 private static final int QUOTATION_MARK_PROPERTY_ = 3; 434 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 435 private static final int MATH_PROPERTY_ = 5; 436 private static final int HEX_DIGIT_PROPERTY_ = 6; 437 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 438 private static final int ALPHABETIC_PROPERTY_ = 8; 439 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 440 private static final int DIACRITIC_PROPERTY_ = 10; 441 private static final int EXTENDER_PROPERTY_ = 11; 442 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 443 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 444 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 445 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 446 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 447 private static final int RADICAL_PROPERTY_ = 17; 448 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 449 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 450 private static final int DEPRECATED_PROPERTY_ = 20; 451 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 452 private static final int XID_START_PROPERTY_ = 22; 453 private static final int XID_CONTINUE_PROPERTY_ = 23; 454 private static final int ID_START_PROPERTY_ = 24; 455 private static final int ID_CONTINUE_PROPERTY_ = 25; 456 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 457 private static final int S_TERM_PROPERTY_ = 27; 458 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 459 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 460 private static final int PATTERN_WHITE_SPACE = 30; 461 462 /* 463 * Properties in vector word 2 464 * Bits 465 * 31..26 reserved 466 * 25..20 Line Break 467 * 19..15 Sentence Break 468 * 14..10 Word Break 469 * 9.. 5 Grapheme Cluster Break 470 * 4.. 0 Decomposition Type 471 */ 472 private static final int LB_MASK = 0x03f00000; 473 private static final int LB_SHIFT = 20; 474 475 private static final int SB_MASK = 0x000f8000; 476 private static final int SB_SHIFT = 15; 477 478 private static final int WB_MASK = 0x00007c00; 479 private static final int WB_SHIFT = 10; 480 481 private static final int GCB_MASK = 0x000003e0; 482 private static final int GCB_SHIFT = 5; 483 484 /** 485 * Integer properties mask for decomposition type. 486 * Equivalent to icu4c UPROPS_DT_MASK. 487 */ 488 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 489 490 /** 491 * First nibble shift 492 */ 493 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 494 /** 495 * Second nibble mask 496 */ 497 private static final int LAST_NIBBLE_MASK_ = 0xF; 498 /** 499 * Age value shift 500 */ 501 private static final int AGE_SHIFT_ = 24; 502 503 // private constructors -------------------------------------------------- 504 505 /** 506 * Constructor 507 * @exception IOException thrown when data reading fails or data corrupted 508 */ 509 private UCharacterProperty() throws IOException 510 { 511 // jar access 512 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 513 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 514 // Read or skip the 16 indexes. 515 int propertyOffset = bytes.getInt(); 516 /* exceptionOffset = */ bytes.getInt(); 517 /* caseOffset = */ bytes.getInt(); 518 int additionalOffset = bytes.getInt(); 519 int additionalVectorsOffset = bytes.getInt(); 520 m_additionalColumnsCount_ = bytes.getInt(); 521 int scriptExtensionsOffset = bytes.getInt(); 522 int reservedOffset7 = bytes.getInt(); 523 /* reservedOffset8 = */ bytes.getInt(); 524 /* dataTopOffset = */ bytes.getInt(); 525 m_maxBlockScriptValue_ = bytes.getInt(); 526 m_maxJTGValue_ = bytes.getInt(); 527 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 528 529 // read the main properties trie 530 m_trie_ = Trie2_16.createFromSerialized(bytes); 531 int expectedTrieLength = (propertyOffset - 16) * 4; 532 int trieLength = m_trie_.getSerializedLength(); 533 if(trieLength > expectedTrieLength) { 534 throw new IOException("uprops.icu: not enough bytes for main trie"); 535 } 536 // skip padding after trie bytes 537 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 538 539 // skip unused intervening data structures 540 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 541 542 if(m_additionalColumnsCount_ > 0) { 543 // reads the additional property block 544 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 545 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 546 trieLength = m_additionalTrie_.getSerializedLength(); 547 if(trieLength > expectedTrieLength) { 548 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 549 } 550 // skip padding after trie bytes 551 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 552 553 // additional properties 554 int size = scriptExtensionsOffset - additionalVectorsOffset; 555 m_additionalVectors_ = new int[size]; 556 for (int i = 0; i < size; i ++) { 557 m_additionalVectors_[i] = bytes.getInt(); 558 } 559 } 560 561 // Script_Extensions 562 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 563 if(numChars > 0) { 564 m_scriptExtensions_ = new char[numChars]; 565 for(int i = 0; i < numChars; ++i) { 566 m_scriptExtensions_[i] = bytes.getChar(); 567 } 568 } 569 } 570 571 private static final class IsAcceptable implements ICUBinary.Authenticate { 572 // @Override when we switch to Java 6 573 public boolean isDataVersionAcceptable(byte version[]) { 574 return version[0] == 7; 575 } 576 } 577 578 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 579 580 public void upropsvec_addPropertyStarts(UnicodeSet set) { 581 /* add the start code point of each same-value range of the properties vectors trie */ 582 if(m_additionalColumnsCount_>0) { 583 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 584 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 585 Trie2.Range range; 586 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 587 set.add(range.startCodePoint); 588 } 589 } 590 } 591 592 // This static initializer block must be placed after 593 // other static member initialization 594 static { 595 try { 596 INSTANCE = new UCharacterProperty(); 597 } 598 catch (IOException e) { 599 throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); 600 } 601 } 602 603 604 // Moved from UProperty.java 605 /** 606 * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). 607 * Used in UAX #9: Unicode Bidirectional Algorithm 608 * (http://www.unicode.org/reports/tr9/) 609 * Returns UCharacter.BidiPairedBracketType values. 610 * @stable ICU 52 611 */ 612 public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; 613 614 }