1 /* 2 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 1996-2014, International Business Machines Corporation and 28 * others. All Rights Reserved. 29 ******************************************************************************* 30 */ 31 32 package jdk.internal.icu.impl; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.util.Iterator; 37 import java.util.MissingResourceException; 38 39 import jdk.internal.icu.lang.UCharacter.HangulSyllableType; 40 import jdk.internal.icu.lang.UCharacter.NumericType; 41 import jdk.internal.icu.text.UTF16; 42 import jdk.internal.icu.text.UnicodeSet; 43 import jdk.internal.icu.util.VersionInfo; 44 45 /** 46 * <p>Internal class used for Unicode character property database.</p> 47 * <p>This classes store binary data read from uprops.icu. 48 * It does not have the capability to parse the data into more high-level 49 * information. It only returns bytes of information when required.</p> 50 * <p>Due to the form most commonly used for retrieval, array of char is used 51 * to store the binary data.</p> 52 * <p>UCharacterPropertyDB also contains information on accessing indexes to 53 * significant points in the binary data.</p> 54 * <p>Responsibility for molding the binary data into more meaning form lies on 55 * <a href=UCharacter.html>UCharacter</a>.</p> 56 * @author Syn Wee Quek 57 * @since release 2.1, february 1st 2002 58 */ 59 60 public final class UCharacterProperty 61 { 62 // public data members ----------------------------------------------- 63 64 /* 65 * public singleton instance 66 */ 67 public static final UCharacterProperty INSTANCE; 68 69 /** 70 * Trie data 71 */ 72 public Trie2_16 m_trie_; 73 74 /** 75 * Unicode version 76 */ 77 public VersionInfo m_unicodeVersion_; 78 79 /** 80 * Character type mask 81 */ 82 public static final int TYPE_MASK = 0x1F; 83 84 // uprops.h enum UPropertySource --------------------------------------- *** 85 86 /** From uchar.c/uprops.icu main trie */ 87 public static final int SRC_CHAR=1; 88 /** From uchar.c/uprops.icu properties vectors trie */ 89 public static final int SRC_PROPSVEC=2; 90 /** From ubidi_props.c/ubidi.icu */ 91 public static final int SRC_BIDI=5; 92 /** From normalizer2impl.cpp/nfc.nrm */ 93 public static final int SRC_NFC=8; 94 /** From normalizer2impl.cpp/nfkc.nrm */ 95 public static final int SRC_NFKC=9; 96 97 // public methods ---------------------------------------------------- 98 99 /** 100 * Gets the main property value for code point ch. 101 * @param ch code point whose property value is to be retrieved 102 * @return property value of code point 103 */ 104 public final int getProperty(int ch) 105 { 106 return m_trie_.get(ch); 107 } 108 109 /** 110 * Gets the unicode additional properties. 111 * Java version of C u_getUnicodeProperties(). 112 * @param codepoint codepoint whose additional properties is to be 113 * retrieved 114 * @param column The column index. 115 * @return unicode properties 116 */ 117 public int getAdditional(int codepoint, int column) { 118 assert column >= 0; 119 if (column >= m_additionalColumnsCount_) { 120 return 0; 121 } 122 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 123 } 124 125 /** 126 * <p>Get the "age" of the code point.</p> 127 * <p>The "age" is the Unicode version when the code point was first 128 * designated (as a non-character or for Private Use) or assigned a 129 * character.</p> 130 * <p>This can be useful to avoid emitting code points to receiving 131 * processes that do not accept newer characters.</p> 132 * <p>The data is from the UCD file DerivedAge.txt.</p> 133 * <p>This API does not check the validity of the codepoint.</p> 134 * @param codepoint The code point. 135 * @return the Unicode version number 136 */ 137 public VersionInfo getAge(int codepoint) 138 { 139 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 140 return VersionInfo.getInstance( 141 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 142 version & LAST_NIBBLE_MASK_, 0, 0); 143 } 144 145 // int-value and enumerated properties --------------------------------- *** 146 147 public int getType(int c) { 148 return getProperty(c)&TYPE_MASK; 149 } 150 151 /* 152 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 153 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 154 */ 155 private static final int /* UHangulSyllableType */ gcbToHst[]={ 156 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 157 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 158 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 159 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 160 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 161 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 162 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 163 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 164 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 165 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 166 /* 167 * Omit GCB values beyond what we need for hst. 168 * The code below checks for the array length. 169 */ 170 }; 171 172 private class IntProperty { 173 int column; // SRC_PROPSVEC column, or "source" if mask==0 174 int mask; 175 int shift; 176 177 IntProperty(int column, int mask, int shift) { 178 this.column=column; 179 this.mask=mask; 180 this.shift=shift; 181 } 182 183 IntProperty(int source) { 184 this.column=source; 185 this.mask=0; 186 } 187 188 int getValue(int c) { 189 // systematic, directly stored properties 190 return (getAdditional(c, column)&mask)>>>shift; 191 } 192 } 193 194 private class BiDiIntProperty extends IntProperty { 195 BiDiIntProperty() { 196 super(SRC_BIDI); 197 } 198 } 199 200 private class CombiningClassIntProperty extends IntProperty { 201 CombiningClassIntProperty(int source) { 202 super(source); 203 } 204 } 205 206 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 207 int which; 208 int max; 209 210 NormQuickCheckIntProperty(int source, int which, int max) { 211 super(source); 212 this.which=which; 213 this.max=max; 214 } 215 } 216 217 private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 218 int getValue(int c) { 219 return UBiDiProps.INSTANCE.getPairedBracketType(c); 220 } 221 }; 222 223 public int getIntPropertyValue(int c, int which) { 224 if (which == BIDI_PAIRED_BRACKET_TYPE) { 225 return intProp.getValue(c); 226 } 227 return 0; // undefined 228 } 229 230 /** 231 * Forms a supplementary code point from the argument character<br> 232 * Note this is for internal use hence no checks for the validity of the 233 * surrogate characters are done 234 * @param lead lead surrogate character 235 * @param trail trailing surrogate character 236 * @return code point of the supplementary character 237 */ 238 public static int getRawSupplementary(char lead, char trail) 239 { 240 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 241 } 242 243 /** 244 * Gets the type mask 245 * @param type character type 246 * @return mask 247 */ 248 public static final int getMask(int type) 249 { 250 return 1 << type; 251 } 252 253 /** 254 * Returns the digit values of characters like 'A' - 'Z', normal, 255 * half-width and full-width. This method assumes that the other digit 256 * characters are checked by the calling method. 257 * @param ch character to test 258 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 259 * its corresponding digit will be returned. 260 */ 261 public static int getEuropeanDigit(int ch) { 262 if ((ch > 0x7a && ch < 0xff21) 263 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 264 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 265 return -1; 266 } 267 if (ch <= 0x7a) { 268 // ch >= 0x41 or ch < 0x61 269 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 270 } 271 // ch >= 0xff21 272 if (ch <= 0xff3a) { 273 return ch + 10 - 0xff21; 274 } 275 // ch >= 0xff41 && ch <= 0xff5a 276 return ch + 10 - 0xff41; 277 } 278 279 public int digit(int c) { 280 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 281 if(value<=9) { 282 return value; 283 } else { 284 return -1; 285 } 286 } 287 288 // protected variables ----------------------------------------------- 289 290 /** 291 * Extra property trie 292 */ 293 Trie2_16 m_additionalTrie_; 294 /** 295 * Extra property vectors, 1st column for age and second for binary 296 * properties. 297 */ 298 int m_additionalVectors_[]; 299 /** 300 * Number of additional columns 301 */ 302 int m_additionalColumnsCount_; 303 /** 304 * Maximum values for block, bits used as in vector word 305 * 0 306 */ 307 int m_maxBlockScriptValue_; 308 /** 309 * Maximum values for script, bits used as in vector word 310 * 0 311 */ 312 int m_maxJTGValue_; 313 /** 314 * Script_Extensions data 315 */ 316 public char[] m_scriptExtensions_; 317 318 // private variables ------------------------------------------------- 319 320 /** 321 * Default name of the datafile 322 */ 323 @SuppressWarnings("deprecation") 324 private static final String DATA_FILE_NAME_ = 325 "/jdk/internal/icu/impl/data/icudt" + 326 VersionInfo.ICU_DATA_VERSION_PATH + 327 "/uprops.icu"; 328 329 /** 330 * Shift value for lead surrogate to form a supplementary character. 331 */ 332 private static final int LEAD_SURROGATE_SHIFT_ = 10; 333 /** 334 * Offset to add to combined surrogate pair to avoid masking. 335 */ 336 private static final int SURROGATE_OFFSET_ = 337 UTF16.SUPPLEMENTARY_MIN_VALUE - 338 (UTF16.SURROGATE_MIN_VALUE << 339 LEAD_SURROGATE_SHIFT_) - 340 UTF16.TRAIL_SURROGATE_MIN_VALUE; 341 342 343 // property data constants ------------------------------------------------- 344 345 /** 346 * Numeric types and values in the main properties words. 347 */ 348 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 349 private static final int getNumericTypeValue(int props) { 350 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 351 } 352 353 /* constants for the storage form of numeric types and values */ 354 /** No numeric value. */ 355 private static final int NTV_NONE_ = 0; 356 /** Decimal digits: nv=0..9 */ 357 private static final int NTV_DECIMAL_START_ = 1; 358 /** Other digits: nv=0..9 */ 359 private static final int NTV_DIGIT_START_ = 11; 360 /** Small integers: nv=0..154 */ 361 private static final int NTV_NUMERIC_START_ = 21; 362 363 private static final int ntvGetType(int ntv) { 364 return 365 (ntv==NTV_NONE_) ? NumericType.NONE : 366 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 367 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 368 NumericType.NUMERIC; 369 } 370 371 /* 372 * Properties in vector word 0 373 * Bits 374 * 31..24 DerivedAge version major/minor one nibble each 375 * 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index 376 * 3: Script value from Script_Extensions 377 * 2: Script=Inherited 378 * 1: Script=Common 379 * 0: Script=bits 21..20 & 7..0 380 * 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions 381 * 19..17 East Asian Width 382 * 16.. 8 UBlockCode 383 * 7.. 0 UScriptCode, or index to Script_Extensions 384 */ 385 386 /** 387 * Script_Extensions: mask includes Script 388 */ 389 public static final int SCRIPT_X_MASK = 0x00f000ff; 390 //private static final int SCRIPT_X_SHIFT = 22; 391 392 // The UScriptCode or Script_Extensions index is split across two bit fields. 393 // (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.) 394 // Shift the high bits right by 12 to assemble the full value. 395 public static final int SCRIPT_HIGH_MASK = 0x00300000; 396 public static final int SCRIPT_HIGH_SHIFT = 12; 397 public static final int MAX_SCRIPT = 0x3ff; 398 399 /** 400 * Integer properties mask and shift values for East Asian cell width. 401 * Equivalent to icu4c UPROPS_EA_MASK 402 */ 403 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 404 /** 405 * Integer properties mask and shift values for East Asian cell width. 406 * Equivalent to icu4c UPROPS_EA_SHIFT 407 */ 408 private static final int EAST_ASIAN_SHIFT_ = 17; 409 /** 410 * Integer properties mask and shift values for blocks. 411 * Equivalent to icu4c UPROPS_BLOCK_MASK 412 */ 413 private static final int BLOCK_MASK_ = 0x0001ff00; 414 /** 415 * Integer properties mask and shift values for blocks. 416 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 417 */ 418 private static final int BLOCK_SHIFT_ = 8; 419 /** 420 * Integer properties mask and shift values for scripts. 421 * Equivalent to icu4c UPROPS_SHIFT_LOW_MASK. 422 */ 423 public static final int SCRIPT_LOW_MASK = 0x000000ff; 424 425 public static final int mergeScriptCodeOrIndex(int scriptX) { 426 return 427 ((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) | 428 (scriptX & SCRIPT_LOW_MASK); 429 } 430 431 /** 432 * Additional properties used in internal trie data 433 */ 434 /* 435 * Properties in vector word 1 436 * Each bit encodes one binary property. 437 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 438 * UPROPS_BINARY_1_TOP<=32! 439 * 440 * Keep this list of property enums in sync with 441 * propListNames[] in icu/source/tools/genprops/props2.c! 442 * 443 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 444 */ 445 private static final int WHITE_SPACE_PROPERTY_ = 0; 446 private static final int DASH_PROPERTY_ = 1; 447 private static final int HYPHEN_PROPERTY_ = 2; 448 private static final int QUOTATION_MARK_PROPERTY_ = 3; 449 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 450 private static final int MATH_PROPERTY_ = 5; 451 private static final int HEX_DIGIT_PROPERTY_ = 6; 452 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 453 private static final int ALPHABETIC_PROPERTY_ = 8; 454 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 455 private static final int DIACRITIC_PROPERTY_ = 10; 456 private static final int EXTENDER_PROPERTY_ = 11; 457 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 458 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 459 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 460 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 461 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 462 private static final int RADICAL_PROPERTY_ = 17; 463 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 464 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 465 private static final int DEPRECATED_PROPERTY_ = 20; 466 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 467 private static final int XID_START_PROPERTY_ = 22; 468 private static final int XID_CONTINUE_PROPERTY_ = 23; 469 private static final int ID_START_PROPERTY_ = 24; 470 private static final int ID_CONTINUE_PROPERTY_ = 25; 471 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 472 private static final int S_TERM_PROPERTY_ = 27; 473 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 474 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 475 private static final int PATTERN_WHITE_SPACE = 30; 476 477 /* 478 * Properties in vector word 2 479 * Bits 480 * 31..26 reserved 481 * 25..20 Line Break 482 * 19..15 Sentence Break 483 * 14..10 Word Break 484 * 9.. 5 Grapheme Cluster Break 485 * 4.. 0 Decomposition Type 486 */ 487 private static final int LB_MASK = 0x03f00000; 488 private static final int LB_SHIFT = 20; 489 490 private static final int SB_MASK = 0x000f8000; 491 private static final int SB_SHIFT = 15; 492 493 private static final int WB_MASK = 0x00007c00; 494 private static final int WB_SHIFT = 10; 495 496 private static final int GCB_MASK = 0x000003e0; 497 private static final int GCB_SHIFT = 5; 498 499 /** 500 * Integer properties mask for decomposition type. 501 * Equivalent to icu4c UPROPS_DT_MASK. 502 */ 503 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 504 505 /** 506 * First nibble shift 507 */ 508 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 509 /** 510 * Second nibble mask 511 */ 512 private static final int LAST_NIBBLE_MASK_ = 0xF; 513 /** 514 * Age value shift 515 */ 516 private static final int AGE_SHIFT_ = 24; 517 518 // private constructors -------------------------------------------------- 519 520 /** 521 * Constructor 522 * @exception IOException thrown when data reading fails or data corrupted 523 */ 524 private UCharacterProperty() throws IOException 525 { 526 // jar access 527 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 528 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 529 // Read or skip the 16 indexes. 530 int propertyOffset = bytes.getInt(); 531 /* exceptionOffset = */ bytes.getInt(); 532 /* caseOffset = */ bytes.getInt(); 533 int additionalOffset = bytes.getInt(); 534 int additionalVectorsOffset = bytes.getInt(); 535 m_additionalColumnsCount_ = bytes.getInt(); 536 int scriptExtensionsOffset = bytes.getInt(); 537 int reservedOffset7 = bytes.getInt(); 538 /* reservedOffset8 = */ bytes.getInt(); 539 /* dataTopOffset = */ bytes.getInt(); 540 m_maxBlockScriptValue_ = bytes.getInt(); 541 m_maxJTGValue_ = bytes.getInt(); 542 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 543 544 // read the main properties trie 545 m_trie_ = Trie2_16.createFromSerialized(bytes); 546 int expectedTrieLength = (propertyOffset - 16) * 4; 547 int trieLength = m_trie_.getSerializedLength(); 548 if(trieLength > expectedTrieLength) { 549 throw new IOException("uprops.icu: not enough bytes for main trie"); 550 } 551 // skip padding after trie bytes 552 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 553 554 // skip unused intervening data structures 555 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 556 557 if(m_additionalColumnsCount_ > 0) { 558 // reads the additional property block 559 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 560 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 561 trieLength = m_additionalTrie_.getSerializedLength(); 562 if(trieLength > expectedTrieLength) { 563 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 564 } 565 // skip padding after trie bytes 566 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 567 568 // additional properties 569 int size = scriptExtensionsOffset - additionalVectorsOffset; 570 m_additionalVectors_ = new int[size]; 571 for (int i = 0; i < size; i ++) { 572 m_additionalVectors_[i] = bytes.getInt(); 573 } 574 } 575 576 // Script_Extensions 577 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 578 if(numChars > 0) { 579 m_scriptExtensions_ = new char[numChars]; 580 for(int i = 0; i < numChars; ++i) { 581 m_scriptExtensions_[i] = bytes.getChar(); 582 } 583 } 584 } 585 586 private static final class IsAcceptable implements ICUBinary.Authenticate { 587 // @Override when we switch to Java 6 588 public boolean isDataVersionAcceptable(byte version[]) { 589 return version[0] == 7; 590 } 591 } 592 593 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 594 595 public void upropsvec_addPropertyStarts(UnicodeSet set) { 596 /* add the start code point of each same-value range of the properties vectors trie */ 597 if(m_additionalColumnsCount_>0) { 598 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 599 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 600 Trie2.Range range; 601 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 602 set.add(range.startCodePoint); 603 } 604 } 605 } 606 607 // This static initializer block must be placed after 608 // other static member initialization 609 static { 610 try { 611 INSTANCE = new UCharacterProperty(); 612 } 613 catch (IOException e) { 614 throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); 615 } 616 } 617 618 619 // Moved from UProperty.java 620 /** 621 * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). 622 * Used in UAX #9: Unicode Bidirectional Algorithm 623 * (http://www.unicode.org/reports/tr9/) 624 * Returns UCharacter.BidiPairedBracketType values. 625 * @stable ICU 52 626 */ 627 public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; 628 629 }