/* * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package jdk.internal.icu.impl; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.MissingResourceException; import jdk.internal.icu.lang.UCharacter.HangulSyllableType; import jdk.internal.icu.lang.UCharacter.NumericType; import jdk.internal.icu.text.UTF16; import jdk.internal.icu.text.UnicodeSet; import jdk.internal.icu.util.VersionInfo; /** *

Internal class used for Unicode character property database.

*

This classes store binary data read from uprops.icu. * It does not have the capability to parse the data into more high-level * information. It only returns bytes of information when required.

*

Due to the form most commonly used for retrieval, array of char is used * to store the binary data.

*

UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data.

*

Responsibility for molding the binary data into more meaning form lies on * UCharacter.

* @author Syn Wee Quek * @since release 2.1, february 1st 2002 */ public final class UCharacterProperty { // public data members ----------------------------------------------- /* * public singleton instance */ public static final UCharacterProperty INSTANCE; /** * Trie data */ public Trie2_16 m_trie_; /** * Unicode version */ public VersionInfo m_unicodeVersion_; /** * Character type mask */ public static final int TYPE_MASK = 0x1F; // uprops.h enum UPropertySource --------------------------------------- *** /** From uchar.c/uprops.icu main trie */ public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; /** From ubidi_props.c/ubidi.icu */ public static final int SRC_BIDI=5; /** From normalizer2impl.cpp/nfc.nrm */ public static final int SRC_NFC=8; /** From normalizer2impl.cpp/nfkc.nrm */ public static final int SRC_NFKC=9; // public methods ---------------------------------------------------- /** * Gets the main property value for code point ch. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { return m_trie_.get(ch); } /** * Gets the unicode additional properties. * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved * @param column The column index. * @return unicode properties */ public int getAdditional(int codepoint, int column) { assert column >= 0; if (column >= m_additionalColumnsCount_) { return 0; } return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; } /** *

Get the "age" of the code point.

*

The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character.

*

This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.

*

The data is from the UCD file DerivedAge.txt.

*

This API does not check the validity of the codepoint.

* @param codepoint The code point. * @return the Unicode version number */ public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; return VersionInfo.getInstance( (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); } // int-value and enumerated properties --------------------------------- *** public int getType(int c) { return getProperty(c)&TYPE_MASK; } /* * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. */ private static final int /* UHangulSyllableType */ gcbToHst[]={ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ /* * Omit GCB values beyond what we need for hst. * The code below checks for the array length. */ }; private class IntProperty { int column; // SRC_PROPSVEC column, or "source" if mask==0 int mask; int shift; IntProperty(int column, int mask, int shift) { this.column=column; this.mask=mask; this.shift=shift; } IntProperty(int source) { this.column=source; this.mask=0; } int getValue(int c) { // systematic, directly stored properties return (getAdditional(c, column)&mask)>>>shift; } } private class BiDiIntProperty extends IntProperty { BiDiIntProperty() { super(SRC_BIDI); } } private class CombiningClassIntProperty extends IntProperty { CombiningClassIntProperty(int source) { super(source); } } private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties int which; int max; NormQuickCheckIntProperty(int source, int which, int max) { super(source); this.which=which; this.max=max; } } private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE int getValue(int c) { return UBiDiProps.INSTANCE.getPairedBracketType(c); } }; public int getIntPropertyValue(int c, int which) { if (which == BIDI_PAIRED_BRACKET_TYPE) { return intProp.getValue(c); } return 0; // undefined } /** * Forms a supplementary code point from the argument character
* Note this is for internal use hence no checks for the validity of the * surrogate characters are done * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */ public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; } /** * Gets the type mask * @param type character type * @return mask */ public static final int getMask(int type) { return 1 << type; } /** * Returns the digit values of characters like 'A' - 'Z', normal, * half-width and full-width. This method assumes that the other digit * characters are checked by the calling method. * @param ch character to test * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise * its corresponding digit will be returned. */ public static int getEuropeanDigit(int ch) { if ((ch > 0x7a && ch < 0xff21) || ch < 0x41 || (ch > 0x5a && ch < 0x61) || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { return -1; } if (ch <= 0x7a) { // ch >= 0x41 or ch < 0x61 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); } // ch >= 0xff21 if (ch <= 0xff3a) { return ch + 10 - 0xff21; } // ch >= 0xff41 && ch <= 0xff5a return ch + 10 - 0xff41; } public int digit(int c) { int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; if(value<=9) { return value; } else { return -1; } } // protected variables ----------------------------------------------- /** * Extra property trie */ Trie2_16 m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary * properties. */ int m_additionalVectors_[]; /** * Number of additional columns */ int m_additionalColumnsCount_; /** * Maximum values for block, bits used as in vector word * 0 */ int m_maxBlockScriptValue_; /** * Maximum values for script, bits used as in vector word * 0 */ int m_maxJTGValue_; /** * Script_Extensions data */ public char[] m_scriptExtensions_; // private variables ------------------------------------------------- /** * Default name of the datafile */ @SuppressWarnings("deprecation") private static final String DATA_FILE_NAME_ = "/jdk/internal/icu/impl/data/icudt" + VersionInfo.ICU_DATA_VERSION_PATH + "/uprops.icu"; /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** * Offset to add to combined surrogate pair to avoid masking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; // property data constants ------------------------------------------------- /** * Numeric types and values in the main properties words. */ private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; private static final int getNumericTypeValue(int props) { return props >> NUMERIC_TYPE_VALUE_SHIFT_; } /* constants for the storage form of numeric types and values */ /** No numeric value. */ private static final int NTV_NONE_ = 0; /** Decimal digits: nv=0..9 */ private static final int NTV_DECIMAL_START_ = 1; /** Other digits: nv=0..9 */ private static final int NTV_DIGIT_START_ = 11; /** Small integers: nv=0..154 */ private static final int NTV_NUMERIC_START_ = 21; private static final int ntvGetType(int ntv) { return (ntv==NTV_NONE_) ? NumericType.NONE : (ntv expectedTrieLength) { throw new IOException("uprops.icu: not enough bytes for main trie"); } // skip padding after trie bytes ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); // skip unused intervening data structures ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); if(m_additionalColumnsCount_ > 0) { // reads the additional property block m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; trieLength = m_additionalTrie_.getSerializedLength(); if(trieLength > expectedTrieLength) { throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); } // skip padding after trie bytes ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); // additional properties int size = scriptExtensionsOffset - additionalVectorsOffset; m_additionalVectors_ = new int[size]; for (int i = 0; i < size; i ++) { m_additionalVectors_[i] = bytes.getInt(); } } // Script_Extensions int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; if(numChars > 0) { m_scriptExtensions_ = new char[numChars]; for(int i = 0; i < numChars; ++i) { m_scriptExtensions_[i] = bytes.getChar(); } } } private static final class IsAcceptable implements ICUBinary.Authenticate { // @Override when we switch to Java 6 public boolean isDataVersionAcceptable(byte version[]) { return version[0] == 7; } } private static final int DATA_FORMAT = 0x5550726F; // "UPro" public void upropsvec_addPropertyStarts(UnicodeSet set) { /* add the start code point of each same-value range of the properties vectors trie */ if(m_additionalColumnsCount_>0) { /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ Iterator trieIterator = m_additionalTrie_.iterator(); Trie2.Range range; while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { set.add(range.startCodePoint); } } } // This static initializer block must be placed after // other static member initialization static { try { INSTANCE = new UCharacterProperty(); } catch (IOException e) { throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); } } // Moved from UProperty.java /** * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). * Used in UAX #9: Unicode Bidirectional Algorithm * (http://www.unicode.org/reports/tr9/) * Returns UCharacter.BidiPairedBracketType values. * @stable ICU 52 */ public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; }