--- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,23 +24,21 @@ */ /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; -import java.io.BufferedInputStream; -import java.io.InputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; import java.util.MissingResourceException; +import sun.text.normalizer.UCharacter.HangulSyllableType; +import sun.text.normalizer.UCharacter.NumericType; + /** *
Internal class used for Unicode character property database.
*This classes store binary data read from uprops.icu. @@ -56,134 +54,72 @@ * @since release 2.1, february 1st 2002 */ -public final class UCharacterProperty +final class UCharacterProperty { // public data members ----------------------------------------------- + /* + * public singleton instance + */ + public static final UCharacterProperty INSTANCE; + /** * Trie data */ - public CharTrie m_trie_; - /** - * Optimization - * CharTrie index array - */ - public char[] m_trieIndex_; - /** - * Optimization - * CharTrie data array - */ - public char[] m_trieData_; - /** - * Optimization - * CharTrie data offset - */ - public int m_trieInitialValue_; + public Trie2_16 m_trie_; + /** * Unicode version */ public VersionInfo m_unicodeVersion_; + /** + * Character type mask + */ + public static final int TYPE_MASK = 0x1F; + // uprops.h enum UPropertySource --------------------------------------- *** + /** From uchar.c/uprops.icu main trie */ + public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; - /** One more than the highest UPropertySource (SRC_) constant. */ - public static final int SRC_COUNT=9; + /** From ubidi_props.c/ubidi.icu */ + public static final int SRC_BIDI=5; + /** From normalizer2impl.cpp/nfc.nrm */ + public static final int SRC_NFC=8; + /** From normalizer2impl.cpp/nfkc.nrm */ + public static final int SRC_NFKC=9; // public methods ---------------------------------------------------- /** - * Java friends implementation - */ - public void setIndexData(CharTrie.FriendAgent friendagent) - { - m_trieIndex_ = friendagent.getPrivateIndex(); - m_trieData_ = friendagent.getPrivateData(); - m_trieInitialValue_ = friendagent.getPrivateInitialValue(); - } - - /** - * Gets the property value at the index. - * This is optimized. - * Note this is alittle different from CharTrie the index m_trieData_ - * is never negative. + * Gets the main property value for code point ch. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { - if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE - || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE - && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { - // BMP codepoint 0000..D7FF or DC00..FFFF - // optimized - try { // using try for ch < 0 is faster than using an if statement - return m_trieData_[ - (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } catch (ArrayIndexOutOfBoundsException e) { - return m_trieInitialValue_; - } - } - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - // lead surrogate D800..DBFF - return m_trieData_[ - (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ - + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } - if (ch <= UTF16.CODEPOINT_MAX_VALUE) { - // supplementary code point 10000..10FFFF - // look at the construction of supplementary characters - // trail forms the ends of it. - return m_trie_.getSurrogateValue( - UTF16.getLeadSurrogate(ch), - (char)(ch & Trie.SURROGATE_MASK_)); - } - // ch is out of bounds - // return m_dataOffset_ if there is an error, in this case we return - // the default value: m_initialValue_ - // we cannot assume that m_initialValue_ is at offset 0 - // this is for optimization. - return m_trieInitialValue_; - - // this all is an inlined form of return m_trie_.getCodePointValue(ch); - } - - /** - * Getting the unsigned numeric value of a character embedded in the property - * argument - * @param prop the character - * @return unsigned numberic value - */ - public static int getUnsignedValue(int prop) - { - return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; + return m_trie_.get(ch); } /** * Gets the unicode additional properties. - * C version getUnicodeProperties. + * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved - * @param column + * @param column The column index. * @return unicode properties */ - public int getAdditional(int codepoint, int column) { - if (column == -1) { - return getProperty(codepoint); - } - if (column < 0 || column >= m_additionalColumnsCount_) { - return 0; - } - return m_additionalVectors_[ - m_additionalTrie_.getCodePointValue(codepoint) + column]; - } + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; + } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } - /** + /** *
Get the "age" of the code point.
*The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
@@ -203,6 +139,91 @@
version & LAST_NIBBLE_MASK_, 0, 0);
}
+ // int-value and enumerated properties --------------------------------- ***
+
+ public int getType(int c) {
+ return getProperty(c)&TYPE_MASK;
+ }
+
+ /*
+ * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
+ * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
+ */
+ private static final int /* UHangulSyllableType */ gcbToHst[]={
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
+ HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
+ HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
+ HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
+ HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
+ HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
+ /*
+ * Omit GCB values beyond what we need for hst.
+ * The code below checks for the array length.
+ */
+ };
+
+ private class IntProperty {
+ int column; // SRC_PROPSVEC column, or "source" if mask==0
+ int mask;
+ int shift;
+
+ IntProperty(int column, int mask, int shift) {
+ this.column=column;
+ this.mask=mask;
+ this.shift=shift;
+ }
+
+ IntProperty(int source) {
+ this.column=source;
+ this.mask=0;
+ }
+
+ int getValue(int c) {
+ // systematic, directly stored properties
+ return (getAdditional(c, column)&mask)>>>shift;
+ }
+ }
+
+ private class BiDiIntProperty extends IntProperty {
+ BiDiIntProperty() {
+ super(SRC_BIDI);
+ }
+ }
+
+ private class CombiningClassIntProperty extends IntProperty {
+ CombiningClassIntProperty(int source) {
+ super(source);
+ }
+ }
+
+ private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
+ int which;
+ int max;
+
+ NormQuickCheckIntProperty(int source, int which, int max) {
+ super(source);
+ this.which=which;
+ this.max=max;
+ }
+ }
+
+ private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
+ int getValue(int c) {
+ return UBiDiProps.INSTANCE.getPairedBracketType(c);
+ }
+ };
+
+ public int getIntPropertyValue(int c, int which) {
+ if (which == BIDI_PAIRED_BRACKET_TYPE) {
+ return intProp.getValue(c);
+ }
+ return 0; // undefined
+ }
+
/**
* Forms a supplementary code point from the argument character
* Note this is for internal use hence no checks for the validity of the
@@ -217,42 +238,48 @@
}
/**
- * Loads the property data and initialize the UCharacterProperty instance.
- * @throws MissingResourceException when data is missing or data has been corrupted
- */
- public static UCharacterProperty getInstance()
+ * Gets the type mask
+ * @param type character type
+ * @return mask
+ */
+ public static final int getMask(int type)
{
- if(INSTANCE_ == null) {
- try {
- INSTANCE_ = new UCharacterProperty();
- }
- catch (Exception e) {
- throw new MissingResourceException(e.getMessage(),"","");
- }
- }
- return INSTANCE_;
+ return 1 << type;
}
/**
- * Checks if the argument c is to be treated as a white space in ICU
- * rules. Usually ICU rule white spaces are ignored unless quoted.
- * Equivalent to test for Pattern_White_Space Unicode property.
- * Stable set of characters, won't change.
- * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
- * @param c codepoint to check
- * @return true if c is a ICU white space
- */
- public static boolean isRuleWhiteSpace(int c)
- {
- /* "white space" in the sense of ICU rule parsers
- This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
- See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
- U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
- Equivalent to test for Pattern_White_Space Unicode property.
- */
- return (c >= 0x0009 && c <= 0x2029 &&
- (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
- c == 0x200E || c == 0x200F || c >= 0x2028));
+ * Returns the digit values of characters like 'A' - 'Z', normal,
+ * half-width and full-width. This method assumes that the other digit
+ * characters are checked by the calling method.
+ * @param ch character to test
+ * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
+ * its corresponding digit will be returned.
+ */
+ public static int getEuropeanDigit(int ch) {
+ if ((ch > 0x7a && ch < 0xff21)
+ || ch < 0x41 || (ch > 0x5a && ch < 0x61)
+ || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
+ return -1;
+ }
+ if (ch <= 0x7a) {
+ // ch >= 0x41 or ch < 0x61
+ return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
+ }
+ // ch >= 0xff21
+ if (ch <= 0xff3a) {
+ return ch + 10 - 0xff21;
+ }
+ // ch >= 0xff41 && ch <= 0xff5a
+ return ch + 10 - 0xff41;
+ }
+
+ public int digit(int c) {
+ int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
+ if(value<=9) {
+ return value;
+ } else {
+ return -1;
+ }
}
// protected variables -----------------------------------------------
@@ -260,7 +287,7 @@
/**
* Extra property trie
*/
- CharTrie m_additionalTrie_;
+ Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
@@ -280,40 +307,24 @@
* 0
*/
int m_maxJTGValue_;
+ /**
+ * Script_Extensions data
+ */
+ public char[] m_scriptExtensions_;
// private variables -------------------------------------------------
- /**
- * UnicodeData.txt property object
- */
- private static UCharacterProperty INSTANCE_ = null;
-
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
- * Default buffer size of datafile
- */
- private static final int DATA_BUFFER_SIZE_ = 25000;
-
- /**
- * Numeric value shift
- */
- private static final int VALUE_SHIFT_ = 8;
-
- /**
- * Mask to be applied after shifting to obtain an unsigned numeric value
- */
- private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
-
- /**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
- * Offset to add to combined surrogate pair to avoid msking.
+ * Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
@@ -321,7 +332,153 @@
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
- // additional properties ----------------------------------------------
+
+ // property data constants -------------------------------------------------
+
+ /**
+ * Numeric types and values in the main properties words.
+ */
+ private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
+ private static final int getNumericTypeValue(int props) {
+ return props >> NUMERIC_TYPE_VALUE_SHIFT_;
+ }
+
+ /* constants for the storage form of numeric types and values */
+ /** No numeric value. */
+ private static final int NTV_NONE_ = 0;
+ /** Decimal digits: nv=0..9 */
+ private static final int NTV_DECIMAL_START_ = 1;
+ /** Other digits: nv=0..9 */
+ private static final int NTV_DIGIT_START_ = 11;
+ /** Small integers: nv=0..154 */
+ private static final int NTV_NUMERIC_START_ = 21;
+
+ private static final int ntvGetType(int ntv) {
+ return
+ (ntv==NTV_NONE_) ? NumericType.NONE :
+ (ntv