--- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,23 +24,21 @@ */ /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; -import java.io.BufferedInputStream; -import java.io.InputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; import java.util.MissingResourceException; +import sun.text.normalizer.UCharacter.HangulSyllableType; +import sun.text.normalizer.UCharacter.NumericType; + /** *

Internal class used for Unicode character property database.

*

This classes store binary data read from uprops.icu. @@ -56,134 +54,72 @@ * @since release 2.1, february 1st 2002 */ -public final class UCharacterProperty +final class UCharacterProperty { // public data members ----------------------------------------------- + /* + * public singleton instance + */ + public static final UCharacterProperty INSTANCE; + /** * Trie data */ - public CharTrie m_trie_; - /** - * Optimization - * CharTrie index array - */ - public char[] m_trieIndex_; - /** - * Optimization - * CharTrie data array - */ - public char[] m_trieData_; - /** - * Optimization - * CharTrie data offset - */ - public int m_trieInitialValue_; + public Trie2_16 m_trie_; + /** * Unicode version */ public VersionInfo m_unicodeVersion_; + /** + * Character type mask + */ + public static final int TYPE_MASK = 0x1F; + // uprops.h enum UPropertySource --------------------------------------- *** + /** From uchar.c/uprops.icu main trie */ + public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; - /** One more than the highest UPropertySource (SRC_) constant. */ - public static final int SRC_COUNT=9; + /** From ubidi_props.c/ubidi.icu */ + public static final int SRC_BIDI=5; + /** From normalizer2impl.cpp/nfc.nrm */ + public static final int SRC_NFC=8; + /** From normalizer2impl.cpp/nfkc.nrm */ + public static final int SRC_NFKC=9; // public methods ---------------------------------------------------- /** - * Java friends implementation - */ - public void setIndexData(CharTrie.FriendAgent friendagent) - { - m_trieIndex_ = friendagent.getPrivateIndex(); - m_trieData_ = friendagent.getPrivateData(); - m_trieInitialValue_ = friendagent.getPrivateInitialValue(); - } - - /** - * Gets the property value at the index. - * This is optimized. - * Note this is alittle different from CharTrie the index m_trieData_ - * is never negative. + * Gets the main property value for code point ch. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { - if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE - || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE - && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { - // BMP codepoint 0000..D7FF or DC00..FFFF - // optimized - try { // using try for ch < 0 is faster than using an if statement - return m_trieData_[ - (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } catch (ArrayIndexOutOfBoundsException e) { - return m_trieInitialValue_; - } - } - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - // lead surrogate D800..DBFF - return m_trieData_[ - (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ - + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } - if (ch <= UTF16.CODEPOINT_MAX_VALUE) { - // supplementary code point 10000..10FFFF - // look at the construction of supplementary characters - // trail forms the ends of it. - return m_trie_.getSurrogateValue( - UTF16.getLeadSurrogate(ch), - (char)(ch & Trie.SURROGATE_MASK_)); - } - // ch is out of bounds - // return m_dataOffset_ if there is an error, in this case we return - // the default value: m_initialValue_ - // we cannot assume that m_initialValue_ is at offset 0 - // this is for optimization. - return m_trieInitialValue_; - - // this all is an inlined form of return m_trie_.getCodePointValue(ch); - } - - /** - * Getting the unsigned numeric value of a character embedded in the property - * argument - * @param prop the character - * @return unsigned numberic value - */ - public static int getUnsignedValue(int prop) - { - return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; + return m_trie_.get(ch); } /** * Gets the unicode additional properties. - * C version getUnicodeProperties. + * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved - * @param column + * @param column The column index. * @return unicode properties */ - public int getAdditional(int codepoint, int column) { - if (column == -1) { - return getProperty(codepoint); - } - if (column < 0 || column >= m_additionalColumnsCount_) { - return 0; - } - return m_additionalVectors_[ - m_additionalTrie_.getCodePointValue(codepoint) + column]; - } + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; + } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } - /** + /** *

Get the "age" of the code point.

*

The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a @@ -203,6 +139,91 @@ version & LAST_NIBBLE_MASK_, 0, 0); } + // int-value and enumerated properties --------------------------------- *** + + public int getType(int c) { + return getProperty(c)&TYPE_MASK; + } + + /* + * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. + * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. + */ + private static final int /* UHangulSyllableType */ gcbToHst[]={ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ + HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ + HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ + HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ + HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ + HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ + /* + * Omit GCB values beyond what we need for hst. + * The code below checks for the array length. + */ + }; + + private class IntProperty { + int column; // SRC_PROPSVEC column, or "source" if mask==0 + int mask; + int shift; + + IntProperty(int column, int mask, int shift) { + this.column=column; + this.mask=mask; + this.shift=shift; + } + + IntProperty(int source) { + this.column=source; + this.mask=0; + } + + int getValue(int c) { + // systematic, directly stored properties + return (getAdditional(c, column)&mask)>>>shift; + } + } + + private class BiDiIntProperty extends IntProperty { + BiDiIntProperty() { + super(SRC_BIDI); + } + } + + private class CombiningClassIntProperty extends IntProperty { + CombiningClassIntProperty(int source) { + super(source); + } + } + + private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties + int which; + int max; + + NormQuickCheckIntProperty(int source, int which, int max) { + super(source); + this.which=which; + this.max=max; + } + } + + private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE + int getValue(int c) { + return UBiDiProps.INSTANCE.getPairedBracketType(c); + } + }; + + public int getIntPropertyValue(int c, int which) { + if (which == BIDI_PAIRED_BRACKET_TYPE) { + return intProp.getValue(c); + } + return 0; // undefined + } + /** * Forms a supplementary code point from the argument character
* Note this is for internal use hence no checks for the validity of the @@ -217,42 +238,48 @@ } /** - * Loads the property data and initialize the UCharacterProperty instance. - * @throws MissingResourceException when data is missing or data has been corrupted - */ - public static UCharacterProperty getInstance() + * Gets the type mask + * @param type character type + * @return mask + */ + public static final int getMask(int type) { - if(INSTANCE_ == null) { - try { - INSTANCE_ = new UCharacterProperty(); - } - catch (Exception e) { - throw new MissingResourceException(e.getMessage(),"",""); - } - } - return INSTANCE_; + return 1 << type; } /** - * Checks if the argument c is to be treated as a white space in ICU - * rules. Usually ICU rule white spaces are ignored unless quoted. - * Equivalent to test for Pattern_White_Space Unicode property. - * Stable set of characters, won't change. - * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ - * @param c codepoint to check - * @return true if c is a ICU white space - */ - public static boolean isRuleWhiteSpace(int c) - { - /* "white space" in the sense of ICU rule parsers - This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. - See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ - U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 - Equivalent to test for Pattern_White_Space Unicode property. - */ - return (c >= 0x0009 && c <= 0x2029 && - (c <= 0x000D || c == 0x0020 || c == 0x0085 || - c == 0x200E || c == 0x200F || c >= 0x2028)); + * Returns the digit values of characters like 'A' - 'Z', normal, + * half-width and full-width. This method assumes that the other digit + * characters are checked by the calling method. + * @param ch character to test + * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise + * its corresponding digit will be returned. + */ + public static int getEuropeanDigit(int ch) { + if ((ch > 0x7a && ch < 0xff21) + || ch < 0x41 || (ch > 0x5a && ch < 0x61) + || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { + return -1; + } + if (ch <= 0x7a) { + // ch >= 0x41 or ch < 0x61 + return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); + } + // ch >= 0xff21 + if (ch <= 0xff3a) { + return ch + 10 - 0xff21; + } + // ch >= 0xff41 && ch <= 0xff5a + return ch + 10 - 0xff41; + } + + public int digit(int c) { + int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; + if(value<=9) { + return value; + } else { + return -1; + } } // protected variables ----------------------------------------------- @@ -260,7 +287,7 @@ /** * Extra property trie */ - CharTrie m_additionalTrie_; + Trie2_16 m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary * properties. @@ -280,40 +307,24 @@ * 0 */ int m_maxJTGValue_; + /** + * Script_Extensions data + */ + public char[] m_scriptExtensions_; // private variables ------------------------------------------------- - /** - * UnicodeData.txt property object - */ - private static UCharacterProperty INSTANCE_ = null; - /** * Default name of the datafile */ private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; /** - * Default buffer size of datafile - */ - private static final int DATA_BUFFER_SIZE_ = 25000; - - /** - * Numeric value shift - */ - private static final int VALUE_SHIFT_ = 8; - - /** - * Mask to be applied after shifting to obtain an unsigned numeric value - */ - private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF; - - /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** - * Offset to add to combined surrogate pair to avoid msking. + * Offset to add to combined surrogate pair to avoid masking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - @@ -321,7 +332,153 @@ LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; - // additional properties ---------------------------------------------- + + // property data constants ------------------------------------------------- + + /** + * Numeric types and values in the main properties words. + */ + private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; + private static final int getNumericTypeValue(int props) { + return props >> NUMERIC_TYPE_VALUE_SHIFT_; + } + + /* constants for the storage form of numeric types and values */ + /** No numeric value. */ + private static final int NTV_NONE_ = 0; + /** Decimal digits: nv=0..9 */ + private static final int NTV_DECIMAL_START_ = 1; + /** Other digits: nv=0..9 */ + private static final int NTV_DIGIT_START_ = 11; + /** Small integers: nv=0..154 */ + private static final int NTV_NUMERIC_START_ = 21; + + private static final int ntvGetType(int ntv) { + return + (ntv==NTV_NONE_) ? NumericType.NONE : + (ntv expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for main trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // skip unused intervening data structures + ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); + + if(m_additionalColumnsCount_ > 0) { + // reads the additional property block + m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); + expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; + trieLength = m_additionalTrie_.getSerializedLength(); + if(trieLength > expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // additional properties + int size = scriptExtensionsOffset - additionalVectorsOffset; + m_additionalVectors_ = new int[size]; + for (int i = 0; i < size; i ++) { + m_additionalVectors_[i] = bytes.getInt(); + } + } - m_trie_.putIndexData(this); + // Script_Extensions + int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; + if(numChars > 0) { + m_scriptExtensions_ = new char[numChars]; + for(int i = 0; i < numChars; ++i) { + m_scriptExtensions_[i] = bytes.getChar(); + } + } } + private static final class IsAcceptable implements ICUBinary.Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 7; + } + } + + private static final int DATA_FORMAT = 0x5550726F; // "UPro" + public void upropsvec_addPropertyStarts(UnicodeSet set) { /* add the start code point of each same-value range of the properties vectors trie */ if(m_additionalColumnsCount_>0) { /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ - TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); - RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); - while(propsVectorsIter.next(propsVectorsResult)){ - set.add(propsVectorsResult.start); + Iterator trieIterator = m_additionalTrie_.iterator(); + Trie2.Range range; + while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { + set.add(range.startCodePoint); } } } + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UCharacterProperty(); + } + catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); + } + } + + + // Moved from UProperty.java + /** + * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + * Used in UAX #9: Unicode Bidirectional Algorithm + * (http://www.unicode.org/reports/tr9/) + * Returns UCharacter.BidiPairedBracketType values. + * @stable ICU 52 + */ + public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; + }