< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java

Print this page


   1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.io.BufferedInputStream;
  40 import java.io.InputStream;
  41 import java.io.IOException;


  42 import java.util.MissingResourceException;
  43 



  44 /**
  45 * <p>Internal class used for Unicode character property database.</p>
  46 * <p>This classes store binary data read from uprops.icu.
  47 * It does not have the capability to parse the data into more high-level
  48 * information. It only returns bytes of information when required.</p>
  49 * <p>Due to the form most commonly used for retrieval, array of char is used
  50 * to store the binary data.</p>
  51 * <p>UCharacterPropertyDB also contains information on accessing indexes to
  52 * significant points in the binary data.</p>
  53 * <p>Responsibility for molding the binary data into more meaning form lies on
  54 * <a href=UCharacter.html>UCharacter</a>.</p>
  55 * @author Syn Wee Quek
  56 * @since release 2.1, february 1st 2002
  57 */
  58 
  59 public final class UCharacterProperty
  60 {
  61     // public data members -----------------------------------------------
  62 
  63     /**
  64     * Trie data
  65     */
  66     public CharTrie m_trie_;
  67     /**
  68      * Optimization
  69      * CharTrie index array
  70      */
  71     public char[] m_trieIndex_;
  72     /**
  73      * Optimization
  74      * CharTrie data array
  75      */
  76     public char[] m_trieData_;

  77     /**
  78      * Optimization
  79      * CharTrie data offset
  80      */
  81     public int m_trieInitialValue_;

  82     /**
  83     * Unicode version
  84     */
  85     public VersionInfo m_unicodeVersion_;
  86 





  87     // uprops.h enum UPropertySource --------------------------------------- ***
  88 


  89     /** From uchar.c/uprops.icu properties vectors trie */
  90     public static final int SRC_PROPSVEC=2;
  91     /** One more than the highest UPropertySource (SRC_) constant. */
  92     public static final int SRC_COUNT=9;




  93 
  94     // public methods ----------------------------------------------------
  95 
  96     /**
  97      * Java friends implementation
  98      */
  99     public void setIndexData(CharTrie.FriendAgent friendagent)
 100     {
 101         m_trieIndex_ = friendagent.getPrivateIndex();
 102         m_trieData_ = friendagent.getPrivateData();
 103         m_trieInitialValue_ = friendagent.getPrivateInitialValue();
 104     }
 105 
 106     /**
 107     * Gets the property value at the index.
 108     * This is optimized.
 109     * Note this is alittle different from CharTrie the index m_trieData_
 110     * is never negative.
 111     * @param ch code point whose property value is to be retrieved
 112     * @return property value of code point
 113     */
 114     public final int getProperty(int ch)
 115     {
 116         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
 117             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
 118                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
 119             // BMP codepoint 0000..D7FF or DC00..FFFF
 120             // optimized
 121             try { // using try for ch < 0 is faster than using an if statement
 122                 return m_trieData_[
 123                     (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
 124                           << Trie.INDEX_STAGE_2_SHIFT_)
 125                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
 126             } catch (ArrayIndexOutOfBoundsException e) {
 127                 return m_trieInitialValue_;
 128             }
 129         }
 130         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 131             // lead surrogate D800..DBFF
 132             return m_trieData_[
 133                     (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
 134                                   + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
 135                           << Trie.INDEX_STAGE_2_SHIFT_)
 136                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
 137         }
 138         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
 139             // supplementary code point 10000..10FFFF
 140             // look at the construction of supplementary characters
 141             // trail forms the ends of it.
 142             return m_trie_.getSurrogateValue(
 143                                           UTF16.getLeadSurrogate(ch),
 144                                           (char)(ch & Trie.SURROGATE_MASK_));
 145         }
 146         // ch is out of bounds
 147         // return m_dataOffset_ if there is an error, in this case we return
 148         // the default value: m_initialValue_
 149         // we cannot assume that m_initialValue_ is at offset 0
 150         // this is for optimization.
 151         return m_trieInitialValue_;
 152 
 153         // this all is an inlined form of return m_trie_.getCodePointValue(ch);
 154     }
 155 
 156     /**
 157     * Getting the unsigned numeric value of a character embedded in the property
 158     * argument
 159     * @param prop the character
 160     * @return unsigned numberic value
 161     */
 162     public static int getUnsignedValue(int prop)
 163     {
 164         return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
 165     }
 166 
 167     /**
 168      * Gets the unicode additional properties.
 169      * C version getUnicodeProperties.
 170      * @param codepoint codepoint whose additional properties is to be
 171      *                  retrieved
 172      * @param column
 173      * @return unicode properties
 174      */
 175        public int getAdditional(int codepoint, int column) {
 176         if (column == -1) {
 177             return getProperty(codepoint);
 178         }
 179            if (column < 0 || column >= m_additionalColumnsCount_) {
 180            return 0;
 181        }
 182        return m_additionalVectors_[
 183                      m_additionalTrie_.getCodePointValue(codepoint) + column];
 184        }
 185 
 186        /**
 187      * <p>Get the "age" of the code point.</p>
 188      * <p>The "age" is the Unicode version when the code point was first
 189      * designated (as a non-character or for Private Use) or assigned a
 190      * character.</p>
 191      * <p>This can be useful to avoid emitting code points to receiving
 192      * processes that do not accept newer characters.</p>
 193      * <p>The data is from the UCD file DerivedAge.txt.</p>
 194      * <p>This API does not check the validity of the codepoint.</p>
 195      * @param codepoint The code point.
 196      * @return the Unicode version number
 197      */
 198     public VersionInfo getAge(int codepoint)
 199     {
 200         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
 201         return VersionInfo.getInstance(
 202                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
 203                            version & LAST_NIBBLE_MASK_, 0, 0);
 204     }
 205 





















































































 206     /**
 207     * Forms a supplementary code point from the argument character<br>
 208     * Note this is for internal use hence no checks for the validity of the
 209     * surrogate characters are done
 210     * @param lead lead surrogate character
 211     * @param trail trailing surrogate character
 212     * @return code point of the supplementary character
 213     */
 214     public static int getRawSupplementary(char lead, char trail)
 215     {
 216         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
 217     }
 218 
 219     /**
 220     * Loads the property data and initialize the UCharacterProperty instance.
 221     * @throws MissingResourceException when data is missing or data has been corrupted

 222     */
 223     public static UCharacterProperty getInstance()
 224     {
 225         if(INSTANCE_ == null) {
 226             try {
 227                 INSTANCE_ = new UCharacterProperty();
 228             }
 229             catch (Exception e) {
 230                 throw new MissingResourceException(e.getMessage(),"","");
















 231             }



 232         }
 233         return INSTANCE_;

 234     }
 235 
 236     /**
 237      * Checks if the argument c is to be treated as a white space in ICU
 238      * rules. Usually ICU rule white spaces are ignored unless quoted.
 239      * Equivalent to test for Pattern_White_Space Unicode property.
 240      * Stable set of characters, won't change.
 241      * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
 242      * @param c codepoint to check
 243      * @return true if c is a ICU white space
 244      */
 245     public static boolean isRuleWhiteSpace(int c)
 246     {
 247         /* "white space" in the sense of ICU rule parsers
 248            This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
 249            See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
 250            U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
 251            Equivalent to test for Pattern_White_Space Unicode property.
 252         */
 253         return (c >= 0x0009 && c <= 0x2029 &&
 254                 (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
 255                  c == 0x200E || c == 0x200F || c >= 0x2028));
 256     }
 257 
 258     // protected variables -----------------------------------------------
 259 
 260     /**
 261      * Extra property trie
 262      */
 263     CharTrie m_additionalTrie_;
 264     /**
 265      * Extra property vectors, 1st column for age and second for binary
 266      * properties.
 267      */
 268     int m_additionalVectors_[];
 269     /**
 270      * Number of additional columns
 271      */
 272     int m_additionalColumnsCount_;
 273     /**
 274      * Maximum values for block, bits used as in vector word
 275      * 0
 276      */
 277     int m_maxBlockScriptValue_;
 278     /**
 279      * Maximum values for script, bits used as in vector word
 280      * 0
 281      */
 282      int m_maxJTGValue_;
 283 
 284     // private variables -------------------------------------------------
 285 
 286       /**
 287      * UnicodeData.txt property object
 288      */
 289     private static UCharacterProperty INSTANCE_ = null;


 290 
 291     /**
 292     * Default name of the datafile
 293     */
 294     private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
 295 
 296     /**
 297     * Default buffer size of datafile
 298     */
 299     private static final int DATA_BUFFER_SIZE_ = 25000;
 300 
 301     /**
 302     * Numeric value shift
 303     */
 304     private static final int VALUE_SHIFT_ = 8;







 305 
 306     /**
 307     * Mask to be applied after shifting to obtain an unsigned numeric value
 308     */
 309     private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;



 310 
































 311     /**
 312     * Shift value for lead surrogate to form a supplementary character.
 313     */
 314     private static final int LEAD_SURROGATE_SHIFT_ = 10;

 315     /**
 316     * Offset to add to combined surrogate pair to avoid msking.

 317     */
 318     private static final int SURROGATE_OFFSET_ =
 319                            UTF16.SUPPLEMENTARY_MIN_VALUE -
 320                            (UTF16.SURROGATE_MIN_VALUE <<
 321                            LEAD_SURROGATE_SHIFT_) -
 322                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
















 323 
 324     // additional properties ----------------------------------------------








































































 325 
 326     /**
 327      * First nibble shift
 328      */
 329     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
 330     /**
 331      * Second nibble mask
 332      */
 333     private static final int LAST_NIBBLE_MASK_ = 0xF;
 334     /**
 335      * Age value shift
 336      */
 337     private static final int AGE_SHIFT_ = 24;
 338 
 339     // private constructors --------------------------------------------------
 340 
 341     /**
 342     * Constructor
 343     * @exception IOException thrown when data reading fails or data corrupted
 344     */
 345     private UCharacterProperty() throws IOException
 346     {
 347         // jar access
 348         InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
 349         BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
 350         UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
 351         reader.read(this);
 352         b.close();
 353 
 354         m_trie_.putIndexData(this);

























































 355     }
 356 


 357     public void upropsvec_addPropertyStarts(UnicodeSet set) {
 358         /* add the start code point of each same-value range of the properties vectors trie */
 359         if(m_additionalColumnsCount_>0) {
 360             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
 361             TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
 362             RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
 363             while(propsVectorsIter.next(propsVectorsResult)){
 364                 set.add(propsVectorsResult.start);


 365             }






 366         }


 367     }












 368 
 369 }
   1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 1996-2014, International Business Machines Corporation and
  28  * others. All Rights Reserved.





  29  *******************************************************************************
  30  */
  31 
  32 package sun.text.normalizer;
  33 


  34 import java.io.IOException;
  35 import java.nio.ByteBuffer;
  36 import java.util.Iterator;
  37 import java.util.MissingResourceException;
  38 
  39 import sun.text.normalizer.UCharacter.HangulSyllableType;
  40 import sun.text.normalizer.UCharacter.NumericType;
  41 
  42 /**
  43 * <p>Internal class used for Unicode character property database.</p>
  44 * <p>This classes store binary data read from uprops.icu.
  45 * It does not have the capability to parse the data into more high-level
  46 * information. It only returns bytes of information when required.</p>
  47 * <p>Due to the form most commonly used for retrieval, array of char is used
  48 * to store the binary data.</p>
  49 * <p>UCharacterPropertyDB also contains information on accessing indexes to
  50 * significant points in the binary data.</p>
  51 * <p>Responsibility for molding the binary data into more meaning form lies on
  52 * <a href=UCharacter.html>UCharacter</a>.</p>
  53 * @author Syn Wee Quek
  54 * @since release 2.1, february 1st 2002
  55 */
  56 
  57 final class UCharacterProperty
  58 {
  59     // public data members -----------------------------------------------
  60 
  61     /*
  62      * public singleton instance










  63      */
  64     public static final UCharacterProperty INSTANCE;
  65 
  66     /**
  67     * Trie data

  68     */
  69     public Trie2_16 m_trie_;
  70 
  71     /**
  72     * Unicode version
  73     */
  74     public VersionInfo m_unicodeVersion_;
  75 
  76     /**
  77     * Character type mask
  78     */
  79     public static final int TYPE_MASK = 0x1F;
  80 
  81     // uprops.h enum UPropertySource --------------------------------------- ***
  82 
  83     /** From uchar.c/uprops.icu main trie */
  84     public static final int SRC_CHAR=1;
  85     /** From uchar.c/uprops.icu properties vectors trie */
  86     public static final int SRC_PROPSVEC=2;
  87     /** From ubidi_props.c/ubidi.icu */
  88     public static final int SRC_BIDI=5;
  89     /** From normalizer2impl.cpp/nfc.nrm */
  90     public static final int SRC_NFC=8;
  91     /** From normalizer2impl.cpp/nfkc.nrm */
  92     public static final int SRC_NFKC=9;
  93 
  94     // public methods ----------------------------------------------------
  95 
  96     /**
  97     * Gets the main property value for code point ch.













  98     * @param ch code point whose property value is to be retrieved
  99     * @return property value of code point
 100     */
 101     public final int getProperty(int ch)
 102     {
 103         return m_trie_.get(ch);
















































 104     }
 105 
 106     /**
 107      * Gets the unicode additional properties.
 108      * Java version of C u_getUnicodeProperties().
 109      * @param codepoint codepoint whose additional properties is to be
 110      *                  retrieved
 111      * @param column The column index.
 112      * @return unicode properties
 113      */
 114     public int getAdditional(int codepoint, int column) {
 115         assert column >= 0;
 116         if (column >= m_additionalColumnsCount_) {


 117             return 0;
 118         }
 119         return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];

 120     }
 121 
 122     /**
 123      * <p>Get the "age" of the code point.</p>
 124      * <p>The "age" is the Unicode version when the code point was first
 125      * designated (as a non-character or for Private Use) or assigned a
 126      * character.</p>
 127      * <p>This can be useful to avoid emitting code points to receiving
 128      * processes that do not accept newer characters.</p>
 129      * <p>The data is from the UCD file DerivedAge.txt.</p>
 130      * <p>This API does not check the validity of the codepoint.</p>
 131      * @param codepoint The code point.
 132      * @return the Unicode version number
 133      */
 134     public VersionInfo getAge(int codepoint)
 135     {
 136         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
 137         return VersionInfo.getInstance(
 138                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
 139                            version & LAST_NIBBLE_MASK_, 0, 0);
 140     }
 141 
 142     // int-value and enumerated properties --------------------------------- ***
 143 
 144     public int getType(int c) {
 145         return getProperty(c)&TYPE_MASK;
 146     }
 147 
 148     /*
 149      * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
 150      * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
 151      */
 152     private static final int /* UHangulSyllableType */ gcbToHst[]={
 153         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
 154         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CONTROL */
 155         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CR */
 156         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_EXTEND */
 157         HangulSyllableType.LEADING_JAMO,     /* U_GCB_L */
 158         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_LF */
 159         HangulSyllableType.LV_SYLLABLE,      /* U_GCB_LV */
 160         HangulSyllableType.LVT_SYLLABLE,     /* U_GCB_LVT */
 161         HangulSyllableType.TRAILING_JAMO,    /* U_GCB_T */
 162         HangulSyllableType.VOWEL_JAMO        /* U_GCB_V */
 163         /*
 164          * Omit GCB values beyond what we need for hst.
 165          * The code below checks for the array length.
 166          */
 167     };
 168 
 169     private class IntProperty {
 170         int column;  // SRC_PROPSVEC column, or "source" if mask==0
 171         int mask;
 172         int shift;
 173 
 174         IntProperty(int column, int mask, int shift) {
 175             this.column=column;
 176             this.mask=mask;
 177             this.shift=shift;
 178         }
 179 
 180         IntProperty(int source) {
 181             this.column=source;
 182             this.mask=0;
 183         }
 184 
 185         int getValue(int c) {
 186             // systematic, directly stored properties
 187             return (getAdditional(c, column)&mask)>>>shift;
 188         }
 189     }
 190 
 191     private class BiDiIntProperty extends IntProperty {
 192         BiDiIntProperty() {
 193             super(SRC_BIDI);
 194         }
 195     }
 196 
 197     private class CombiningClassIntProperty extends IntProperty {
 198         CombiningClassIntProperty(int source) {
 199             super(source);
 200         }
 201     }
 202 
 203     private class NormQuickCheckIntProperty extends IntProperty {  // UCHAR_NF*_QUICK_CHECK properties
 204         int which;
 205         int max;
 206 
 207         NormQuickCheckIntProperty(int source, int which, int max) {
 208             super(source);
 209             this.which=which;
 210             this.max=max;
 211         }
 212     }
 213 
 214     private IntProperty intProp =  new BiDiIntProperty() {  // BIDI_PAIRED_BRACKET_TYPE
 215         int getValue(int c) {
 216             return UBiDiProps.INSTANCE.getPairedBracketType(c);
 217         }
 218     };
 219 
 220     public int getIntPropertyValue(int c, int which) {
 221         if (which == BIDI_PAIRED_BRACKET_TYPE) {
 222             return intProp.getValue(c);
 223         }
 224         return 0; // undefined
 225     }
 226 
 227     /**
 228     * Forms a supplementary code point from the argument character<br>
 229     * Note this is for internal use hence no checks for the validity of the
 230     * surrogate characters are done
 231     * @param lead lead surrogate character
 232     * @param trail trailing surrogate character
 233     * @return code point of the supplementary character
 234     */
 235     public static int getRawSupplementary(char lead, char trail)
 236     {
 237         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
 238     }
 239 
 240     /**
 241      * Gets the type mask
 242      * @param type character type
 243      * @return mask
 244      */
 245     public static final int getMask(int type)
 246     {
 247         return 1 << type;


 248     }
 249 
 250     /**
 251      * Returns the digit values of characters like 'A' - 'Z', normal,
 252      * half-width and full-width. This method assumes that the other digit
 253      * characters are checked by the calling method.
 254      * @param ch character to test
 255      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
 256      *         its corresponding digit will be returned.
 257      */
 258     public static int getEuropeanDigit(int ch) {
 259         if ((ch > 0x7a && ch < 0xff21)
 260             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
 261             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
 262             return -1;
 263         }
 264         if (ch <= 0x7a) {
 265             // ch >= 0x41 or ch < 0x61
 266             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
 267         }
 268         // ch >= 0xff21
 269         if (ch <= 0xff3a) {
 270             return ch + 10 - 0xff21;
 271         }
 272         // ch >= 0xff41 && ch <= 0xff5a
 273         return ch + 10 - 0xff41;
 274     }
 275 
 276     public int digit(int c) {
 277         int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
 278         if(value<=9) {
 279             return value;
 280         } else {
 281             return -1;
 282         }













 283     }
 284 
 285     // protected variables -----------------------------------------------
 286 
 287     /**
 288      * Extra property trie
 289      */
 290     Trie2_16 m_additionalTrie_;
 291     /**
 292      * Extra property vectors, 1st column for age and second for binary
 293      * properties.
 294      */
 295     int m_additionalVectors_[];
 296     /**
 297      * Number of additional columns
 298      */
 299     int m_additionalColumnsCount_;
 300     /**
 301      * Maximum values for block, bits used as in vector word
 302      * 0
 303      */
 304     int m_maxBlockScriptValue_;
 305     /**
 306      * Maximum values for script, bits used as in vector word
 307      * 0
 308      */
 309      int m_maxJTGValue_;



 310     /**
 311      * Script_Extensions data
 312      */
 313     public char[] m_scriptExtensions_;
 314 
 315     // private variables -------------------------------------------------
 316 
 317     /**
 318     * Default name of the datafile
 319     */
 320     private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
 321 
 322     /**
 323     * Shift value for lead surrogate to form a supplementary character.
 324     */
 325     private static final int LEAD_SURROGATE_SHIFT_ = 10;

 326     /**
 327     * Offset to add to combined surrogate pair to avoid masking.
 328     */
 329     private static final int SURROGATE_OFFSET_ =
 330                            UTF16.SUPPLEMENTARY_MIN_VALUE -
 331                            (UTF16.SURROGATE_MIN_VALUE <<
 332                            LEAD_SURROGATE_SHIFT_) -
 333                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
 334 
 335 
 336     // property data constants -------------------------------------------------
 337 
 338     /**
 339      * Numeric types and values in the main properties words.
 340      */
 341     private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
 342     private static final int getNumericTypeValue(int props) {
 343         return props >> NUMERIC_TYPE_VALUE_SHIFT_;
 344     }
 345 
 346     /* constants for the storage form of numeric types and values */
 347     /** No numeric value. */
 348     private static final int NTV_NONE_ = 0;
 349     /** Decimal digits: nv=0..9 */
 350     private static final int NTV_DECIMAL_START_ = 1;
 351     /** Other digits: nv=0..9 */
 352     private static final int NTV_DIGIT_START_ = 11;
 353     /** Small integers: nv=0..154 */
 354     private static final int NTV_NUMERIC_START_ = 21;
 355 
 356     private static final int ntvGetType(int ntv) {
 357         return
 358             (ntv==NTV_NONE_) ? NumericType.NONE :
 359             (ntv<NTV_DIGIT_START_) ?  NumericType.DECIMAL :
 360             (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
 361             NumericType.NUMERIC;
 362     }
 363 
 364     /*
 365      * Properties in vector word 0
 366      * Bits
 367      * 31..24   DerivedAge version major/minor one nibble each
 368      * 23..22   3..1: Bits 7..0 = Script_Extensions index
 369      *             3: Script value from Script_Extensions
 370      *             2: Script=Inherited
 371      *             1: Script=Common
 372      *             0: Script=bits 7..0
 373      * 21..20   reserved
 374      * 19..17   East Asian Width
 375      * 16.. 8   UBlockCode
 376      *  7.. 0   UScriptCode
 377      */
 378     /**
 379      * Script_Extensions: mask includes Script
 380      */
 381     public static final int SCRIPT_X_MASK = 0x00c000ff;
 382     //private static final int SCRIPT_X_SHIFT = 22;
 383     /**
 384      * Integer properties mask and shift values for East Asian cell width.
 385      * Equivalent to icu4c UPROPS_EA_MASK
 386      */
 387     private static final int EAST_ASIAN_MASK_ = 0x000e0000;
 388     /**
 389      * Integer properties mask and shift values for East Asian cell width.
 390      * Equivalent to icu4c UPROPS_EA_SHIFT
 391      */
 392     private static final int EAST_ASIAN_SHIFT_ = 17;
 393     /**
 394      * Integer properties mask and shift values for blocks.
 395      * Equivalent to icu4c UPROPS_BLOCK_MASK
 396      */
 397     private static final int BLOCK_MASK_ = 0x0001ff00;
 398     /**
 399      * Integer properties mask and shift values for blocks.
 400      * Equivalent to icu4c UPROPS_BLOCK_SHIFT
 401      */
 402     private static final int BLOCK_SHIFT_ = 8;
 403     /**
 404      * Integer properties mask and shift values for scripts.
 405      * Equivalent to icu4c UPROPS_SHIFT_MASK
 406      */
 407     public static final int SCRIPT_MASK_ = 0x000000ff;
 408 
 409     /**
 410      * Additional properties used in internal trie data
 411      */
 412     /*
 413      * Properties in vector word 1
 414      * Each bit encodes one binary property.
 415      * The following constants represent the bit number, use 1<<UPROPS_XYZ.
 416      * UPROPS_BINARY_1_TOP<=32!
 417      *
 418      * Keep this list of property enums in sync with
 419      * propListNames[] in icu/source/tools/genprops/props2.c!
 420      *
 421      * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
 422      */
 423     private static final int WHITE_SPACE_PROPERTY_ = 0;
 424     private static final int DASH_PROPERTY_ = 1;
 425     private static final int HYPHEN_PROPERTY_ = 2;
 426     private static final int QUOTATION_MARK_PROPERTY_ = 3;
 427     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
 428     private static final int MATH_PROPERTY_ = 5;
 429     private static final int HEX_DIGIT_PROPERTY_ = 6;
 430     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
 431     private static final int ALPHABETIC_PROPERTY_ = 8;
 432     private static final int IDEOGRAPHIC_PROPERTY_ = 9;
 433     private static final int DIACRITIC_PROPERTY_ = 10;
 434     private static final int EXTENDER_PROPERTY_ = 11;
 435     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
 436     private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
 437     private static final int GRAPHEME_LINK_PROPERTY_ = 14;
 438     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
 439     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
 440     private static final int RADICAL_PROPERTY_ = 17;
 441     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
 442     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
 443     private static final int DEPRECATED_PROPERTY_ = 20;
 444     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
 445     private static final int XID_START_PROPERTY_ = 22;
 446     private static final int XID_CONTINUE_PROPERTY_ = 23;
 447     private static final int ID_START_PROPERTY_    = 24;
 448     private static final int ID_CONTINUE_PROPERTY_ = 25;
 449     private static final int GRAPHEME_BASE_PROPERTY_ = 26;
 450     private static final int S_TERM_PROPERTY_ = 27;
 451     private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
 452     private static final int PATTERN_SYNTAX = 29;                   /* new in ICU 3.4 and Unicode 4.1 */
 453     private static final int PATTERN_WHITE_SPACE = 30;
 454 
 455     /*
 456      * Properties in vector word 2
 457      * Bits
 458      * 31..26   reserved
 459      * 25..20   Line Break
 460      * 19..15   Sentence Break
 461      * 14..10   Word Break
 462      *  9.. 5   Grapheme Cluster Break
 463      *  4.. 0   Decomposition Type
 464      */
 465     private static final int LB_MASK          = 0x03f00000;
 466     private static final int LB_SHIFT         = 20;
 467 
 468     private static final int SB_MASK          = 0x000f8000;
 469     private static final int SB_SHIFT         = 15;
 470 
 471     private static final int WB_MASK          = 0x00007c00;
 472     private static final int WB_SHIFT         = 10;
 473 
 474     private static final int GCB_MASK         = 0x000003e0;
 475     private static final int GCB_SHIFT        = 5;
 476 
 477     /**
 478      * Integer properties mask for decomposition type.
 479      * Equivalent to icu4c UPROPS_DT_MASK.
 480      */
 481     private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
 482 
 483     /**
 484      * First nibble shift
 485      */
 486     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
 487     /**
 488      * Second nibble mask
 489      */
 490     private static final int LAST_NIBBLE_MASK_ = 0xF;
 491     /**
 492      * Age value shift
 493      */
 494     private static final int AGE_SHIFT_ = 24;
 495 
 496     // private constructors --------------------------------------------------
 497 
 498     /**
 499      * Constructor
 500      * @exception IOException thrown when data reading fails or data corrupted
 501      */
 502     private UCharacterProperty() throws IOException
 503     {
 504         // jar access
 505         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
 506         m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
 507         // Read or skip the 16 indexes.
 508         int propertyOffset = bytes.getInt();
 509         /* exceptionOffset = */ bytes.getInt();
 510         /* caseOffset = */ bytes.getInt();
 511         int additionalOffset = bytes.getInt();
 512         int additionalVectorsOffset = bytes.getInt();
 513         m_additionalColumnsCount_ = bytes.getInt();
 514         int scriptExtensionsOffset = bytes.getInt();
 515         int reservedOffset7 = bytes.getInt();
 516         /* reservedOffset8 = */ bytes.getInt();
 517         /* dataTopOffset = */ bytes.getInt();
 518         m_maxBlockScriptValue_ = bytes.getInt();
 519         m_maxJTGValue_ = bytes.getInt();
 520         ICUBinary.skipBytes(bytes, (16 - 12) << 2);
 521 
 522         // read the main properties trie
 523         m_trie_ = Trie2_16.createFromSerialized(bytes);
 524         int expectedTrieLength = (propertyOffset - 16) * 4;
 525         int trieLength = m_trie_.getSerializedLength();
 526         if(trieLength > expectedTrieLength) {
 527             throw new IOException("uprops.icu: not enough bytes for main trie");
 528         }
 529         // skip padding after trie bytes
 530         ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
 531 
 532         // skip unused intervening data structures
 533         ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
 534 
 535         if(m_additionalColumnsCount_ > 0) {
 536             // reads the additional property block
 537             m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
 538             expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
 539             trieLength = m_additionalTrie_.getSerializedLength();
 540             if(trieLength > expectedTrieLength) {
 541                 throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
 542             }
 543             // skip padding after trie bytes
 544             ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
 545 
 546             // additional properties
 547             int size = scriptExtensionsOffset - additionalVectorsOffset;
 548             m_additionalVectors_ = new int[size];
 549             for (int i = 0; i < size; i ++) {
 550                 m_additionalVectors_[i] = bytes.getInt();
 551             }
 552         }
 553 
 554         // Script_Extensions
 555         int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
 556         if(numChars > 0) {
 557             m_scriptExtensions_ = new char[numChars];
 558             for(int i = 0; i < numChars; ++i) {
 559                 m_scriptExtensions_[i] = bytes.getChar();
 560             }
 561         }
 562     }
 563 
 564     private static final class IsAcceptable implements ICUBinary.Authenticate {
 565         // @Override when we switch to Java 6
 566         public boolean isDataVersionAcceptable(byte version[]) {
 567             return version[0] == 7;
 568         }
 569     }
 570 
 571     private static final int DATA_FORMAT = 0x5550726F;  // "UPro"
 572 
 573     public void upropsvec_addPropertyStarts(UnicodeSet set) {
 574         /* add the start code point of each same-value range of the properties vectors trie */
 575         if(m_additionalColumnsCount_>0) {
 576             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
 577             Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
 578             Trie2.Range range;
 579             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
 580                 set.add(range.startCodePoint);
 581             }
 582         }
 583     }
 584 
 585     // This static initializer block must be placed after
 586     // other static member initialization
 587     static {
 588         try {
 589             INSTANCE = new UCharacterProperty();
 590         }
 591         catch (IOException e) {
 592             throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
 593         }
 594     }
 595 
 596 
 597     // Moved from UProperty.java
 598     /**
 599      * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
 600      * Used in UAX #9: Unicode Bidirectional Algorithm
 601      * (http://www.unicode.org/reports/tr9/)
 602      * Returns UCharacter.BidiPairedBracketType values.
 603      * @stable ICU 52
 604      */
 605     public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
 606 
 607 }
< prev index next >