1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.io.BufferedInputStream;
  40 import java.io.InputStream;
  41 import java.io.IOException;
  42 import java.util.MissingResourceException;
  43 
  44 /**
  45 * <p>Internal class used for Unicode character property database.</p>
  46 * <p>This classes store binary data read from uprops.icu.
  47 * It does not have the capability to parse the data into more high-level
  48 * information. It only returns bytes of information when required.</p>
  49 * <p>Due to the form most commonly used for retrieval, array of char is used
  50 * to store the binary data.</p>
  51 * <p>UCharacterPropertyDB also contains information on accessing indexes to
  52 * significant points in the binary data.</p>
  53 * <p>Responsibility for molding the binary data into more meaning form lies on
  54 * <a href=UCharacter.html>UCharacter</a>.</p>
  55 * @author Syn Wee Quek
  56 * @since release 2.1, february 1st 2002
  57 */
  58 
  59 public final class UCharacterProperty
  60 {
  61     // public data members -----------------------------------------------
  62 
  63     /**
  64     * Trie data
  65     */
  66     public CharTrie m_trie_;
  67     /**
  68      * Optimization
  69      * CharTrie index array
  70      */
  71     public char[] m_trieIndex_;
  72     /**
  73      * Optimization
  74      * CharTrie data array
  75      */
  76     public char[] m_trieData_;
  77     /**
  78      * Optimization
  79      * CharTrie data offset
  80      */
  81     public int m_trieInitialValue_;
  82     /**
  83     * Unicode version
  84     */
  85     public VersionInfo m_unicodeVersion_;
  86 
  87     // uprops.h enum UPropertySource --------------------------------------- ***
  88 
  89     /** From uchar.c/uprops.icu properties vectors trie */
  90     public static final int SRC_PROPSVEC=2;
  91     /** One more than the highest UPropertySource (SRC_) constant. */
  92     public static final int SRC_COUNT=9;
  93 
  94     // public methods ----------------------------------------------------
  95 
  96     /**
  97      * Java friends implementation
  98      */
  99     public void setIndexData(CharTrie.FriendAgent friendagent)
 100     {
 101         m_trieIndex_ = friendagent.getPrivateIndex();
 102         m_trieData_ = friendagent.getPrivateData();
 103         m_trieInitialValue_ = friendagent.getPrivateInitialValue();
 104     }
 105 
 106     /**
 107     * Gets the property value at the index.
 108     * This is optimized.
 109     * Note this is alittle different from CharTrie the index m_trieData_
 110     * is never negative.
 111     * @param ch code point whose property value is to be retrieved
 112     * @return property value of code point
 113     */
 114     public final int getProperty(int ch)
 115     {
 116         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
 117             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
 118                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
 119             // BMP codepoint 0000..D7FF or DC00..FFFF
 120             // optimized
 121             try { // using try for ch < 0 is faster than using an if statement
 122                 return m_trieData_[
 123                     (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
 124                           << Trie.INDEX_STAGE_2_SHIFT_)
 125                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
 126             } catch (ArrayIndexOutOfBoundsException e) {
 127                 return m_trieInitialValue_;
 128             }
 129         }
 130         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 131             // lead surrogate D800..DBFF
 132             return m_trieData_[
 133                     (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
 134                                   + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
 135                           << Trie.INDEX_STAGE_2_SHIFT_)
 136                     + (ch & Trie.INDEX_STAGE_3_MASK_)];
 137         }
 138         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
 139             // supplementary code point 10000..10FFFF
 140             // look at the construction of supplementary characters
 141             // trail forms the ends of it.
 142             return m_trie_.getSurrogateValue(
 143                                           UTF16.getLeadSurrogate(ch),
 144                                           (char)(ch & Trie.SURROGATE_MASK_));
 145         }
 146         // ch is out of bounds
 147         // return m_dataOffset_ if there is an error, in this case we return
 148         // the default value: m_initialValue_
 149         // we cannot assume that m_initialValue_ is at offset 0
 150         // this is for optimization.
 151         return m_trieInitialValue_;
 152 
 153         // this all is an inlined form of return m_trie_.getCodePointValue(ch);
 154     }
 155 
 156     /**
 157     * Getting the unsigned numeric value of a character embedded in the property
 158     * argument
 159     * @param prop the character
 160     * @return unsigned numberic value
 161     */
 162     public static int getUnsignedValue(int prop)
 163     {
 164         return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
 165     }
 166 
 167     /**
 168      * Gets the unicode additional properties.
 169      * C version getUnicodeProperties.
 170      * @param codepoint codepoint whose additional properties is to be
 171      *                  retrieved
 172      * @param column
 173      * @return unicode properties
 174      */
 175        public int getAdditional(int codepoint, int column) {
 176         if (column == -1) {
 177             return getProperty(codepoint);
 178         }
 179            if (column < 0 || column >= m_additionalColumnsCount_) {
 180            return 0;
 181        }
 182        return m_additionalVectors_[
 183                      m_additionalTrie_.getCodePointValue(codepoint) + column];
 184        }
 185 
 186        /**
 187      * <p>Get the "age" of the code point.</p>
 188      * <p>The "age" is the Unicode version when the code point was first
 189      * designated (as a non-character or for Private Use) or assigned a
 190      * character.</p>
 191      * <p>This can be useful to avoid emitting code points to receiving
 192      * processes that do not accept newer characters.</p>
 193      * <p>The data is from the UCD file DerivedAge.txt.</p>
 194      * <p>This API does not check the validity of the codepoint.</p>
 195      * @param codepoint The code point.
 196      * @return the Unicode version number
 197      */
 198     public VersionInfo getAge(int codepoint)
 199     {
 200         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
 201         return VersionInfo.getInstance(
 202                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
 203                            version & LAST_NIBBLE_MASK_, 0, 0);
 204     }
 205 
 206     /**
 207     * Forms a supplementary code point from the argument character<br>
 208     * Note this is for internal use hence no checks for the validity of the
 209     * surrogate characters are done
 210     * @param lead lead surrogate character
 211     * @param trail trailing surrogate character
 212     * @return code point of the supplementary character
 213     */
 214     public static int getRawSupplementary(char lead, char trail)
 215     {
 216         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
 217     }
 218 
 219     /**
 220     * Loads the property data and initialize the UCharacterProperty instance.
 221     * @throws MissingResourceException when data is missing or data has been corrupted
 222     */
 223     public static UCharacterProperty getInstance()
 224     {
 225         if(INSTANCE_ == null) {
 226             try {
 227                 INSTANCE_ = new UCharacterProperty();
 228             }
 229             catch (Exception e) {
 230                 throw new MissingResourceException(e.getMessage(),"","");
 231             }
 232         }
 233         return INSTANCE_;
 234     }
 235 
 236     /**
 237      * Checks if the argument c is to be treated as a white space in ICU
 238      * rules. Usually ICU rule white spaces are ignored unless quoted.
 239      * Equivalent to test for Pattern_White_Space Unicode property.
 240      * Stable set of characters, won't change.
 241      * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
 242      * @param c codepoint to check
 243      * @return true if c is a ICU white space
 244      */
 245     public static boolean isRuleWhiteSpace(int c)
 246     {
 247         /* "white space" in the sense of ICU rule parsers
 248            This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
 249            See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
 250            U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
 251            Equivalent to test for Pattern_White_Space Unicode property.
 252         */
 253         return (c >= 0x0009 && c <= 0x2029 &&
 254                 (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
 255                  c == 0x200E || c == 0x200F || c >= 0x2028));
 256     }
 257 
 258     // protected variables -----------------------------------------------
 259 
 260     /**
 261      * Extra property trie
 262      */
 263     CharTrie m_additionalTrie_;
 264     /**
 265      * Extra property vectors, 1st column for age and second for binary
 266      * properties.
 267      */
 268     int m_additionalVectors_[];
 269     /**
 270      * Number of additional columns
 271      */
 272     int m_additionalColumnsCount_;
 273     /**
 274      * Maximum values for block, bits used as in vector word
 275      * 0
 276      */
 277     int m_maxBlockScriptValue_;
 278     /**
 279      * Maximum values for script, bits used as in vector word
 280      * 0
 281      */
 282      int m_maxJTGValue_;
 283 
 284     // private variables -------------------------------------------------
 285 
 286       /**
 287      * UnicodeData.txt property object
 288      */
 289     private static UCharacterProperty INSTANCE_ = null;
 290 
 291     /**
 292     * Default name of the datafile
 293     */
 294     private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
 295 
 296     /**
 297     * Default buffer size of datafile
 298     */
 299     private static final int DATA_BUFFER_SIZE_ = 25000;
 300 
 301     /**
 302     * Numeric value shift
 303     */
 304     private static final int VALUE_SHIFT_ = 8;
 305 
 306     /**
 307     * Mask to be applied after shifting to obtain an unsigned numeric value
 308     */
 309     private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
 310 
 311     /**
 312     * Shift value for lead surrogate to form a supplementary character.
 313     */
 314     private static final int LEAD_SURROGATE_SHIFT_ = 10;
 315     /**
 316     * Offset to add to combined surrogate pair to avoid msking.
 317     */
 318     private static final int SURROGATE_OFFSET_ =
 319                            UTF16.SUPPLEMENTARY_MIN_VALUE -
 320                            (UTF16.SURROGATE_MIN_VALUE <<
 321                            LEAD_SURROGATE_SHIFT_) -
 322                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
 323 
 324     // additional properties ----------------------------------------------
 325 
 326     /**
 327      * First nibble shift
 328      */
 329     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
 330     /**
 331      * Second nibble mask
 332      */
 333     private static final int LAST_NIBBLE_MASK_ = 0xF;
 334     /**
 335      * Age value shift
 336      */
 337     private static final int AGE_SHIFT_ = 24;
 338 
 339     // private constructors --------------------------------------------------
 340 
 341     /**
 342     * Constructor
 343     * @exception IOException thrown when data reading fails or data corrupted
 344     */
 345     private UCharacterProperty() throws IOException
 346     {
 347         // jar access
 348         InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
 349         BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
 350         UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
 351         reader.read(this);
 352         b.close();
 353 
 354         m_trie_.putIndexData(this);
 355     }
 356 
 357     public void upropsvec_addPropertyStarts(UnicodeSet set) {
 358         /* add the start code point of each same-value range of the properties vectors trie */
 359         if(m_additionalColumnsCount_>0) {
 360             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
 361             TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
 362             RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
 363             while(propsVectorsIter.next(propsVectorsResult)){
 364                 set.add(propsVectorsResult.start);
 365             }
 366         }
 367     }
 368 
 369 }