< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java

Print this page


   1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */

  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.io.DataInputStream;
  40 import java.io.InputStream;
  41 import java.io.IOException;
  42 
  43 /**
  44  * <p>A trie is a kind of compressed, serializable table of values
  45  * associated with Unicode code points (0..0x10ffff).</p>
  46  * <p>This class defines the basic structure of a trie and provides methods
  47  * to <b>retrieve the offsets to the actual data</b>.</p>
  48  * <p>Data will be the form of an array of basic types, char or int.</p>
  49  * <p>The actual data format will have to be specified by the user in the
  50  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
  51  * <p>This trie implementation is optimized for getting offset while walking
  52  * forward through a UTF-16 string.
  53  * Therefore, the simplest and fastest access macros are the
  54  * fromLead() and fromOffsetTrail() methods.


 118         // Magic number to authenticate the data.
 119         int signature = input.readInt();
 120         m_options_    = input.readInt();
 121 
 122         if (!checkHeader(signature)) {
 123             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
 124         }
 125 
 126         if(dataManipulate != null) {
 127             m_dataManipulate_ = dataManipulate;
 128         } else {
 129             m_dataManipulate_ = new DefaultGetFoldingOffset();
 130         }
 131         m_isLatin1Linear_ = (m_options_ &
 132                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
 133         m_dataOffset_     = input.readInt();
 134         m_dataLength_     = input.readInt();
 135         unserialize(inputStream);
 136     }
 137 
 138     /**
 139     * Trie constructor
 140     * @param index array to be used for index
 141     * @param options used by the trie
 142     * @param dataManipulate object containing the information to parse the
 143     *                       trie data
 144     */
 145     protected Trie(char index[], int options, DataManipulate dataManipulate)
 146     {
 147         m_options_ = options;
 148         if(dataManipulate != null) {
 149             m_dataManipulate_ = dataManipulate;
 150         } else {
 151             m_dataManipulate_ = new DefaultGetFoldingOffset();
 152         }
 153         m_isLatin1Linear_ = (m_options_ &
 154                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
 155         m_index_ = index;
 156         m_dataOffset_ = m_index_.length;
 157     }
 158 
 159     // protected data members ------------------------------------------
 160 
 161     /**
 162     * Lead surrogate code points' index displacement in the index array.
 163     * <pre>{@code
 164     * 0x10000-0xd800=0x2800
 165     * 0x2800 >> INDEX_STAGE_1_SHIFT_
 166     * }</pre>
 167     */
 168     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
 169     /**
 170     * Shift size for shifting right the input index. 1..9
 171     */
 172     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
 173     /**
 174     * Shift size for shifting left the index array values.
 175     * Increases possible data size with 16-bit index values at the cost
 176     * of compactability.
 177     * This requires blocks of stage 2 data to be aligned by
 178     * DATA_GRANULARITY.
 179     * 0..INDEX_STAGE_1_SHIFT
 180     */
 181     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
 182     /**
 183      * Number of data values in a stage 2 (data array) block.
 184      */
 185     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
 186     /**
 187     * Mask for getting the lower bits from the input index.
 188     * DATA_BLOCK_LENGTH - 1.
 189     */
 190     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
 191     /** Number of bits of a trail surrogate that are used in index table lookups. */
 192     protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
 193     /**
 194      * Number of index (stage 1) entries per lead surrogate.
 195      * Same as number of index entries for 1024 trail surrogates,
 196      * {@code ==0x400>>INDEX_STAGE_1_SHIFT_}
 197      */
 198     protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
 199     /** Length of the BMP portion of the index (stage 1) array. */
 200     protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
 201     /**
 202     * Surrogate mask to use when shifting offset to retrieve supplementary
 203     * values
 204     */
 205     protected static final int SURROGATE_MASK_ = 0x3FF;
 206     /**
 207     * Index or UTF16 characters
 208     */
 209     protected char m_index_[];
 210     /**
 211     * Internal TrieValue which handles the parsing of the data value.
 212     * This class is to be implemented by the user
 213     */
 214     protected DataManipulate m_dataManipulate_;
 215     /**
 216     * Start index of the data portion of the trie. CharTrie combines
 217     * index and data into a char array, so this is used to indicate the
 218     * initial offset to the data portion.
 219     * Note this index always points to the initial value.
 220     */
 221     protected int m_dataOffset_;
 222     /**
 223     * Length of the data array
 224     */
 225     protected int m_dataLength_;
 226 
 227     // protected methods -----------------------------------------------
 228 
 229     /**
 230     * Gets the offset to the data which the surrogate pair points to.
 231     * @param lead lead surrogate
 232     * @param trail trailing surrogate
 233     * @return offset to data
 234     */
 235     protected abstract int getSurrogateOffset(char lead, char trail);
 236 
 237     /**
 238     * Gets the value at the argument index
 239     * @param index value at index will be retrieved
 240     * @return 32 bit value
 241     */
 242     protected abstract int getValue(int index);
 243 
 244     /**
 245     * Gets the default initial value
 246     * @return 32 bit value
 247     */
 248     protected abstract int getInitialValue();
 249 
 250     /**
 251     * Gets the offset to the data which the index ch after variable offset
 252     * points to.
 253     * Note for locating a non-supplementary character data offset, calling
 254     * <p>
 255     * getRawOffset(0, ch);
 256     * </p>
 257     * will do. Otherwise if it is a supplementary character formed by
 258     * surrogates lead and trail. Then we would have to call getRawOffset()
 259     * with getFoldingIndexOffset(). See getSurrogateOffset().
 260     * @param offset index offset which ch is to start from
 261     * @param ch index to be used after offset
 262     * @return offset to the data
 263     */
 264     protected final int getRawOffset(int offset, char ch)
 265     {
 266         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
 267                 << INDEX_STAGE_2_SHIFT_)
 268                 + (ch & INDEX_STAGE_3_MASK_);
 269     }
 270 


 304     * @param ch codepoint
 305     * @return offset to data
 306     */
 307     protected final int getCodePointOffset(int ch)
 308     {
 309         // if ((ch >> 16) == 0) slower
 310         if (ch < 0) {
 311             return -1;
 312         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 313             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
 314             return getRawOffset(0, (char)ch);
 315         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
 316             // BMP codepoint
 317             return getBMPOffset((char)ch);
 318         } else if (ch <= UCharacter.MAX_VALUE) {
 319             // look at the construction of supplementary characters
 320             // trail forms the ends of it.
 321             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
 322                                       (char)(ch & SURROGATE_MASK_));
 323         } else {
 324             // return -1 // if there is an error, in this case we return
 325             return -1;
 326         }
 327     }
 328 
 329     /**
 330     * <p>Parses the inputstream and creates the trie index with it.</p>
 331     * <p>This is overwritten by the child classes.
 332     * @param inputStream input stream containing the trie information
 333     * @exception IOException thrown when data reading fails.
 334     */
 335     protected void unserialize(InputStream inputStream) throws IOException
 336     {
 337         //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
 338         m_index_              = new char[m_dataOffset_];
 339         DataInputStream input = new DataInputStream(inputStream);
 340         for (int i = 0; i < m_dataOffset_; i ++) {
 341              m_index_[i] = input.readChar();
 342         }
 343     }
 344 
 345     /**
 346     * Determines if this is a 32 bit trie
 347     * @return true if options specifies this is a 32 bit trie
 348     */
 349     protected final boolean isIntTrie()
 350     {
 351         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
 352     }
 353 
 354     /**
 355     * Determines if this is a 16 bit trie
 356     * @return true if this is a 16 bit trie
 357     */
 358     protected final boolean isCharTrie()
 359     {
 360         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
 361     }
 362 
 363     // private data members --------------------------------------------
 364 
 365     /**
 366     * Latin 1 option mask
 367     */
 368     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
 369     /**
 370     * Constant number to authenticate the byte block
 371     */
 372     protected static final int HEADER_SIGNATURE_ = 0x54726965;
 373     /**
 374     * Header option formatting


   1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  ******************************************************************************
  28  * Copyright (C) 1996-2014, International Business Machines Corporation and
  29  * others. All Rights Reserved.
  30  ******************************************************************************





  31  */
  32 
  33 package sun.text.normalizer;
  34 
  35 import java.io.DataInputStream;
  36 import java.io.InputStream;
  37 import java.io.IOException;
  38 
  39 /**
  40  * <p>A trie is a kind of compressed, serializable table of values
  41  * associated with Unicode code points (0..0x10ffff).</p>
  42  * <p>This class defines the basic structure of a trie and provides methods
  43  * to <b>retrieve the offsets to the actual data</b>.</p>
  44  * <p>Data will be the form of an array of basic types, char or int.</p>
  45  * <p>The actual data format will have to be specified by the user in the
  46  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
  47  * <p>This trie implementation is optimized for getting offset while walking
  48  * forward through a UTF-16 string.
  49  * Therefore, the simplest and fastest access macros are the
  50  * fromLead() and fromOffsetTrail() methods.


 114         // Magic number to authenticate the data.
 115         int signature = input.readInt();
 116         m_options_    = input.readInt();
 117 
 118         if (!checkHeader(signature)) {
 119             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
 120         }
 121 
 122         if(dataManipulate != null) {
 123             m_dataManipulate_ = dataManipulate;
 124         } else {
 125             m_dataManipulate_ = new DefaultGetFoldingOffset();
 126         }
 127         m_isLatin1Linear_ = (m_options_ &
 128                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
 129         m_dataOffset_     = input.readInt();
 130         m_dataLength_     = input.readInt();
 131         unserialize(inputStream);
 132     }
 133 





















 134     // protected data members ------------------------------------------
 135 
 136     /**
 137      * Lead surrogate code points' index displacement in the index array.
 138      * <pre>{@code
 139      * 0x10000-0xd800=0x2800
 140      * 0x2800 >> INDEX_STAGE_1_SHIFT_
 141      * }</pre>
 142      */
 143     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
 144     /**
 145      * Shift size for shifting right the input index. 1..9
 146      */
 147     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
 148     /**
 149      * Shift size for shifting left the index array values.
 150      * Increases possible data size with 16-bit index values at the cost
 151      * of compactability.
 152      * This requires blocks of stage 2 data to be aligned by
 153      * DATA_GRANULARITY.
 154      * 0..INDEX_STAGE_1_SHIFT
 155      */
 156     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
 157     /**
 158      * Number of data values in a stage 2 (data array) block.
 159      */
 160     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
 161     /**
 162      * Mask for getting the lower bits from the input index.
 163      * DATA_BLOCK_LENGTH - 1.
 164      */
 165     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;










 166     /**
 167      * Surrogate mask to use when shifting offset to retrieve supplementary
 168      * values
 169      */
 170     protected static final int SURROGATE_MASK_ = 0x3FF;
 171     /**
 172      * Index or UTF16 characters
 173      */
 174     protected char m_index_[];
 175     /**
 176      * Internal TrieValue which handles the parsing of the data value.
 177      * This class is to be implemented by the user
 178      */
 179     protected DataManipulate m_dataManipulate_;
 180     /**
 181      * Start index of the data portion of the trie. CharTrie combines
 182      * index and data into a char array, so this is used to indicate the
 183      * initial offset to the data portion.
 184      * Note this index always points to the initial value.
 185      */
 186     protected int m_dataOffset_;
 187     /**
 188      * Length of the data array
 189      */
 190     protected int m_dataLength_;
 191 
 192     // protected methods -----------------------------------------------
 193 
 194     /**
 195     * Gets the offset to the data which the surrogate pair points to.
 196     * @param lead lead surrogate
 197     * @param trail trailing surrogate
 198     * @return offset to data
 199     */
 200     protected abstract int getSurrogateOffset(char lead, char trail);
 201 
 202     /**













 203     * Gets the offset to the data which the index ch after variable offset
 204     * points to.
 205     * Note for locating a non-supplementary character data offset, calling
 206     * <p>
 207     * getRawOffset(0, ch);
 208     * </p>
 209     * will do. Otherwise if it is a supplementary character formed by
 210     * surrogates lead and trail. Then we would have to call getRawOffset()
 211     * with getFoldingIndexOffset(). See getSurrogateOffset().
 212     * @param offset index offset which ch is to start from
 213     * @param ch index to be used after offset
 214     * @return offset to the data
 215     */
 216     protected final int getRawOffset(int offset, char ch)
 217     {
 218         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
 219                 << INDEX_STAGE_2_SHIFT_)
 220                 + (ch & INDEX_STAGE_3_MASK_);
 221     }
 222 


 256      * @param ch codepoint
 257      * @return offset to data
 258      */
 259     protected final int getCodePointOffset(int ch)
 260     {
 261         // if ((ch >> 16) == 0) slower
 262         if (ch < 0) {
 263             return -1;
 264         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 265             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
 266             return getRawOffset(0, (char)ch);
 267         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
 268             // BMP codepoint
 269             return getBMPOffset((char)ch);
 270         } else if (ch <= UCharacter.MAX_VALUE) {
 271             // look at the construction of supplementary characters
 272             // trail forms the ends of it.
 273             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
 274                                       (char)(ch & SURROGATE_MASK_));
 275         } else {
 276             // return -1 if there is an error, in this case we return
 277             return -1;
 278         }
 279     }
 280 
 281     /**
 282     * <p>Parses the inputstream and creates the trie index with it.</p>
 283     * <p>This is overwritten by the child classes.
 284     * @param inputStream input stream containing the trie information
 285     * @exception IOException thrown when data reading fails.
 286     */
 287     protected void unserialize(InputStream inputStream) throws IOException
 288     {
 289         //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
 290         m_index_              = new char[m_dataOffset_];
 291         DataInputStream input = new DataInputStream(inputStream);
 292         for (int i = 0; i < m_dataOffset_; i ++) {
 293              m_index_[i] = input.readChar();
 294         }
 295     }
 296 
 297     /**









 298     * Determines if this is a 16 bit trie
 299     * @return true if this is a 16 bit trie
 300     */
 301     protected final boolean isCharTrie()
 302     {
 303         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
 304     }
 305 
 306     // private data members --------------------------------------------
 307 
 308     /**
 309      * Latin 1 option mask
 310      */
 311     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
 312     /**
 313     * Constant number to authenticate the byte block
 314     */
 315     protected static final int HEADER_SIGNATURE_ = 0x54726965;
 316     /**
 317     * Header option formatting


< prev index next >