1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.io.DataInputStream;
  40 import java.io.InputStream;
  41 import java.io.IOException;
  42 
  43 /**
  44  * <p>A trie is a kind of compressed, serializable table of values
  45  * associated with Unicode code points (0..0x10ffff).</p>
  46  * <p>This class defines the basic structure of a trie and provides methods
  47  * to <b>retrieve the offsets to the actual data</b>.</p>
  48  * <p>Data will be the form of an array of basic types, char or int.</p>
  49  * <p>The actual data format will have to be specified by the user in the
  50  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
  51  * <p>This trie implementation is optimized for getting offset while walking
  52  * forward through a UTF-16 string.
  53  * Therefore, the simplest and fastest access macros are the
  54  * fromLead() and fromOffsetTrail() methods.
  55  * The fromBMP() method are a little more complicated; they get offsets even
  56  * for lead surrogate codepoints, while the fromLead() method get special
  57  * "folded" offsets for lead surrogate code units if there is relevant data
  58  * associated with them.
  59  * From such a folded offsets, an offset needs to be extracted to supply
  60  * to the fromOffsetTrail() methods.
  61  * To handle such supplementary codepoints, some offset information are kept
  62  * in the data.</p>
  63  * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
  64  * that offset from the folded value for the lead surrogate unit.</p>
  65  * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
  66  * com.ibm.icu.impl.IntTrie.</p>
  67  * @author synwee
  68  * @see com.ibm.icu.impl.CharTrie
  69  * @see com.ibm.icu.impl.IntTrie
  70  * @since release 2.1, Jan 01 2002
  71  */
  72 public abstract class Trie
  73 {
  74     // public class declaration ----------------------------------------
  75 
  76     /**
  77     * Character data in com.ibm.impl.Trie have different user-specified format
  78     * for different purposes.
  79     * This interface specifies methods to be implemented in order for
  80     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
  81     * the data.
  82     */
  83     public static interface DataManipulate
  84     {
  85         /**
  86         * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
  87         * data
  88         * the index array offset of the indexes for that lead surrogate.
  89         * @param value data value for a surrogate from the trie, including the
  90         *        folding offset
  91         * @return data offset or 0 if there is no data for the lead surrogate
  92         */
  93         public int getFoldingOffset(int value);
  94     }
  95 
  96     // default implementation
  97     private static class DefaultGetFoldingOffset implements DataManipulate {
  98         public int getFoldingOffset(int value) {
  99             return value;
 100         }
 101     }
 102 
 103     // protected constructor -------------------------------------------
 104 
 105     /**
 106     * Trie constructor for CharTrie use.
 107     * @param inputStream ICU data file input stream which contains the
 108     *                        trie
 109     * @param dataManipulate object containing the information to parse the
 110     *                       trie data
 111     * @throws IOException thrown when input stream does not have the
 112     *                        right header.
 113     */
 114     protected Trie(InputStream inputStream,
 115                    DataManipulate  dataManipulate) throws IOException
 116     {
 117         DataInputStream input = new DataInputStream(inputStream);
 118         // Magic number to authenticate the data.
 119         int signature = input.readInt();
 120         m_options_    = input.readInt();
 121 
 122         if (!checkHeader(signature)) {
 123             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
 124         }
 125 
 126         if(dataManipulate != null) {
 127             m_dataManipulate_ = dataManipulate;
 128         } else {
 129             m_dataManipulate_ = new DefaultGetFoldingOffset();
 130         }
 131         m_isLatin1Linear_ = (m_options_ &
 132                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
 133         m_dataOffset_     = input.readInt();
 134         m_dataLength_     = input.readInt();
 135         unserialize(inputStream);
 136     }
 137 
 138     /**
 139     * Trie constructor
 140     * @param index array to be used for index
 141     * @param options used by the trie
 142     * @param dataManipulate object containing the information to parse the
 143     *                       trie data
 144     */
 145     protected Trie(char index[], int options, DataManipulate dataManipulate)
 146     {
 147         m_options_ = options;
 148         if(dataManipulate != null) {
 149             m_dataManipulate_ = dataManipulate;
 150         } else {
 151             m_dataManipulate_ = new DefaultGetFoldingOffset();
 152         }
 153         m_isLatin1Linear_ = (m_options_ &
 154                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
 155         m_index_ = index;
 156         m_dataOffset_ = m_index_.length;
 157     }
 158 
 159     // protected data members ------------------------------------------
 160 
 161     /**
 162     * Lead surrogate code points' index displacement in the index array.
 163     * 0x10000-0xd800=0x2800
 164     * 0x2800 >> INDEX_STAGE_1_SHIFT_
 165     */
 166     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
 167     /**
 168     * Shift size for shifting right the input index. 1..9
 169     */
 170     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
 171     /**
 172     * Shift size for shifting left the index array values.
 173     * Increases possible data size with 16-bit index values at the cost
 174     * of compactability.
 175     * This requires blocks of stage 2 data to be aligned by
 176     * DATA_GRANULARITY.
 177     * 0..INDEX_STAGE_1_SHIFT
 178     */
 179     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
 180     /**
 181      * Number of data values in a stage 2 (data array) block.
 182      */
 183     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
 184     /**
 185     * Mask for getting the lower bits from the input index.
 186     * DATA_BLOCK_LENGTH - 1.
 187     */
 188     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
 189     /** Number of bits of a trail surrogate that are used in index table lookups. */
 190     protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
 191     /**
 192      * Number of index (stage 1) entries per lead surrogate.
 193      * Same as number of index entries for 1024 trail surrogates,
 194      * ==0x400>>INDEX_STAGE_1_SHIFT_
 195      */
 196     protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
 197     /** Length of the BMP portion of the index (stage 1) array. */
 198     protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
 199     /**
 200     * Surrogate mask to use when shifting offset to retrieve supplementary
 201     * values
 202     */
 203     protected static final int SURROGATE_MASK_ = 0x3FF;
 204     /**
 205     * Index or UTF16 characters
 206     */
 207     protected char m_index_[];
 208     /**
 209     * Internal TrieValue which handles the parsing of the data value.
 210     * This class is to be implemented by the user
 211     */
 212     protected DataManipulate m_dataManipulate_;
 213     /**
 214     * Start index of the data portion of the trie. CharTrie combines
 215     * index and data into a char array, so this is used to indicate the
 216     * initial offset to the data portion.
 217     * Note this index always points to the initial value.
 218     */
 219     protected int m_dataOffset_;
 220     /**
 221     * Length of the data array
 222     */
 223     protected int m_dataLength_;
 224 
 225     // protected methods -----------------------------------------------
 226 
 227     /**
 228     * Gets the offset to the data which the surrogate pair points to.
 229     * @param lead lead surrogate
 230     * @param trail trailing surrogate
 231     * @return offset to data
 232     */
 233     protected abstract int getSurrogateOffset(char lead, char trail);
 234 
 235     /**
 236     * Gets the value at the argument index
 237     * @param index value at index will be retrieved
 238     * @return 32 bit value
 239     */
 240     protected abstract int getValue(int index);
 241 
 242     /**
 243     * Gets the default initial value
 244     * @return 32 bit value
 245     */
 246     protected abstract int getInitialValue();
 247 
 248     /**
 249     * Gets the offset to the data which the index ch after variable offset
 250     * points to.
 251     * Note for locating a non-supplementary character data offset, calling
 252     * <p>
 253     * getRawOffset(0, ch);
 254     * </p>
 255     * will do. Otherwise if it is a supplementary character formed by
 256     * surrogates lead and trail. Then we would have to call getRawOffset()
 257     * with getFoldingIndexOffset(). See getSurrogateOffset().
 258     * @param offset index offset which ch is to start from
 259     * @param ch index to be used after offset
 260     * @return offset to the data
 261     */
 262     protected final int getRawOffset(int offset, char ch)
 263     {
 264         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
 265                 << INDEX_STAGE_2_SHIFT_)
 266                 + (ch & INDEX_STAGE_3_MASK_);
 267     }
 268 
 269     /**
 270     * Gets the offset to data which the BMP character points to
 271     * Treats a lead surrogate as a normal code point.
 272     * @param ch BMP character
 273     * @return offset to data
 274     */
 275     protected final int getBMPOffset(char ch)
 276     {
 277         return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
 278                 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
 279                 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
 280                 : getRawOffset(0, ch);
 281                 // using a getRawOffset(ch) makes no diff
 282     }
 283 
 284     /**
 285     * Gets the offset to the data which this lead surrogate character points
 286     * to.
 287     * Data at the returned offset may contain folding offset information for
 288     * the next trailing surrogate character.
 289     * @param ch lead surrogate character
 290     * @return offset to data
 291     */
 292     protected final int getLeadOffset(char ch)
 293     {
 294        return getRawOffset(0, ch);
 295     }
 296 
 297     /**
 298     * Internal trie getter from a code point.
 299     * Could be faster(?) but longer with
 300     *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
 301     * Gets the offset to data which the codepoint points to
 302     * @param ch codepoint
 303     * @return offset to data
 304     */
 305     protected final int getCodePointOffset(int ch)
 306     {
 307         // if ((ch >> 16) == 0) slower
 308         if (ch < 0) {
 309             return -1;
 310         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 311             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
 312             return getRawOffset(0, (char)ch);
 313         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
 314             // BMP codepoint
 315             return getBMPOffset((char)ch);
 316         } else if (ch <= UCharacter.MAX_VALUE) {
 317             // look at the construction of supplementary characters
 318             // trail forms the ends of it.
 319             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
 320                                       (char)(ch & SURROGATE_MASK_));
 321         } else {
 322             // return -1 // if there is an error, in this case we return
 323             return -1;
 324         }
 325     }
 326 
 327     /**
 328     * <p>Parses the inputstream and creates the trie index with it.</p>
 329     * <p>This is overwritten by the child classes.
 330     * @param inputStream input stream containing the trie information
 331     * @exception IOException thrown when data reading fails.
 332     */
 333     protected void unserialize(InputStream inputStream) throws IOException
 334     {
 335         //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
 336         m_index_              = new char[m_dataOffset_];
 337         DataInputStream input = new DataInputStream(inputStream);
 338         for (int i = 0; i < m_dataOffset_; i ++) {
 339              m_index_[i] = input.readChar();
 340         }
 341     }
 342 
 343     /**
 344     * Determines if this is a 32 bit trie
 345     * @return true if options specifies this is a 32 bit trie
 346     */
 347     protected final boolean isIntTrie()
 348     {
 349         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
 350     }
 351 
 352     /**
 353     * Determines if this is a 16 bit trie
 354     * @return true if this is a 16 bit trie
 355     */
 356     protected final boolean isCharTrie()
 357     {
 358         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
 359     }
 360 
 361     // private data members --------------------------------------------
 362 
 363     /**
 364     * Latin 1 option mask
 365     */
 366     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
 367     /**
 368     * Constant number to authenticate the byte block
 369     */
 370     protected static final int HEADER_SIGNATURE_ = 0x54726965;
 371     /**
 372     * Header option formatting
 373     */
 374     private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
 375     protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
 376     protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
 377 
 378     /**
 379     * Flag indicator for Latin quick access data block
 380     */
 381     private boolean m_isLatin1Linear_;
 382 
 383     /**
 384     * <p>Trie options field.</p>
 385     * <p>options bit field:<br>
 386     * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
 387     * 8  0 = 16-bit data, 1=32-bit data<br>
 388     * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
 389     * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
 390     */
 391     private int m_options_;
 392 
 393     // private methods ---------------------------------------------------
 394 
 395     /**
 396     * Authenticates raw data header.
 397     * Checking the header information, signature and options.
 398     * @param signature This contains the options and type of a Trie
 399     * @return true if the header is authenticated valid
 400     */
 401     private final boolean checkHeader(int signature)
 402     {
 403         // check the signature
 404         // Trie in big-endian US-ASCII (0x54726965).
 405         // Magic number to authenticate the data.
 406         if (signature != HEADER_SIGNATURE_) {
 407             return false;
 408         }
 409 
 410         if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
 411                                                     INDEX_STAGE_1_SHIFT_ ||
 412             ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
 413                                                 HEADER_OPTIONS_SHIFT_MASK_)
 414                                                  != INDEX_STAGE_2_SHIFT_) {
 415             return false;
 416         }
 417         return true;
 418     }
 419 }