1 /*
   2  * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  ******************************************************************************
  28  * Copyright (C) 1996-2014, International Business Machines Corporation and
  29  * others. All Rights Reserved.
  30  ******************************************************************************
  31  */
  32 
  33 package jdk.internal.icu.impl;
  34 
  35 import jdk.internal.icu.lang.UCharacter;
  36 import jdk.internal.icu.text.UTF16;
  37 
  38 import java.io.DataInputStream;
  39 import java.io.InputStream;
  40 import java.io.IOException;
  41 
  42 /**
  43  * <p>A trie is a kind of compressed, serializable table of values
  44  * associated with Unicode code points (0..0x10ffff).</p>
  45  * <p>This class defines the basic structure of a trie and provides methods
  46  * to <b>retrieve the offsets to the actual data</b>.</p>
  47  * <p>Data will be the form of an array of basic types, char or int.</p>
  48  * <p>The actual data format will have to be specified by the user in the
  49  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
  50  * <p>This trie implementation is optimized for getting offset while walking
  51  * forward through a UTF-16 string.
  52  * Therefore, the simplest and fastest access macros are the
  53  * fromLead() and fromOffsetTrail() methods.
  54  * The fromBMP() method are a little more complicated; they get offsets even
  55  * for lead surrogate codepoints, while the fromLead() method get special
  56  * "folded" offsets for lead surrogate code units if there is relevant data
  57  * associated with them.
  58  * From such a folded offsets, an offset needs to be extracted to supply
  59  * to the fromOffsetTrail() methods.
  60  * To handle such supplementary codepoints, some offset information are kept
  61  * in the data.</p>
  62  * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
  63  * that offset from the folded value for the lead surrogate unit.</p>
  64  * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
  65  * com.ibm.icu.impl.IntTrie.</p>
  66  * @author synwee
  67  * @see com.ibm.icu.impl.CharTrie
  68  * @see com.ibm.icu.impl.IntTrie
  69  * @since release 2.1, Jan 01 2002
  70  */
  71 public abstract class Trie
  72 {
  73     // public class declaration ----------------------------------------
  74 
  75     /**
  76     * Character data in com.ibm.impl.Trie have different user-specified format
  77     * for different purposes.
  78     * This interface specifies methods to be implemented in order for
  79     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
  80     * the data.
  81     */
  82     public static interface DataManipulate
  83     {
  84         /**
  85         * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
  86         * data
  87         * the index array offset of the indexes for that lead surrogate.
  88         * @param value data value for a surrogate from the trie, including the
  89         *        folding offset
  90         * @return data offset or 0 if there is no data for the lead surrogate
  91         */
  92         public int getFoldingOffset(int value);
  93     }
  94 
  95     // default implementation
  96     private static class DefaultGetFoldingOffset implements DataManipulate {
  97         public int getFoldingOffset(int value) {
  98             return value;
  99         }
 100     }
 101 
 102     // protected constructor -------------------------------------------
 103 
 104     /**
 105     * Trie constructor for CharTrie use.
 106     * @param inputStream ICU data file input stream which contains the
 107     *                        trie
 108     * @param dataManipulate object containing the information to parse the
 109     *                       trie data
 110     * @throws IOException thrown when input stream does not have the
 111     *                        right header.
 112     */
 113     protected Trie(InputStream inputStream,
 114                    DataManipulate  dataManipulate) throws IOException
 115     {
 116         DataInputStream input = new DataInputStream(inputStream);
 117         // Magic number to authenticate the data.
 118         int signature = input.readInt();
 119         m_options_    = input.readInt();
 120 
 121         if (!checkHeader(signature)) {
 122             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
 123         }
 124 
 125         if(dataManipulate != null) {
 126             m_dataManipulate_ = dataManipulate;
 127         } else {
 128             m_dataManipulate_ = new DefaultGetFoldingOffset();
 129         }
 130         m_isLatin1Linear_ = (m_options_ &
 131                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
 132         m_dataOffset_     = input.readInt();
 133         m_dataLength_     = input.readInt();
 134         unserialize(inputStream);
 135     }
 136 
 137     // protected data members ------------------------------------------
 138 
 139     /**
 140      * Lead surrogate code points' index displacement in the index array.
 141      * <pre>{@code
 142      * 0x10000-0xd800=0x2800
 143      * 0x2800 >> INDEX_STAGE_1_SHIFT_
 144      * }</pre>
 145      */
 146     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
 147     /**
 148      * Shift size for shifting right the input index. 1..9
 149      */
 150     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
 151     /**
 152      * Shift size for shifting left the index array values.
 153      * Increases possible data size with 16-bit index values at the cost
 154      * of compactability.
 155      * This requires blocks of stage 2 data to be aligned by
 156      * DATA_GRANULARITY.
 157      * 0..INDEX_STAGE_1_SHIFT
 158      */
 159     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
 160     /**
 161      * Number of data values in a stage 2 (data array) block.
 162      */
 163     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
 164     /**
 165      * Mask for getting the lower bits from the input index.
 166      * DATA_BLOCK_LENGTH - 1.
 167      */
 168     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
 169     /**
 170      * Surrogate mask to use when shifting offset to retrieve supplementary
 171      * values
 172      */
 173     protected static final int SURROGATE_MASK_ = 0x3FF;
 174     /**
 175      * Index or UTF16 characters
 176      */
 177     protected char m_index_[];
 178     /**
 179      * Internal TrieValue which handles the parsing of the data value.
 180      * This class is to be implemented by the user
 181      */
 182     protected DataManipulate m_dataManipulate_;
 183     /**
 184      * Start index of the data portion of the trie. CharTrie combines
 185      * index and data into a char array, so this is used to indicate the
 186      * initial offset to the data portion.
 187      * Note this index always points to the initial value.
 188      */
 189     protected int m_dataOffset_;
 190     /**
 191      * Length of the data array
 192      */
 193     protected int m_dataLength_;
 194 
 195     // protected methods -----------------------------------------------
 196 
 197     /**
 198     * Gets the offset to the data which the surrogate pair points to.
 199     * @param lead lead surrogate
 200     * @param trail trailing surrogate
 201     * @return offset to data
 202     */
 203     protected abstract int getSurrogateOffset(char lead, char trail);
 204 
 205     /**
 206     * Gets the offset to the data which the index ch after variable offset
 207     * points to.
 208     * Note for locating a non-supplementary character data offset, calling
 209     * <p>
 210     * getRawOffset(0, ch);
 211     * </p>
 212     * will do. Otherwise if it is a supplementary character formed by
 213     * surrogates lead and trail. Then we would have to call getRawOffset()
 214     * with getFoldingIndexOffset(). See getSurrogateOffset().
 215     * @param offset index offset which ch is to start from
 216     * @param ch index to be used after offset
 217     * @return offset to the data
 218     */
 219     protected final int getRawOffset(int offset, char ch)
 220     {
 221         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
 222                 << INDEX_STAGE_2_SHIFT_)
 223                 + (ch & INDEX_STAGE_3_MASK_);
 224     }
 225 
 226     /**
 227     * Gets the offset to data which the BMP character points to
 228     * Treats a lead surrogate as a normal code point.
 229     * @param ch BMP character
 230     * @return offset to data
 231     */
 232     protected final int getBMPOffset(char ch)
 233     {
 234         return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
 235                 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
 236                 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
 237                 : getRawOffset(0, ch);
 238                 // using a getRawOffset(ch) makes no diff
 239     }
 240 
 241     /**
 242     * Gets the offset to the data which this lead surrogate character points
 243     * to.
 244     * Data at the returned offset may contain folding offset information for
 245     * the next trailing surrogate character.
 246     * @param ch lead surrogate character
 247     * @return offset to data
 248     */
 249     protected final int getLeadOffset(char ch)
 250     {
 251        return getRawOffset(0, ch);
 252     }
 253 
 254     /**
 255      * Internal trie getter from a code point.
 256      * Could be faster(?) but longer with
 257      * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
 258      * Gets the offset to data which the codepoint points to
 259      * @param ch codepoint
 260      * @return offset to data
 261      */
 262     protected final int getCodePointOffset(int ch)
 263     {
 264         // if ((ch >> 16) == 0) slower
 265         if (ch < 0) {
 266             return -1;
 267         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 268             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
 269             return getRawOffset(0, (char)ch);
 270         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
 271             // BMP codepoint
 272             return getBMPOffset((char)ch);
 273         } else if (ch <= UCharacter.MAX_VALUE) {
 274             // look at the construction of supplementary characters
 275             // trail forms the ends of it.
 276             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
 277                                       (char)(ch & SURROGATE_MASK_));
 278         } else {
 279             // return -1 if there is an error, in this case we return
 280             return -1;
 281         }
 282     }
 283 
 284     /**
 285     * <p>Parses the inputstream and creates the trie index with it.</p>
 286     * <p>This is overwritten by the child classes.
 287     * @param inputStream input stream containing the trie information
 288     * @exception IOException thrown when data reading fails.
 289     */
 290     protected void unserialize(InputStream inputStream) throws IOException
 291     {
 292         //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
 293         m_index_              = new char[m_dataOffset_];
 294         DataInputStream input = new DataInputStream(inputStream);
 295         for (int i = 0; i < m_dataOffset_; i ++) {
 296              m_index_[i] = input.readChar();
 297         }
 298     }
 299 
 300     /**
 301     * Determines if this is a 16 bit trie
 302     * @return true if this is a 16 bit trie
 303     */
 304     protected final boolean isCharTrie()
 305     {
 306         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
 307     }
 308 
 309     // private data members --------------------------------------------
 310 
 311     /**
 312      * Latin 1 option mask
 313      */
 314     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
 315     /**
 316     * Constant number to authenticate the byte block
 317     */
 318     protected static final int HEADER_SIGNATURE_ = 0x54726965;
 319     /**
 320     * Header option formatting
 321     */
 322     private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
 323     protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
 324     protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
 325 
 326     /**
 327      * Flag indicator for Latin quick access data block
 328      */
 329     private boolean m_isLatin1Linear_;
 330 
 331     /**
 332      * <p>Trie options field.</p>
 333      * <p>options bit field:<br>
 334      * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
 335      * 8  0 = 16-bit data, 1=32-bit data<br>
 336      * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
 337      * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
 338      */
 339     private int m_options_;
 340 
 341     // private methods ---------------------------------------------------
 342 
 343     /**
 344      * Authenticates raw data header.
 345      * Checking the header information, signature and options.
 346      * @param signature This contains the options and type of a Trie
 347      * @return true if the header is authenticated valid
 348      */
 349     private final boolean checkHeader(int signature)
 350     {
 351         // check the signature
 352         // Trie in big-endian US-ASCII (0x54726965).
 353         // Magic number to authenticate the data.
 354         if (signature != HEADER_SIGNATURE_) {
 355             return false;
 356         }
 357 
 358         if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
 359                                                     INDEX_STAGE_1_SHIFT_ ||
 360             ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
 361                                                 HEADER_OPTIONS_SHIFT_MASK_)
 362                                                  != INDEX_STAGE_2_SHIFT_) {
 363             return false;
 364         }
 365         return true;
 366     }
 367 }