1 /* 2 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ****************************************************************************** 28 * Copyright (C) 1996-2014, International Business Machines Corporation and 29 * others. All Rights Reserved. 30 ****************************************************************************** 31 */ 32 33 package jdk.internal.icu.impl; 34 35 import jdk.internal.icu.lang.UCharacter; 36 import jdk.internal.icu.text.UTF16; 37 38 import java.io.DataInputStream; 39 import java.io.InputStream; 40 import java.io.IOException; 41 42 /** 43 * <p>A trie is a kind of compressed, serializable table of values 44 * associated with Unicode code points (0..0x10ffff).</p> 45 * <p>This class defines the basic structure of a trie and provides methods 46 * to <b>retrieve the offsets to the actual data</b>.</p> 47 * <p>Data will be the form of an array of basic types, char or int.</p> 48 * <p>The actual data format will have to be specified by the user in the 49 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> 50 * <p>This trie implementation is optimized for getting offset while walking 51 * forward through a UTF-16 string. 52 * Therefore, the simplest and fastest access macros are the 53 * fromLead() and fromOffsetTrail() methods. 54 * The fromBMP() method are a little more complicated; they get offsets even 55 * for lead surrogate codepoints, while the fromLead() method get special 56 * "folded" offsets for lead surrogate code units if there is relevant data 57 * associated with them. 58 * From such a folded offsets, an offset needs to be extracted to supply 59 * to the fromOffsetTrail() methods. 60 * To handle such supplementary codepoints, some offset information are kept 61 * in the data.</p> 62 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve 63 * that offset from the folded value for the lead surrogate unit.</p> 64 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or 65 * com.ibm.icu.impl.IntTrie.</p> 66 * @author synwee 67 * @see com.ibm.icu.impl.CharTrie 68 * @see com.ibm.icu.impl.IntTrie 69 * @since release 2.1, Jan 01 2002 70 */ 71 public abstract class Trie 72 { 73 // public class declaration ---------------------------------------- 74 75 /** 76 * Character data in com.ibm.impl.Trie have different user-specified format 77 * for different purposes. 78 * This interface specifies methods to be implemented in order for 79 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 80 * the data. 81 */ 82 public static interface DataManipulate 83 { 84 /** 85 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's 86 * data 87 * the index array offset of the indexes for that lead surrogate. 88 * @param value data value for a surrogate from the trie, including the 89 * folding offset 90 * @return data offset or 0 if there is no data for the lead surrogate 91 */ 92 public int getFoldingOffset(int value); 93 } 94 95 // default implementation 96 private static class DefaultGetFoldingOffset implements DataManipulate { 97 public int getFoldingOffset(int value) { 98 return value; 99 } 100 } 101 102 // protected constructor ------------------------------------------- 103 104 /** 105 * Trie constructor for CharTrie use. 106 * @param inputStream ICU data file input stream which contains the 107 * trie 108 * @param dataManipulate object containing the information to parse the 109 * trie data 110 * @throws IOException thrown when input stream does not have the 111 * right header. 112 */ 113 protected Trie(InputStream inputStream, 114 DataManipulate dataManipulate) throws IOException 115 { 116 DataInputStream input = new DataInputStream(inputStream); 117 // Magic number to authenticate the data. 118 int signature = input.readInt(); 119 m_options_ = input.readInt(); 120 121 if (!checkHeader(signature)) { 122 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 123 } 124 125 if(dataManipulate != null) { 126 m_dataManipulate_ = dataManipulate; 127 } else { 128 m_dataManipulate_ = new DefaultGetFoldingOffset(); 129 } 130 m_isLatin1Linear_ = (m_options_ & 131 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 132 m_dataOffset_ = input.readInt(); 133 m_dataLength_ = input.readInt(); 134 unserialize(inputStream); 135 } 136 137 // protected data members ------------------------------------------ 138 139 /** 140 * Lead surrogate code points' index displacement in the index array. 141 * <pre>{@code 142 * 0x10000-0xd800=0x2800 143 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 144 * }</pre> 145 */ 146 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 147 /** 148 * Shift size for shifting right the input index. 1..9 149 */ 150 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 151 /** 152 * Shift size for shifting left the index array values. 153 * Increases possible data size with 16-bit index values at the cost 154 * of compactability. 155 * This requires blocks of stage 2 data to be aligned by 156 * DATA_GRANULARITY. 157 * 0..INDEX_STAGE_1_SHIFT 158 */ 159 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 160 /** 161 * Number of data values in a stage 2 (data array) block. 162 */ 163 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 164 /** 165 * Mask for getting the lower bits from the input index. 166 * DATA_BLOCK_LENGTH - 1. 167 */ 168 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 169 /** 170 * Surrogate mask to use when shifting offset to retrieve supplementary 171 * values 172 */ 173 protected static final int SURROGATE_MASK_ = 0x3FF; 174 /** 175 * Index or UTF16 characters 176 */ 177 protected char m_index_[]; 178 /** 179 * Internal TrieValue which handles the parsing of the data value. 180 * This class is to be implemented by the user 181 */ 182 protected DataManipulate m_dataManipulate_; 183 /** 184 * Start index of the data portion of the trie. CharTrie combines 185 * index and data into a char array, so this is used to indicate the 186 * initial offset to the data portion. 187 * Note this index always points to the initial value. 188 */ 189 protected int m_dataOffset_; 190 /** 191 * Length of the data array 192 */ 193 protected int m_dataLength_; 194 195 // protected methods ----------------------------------------------- 196 197 /** 198 * Gets the offset to the data which the surrogate pair points to. 199 * @param lead lead surrogate 200 * @param trail trailing surrogate 201 * @return offset to data 202 */ 203 protected abstract int getSurrogateOffset(char lead, char trail); 204 205 /** 206 * Gets the offset to the data which the index ch after variable offset 207 * points to. 208 * Note for locating a non-supplementary character data offset, calling 209 * <p> 210 * getRawOffset(0, ch); 211 * </p> 212 * will do. Otherwise if it is a supplementary character formed by 213 * surrogates lead and trail. Then we would have to call getRawOffset() 214 * with getFoldingIndexOffset(). See getSurrogateOffset(). 215 * @param offset index offset which ch is to start from 216 * @param ch index to be used after offset 217 * @return offset to the data 218 */ 219 protected final int getRawOffset(int offset, char ch) 220 { 221 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 222 << INDEX_STAGE_2_SHIFT_) 223 + (ch & INDEX_STAGE_3_MASK_); 224 } 225 226 /** 227 * Gets the offset to data which the BMP character points to 228 * Treats a lead surrogate as a normal code point. 229 * @param ch BMP character 230 * @return offset to data 231 */ 232 protected final int getBMPOffset(char ch) 233 { 234 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 235 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 236 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 237 : getRawOffset(0, ch); 238 // using a getRawOffset(ch) makes no diff 239 } 240 241 /** 242 * Gets the offset to the data which this lead surrogate character points 243 * to. 244 * Data at the returned offset may contain folding offset information for 245 * the next trailing surrogate character. 246 * @param ch lead surrogate character 247 * @return offset to data 248 */ 249 protected final int getLeadOffset(char ch) 250 { 251 return getRawOffset(0, ch); 252 } 253 254 /** 255 * Internal trie getter from a code point. 256 * Could be faster(?) but longer with 257 * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} 258 * Gets the offset to data which the codepoint points to 259 * @param ch codepoint 260 * @return offset to data 261 */ 262 protected final int getCodePointOffset(int ch) 263 { 264 // if ((ch >> 16) == 0) slower 265 if (ch < 0) { 266 return -1; 267 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 268 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 269 return getRawOffset(0, (char)ch); 270 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 271 // BMP codepoint 272 return getBMPOffset((char)ch); 273 } else if (ch <= UCharacter.MAX_VALUE) { 274 // look at the construction of supplementary characters 275 // trail forms the ends of it. 276 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 277 (char)(ch & SURROGATE_MASK_)); 278 } else { 279 // return -1 if there is an error, in this case we return 280 return -1; 281 } 282 } 283 284 /** 285 * <p>Parses the inputstream and creates the trie index with it.</p> 286 * <p>This is overwritten by the child classes. 287 * @param inputStream input stream containing the trie information 288 * @exception IOException thrown when data reading fails. 289 */ 290 protected void unserialize(InputStream inputStream) throws IOException 291 { 292 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ 293 m_index_ = new char[m_dataOffset_]; 294 DataInputStream input = new DataInputStream(inputStream); 295 for (int i = 0; i < m_dataOffset_; i ++) { 296 m_index_[i] = input.readChar(); 297 } 298 } 299 300 /** 301 * Determines if this is a 16 bit trie 302 * @return true if this is a 16 bit trie 303 */ 304 protected final boolean isCharTrie() 305 { 306 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 307 } 308 309 // private data members -------------------------------------------- 310 311 /** 312 * Latin 1 option mask 313 */ 314 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 315 /** 316 * Constant number to authenticate the byte block 317 */ 318 protected static final int HEADER_SIGNATURE_ = 0x54726965; 319 /** 320 * Header option formatting 321 */ 322 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 323 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 324 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 325 326 /** 327 * Flag indicator for Latin quick access data block 328 */ 329 private boolean m_isLatin1Linear_; 330 331 /** 332 * <p>Trie options field.</p> 333 * <p>options bit field:<br> 334 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 335 * 8 0 = 16-bit data, 1=32-bit data<br> 336 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 337 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 338 */ 339 private int m_options_; 340 341 // private methods --------------------------------------------------- 342 343 /** 344 * Authenticates raw data header. 345 * Checking the header information, signature and options. 346 * @param signature This contains the options and type of a Trie 347 * @return true if the header is authenticated valid 348 */ 349 private final boolean checkHeader(int signature) 350 { 351 // check the signature 352 // Trie in big-endian US-ASCII (0x54726965). 353 // Magic number to authenticate the data. 354 if (signature != HEADER_SIGNATURE_) { 355 return false; 356 } 357 358 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 359 INDEX_STAGE_1_SHIFT_ || 360 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 361 HEADER_OPTIONS_SHIFT_MASK_) 362 != INDEX_STAGE_2_SHIFT_) { 363 return false; 364 } 365 return true; 366 } 367 }