1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.io.DataInputStream; 40 import java.io.InputStream; 41 import java.io.IOException; 42 43 /** 44 * <p>A trie is a kind of compressed, serializable table of values 45 * associated with Unicode code points (0..0x10ffff).</p> 46 * <p>This class defines the basic structure of a trie and provides methods 47 * to <b>retrieve the offsets to the actual data</b>.</p> 48 * <p>Data will be the form of an array of basic types, char or int.</p> 49 * <p>The actual data format will have to be specified by the user in the 50 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> 51 * <p>This trie implementation is optimized for getting offset while walking 52 * forward through a UTF-16 string. 53 * Therefore, the simplest and fastest access macros are the 54 * fromLead() and fromOffsetTrail() methods. 55 * The fromBMP() method are a little more complicated; they get offsets even 56 * for lead surrogate codepoints, while the fromLead() method get special 57 * "folded" offsets for lead surrogate code units if there is relevant data 58 * associated with them. 59 * From such a folded offsets, an offset needs to be extracted to supply 60 * to the fromOffsetTrail() methods. 61 * To handle such supplementary codepoints, some offset information are kept 62 * in the data.</p> 63 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve 64 * that offset from the folded value for the lead surrogate unit.</p> 65 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or 66 * com.ibm.icu.impl.IntTrie.</p> 67 * @author synwee 68 * @see com.ibm.icu.impl.CharTrie 69 * @see com.ibm.icu.impl.IntTrie 70 * @since release 2.1, Jan 01 2002 71 */ 72 public abstract class Trie 73 { 74 // public class declaration ---------------------------------------- 75 76 /** 77 * Character data in com.ibm.impl.Trie have different user-specified format 78 * for different purposes. 79 * This interface specifies methods to be implemented in order for 80 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 81 * the data. 82 */ 83 public static interface DataManipulate 84 { 85 /** 86 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's 87 * data 88 * the index array offset of the indexes for that lead surrogate. 89 * @param value data value for a surrogate from the trie, including the 90 * folding offset 91 * @return data offset or 0 if there is no data for the lead surrogate 92 */ 93 public int getFoldingOffset(int value); 94 } 95 96 // default implementation 97 private static class DefaultGetFoldingOffset implements DataManipulate { 98 public int getFoldingOffset(int value) { 99 return value; 100 } 101 } 102 103 // protected constructor ------------------------------------------- 104 105 /** 106 * Trie constructor for CharTrie use. 107 * @param inputStream ICU data file input stream which contains the 108 * trie 109 * @param dataManipulate object containing the information to parse the 110 * trie data 111 * @throws IOException thrown when input stream does not have the 112 * right header. 113 */ 114 protected Trie(InputStream inputStream, 115 DataManipulate dataManipulate) throws IOException 116 { 117 DataInputStream input = new DataInputStream(inputStream); 118 // Magic number to authenticate the data. 119 int signature = input.readInt(); 120 m_options_ = input.readInt(); 121 122 if (!checkHeader(signature)) { 123 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 124 } 125 126 if(dataManipulate != null) { 127 m_dataManipulate_ = dataManipulate; 128 } else { 129 m_dataManipulate_ = new DefaultGetFoldingOffset(); 130 } 131 m_isLatin1Linear_ = (m_options_ & 132 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 133 m_dataOffset_ = input.readInt(); 134 m_dataLength_ = input.readInt(); 135 unserialize(inputStream); 136 } 137 138 /** 139 * Trie constructor 140 * @param index array to be used for index 141 * @param options used by the trie 142 * @param dataManipulate object containing the information to parse the 143 * trie data 144 */ 145 protected Trie(char index[], int options, DataManipulate dataManipulate) 146 { 147 m_options_ = options; 148 if(dataManipulate != null) { 149 m_dataManipulate_ = dataManipulate; 150 } else { 151 m_dataManipulate_ = new DefaultGetFoldingOffset(); 152 } 153 m_isLatin1Linear_ = (m_options_ & 154 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 155 m_index_ = index; 156 m_dataOffset_ = m_index_.length; 157 } 158 159 // protected data members ------------------------------------------ 160 161 /** 162 * Lead surrogate code points' index displacement in the index array. 163 * <pre>{@code 164 * 0x10000-0xd800=0x2800 165 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 166 * }</pre> 167 */ 168 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 169 /** 170 * Shift size for shifting right the input index. 1..9 171 */ 172 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 173 /** 174 * Shift size for shifting left the index array values. 175 * Increases possible data size with 16-bit index values at the cost 176 * of compactability. 177 * This requires blocks of stage 2 data to be aligned by 178 * DATA_GRANULARITY. 179 * 0..INDEX_STAGE_1_SHIFT 180 */ 181 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 182 /** 183 * Number of data values in a stage 2 (data array) block. 184 */ 185 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 186 /** 187 * Mask for getting the lower bits from the input index. 188 * DATA_BLOCK_LENGTH - 1. 189 */ 190 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 191 /** Number of bits of a trail surrogate that are used in index table lookups. */ 192 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_; 193 /** 194 * Number of index (stage 1) entries per lead surrogate. 195 * Same as number of index entries for 1024 trail surrogates, 196 * {@code ==0x400>>INDEX_STAGE_1_SHIFT_} 197 */ 198 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS); 199 /** Length of the BMP portion of the index (stage 1) array. */ 200 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_; 201 /** 202 * Surrogate mask to use when shifting offset to retrieve supplementary 203 * values 204 */ 205 protected static final int SURROGATE_MASK_ = 0x3FF; 206 /** 207 * Index or UTF16 characters 208 */ 209 protected char m_index_[]; 210 /** 211 * Internal TrieValue which handles the parsing of the data value. 212 * This class is to be implemented by the user 213 */ 214 protected DataManipulate m_dataManipulate_; 215 /** 216 * Start index of the data portion of the trie. CharTrie combines 217 * index and data into a char array, so this is used to indicate the 218 * initial offset to the data portion. 219 * Note this index always points to the initial value. 220 */ 221 protected int m_dataOffset_; 222 /** 223 * Length of the data array 224 */ 225 protected int m_dataLength_; 226 227 // protected methods ----------------------------------------------- 228 229 /** 230 * Gets the offset to the data which the surrogate pair points to. 231 * @param lead lead surrogate 232 * @param trail trailing surrogate 233 * @return offset to data 234 */ 235 protected abstract int getSurrogateOffset(char lead, char trail); 236 237 /** 238 * Gets the value at the argument index 239 * @param index value at index will be retrieved 240 * @return 32 bit value 241 */ 242 protected abstract int getValue(int index); 243 244 /** 245 * Gets the default initial value 246 * @return 32 bit value 247 */ 248 protected abstract int getInitialValue(); 249 250 /** 251 * Gets the offset to the data which the index ch after variable offset 252 * points to. 253 * Note for locating a non-supplementary character data offset, calling 254 * <p> 255 * getRawOffset(0, ch); 256 * </p> 257 * will do. Otherwise if it is a supplementary character formed by 258 * surrogates lead and trail. Then we would have to call getRawOffset() 259 * with getFoldingIndexOffset(). See getSurrogateOffset(). 260 * @param offset index offset which ch is to start from 261 * @param ch index to be used after offset 262 * @return offset to the data 263 */ 264 protected final int getRawOffset(int offset, char ch) 265 { 266 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 267 << INDEX_STAGE_2_SHIFT_) 268 + (ch & INDEX_STAGE_3_MASK_); 269 } 270 271 /** 272 * Gets the offset to data which the BMP character points to 273 * Treats a lead surrogate as a normal code point. 274 * @param ch BMP character 275 * @return offset to data 276 */ 277 protected final int getBMPOffset(char ch) 278 { 279 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 280 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 281 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 282 : getRawOffset(0, ch); 283 // using a getRawOffset(ch) makes no diff 284 } 285 286 /** 287 * Gets the offset to the data which this lead surrogate character points 288 * to. 289 * Data at the returned offset may contain folding offset information for 290 * the next trailing surrogate character. 291 * @param ch lead surrogate character 292 * @return offset to data 293 */ 294 protected final int getLeadOffset(char ch) 295 { 296 return getRawOffset(0, ch); 297 } 298 299 /** 300 * Internal trie getter from a code point. 301 * Could be faster(?) but longer with 302 * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} 303 * Gets the offset to data which the codepoint points to 304 * @param ch codepoint 305 * @return offset to data 306 */ 307 protected final int getCodePointOffset(int ch) 308 { 309 // if ((ch >> 16) == 0) slower 310 if (ch < 0) { 311 return -1; 312 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 313 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 314 return getRawOffset(0, (char)ch); 315 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 316 // BMP codepoint 317 return getBMPOffset((char)ch); 318 } else if (ch <= UCharacter.MAX_VALUE) { 319 // look at the construction of supplementary characters 320 // trail forms the ends of it. 321 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 322 (char)(ch & SURROGATE_MASK_)); 323 } else { 324 // return -1 // if there is an error, in this case we return 325 return -1; 326 } 327 } 328 329 /** 330 * <p>Parses the inputstream and creates the trie index with it.</p> 331 * <p>This is overwritten by the child classes. 332 * @param inputStream input stream containing the trie information 333 * @exception IOException thrown when data reading fails. 334 */ 335 protected void unserialize(InputStream inputStream) throws IOException 336 { 337 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ 338 m_index_ = new char[m_dataOffset_]; 339 DataInputStream input = new DataInputStream(inputStream); 340 for (int i = 0; i < m_dataOffset_; i ++) { 341 m_index_[i] = input.readChar(); 342 } 343 } 344 345 /** 346 * Determines if this is a 32 bit trie 347 * @return true if options specifies this is a 32 bit trie 348 */ 349 protected final boolean isIntTrie() 350 { 351 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; 352 } 353 354 /** 355 * Determines if this is a 16 bit trie 356 * @return true if this is a 16 bit trie 357 */ 358 protected final boolean isCharTrie() 359 { 360 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 361 } 362 363 // private data members -------------------------------------------- 364 365 /** 366 * Latin 1 option mask 367 */ 368 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 369 /** 370 * Constant number to authenticate the byte block 371 */ 372 protected static final int HEADER_SIGNATURE_ = 0x54726965; 373 /** 374 * Header option formatting 375 */ 376 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 377 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 378 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 379 380 /** 381 * Flag indicator for Latin quick access data block 382 */ 383 private boolean m_isLatin1Linear_; 384 385 /** 386 * <p>Trie options field.</p> 387 * <p>options bit field:<br> 388 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 389 * 8 0 = 16-bit data, 1=32-bit data<br> 390 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 391 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 392 */ 393 private int m_options_; 394 395 // private methods --------------------------------------------------- 396 397 /** 398 * Authenticates raw data header. 399 * Checking the header information, signature and options. 400 * @param signature This contains the options and type of a Trie 401 * @return true if the header is authenticated valid 402 */ 403 private final boolean checkHeader(int signature) 404 { 405 // check the signature 406 // Trie in big-endian US-ASCII (0x54726965). 407 // Magic number to authenticate the data. 408 if (signature != HEADER_SIGNATURE_) { 409 return false; 410 } 411 412 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 413 INDEX_STAGE_1_SHIFT_ || 414 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 415 HEADER_OPTIONS_SHIFT_MASK_) 416 != INDEX_STAGE_2_SHIFT_) { 417 return false; 418 } 419 return true; 420 } 421 }