1 /*
2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 /*
26 *******************************************************************************
27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28 * *
29 * The original version of this source code and documentation is copyrighted *
30 * and owned by IBM, These materials are provided under terms of a License *
31 * Agreement between IBM and Sun. This technology is protected by multiple *
32 * US and International patents. This notice and attribution to IBM may not *
33 * to removed. *
34 *******************************************************************************
35 */
36
37 package sun.text.normalizer;
38
39 import java.io.DataInputStream;
40 import java.io.InputStream;
41 import java.io.IOException;
42
43 /**
44 * <p>A trie is a kind of compressed, serializable table of values
45 * associated with Unicode code points (0..0x10ffff).</p>
46 * <p>This class defines the basic structure of a trie and provides methods
47 * to <b>retrieve the offsets to the actual data</b>.</p>
48 * <p>Data will be the form of an array of basic types, char or int.</p>
49 * <p>The actual data format will have to be specified by the user in the
50 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
51 * <p>This trie implementation is optimized for getting offset while walking
52 * forward through a UTF-16 string.
53 * Therefore, the simplest and fastest access macros are the
54 * fromLead() and fromOffsetTrail() methods.
118 // Magic number to authenticate the data.
119 int signature = input.readInt();
120 m_options_ = input.readInt();
121
122 if (!checkHeader(signature)) {
123 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
124 }
125
126 if(dataManipulate != null) {
127 m_dataManipulate_ = dataManipulate;
128 } else {
129 m_dataManipulate_ = new DefaultGetFoldingOffset();
130 }
131 m_isLatin1Linear_ = (m_options_ &
132 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
133 m_dataOffset_ = input.readInt();
134 m_dataLength_ = input.readInt();
135 unserialize(inputStream);
136 }
137
138 /**
139 * Trie constructor
140 * @param index array to be used for index
141 * @param options used by the trie
142 * @param dataManipulate object containing the information to parse the
143 * trie data
144 */
145 protected Trie(char index[], int options, DataManipulate dataManipulate)
146 {
147 m_options_ = options;
148 if(dataManipulate != null) {
149 m_dataManipulate_ = dataManipulate;
150 } else {
151 m_dataManipulate_ = new DefaultGetFoldingOffset();
152 }
153 m_isLatin1Linear_ = (m_options_ &
154 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
155 m_index_ = index;
156 m_dataOffset_ = m_index_.length;
157 }
158
159 // protected data members ------------------------------------------
160
161 /**
162 * Lead surrogate code points' index displacement in the index array.
163 * <pre>{@code
164 * 0x10000-0xd800=0x2800
165 * 0x2800 >> INDEX_STAGE_1_SHIFT_
166 * }</pre>
167 */
168 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
169 /**
170 * Shift size for shifting right the input index. 1..9
171 */
172 protected static final int INDEX_STAGE_1_SHIFT_ = 5;
173 /**
174 * Shift size for shifting left the index array values.
175 * Increases possible data size with 16-bit index values at the cost
176 * of compactability.
177 * This requires blocks of stage 2 data to be aligned by
178 * DATA_GRANULARITY.
179 * 0..INDEX_STAGE_1_SHIFT
180 */
181 protected static final int INDEX_STAGE_2_SHIFT_ = 2;
182 /**
183 * Number of data values in a stage 2 (data array) block.
184 */
185 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
186 /**
187 * Mask for getting the lower bits from the input index.
188 * DATA_BLOCK_LENGTH - 1.
189 */
190 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
191 /** Number of bits of a trail surrogate that are used in index table lookups. */
192 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
193 /**
194 * Number of index (stage 1) entries per lead surrogate.
195 * Same as number of index entries for 1024 trail surrogates,
196 * {@code ==0x400>>INDEX_STAGE_1_SHIFT_}
197 */
198 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
199 /** Length of the BMP portion of the index (stage 1) array. */
200 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
201 /**
202 * Surrogate mask to use when shifting offset to retrieve supplementary
203 * values
204 */
205 protected static final int SURROGATE_MASK_ = 0x3FF;
206 /**
207 * Index or UTF16 characters
208 */
209 protected char m_index_[];
210 /**
211 * Internal TrieValue which handles the parsing of the data value.
212 * This class is to be implemented by the user
213 */
214 protected DataManipulate m_dataManipulate_;
215 /**
216 * Start index of the data portion of the trie. CharTrie combines
217 * index and data into a char array, so this is used to indicate the
218 * initial offset to the data portion.
219 * Note this index always points to the initial value.
220 */
221 protected int m_dataOffset_;
222 /**
223 * Length of the data array
224 */
225 protected int m_dataLength_;
226
227 // protected methods -----------------------------------------------
228
229 /**
230 * Gets the offset to the data which the surrogate pair points to.
231 * @param lead lead surrogate
232 * @param trail trailing surrogate
233 * @return offset to data
234 */
235 protected abstract int getSurrogateOffset(char lead, char trail);
236
237 /**
238 * Gets the value at the argument index
239 * @param index value at index will be retrieved
240 * @return 32 bit value
241 */
242 protected abstract int getValue(int index);
243
244 /**
245 * Gets the default initial value
246 * @return 32 bit value
247 */
248 protected abstract int getInitialValue();
249
250 /**
251 * Gets the offset to the data which the index ch after variable offset
252 * points to.
253 * Note for locating a non-supplementary character data offset, calling
254 * <p>
255 * getRawOffset(0, ch);
256 * </p>
257 * will do. Otherwise if it is a supplementary character formed by
258 * surrogates lead and trail. Then we would have to call getRawOffset()
259 * with getFoldingIndexOffset(). See getSurrogateOffset().
260 * @param offset index offset which ch is to start from
261 * @param ch index to be used after offset
262 * @return offset to the data
263 */
264 protected final int getRawOffset(int offset, char ch)
265 {
266 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
267 << INDEX_STAGE_2_SHIFT_)
268 + (ch & INDEX_STAGE_3_MASK_);
269 }
270
304 * @param ch codepoint
305 * @return offset to data
306 */
307 protected final int getCodePointOffset(int ch)
308 {
309 // if ((ch >> 16) == 0) slower
310 if (ch < 0) {
311 return -1;
312 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
313 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
314 return getRawOffset(0, (char)ch);
315 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
316 // BMP codepoint
317 return getBMPOffset((char)ch);
318 } else if (ch <= UCharacter.MAX_VALUE) {
319 // look at the construction of supplementary characters
320 // trail forms the ends of it.
321 return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
322 (char)(ch & SURROGATE_MASK_));
323 } else {
324 // return -1 // if there is an error, in this case we return
325 return -1;
326 }
327 }
328
329 /**
330 * <p>Parses the inputstream and creates the trie index with it.</p>
331 * <p>This is overwritten by the child classes.
332 * @param inputStream input stream containing the trie information
333 * @exception IOException thrown when data reading fails.
334 */
335 protected void unserialize(InputStream inputStream) throws IOException
336 {
337 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
338 m_index_ = new char[m_dataOffset_];
339 DataInputStream input = new DataInputStream(inputStream);
340 for (int i = 0; i < m_dataOffset_; i ++) {
341 m_index_[i] = input.readChar();
342 }
343 }
344
345 /**
346 * Determines if this is a 32 bit trie
347 * @return true if options specifies this is a 32 bit trie
348 */
349 protected final boolean isIntTrie()
350 {
351 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
352 }
353
354 /**
355 * Determines if this is a 16 bit trie
356 * @return true if this is a 16 bit trie
357 */
358 protected final boolean isCharTrie()
359 {
360 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
361 }
362
363 // private data members --------------------------------------------
364
365 /**
366 * Latin 1 option mask
367 */
368 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
369 /**
370 * Constant number to authenticate the byte block
371 */
372 protected static final int HEADER_SIGNATURE_ = 0x54726965;
373 /**
374 * Header option formatting
|
1 /*
2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 ******************************************************************************
28 * Copyright (C) 1996-2014, International Business Machines Corporation and
29 * others. All Rights Reserved.
30 ******************************************************************************
31 */
32
33 package sun.text.normalizer;
34
35 import java.io.DataInputStream;
36 import java.io.InputStream;
37 import java.io.IOException;
38
39 /**
40 * <p>A trie is a kind of compressed, serializable table of values
41 * associated with Unicode code points (0..0x10ffff).</p>
42 * <p>This class defines the basic structure of a trie and provides methods
43 * to <b>retrieve the offsets to the actual data</b>.</p>
44 * <p>Data will be the form of an array of basic types, char or int.</p>
45 * <p>The actual data format will have to be specified by the user in the
46 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
47 * <p>This trie implementation is optimized for getting offset while walking
48 * forward through a UTF-16 string.
49 * Therefore, the simplest and fastest access macros are the
50 * fromLead() and fromOffsetTrail() methods.
114 // Magic number to authenticate the data.
115 int signature = input.readInt();
116 m_options_ = input.readInt();
117
118 if (!checkHeader(signature)) {
119 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
120 }
121
122 if(dataManipulate != null) {
123 m_dataManipulate_ = dataManipulate;
124 } else {
125 m_dataManipulate_ = new DefaultGetFoldingOffset();
126 }
127 m_isLatin1Linear_ = (m_options_ &
128 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
129 m_dataOffset_ = input.readInt();
130 m_dataLength_ = input.readInt();
131 unserialize(inputStream);
132 }
133
134 // protected data members ------------------------------------------
135
136 /**
137 * Lead surrogate code points' index displacement in the index array.
138 * <pre>{@code
139 * 0x10000-0xd800=0x2800
140 * 0x2800 >> INDEX_STAGE_1_SHIFT_
141 * }</pre>
142 */
143 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
144 /**
145 * Shift size for shifting right the input index. 1..9
146 */
147 protected static final int INDEX_STAGE_1_SHIFT_ = 5;
148 /**
149 * Shift size for shifting left the index array values.
150 * Increases possible data size with 16-bit index values at the cost
151 * of compactability.
152 * This requires blocks of stage 2 data to be aligned by
153 * DATA_GRANULARITY.
154 * 0..INDEX_STAGE_1_SHIFT
155 */
156 protected static final int INDEX_STAGE_2_SHIFT_ = 2;
157 /**
158 * Number of data values in a stage 2 (data array) block.
159 */
160 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
161 /**
162 * Mask for getting the lower bits from the input index.
163 * DATA_BLOCK_LENGTH - 1.
164 */
165 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
166 /**
167 * Surrogate mask to use when shifting offset to retrieve supplementary
168 * values
169 */
170 protected static final int SURROGATE_MASK_ = 0x3FF;
171 /**
172 * Index or UTF16 characters
173 */
174 protected char m_index_[];
175 /**
176 * Internal TrieValue which handles the parsing of the data value.
177 * This class is to be implemented by the user
178 */
179 protected DataManipulate m_dataManipulate_;
180 /**
181 * Start index of the data portion of the trie. CharTrie combines
182 * index and data into a char array, so this is used to indicate the
183 * initial offset to the data portion.
184 * Note this index always points to the initial value.
185 */
186 protected int m_dataOffset_;
187 /**
188 * Length of the data array
189 */
190 protected int m_dataLength_;
191
192 // protected methods -----------------------------------------------
193
194 /**
195 * Gets the offset to the data which the surrogate pair points to.
196 * @param lead lead surrogate
197 * @param trail trailing surrogate
198 * @return offset to data
199 */
200 protected abstract int getSurrogateOffset(char lead, char trail);
201
202 /**
203 * Gets the offset to the data which the index ch after variable offset
204 * points to.
205 * Note for locating a non-supplementary character data offset, calling
206 * <p>
207 * getRawOffset(0, ch);
208 * </p>
209 * will do. Otherwise if it is a supplementary character formed by
210 * surrogates lead and trail. Then we would have to call getRawOffset()
211 * with getFoldingIndexOffset(). See getSurrogateOffset().
212 * @param offset index offset which ch is to start from
213 * @param ch index to be used after offset
214 * @return offset to the data
215 */
216 protected final int getRawOffset(int offset, char ch)
217 {
218 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
219 << INDEX_STAGE_2_SHIFT_)
220 + (ch & INDEX_STAGE_3_MASK_);
221 }
222
256 * @param ch codepoint
257 * @return offset to data
258 */
259 protected final int getCodePointOffset(int ch)
260 {
261 // if ((ch >> 16) == 0) slower
262 if (ch < 0) {
263 return -1;
264 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
265 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
266 return getRawOffset(0, (char)ch);
267 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
268 // BMP codepoint
269 return getBMPOffset((char)ch);
270 } else if (ch <= UCharacter.MAX_VALUE) {
271 // look at the construction of supplementary characters
272 // trail forms the ends of it.
273 return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
274 (char)(ch & SURROGATE_MASK_));
275 } else {
276 // return -1 if there is an error, in this case we return
277 return -1;
278 }
279 }
280
281 /**
282 * <p>Parses the inputstream and creates the trie index with it.</p>
283 * <p>This is overwritten by the child classes.
284 * @param inputStream input stream containing the trie information
285 * @exception IOException thrown when data reading fails.
286 */
287 protected void unserialize(InputStream inputStream) throws IOException
288 {
289 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
290 m_index_ = new char[m_dataOffset_];
291 DataInputStream input = new DataInputStream(inputStream);
292 for (int i = 0; i < m_dataOffset_; i ++) {
293 m_index_[i] = input.readChar();
294 }
295 }
296
297 /**
298 * Determines if this is a 16 bit trie
299 * @return true if this is a 16 bit trie
300 */
301 protected final boolean isCharTrie()
302 {
303 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
304 }
305
306 // private data members --------------------------------------------
307
308 /**
309 * Latin 1 option mask
310 */
311 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
312 /**
313 * Constant number to authenticate the byte block
314 */
315 protected static final int HEADER_SIGNATURE_ = 0x54726965;
316 /**
317 * Header option formatting
|