1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 /** 40 * <p>Class enabling iteration of the values in a Trie.</p> 41 * <p>Result of each iteration contains the interval of codepoints that have 42 * the same value type and the value type itself.</p> 43 * <p>The comparison of each codepoint value is done via extract(), which the 44 * default implementation is to return the value as it is.</p> 45 * <p>Method extract() can be overwritten to perform manipulations on 46 * codepoint values in order to perform specialized comparison.</p> 47 * <p>TrieIterator is designed to be a generic iterator for the CharTrie 48 * and the IntTrie, hence to accommodate both types of data, the return 49 * result will be in terms of int (32 bit) values.</p> 50 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p> 51 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br> 52 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java 53 * sense, the caller will have to pass a object with a callback function 54 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, 55 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of 56 * codepoints with the same value as determined by 57 * UTrieEnumValue(const void *context, uint32_t value). for each range, 58 * utrie_enum calls the callback function to perform a task. In this way, 59 * icu4c performs the iteration within utrie_enum. 60 * To follow the JDK model, icu4j is slightly different from icu4c. 61 * Instead of requesting the caller to implement an object for a callback. 62 * The caller will have to implement a subclass of TrieIterator, fleshing out 63 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, 64 * the caller will have to code his own iteration and flesh out the task 65 * (equivalent to UTrieEnumRange) to be performed in the iteration loop. 66 * </p> 67 * <p>There are basically 3 usage scenarios for porting:</p> 68 * <p>1) UTrieEnumValue is the only implemented callback then just implement a 69 * subclass of TrieIterator and override the extract(int) method. The 70 * extract(int) method is analogus to UTrieEnumValue callback. 71 * </p> 72 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement 73 * a subclass of TrieIterator, override the extract method and iterate, e.g 74 * </p> 75 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, 76 * set);<br> 77 * In Java :<br> 78 * <pre> 79 * class TrieIteratorImpl extends TrieIterator{ 80 * public TrieIteratorImpl(Trie data){ 81 * super(data); 82 * } 83 * public int extract(int value){ 84 * // port the implementation of _enumPropertyStartsValue here 85 * } 86 * } 87 * .... 88 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie); 89 * while(fcdIter.next(result)) { 90 * // port the implementation of _enumPropertyStartsRange 91 * } 92 * </pre> 93 * </p> 94 * <p>3) UTrieEnumRange is the only implemented callback then just implement 95 * the while loop, when utrie_enum is called 96 * <pre> 97 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); 98 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie); 99 * while(fcdIter.next(result)){ 100 * set.add(result.start); 101 * } 102 * </pre> 103 * </p> 104 * @author synwee 105 * @see com.ibm.icu.impl.Trie 106 * @see com.ibm.icu.lang.UCharacterTypeIterator 107 * @since release 2.1, Jan 17 2002 108 */ 109 public class TrieIterator implements RangeValueIterator 110 { 111 112 // public constructor --------------------------------------------- 113 114 /** 115 * TrieEnumeration constructor 116 * @param trie to be used 117 * @exception IllegalArgumentException throw when argument is null. 118 */ 119 public TrieIterator(Trie trie) 120 { 121 if (trie == null) { 122 throw new IllegalArgumentException( 123 "Argument trie cannot be null"); 124 } 125 m_trie_ = trie; 126 // synwee: check that extract belongs to the child class 127 m_initialValue_ = extract(m_trie_.getInitialValue()); 128 reset(); 129 } 130 131 // public methods ------------------------------------------------- 132 133 /** 134 * <p>Returns true if we are not at the end of the iteration, false 135 * otherwise.</p> 136 * <p>The next set of codepoints with the same value type will be 137 * calculated during this call and returned in the arguement element.</p> 138 * @param element return result 139 * @return true if we are not at the end of the iteration, false otherwise. 140 * @exception NoSuchElementException - if no more elements exist. 141 * @see com.ibm.icu.util.RangeValueIterator.Element 142 */ 143 public final boolean next(Element element) 144 { 145 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { 146 return false; 147 } 148 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && 149 calculateNextBMPElement(element)) { 150 return true; 151 } 152 calculateNextSupplementaryElement(element); 153 return true; 154 } 155 156 /** 157 * Resets the iterator to the beginning of the iteration 158 */ 159 public final void reset() 160 { 161 m_currentCodepoint_ = 0; 162 m_nextCodepoint_ = 0; 163 m_nextIndex_ = 0; 164 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; 165 if (m_nextBlock_ == 0) { 166 m_nextValue_ = m_initialValue_; 167 } 168 else { 169 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); 170 } 171 m_nextBlockIndex_ = 0; 172 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; 173 } 174 175 // protected methods ---------------------------------------------- 176 177 /** 178 * Called by next() to extracts a 32 bit value from a trie value 179 * used for comparison. 180 * This method is to be overwritten if special manipulation is to be done 181 * to retrieve a relevant comparison. 182 * The default function is to return the value as it is. 183 * @param value a value from the trie 184 * @return extracted value 185 */ 186 protected int extract(int value) 187 { 188 return value; 189 } 190 191 // private methods ------------------------------------------------ 192 193 /** 194 * Set the result values 195 * @param element return result object 196 * @param start codepoint of range 197 * @param limit (end + 1) codepoint of range 198 * @param value common value of range 199 */ 200 private final void setResult(Element element, int start, int limit, 201 int value) 202 { 203 element.start = start; 204 element.limit = limit; 205 element.value = value; 206 } 207 208 /** 209 * Finding the next element. 210 * This method is called just before returning the result of 211 * next(). 212 * We always store the next element before it is requested. 213 * In the case that we have to continue calculations into the 214 * supplementary planes, a false will be returned. 215 * @param element return result object 216 * @return true if the next range is found, false if we have to proceed to 217 * the supplementary range. 218 */ 219 private final boolean calculateNextBMPElement(Element element) 220 { 221 int currentBlock = m_nextBlock_; 222 int currentValue = m_nextValue_; 223 m_currentCodepoint_ = m_nextCodepoint_; 224 m_nextCodepoint_ ++; 225 m_nextBlockIndex_ ++; 226 if (!checkBlockDetail(currentValue)) { 227 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 228 currentValue); 229 return true; 230 } 231 // synwee check that next block index == 0 here 232 // enumerate BMP - the main loop enumerates data blocks 233 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { 234 m_nextIndex_ ++; 235 // because of the way the character is split to form the index 236 // the lead surrogate and trail surrogate can not be in the 237 // mid of a block 238 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { 239 // skip lead surrogate code units, 240 // go to lead surrogate codepoints 241 m_nextIndex_ = BMP_INDEX_LENGTH_; 242 } 243 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { 244 // go back to regular BMP code points 245 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; 246 } 247 248 m_nextBlockIndex_ = 0; 249 if (!checkBlock(currentBlock, currentValue)) { 250 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 251 currentValue); 252 return true; 253 } 254 } 255 m_nextCodepoint_ --; // step one back since this value has not been 256 m_nextBlockIndex_ --; // retrieved yet. 257 return false; 258 } 259 260 /** 261 * Finds the next supplementary element. 262 * For each entry in the trie, the value to be delivered is passed through 263 * extract(). 264 * We always store the next element before it is requested. 265 * Called after calculateNextBMP() completes its round of BMP characters. 266 * There is a slight difference in the usage of m_currentCodepoint_ 267 * here as compared to calculateNextBMP(). Though both represents the 268 * lower bound of the next element, in calculateNextBMP() it gets set 269 * at the start of any loop, where-else, in calculateNextSupplementary() 270 * since m_currentCodepoint_ already contains the lower bound of the 271 * next element (passed down from calculateNextBMP()), we keep it till 272 * the end before resetting it to the new value. 273 * Note, if there are no more iterations, it will never get to here. 274 * Blocked out by next(). 275 * @param element return result object 276 */ 277 private final void calculateNextSupplementaryElement(Element element) 278 { 279 int currentValue = m_nextValue_; 280 int currentBlock = m_nextBlock_; 281 m_nextCodepoint_ ++; 282 m_nextBlockIndex_ ++; 283 284 if (UTF16.getTrailSurrogate(m_nextCodepoint_) 285 != UTF16.TRAIL_SURROGATE_MIN_VALUE) { 286 // this piece is only called when we are in the middle of a lead 287 // surrogate block 288 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { 289 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 290 currentValue); 291 m_currentCodepoint_ = m_nextCodepoint_; 292 return; 293 } 294 // we have cleared one block 295 m_nextIndex_ ++; 296 m_nextTrailIndexOffset_ ++; 297 if (!checkTrailBlock(currentBlock, currentValue)) { 298 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 299 currentValue); 300 m_currentCodepoint_ = m_nextCodepoint_; 301 return; 302 } 303 } 304 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 305 // enumerate supplementary code points 306 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { 307 // lead surrogate access 308 int leadBlock = 309 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 310 Trie.INDEX_STAGE_2_SHIFT_; 311 if (leadBlock == m_trie_.m_dataOffset_) { 312 // no entries for a whole block of lead surrogates 313 if (currentValue != m_initialValue_) { 314 m_nextValue_ = m_initialValue_; 315 m_nextBlock_ = 0; 316 m_nextBlockIndex_ = 0; 317 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 318 currentValue); 319 m_currentCodepoint_ = m_nextCodepoint_; 320 return; 321 } 322 323 nextLead += DATA_BLOCK_LENGTH_; 324 // number of total affected supplementary codepoints in one 325 // block 326 // this is not a simple addition of 327 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider 328 // that we might have moved some of the codepoints 329 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary( 330 (char)nextLead, 331 (char)UTF16.TRAIL_SURROGATE_MIN_VALUE); 332 continue; 333 } 334 if (m_trie_.m_dataManipulate_ == null) { 335 throw new NullPointerException( 336 "The field DataManipulate in this Trie is null"); 337 } 338 // enumerate trail surrogates for this lead surrogate 339 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 340 m_trie_.getValue(leadBlock + 341 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 342 if (m_nextIndex_ <= 0) { 343 // no data for this lead surrogate 344 if (currentValue != m_initialValue_) { 345 m_nextValue_ = m_initialValue_; 346 m_nextBlock_ = 0; 347 m_nextBlockIndex_ = 0; 348 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 349 currentValue); 350 m_currentCodepoint_ = m_nextCodepoint_; 351 return; 352 } 353 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; 354 } else { 355 m_nextTrailIndexOffset_ = 0; 356 if (!checkTrailBlock(currentBlock, currentValue)) { 357 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 358 currentValue); 359 m_currentCodepoint_ = m_nextCodepoint_; 360 return; 361 } 362 } 363 nextLead ++; 364 } 365 366 // deliver last range 367 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, 368 currentValue); 369 } 370 371 /** 372 * Internal block value calculations 373 * Performs calculations on a data block to find codepoints in m_nextBlock_ 374 * after the index m_nextBlockIndex_ that has the same value. 375 * Note m_*_ variables at this point is the next codepoint whose value 376 * has not been calculated. 377 * But when returned with false, it will be the last codepoint whose 378 * value has been calculated. 379 * @param currentValue the value which other codepoints are tested against 380 * @return true if the whole block has the same value as currentValue or if 381 * the whole block has been calculated, false otherwise. 382 */ 383 private final boolean checkBlockDetail(int currentValue) 384 { 385 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { 386 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + 387 m_nextBlockIndex_)); 388 if (m_nextValue_ != currentValue) { 389 return false; 390 } 391 ++ m_nextBlockIndex_; 392 ++ m_nextCodepoint_; 393 } 394 return true; 395 } 396 397 /** 398 * Internal block value calculations 399 * Performs calculations on a data block to find codepoints in m_nextBlock_ 400 * that has the same value. 401 * Will call checkBlockDetail() if highlevel check fails. 402 * Note m_*_ variables at this point is the next codepoint whose value 403 * has not been calculated. 404 * @param currentBlock the initial block containing all currentValue 405 * @param currentValue the value which other codepoints are tested against 406 * @return true if the whole block has the same value as currentValue or if 407 * the whole block has been calculated, false otherwise. 408 */ 409 private final boolean checkBlock(int currentBlock, int currentValue) 410 { 411 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << 412 Trie.INDEX_STAGE_2_SHIFT_; 413 if (m_nextBlock_ == currentBlock && 414 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { 415 // the block is the same as the previous one, filled with 416 // currentValue 417 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 418 } 419 else if (m_nextBlock_ == 0) { 420 // this is the all-initial-value block 421 if (currentValue != m_initialValue_) { 422 m_nextValue_ = m_initialValue_; 423 m_nextBlockIndex_ = 0; 424 return false; 425 } 426 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 427 } 428 else { 429 if (!checkBlockDetail(currentValue)) { 430 return false; 431 } 432 } 433 return true; 434 } 435 436 /** 437 * Internal block value calculations 438 * Performs calculations on multiple data blocks for a set of trail 439 * surrogates to find codepoints in m_nextBlock_ that has the same value. 440 * Will call checkBlock() for internal block checks. 441 * Note m_*_ variables at this point is the next codepoint whose value 442 * has not been calculated. 443 * @param currentBlock the initial block containing all currentValue 444 * @param currentValue the value which other codepoints are tested against 445 * @return true if the whole block has the same value as currentValue or if 446 * the whole block has been calculated, false otherwise. 447 */ 448 private final boolean checkTrailBlock(int currentBlock, 449 int currentValue) 450 { 451 // enumerate code points for this lead surrogate 452 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) 453 { 454 // if we ever reach here, we are at the start of a new block 455 m_nextBlockIndex_ = 0; 456 // copy of most of the body of the BMP loop 457 if (!checkBlock(currentBlock, currentValue)) { 458 return false; 459 } 460 m_nextTrailIndexOffset_ ++; 461 m_nextIndex_ ++; 462 } 463 return true; 464 } 465 466 /** 467 * Checks if we are beginning at the start of a initial block. 468 * If we are then the rest of the codepoints in this initial block 469 * has the same values. 470 * We increment m_nextCodepoint_ and relevant data members if so. 471 * This is used only in for the supplementary codepoints because 472 * the offset to the trail indexes could be 0. 473 * @return true if we are at the start of a initial block. 474 */ 475 private final boolean checkNullNextTrailIndex() 476 { 477 if (m_nextIndex_ <= 0) { 478 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; 479 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 480 int leadBlock = 481 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 482 Trie.INDEX_STAGE_2_SHIFT_; 483 if (m_trie_.m_dataManipulate_ == null) { 484 throw new NullPointerException( 485 "The field DataManipulate in this Trie is null"); 486 } 487 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 488 m_trie_.getValue(leadBlock + 489 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 490 m_nextIndex_ --; 491 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; 492 return true; 493 } 494 return false; 495 } 496 497 // private data members -------------------------------------------- 498 499 /** 500 * Size of the stage 1 BMP indexes 501 */ 502 private static final int BMP_INDEX_LENGTH_ = 503 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_; 504 /** 505 * Lead surrogate minimum value 506 */ 507 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800; 508 /** 509 * Trail surrogate minimum value 510 */ 511 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00; 512 /** 513 * Number of trail surrogate 514 */ 515 private static final int TRAIL_SURROGATE_COUNT_ = 0x400; 516 /** 517 * Number of stage 1 indexes for supplementary calculations that maps to 518 * each lead surrogate character. 519 * See second pass into getRawOffset for the trail surrogate character. 520 * 10 for significant number of bits for trail surrogates, 5 for what we 521 * discard during shifting. 522 */ 523 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 524 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_); 525 /** 526 * Number of data values in a stage 2 (data array) block. 527 */ 528 private static final int DATA_BLOCK_LENGTH_ = 529 1 << Trie.INDEX_STAGE_1_SHIFT_; 530 /** 531 * Trie instance 532 */ 533 private Trie m_trie_; 534 /** 535 * Initial value for trie values 536 */ 537 private int m_initialValue_; 538 /** 539 * Next element results and data. 540 */ 541 private int m_currentCodepoint_; 542 private int m_nextCodepoint_; 543 private int m_nextValue_; 544 private int m_nextIndex_; 545 private int m_nextBlock_; 546 private int m_nextBlockIndex_; 547 private int m_nextTrailIndexOffset_; 548 }