1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 /** 40 * Class enabling iteration of the values in a Trie. 41 * <p>Result of each iteration contains the interval of codepoints that have 42 * the same value type and the value type itself. 43 * <p>The comparison of each codepoint value is done via extract(), which the 44 * default implementation is to return the value as it is. 45 * <p>Method extract() can be overwritten to perform manipulations on 46 * codepoint values in order to perform specialized comparison. 47 * <p>TrieIterator is designed to be a generic iterator for the CharTrie 48 * and the IntTrie, hence to accommodate both types of data, the return 49 * result will be in terms of int (32 bit) values. 50 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use. 51 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br> 52 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java 53 * sense, the caller will have to pass a object with a callback function 54 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, 55 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of 56 * codepoints with the same value as determined by 57 * UTrieEnumValue(const void *context, uint32_t value). for each range, 58 * utrie_enum calls the callback function to perform a task. In this way, 59 * icu4c performs the iteration within utrie_enum. 60 * To follow the JDK model, icu4j is slightly different from icu4c. 61 * Instead of requesting the caller to implement an object for a callback. 62 * The caller will have to implement a subclass of TrieIterator, fleshing out 63 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, 64 * the caller will have to code his own iteration and flesh out the task 65 * (equivalent to UTrieEnumRange) to be performed in the iteration loop. 66 * 67 * <p>There are basically 3 usage scenarios for porting: 68 * <p>1) UTrieEnumValue is the only implemented callback then just implement a 69 * subclass of TrieIterator and override the extract(int) method. The 70 * extract(int) method is analogus to UTrieEnumValue callback. 71 * 72 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement 73 * a subclass of TrieIterator, override the extract method and iterate, e.g.<br> 74 * {@code utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, 75 * set);}<br> 76 * In Java:<br> 77 * <pre> 78 * class TrieIteratorImpl extends TrieIterator{ 79 * public TrieIteratorImpl(Trie data){ 80 * super(data); 81 * } 82 * public int extract(int value){ 83 * // port the implementation of _enumPropertyStartsValue here 84 * } 85 * } 86 * .... 87 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie); 88 * while(fcdIter.next(result)) { 89 * // port the implementation of _enumPropertyStartsRange 90 * } 91 * </pre> 92 * 93 * <p>3) UTrieEnumRange is the only implemented callback then just implement 94 * the while loop, when utrie_enum is called 95 * <pre>{@code 96 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); 97 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie); 98 * while(fcdIter.next(result)){ 99 * set.add(result.start); 100 * } 101 * }</pre> 102 * 103 * @author synwee 104 * @see com.ibm.icu.impl.Trie 105 * @see com.ibm.icu.lang.UCharacterTypeIterator 106 * @since release 2.1, Jan 17 2002 107 */ 108 public class TrieIterator implements RangeValueIterator 109 { 110 111 // public constructor --------------------------------------------- 112 113 /** 114 * TrieEnumeration constructor 115 * @param trie to be used 116 * @exception IllegalArgumentException throw when argument is null. 117 */ 118 public TrieIterator(Trie trie) 119 { 120 if (trie == null) { 121 throw new IllegalArgumentException( 122 "Argument trie cannot be null"); 123 } 124 m_trie_ = trie; 125 // synwee: check that extract belongs to the child class 126 m_initialValue_ = extract(m_trie_.getInitialValue()); 127 reset(); 128 } 129 130 // public methods ------------------------------------------------- 131 132 /** 133 * <p>Returns true if we are not at the end of the iteration, false 134 * otherwise.</p> 135 * <p>The next set of codepoints with the same value type will be 136 * calculated during this call and returned in the arguement element.</p> 137 * @param element return result 138 * @return true if we are not at the end of the iteration, false otherwise. 139 * @exception NoSuchElementException - if no more elements exist. 140 * @see com.ibm.icu.util.RangeValueIterator.Element 141 */ 142 public final boolean next(Element element) 143 { 144 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { 145 return false; 146 } 147 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && 148 calculateNextBMPElement(element)) { 149 return true; 150 } 151 calculateNextSupplementaryElement(element); 152 return true; 153 } 154 155 /** 156 * Resets the iterator to the beginning of the iteration 157 */ 158 public final void reset() 159 { 160 m_currentCodepoint_ = 0; 161 m_nextCodepoint_ = 0; 162 m_nextIndex_ = 0; 163 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; 164 if (m_nextBlock_ == 0) { 165 m_nextValue_ = m_initialValue_; 166 } 167 else { 168 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); 169 } 170 m_nextBlockIndex_ = 0; 171 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; 172 } 173 174 // protected methods ---------------------------------------------- 175 176 /** 177 * Called by next() to extracts a 32 bit value from a trie value 178 * used for comparison. 179 * This method is to be overwritten if special manipulation is to be done 180 * to retrieve a relevant comparison. 181 * The default function is to return the value as it is. 182 * @param value a value from the trie 183 * @return extracted value 184 */ 185 protected int extract(int value) 186 { 187 return value; 188 } 189 190 // private methods ------------------------------------------------ 191 192 /** 193 * Set the result values 194 * @param element return result object 195 * @param start codepoint of range 196 * @param limit (end + 1) codepoint of range 197 * @param value common value of range 198 */ 199 private final void setResult(Element element, int start, int limit, 200 int value) 201 { 202 element.start = start; 203 element.limit = limit; 204 element.value = value; 205 } 206 207 /** 208 * Finding the next element. 209 * This method is called just before returning the result of 210 * next(). 211 * We always store the next element before it is requested. 212 * In the case that we have to continue calculations into the 213 * supplementary planes, a false will be returned. 214 * @param element return result object 215 * @return true if the next range is found, false if we have to proceed to 216 * the supplementary range. 217 */ 218 private final boolean calculateNextBMPElement(Element element) 219 { 220 int currentBlock = m_nextBlock_; 221 int currentValue = m_nextValue_; 222 m_currentCodepoint_ = m_nextCodepoint_; 223 m_nextCodepoint_ ++; 224 m_nextBlockIndex_ ++; 225 if (!checkBlockDetail(currentValue)) { 226 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 227 currentValue); 228 return true; 229 } 230 // synwee check that next block index == 0 here 231 // enumerate BMP - the main loop enumerates data blocks 232 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { 233 m_nextIndex_ ++; 234 // because of the way the character is split to form the index 235 // the lead surrogate and trail surrogate can not be in the 236 // mid of a block 237 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { 238 // skip lead surrogate code units, 239 // go to lead surrogate codepoints 240 m_nextIndex_ = BMP_INDEX_LENGTH_; 241 } 242 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { 243 // go back to regular BMP code points 244 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; 245 } 246 247 m_nextBlockIndex_ = 0; 248 if (!checkBlock(currentBlock, currentValue)) { 249 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 250 currentValue); 251 return true; 252 } 253 } 254 m_nextCodepoint_ --; // step one back since this value has not been 255 m_nextBlockIndex_ --; // retrieved yet. 256 return false; 257 } 258 259 /** 260 * Finds the next supplementary element. 261 * For each entry in the trie, the value to be delivered is passed through 262 * extract(). 263 * We always store the next element before it is requested. 264 * Called after calculateNextBMP() completes its round of BMP characters. 265 * There is a slight difference in the usage of m_currentCodepoint_ 266 * here as compared to calculateNextBMP(). Though both represents the 267 * lower bound of the next element, in calculateNextBMP() it gets set 268 * at the start of any loop, where-else, in calculateNextSupplementary() 269 * since m_currentCodepoint_ already contains the lower bound of the 270 * next element (passed down from calculateNextBMP()), we keep it till 271 * the end before resetting it to the new value. 272 * Note, if there are no more iterations, it will never get to here. 273 * Blocked out by next(). 274 * @param element return result object 275 */ 276 private final void calculateNextSupplementaryElement(Element element) 277 { 278 int currentValue = m_nextValue_; 279 int currentBlock = m_nextBlock_; 280 m_nextCodepoint_ ++; 281 m_nextBlockIndex_ ++; 282 283 if (UTF16.getTrailSurrogate(m_nextCodepoint_) 284 != UTF16.TRAIL_SURROGATE_MIN_VALUE) { 285 // this piece is only called when we are in the middle of a lead 286 // surrogate block 287 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { 288 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 289 currentValue); 290 m_currentCodepoint_ = m_nextCodepoint_; 291 return; 292 } 293 // we have cleared one block 294 m_nextIndex_ ++; 295 m_nextTrailIndexOffset_ ++; 296 if (!checkTrailBlock(currentBlock, currentValue)) { 297 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 298 currentValue); 299 m_currentCodepoint_ = m_nextCodepoint_; 300 return; 301 } 302 } 303 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 304 // enumerate supplementary code points 305 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { 306 // lead surrogate access 307 int leadBlock = 308 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 309 Trie.INDEX_STAGE_2_SHIFT_; 310 if (leadBlock == m_trie_.m_dataOffset_) { 311 // no entries for a whole block of lead surrogates 312 if (currentValue != m_initialValue_) { 313 m_nextValue_ = m_initialValue_; 314 m_nextBlock_ = 0; 315 m_nextBlockIndex_ = 0; 316 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 317 currentValue); 318 m_currentCodepoint_ = m_nextCodepoint_; 319 return; 320 } 321 322 nextLead += DATA_BLOCK_LENGTH_; 323 // number of total affected supplementary codepoints in one 324 // block 325 // this is not a simple addition of 326 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider 327 // that we might have moved some of the codepoints 328 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary( 329 (char)nextLead, 330 (char)UTF16.TRAIL_SURROGATE_MIN_VALUE); 331 continue; 332 } 333 if (m_trie_.m_dataManipulate_ == null) { 334 throw new NullPointerException( 335 "The field DataManipulate in this Trie is null"); 336 } 337 // enumerate trail surrogates for this lead surrogate 338 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 339 m_trie_.getValue(leadBlock + 340 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 341 if (m_nextIndex_ <= 0) { 342 // no data for this lead surrogate 343 if (currentValue != m_initialValue_) { 344 m_nextValue_ = m_initialValue_; 345 m_nextBlock_ = 0; 346 m_nextBlockIndex_ = 0; 347 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 348 currentValue); 349 m_currentCodepoint_ = m_nextCodepoint_; 350 return; 351 } 352 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; 353 } else { 354 m_nextTrailIndexOffset_ = 0; 355 if (!checkTrailBlock(currentBlock, currentValue)) { 356 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 357 currentValue); 358 m_currentCodepoint_ = m_nextCodepoint_; 359 return; 360 } 361 } 362 nextLead ++; 363 } 364 365 // deliver last range 366 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, 367 currentValue); 368 } 369 370 /** 371 * Internal block value calculations 372 * Performs calculations on a data block to find codepoints in m_nextBlock_ 373 * after the index m_nextBlockIndex_ that has the same value. 374 * Note m_*_ variables at this point is the next codepoint whose value 375 * has not been calculated. 376 * But when returned with false, it will be the last codepoint whose 377 * value has been calculated. 378 * @param currentValue the value which other codepoints are tested against 379 * @return true if the whole block has the same value as currentValue or if 380 * the whole block has been calculated, false otherwise. 381 */ 382 private final boolean checkBlockDetail(int currentValue) 383 { 384 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { 385 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + 386 m_nextBlockIndex_)); 387 if (m_nextValue_ != currentValue) { 388 return false; 389 } 390 ++ m_nextBlockIndex_; 391 ++ m_nextCodepoint_; 392 } 393 return true; 394 } 395 396 /** 397 * Internal block value calculations 398 * Performs calculations on a data block to find codepoints in m_nextBlock_ 399 * that has the same value. 400 * Will call checkBlockDetail() if highlevel check fails. 401 * Note m_*_ variables at this point is the next codepoint whose value 402 * has not been calculated. 403 * @param currentBlock the initial block containing all currentValue 404 * @param currentValue the value which other codepoints are tested against 405 * @return true if the whole block has the same value as currentValue or if 406 * the whole block has been calculated, false otherwise. 407 */ 408 private final boolean checkBlock(int currentBlock, int currentValue) 409 { 410 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << 411 Trie.INDEX_STAGE_2_SHIFT_; 412 if (m_nextBlock_ == currentBlock && 413 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { 414 // the block is the same as the previous one, filled with 415 // currentValue 416 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 417 } 418 else if (m_nextBlock_ == 0) { 419 // this is the all-initial-value block 420 if (currentValue != m_initialValue_) { 421 m_nextValue_ = m_initialValue_; 422 m_nextBlockIndex_ = 0; 423 return false; 424 } 425 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 426 } 427 else { 428 if (!checkBlockDetail(currentValue)) { 429 return false; 430 } 431 } 432 return true; 433 } 434 435 /** 436 * Internal block value calculations 437 * Performs calculations on multiple data blocks for a set of trail 438 * surrogates to find codepoints in m_nextBlock_ that has the same value. 439 * Will call checkBlock() for internal block checks. 440 * Note m_*_ variables at this point is the next codepoint whose value 441 * has not been calculated. 442 * @param currentBlock the initial block containing all currentValue 443 * @param currentValue the value which other codepoints are tested against 444 * @return true if the whole block has the same value as currentValue or if 445 * the whole block has been calculated, false otherwise. 446 */ 447 private final boolean checkTrailBlock(int currentBlock, 448 int currentValue) 449 { 450 // enumerate code points for this lead surrogate 451 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) 452 { 453 // if we ever reach here, we are at the start of a new block 454 m_nextBlockIndex_ = 0; 455 // copy of most of the body of the BMP loop 456 if (!checkBlock(currentBlock, currentValue)) { 457 return false; 458 } 459 m_nextTrailIndexOffset_ ++; 460 m_nextIndex_ ++; 461 } 462 return true; 463 } 464 465 /** 466 * Checks if we are beginning at the start of a initial block. 467 * If we are then the rest of the codepoints in this initial block 468 * has the same values. 469 * We increment m_nextCodepoint_ and relevant data members if so. 470 * This is used only in for the supplementary codepoints because 471 * the offset to the trail indexes could be 0. 472 * @return true if we are at the start of a initial block. 473 */ 474 private final boolean checkNullNextTrailIndex() 475 { 476 if (m_nextIndex_ <= 0) { 477 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; 478 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 479 int leadBlock = 480 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 481 Trie.INDEX_STAGE_2_SHIFT_; 482 if (m_trie_.m_dataManipulate_ == null) { 483 throw new NullPointerException( 484 "The field DataManipulate in this Trie is null"); 485 } 486 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 487 m_trie_.getValue(leadBlock + 488 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 489 m_nextIndex_ --; 490 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; 491 return true; 492 } 493 return false; 494 } 495 496 // private data members -------------------------------------------- 497 498 /** 499 * Size of the stage 1 BMP indexes 500 */ 501 private static final int BMP_INDEX_LENGTH_ = 502 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_; 503 /** 504 * Lead surrogate minimum value 505 */ 506 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800; 507 /** 508 * Trail surrogate minimum value 509 */ 510 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00; 511 /** 512 * Number of trail surrogate 513 */ 514 private static final int TRAIL_SURROGATE_COUNT_ = 0x400; 515 /** 516 * Number of stage 1 indexes for supplementary calculations that maps to 517 * each lead surrogate character. 518 * See second pass into getRawOffset for the trail surrogate character. 519 * 10 for significant number of bits for trail surrogates, 5 for what we 520 * discard during shifting. 521 */ 522 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 523 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_); 524 /** 525 * Number of data values in a stage 2 (data array) block. 526 */ 527 private static final int DATA_BLOCK_LENGTH_ = 528 1 << Trie.INDEX_STAGE_1_SHIFT_; 529 /** 530 * Trie instance 531 */ 532 private Trie m_trie_; 533 /** 534 * Initial value for trie values 535 */ 536 private int m_initialValue_; 537 /** 538 * Next element results and data. 539 */ 540 private int m_currentCodepoint_; 541 private int m_nextCodepoint_; 542 private int m_nextValue_; 543 private int m_nextIndex_; 544 private int m_nextBlock_; 545 private int m_nextBlockIndex_; 546 private int m_nextTrailIndexOffset_; 547 }