1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 /**
  40  * Class enabling iteration of the values in a Trie.
  41  * <p>Result of each iteration contains the interval of codepoints that have
  42  * the same value type and the value type itself.
  43  * <p>The comparison of each codepoint value is done via extract(), which the
  44  * default implementation is to return the value as it is.
  45  * <p>Method extract() can be overwritten to perform manipulations on
  46  * codepoint values in order to perform specialized comparison.
  47  * <p>TrieIterator is designed to be a generic iterator for the CharTrie
  48  * and the IntTrie, hence to accommodate both types of data, the return
  49  * result will be in terms of int (32 bit) values.
  50  * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.
  51  * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
  52  * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
  53  * sense, the caller will have to pass a object with a callback function
  54  * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
  55  * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
  56  * codepoints with the same value as determined by
  57  * UTrieEnumValue(const void *context, uint32_t value). for each range,
  58  * utrie_enum calls the callback function to perform a task. In this way,
  59  * icu4c performs the iteration within utrie_enum.
  60  * To follow the JDK model, icu4j is slightly different from icu4c.
  61  * Instead of requesting the caller to implement an object for a callback.
  62  * The caller will have to implement a subclass of TrieIterator, fleshing out
  63  * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
  64  * the caller will have to code his own iteration and flesh out the task
  65  * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
  66  *
  67  * <p>There are basically 3 usage scenarios for porting:
  68  * <p>1) UTrieEnumValue is the only implemented callback then just implement a
  69  * subclass of TrieIterator and override the extract(int) method. The
  70  * extract(int) method is analogus to UTrieEnumValue callback.
  71  *
  72  * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
  73  * a subclass of TrieIterator, override the extract method and iterate, e.g.<br>
  74  * {@code utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
  75  *               set);}<br>
  76  * In Java:<br>
  77  * <pre>
  78  * class TrieIteratorImpl extends TrieIterator{
  79  *     public TrieIteratorImpl(Trie data){
  80  *         super(data);
  81  *     }
  82  *     public int extract(int value){
  83  *         // port the implementation of _enumPropertyStartsValue here
  84  *     }
  85  * }
  86  * ....
  87  * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
  88  * while(fcdIter.next(result)) {
  89  *     // port the implementation of _enumPropertyStartsRange
  90  * }
  91  * </pre>
  92  *
  93  * <p>3) UTrieEnumRange is the only implemented callback then just implement
  94  * the while loop, when utrie_enum is called
  95  * <pre>{@code
  96  * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
  97  * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
  98  * while(fcdIter.next(result)){
  99  *     set.add(result.start);
 100  * }
 101  * }</pre>
 102  *
 103  * @author synwee
 104  * @see com.ibm.icu.impl.Trie
 105  * @see com.ibm.icu.lang.UCharacterTypeIterator
 106  * @since release 2.1, Jan 17 2002
 107  */
 108 public class TrieIterator implements RangeValueIterator
 109 {
 110 
 111     // public constructor ---------------------------------------------
 112 
 113     /**
 114     * TrieEnumeration constructor
 115     * @param trie to be used
 116     * @exception IllegalArgumentException throw when argument is null.
 117     */
 118     public TrieIterator(Trie trie)
 119     {
 120         if (trie == null) {
 121             throw new IllegalArgumentException(
 122                                           "Argument trie cannot be null");
 123         }
 124         m_trie_             = trie;
 125         // synwee: check that extract belongs to the child class
 126         m_initialValue_     = extract(m_trie_.getInitialValue());
 127         reset();
 128     }
 129 
 130     // public methods -------------------------------------------------
 131 
 132     /**
 133     * <p>Returns true if we are not at the end of the iteration, false
 134     * otherwise.</p>
 135     * <p>The next set of codepoints with the same value type will be
 136     * calculated during this call and returned in the arguement element.</p>
 137     * @param element return result
 138     * @return true if we are not at the end of the iteration, false otherwise.
 139     * @exception NoSuchElementException - if no more elements exist.
 140     * @see com.ibm.icu.util.RangeValueIterator.Element
 141     */
 142     public final boolean next(Element element)
 143     {
 144         if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
 145             return false;
 146         }
 147         if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
 148             calculateNextBMPElement(element)) {
 149             return true;
 150         }
 151         calculateNextSupplementaryElement(element);
 152         return true;
 153     }
 154 
 155     /**
 156     * Resets the iterator to the beginning of the iteration
 157     */
 158     public final void reset()
 159     {
 160         m_currentCodepoint_ = 0;
 161         m_nextCodepoint_    = 0;
 162         m_nextIndex_        = 0;
 163         m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
 164         if (m_nextBlock_ == 0) {
 165             m_nextValue_ = m_initialValue_;
 166         }
 167         else {
 168             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
 169         }
 170         m_nextBlockIndex_ = 0;
 171         m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
 172     }
 173 
 174     // protected methods ----------------------------------------------
 175 
 176     /**
 177     * Called by next() to extracts a 32 bit value from a trie value
 178     * used for comparison.
 179     * This method is to be overwritten if special manipulation is to be done
 180     * to retrieve a relevant comparison.
 181     * The default function is to return the value as it is.
 182     * @param value a value from the trie
 183     * @return extracted value
 184     */
 185     protected int extract(int value)
 186     {
 187         return value;
 188     }
 189 
 190     // private methods ------------------------------------------------
 191 
 192     /**
 193     * Set the result values
 194     * @param element return result object
 195     * @param start codepoint of range
 196     * @param limit (end + 1) codepoint of range
 197     * @param value common value of range
 198     */
 199     private final void setResult(Element element, int start, int limit,
 200                                  int value)
 201     {
 202         element.start = start;
 203         element.limit = limit;
 204         element.value = value;
 205     }
 206 
 207     /**
 208     * Finding the next element.
 209     * This method is called just before returning the result of
 210     * next().
 211     * We always store the next element before it is requested.
 212     * In the case that we have to continue calculations into the
 213     * supplementary planes, a false will be returned.
 214     * @param element return result object
 215     * @return true if the next range is found, false if we have to proceed to
 216     *         the supplementary range.
 217     */
 218     private final boolean calculateNextBMPElement(Element element)
 219     {
 220         int currentBlock    = m_nextBlock_;
 221         int currentValue    = m_nextValue_;
 222         m_currentCodepoint_ = m_nextCodepoint_;
 223         m_nextCodepoint_ ++;
 224         m_nextBlockIndex_ ++;
 225         if (!checkBlockDetail(currentValue)) {
 226             setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 227                       currentValue);
 228             return true;
 229         }
 230         // synwee check that next block index == 0 here
 231         // enumerate BMP - the main loop enumerates data blocks
 232         while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
 233             m_nextIndex_ ++;
 234             // because of the way the character is split to form the index
 235             // the lead surrogate and trail surrogate can not be in the
 236             // mid of a block
 237             if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
 238                 // skip lead surrogate code units,
 239                 // go to lead surrogate codepoints
 240                 m_nextIndex_ = BMP_INDEX_LENGTH_;
 241             }
 242             else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
 243                 // go back to regular BMP code points
 244                 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
 245             }
 246 
 247             m_nextBlockIndex_ = 0;
 248             if (!checkBlock(currentBlock, currentValue)) {
 249                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 250                           currentValue);
 251                 return true;
 252             }
 253         }
 254         m_nextCodepoint_ --;   // step one back since this value has not been
 255         m_nextBlockIndex_ --;  // retrieved yet.
 256         return false;
 257     }
 258 
 259     /**
 260     * Finds the next supplementary element.
 261     * For each entry in the trie, the value to be delivered is passed through
 262     * extract().
 263     * We always store the next element before it is requested.
 264     * Called after calculateNextBMP() completes its round of BMP characters.
 265     * There is a slight difference in the usage of m_currentCodepoint_
 266     * here as compared to calculateNextBMP(). Though both represents the
 267     * lower bound of the next element, in calculateNextBMP() it gets set
 268     * at the start of any loop, where-else, in calculateNextSupplementary()
 269     * since m_currentCodepoint_ already contains the lower bound of the
 270     * next element (passed down from calculateNextBMP()), we keep it till
 271     * the end before resetting it to the new value.
 272     * Note, if there are no more iterations, it will never get to here.
 273     * Blocked out by next().
 274     * @param element return result object
 275     */
 276     private final void calculateNextSupplementaryElement(Element element)
 277     {
 278         int currentValue = m_nextValue_;
 279         int currentBlock = m_nextBlock_;
 280         m_nextCodepoint_ ++;
 281         m_nextBlockIndex_ ++;
 282 
 283         if (UTF16.getTrailSurrogate(m_nextCodepoint_)
 284                                         != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
 285             // this piece is only called when we are in the middle of a lead
 286             // surrogate block
 287             if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
 288                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 289                           currentValue);
 290                 m_currentCodepoint_ = m_nextCodepoint_;
 291                 return;
 292             }
 293             // we have cleared one block
 294             m_nextIndex_ ++;
 295             m_nextTrailIndexOffset_ ++;
 296             if (!checkTrailBlock(currentBlock, currentValue)) {
 297                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 298                           currentValue);
 299                 m_currentCodepoint_ = m_nextCodepoint_;
 300                 return;
 301             }
 302         }
 303         int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
 304         // enumerate supplementary code points
 305         while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
 306             // lead surrogate access
 307             int leadBlock =
 308                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
 309                                                    Trie.INDEX_STAGE_2_SHIFT_;
 310             if (leadBlock == m_trie_.m_dataOffset_) {
 311                 // no entries for a whole block of lead surrogates
 312                 if (currentValue != m_initialValue_) {
 313                     m_nextValue_      = m_initialValue_;
 314                     m_nextBlock_      = 0;
 315                     m_nextBlockIndex_ = 0;
 316                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 317                               currentValue);
 318                     m_currentCodepoint_ = m_nextCodepoint_;
 319                     return;
 320                 }
 321 
 322                 nextLead += DATA_BLOCK_LENGTH_;
 323                 // number of total affected supplementary codepoints in one
 324                 // block
 325                 // this is not a simple addition of
 326                 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
 327                 // that we might have moved some of the codepoints
 328                 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
 329                                      (char)nextLead,
 330                                      (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
 331                 continue;
 332             }
 333             if (m_trie_.m_dataManipulate_ == null) {
 334                 throw new NullPointerException(
 335                             "The field DataManipulate in this Trie is null");
 336             }
 337             // enumerate trail surrogates for this lead surrogate
 338             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
 339                                m_trie_.getValue(leadBlock +
 340                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
 341             if (m_nextIndex_ <= 0) {
 342                 // no data for this lead surrogate
 343                 if (currentValue != m_initialValue_) {
 344                     m_nextValue_      = m_initialValue_;
 345                     m_nextBlock_      = 0;
 346                     m_nextBlockIndex_ = 0;
 347                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 348                               currentValue);
 349                     m_currentCodepoint_ = m_nextCodepoint_;
 350                     return;
 351                 }
 352                 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
 353             } else {
 354                 m_nextTrailIndexOffset_ = 0;
 355                 if (!checkTrailBlock(currentBlock, currentValue)) {
 356                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 357                               currentValue);
 358                     m_currentCodepoint_ = m_nextCodepoint_;
 359                     return;
 360                 }
 361             }
 362             nextLead ++;
 363          }
 364 
 365          // deliver last range
 366          setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
 367                    currentValue);
 368     }
 369 
 370     /**
 371     * Internal block value calculations
 372     * Performs calculations on a data block to find codepoints in m_nextBlock_
 373     * after the index m_nextBlockIndex_ that has the same value.
 374     * Note m_*_ variables at this point is the next codepoint whose value
 375     * has not been calculated.
 376     * But when returned with false, it will be the last codepoint whose
 377     * value has been calculated.
 378     * @param currentValue the value which other codepoints are tested against
 379     * @return true if the whole block has the same value as currentValue or if
 380     *              the whole block has been calculated, false otherwise.
 381     */
 382     private final boolean checkBlockDetail(int currentValue)
 383     {
 384         while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
 385             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
 386                                                     m_nextBlockIndex_));
 387             if (m_nextValue_ != currentValue) {
 388                 return false;
 389             }
 390             ++ m_nextBlockIndex_;
 391             ++ m_nextCodepoint_;
 392         }
 393         return true;
 394     }
 395 
 396     /**
 397     * Internal block value calculations
 398     * Performs calculations on a data block to find codepoints in m_nextBlock_
 399     * that has the same value.
 400     * Will call checkBlockDetail() if highlevel check fails.
 401     * Note m_*_ variables at this point is the next codepoint whose value
 402     * has not been calculated.
 403     * @param currentBlock the initial block containing all currentValue
 404     * @param currentValue the value which other codepoints are tested against
 405     * @return true if the whole block has the same value as currentValue or if
 406     *              the whole block has been calculated, false otherwise.
 407     */
 408     private final boolean checkBlock(int currentBlock, int currentValue)
 409     {
 410         m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
 411                                                   Trie.INDEX_STAGE_2_SHIFT_;
 412         if (m_nextBlock_ == currentBlock &&
 413             (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
 414             // the block is the same as the previous one, filled with
 415             // currentValue
 416             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
 417         }
 418         else if (m_nextBlock_ == 0) {
 419             // this is the all-initial-value block
 420             if (currentValue != m_initialValue_) {
 421                 m_nextValue_      = m_initialValue_;
 422                 m_nextBlockIndex_ = 0;
 423                 return false;
 424             }
 425             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
 426         }
 427         else {
 428             if (!checkBlockDetail(currentValue)) {
 429                 return false;
 430             }
 431         }
 432         return true;
 433     }
 434 
 435     /**
 436     * Internal block value calculations
 437     * Performs calculations on multiple data blocks for a set of trail
 438     * surrogates to find codepoints in m_nextBlock_ that has the same value.
 439     * Will call checkBlock() for internal block checks.
 440     * Note m_*_ variables at this point is the next codepoint whose value
 441     * has not been calculated.
 442     * @param currentBlock the initial block containing all currentValue
 443     * @param currentValue the value which other codepoints are tested against
 444     * @return true if the whole block has the same value as currentValue or if
 445     *              the whole block has been calculated, false otherwise.
 446     */
 447     private final boolean checkTrailBlock(int currentBlock,
 448                                           int currentValue)
 449     {
 450         // enumerate code points for this lead surrogate
 451         while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
 452         {
 453             // if we ever reach here, we are at the start of a new block
 454             m_nextBlockIndex_ = 0;
 455             // copy of most of the body of the BMP loop
 456             if (!checkBlock(currentBlock, currentValue)) {
 457                 return false;
 458             }
 459             m_nextTrailIndexOffset_ ++;
 460             m_nextIndex_ ++;
 461         }
 462         return true;
 463     }
 464 
 465     /**
 466     * Checks if we are beginning at the start of a initial block.
 467     * If we are then the rest of the codepoints in this initial block
 468     * has the same values.
 469     * We increment m_nextCodepoint_ and relevant data members if so.
 470     * This is used only in for the supplementary codepoints because
 471     * the offset to the trail indexes could be 0.
 472     * @return true if we are at the start of a initial block.
 473     */
 474     private final boolean checkNullNextTrailIndex()
 475     {
 476         if (m_nextIndex_ <= 0) {
 477             m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
 478             int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
 479             int leadBlock =
 480                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
 481                                                    Trie.INDEX_STAGE_2_SHIFT_;
 482             if (m_trie_.m_dataManipulate_ == null) {
 483                 throw new NullPointerException(
 484                             "The field DataManipulate in this Trie is null");
 485             }
 486             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
 487                                m_trie_.getValue(leadBlock +
 488                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
 489             m_nextIndex_ --;
 490             m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
 491             return true;
 492         }
 493         return false;
 494     }
 495 
 496     // private data members --------------------------------------------
 497 
 498     /**
 499     * Size of the stage 1 BMP indexes
 500     */
 501     private static final int BMP_INDEX_LENGTH_ =
 502                                         0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
 503     /**
 504     * Lead surrogate minimum value
 505     */
 506     private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
 507     /**
 508     * Trail surrogate minimum value
 509     */
 510     private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
 511     /**
 512     * Number of trail surrogate
 513     */
 514     private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
 515     /**
 516     * Number of stage 1 indexes for supplementary calculations that maps to
 517     * each lead surrogate character.
 518     * See second pass into getRawOffset for the trail surrogate character.
 519     * 10 for significant number of bits for trail surrogates, 5 for what we
 520     * discard during shifting.
 521     */
 522     private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
 523                                     1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
 524     /**
 525     * Number of data values in a stage 2 (data array) block.
 526     */
 527     private static final int DATA_BLOCK_LENGTH_ =
 528                                               1 << Trie.INDEX_STAGE_1_SHIFT_;
 529     /**
 530     * Trie instance
 531     */
 532     private Trie m_trie_;
 533     /**
 534     * Initial value for trie values
 535     */
 536     private int m_initialValue_;
 537     /**
 538     * Next element results and data.
 539     */
 540     private int m_currentCodepoint_;
 541     private int m_nextCodepoint_;
 542     private int m_nextValue_;
 543     private int m_nextIndex_;
 544     private int m_nextBlock_;
 545     private int m_nextBlockIndex_;
 546     private int m_nextTrailIndexOffset_;
 547 }