1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 /**
  40  * <p>Class enabling iteration of the values in a Trie.</p>
  41  * <p>Result of each iteration contains the interval of codepoints that have
  42  * the same value type and the value type itself.</p>
  43  * <p>The comparison of each codepoint value is done via extract(), which the
  44  * default implementation is to return the value as it is.</p>
  45  * <p>Method extract() can be overwritten to perform manipulations on
  46  * codepoint values in order to perform specialized comparison.</p>
  47  * <p>TrieIterator is designed to be a generic iterator for the CharTrie
  48  * and the IntTrie, hence to accommodate both types of data, the return
  49  * result will be in terms of int (32 bit) values.</p>
  50  * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
  51  * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
  52  * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
  53  * sense, the caller will have to pass a object with a callback function
  54  * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
  55  * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
  56  * codepoints with the same value as determined by
  57  * UTrieEnumValue(const void *context, uint32_t value). for each range,
  58  * utrie_enum calls the callback function to perform a task. In this way,
  59  * icu4c performs the iteration within utrie_enum.
  60  * To follow the JDK model, icu4j is slightly different from icu4c.
  61  * Instead of requesting the caller to implement an object for a callback.
  62  * The caller will have to implement a subclass of TrieIterator, fleshing out
  63  * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
  64  * the caller will have to code his own iteration and flesh out the task
  65  * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
  66  * </p>
  67  * <p>There are basically 3 usage scenarios for porting:</p>
  68  * <p>1) UTrieEnumValue is the only implemented callback then just implement a
  69  * subclass of TrieIterator and override the extract(int) method. The
  70  * extract(int) method is analogus to UTrieEnumValue callback.
  71  * </p>
  72  * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
  73  * a subclass of TrieIterator, override the extract method and iterate, e.g
  74  * </p>
  75  * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
  76  *               set);<br>
  77  * In Java :<br>
  78  * <pre>
  79  * class TrieIteratorImpl extends TrieIterator{
  80  *     public TrieIteratorImpl(Trie data){
  81  *         super(data);
  82  *     }
  83  *     public int extract(int value){
  84  *         // port the implementation of _enumPropertyStartsValue here
  85  *     }
  86  * }
  87  * ....
  88  * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
  89  * while(fcdIter.next(result)) {
  90  *     // port the implementation of _enumPropertyStartsRange
  91  * }
  92  * </pre>
  93  * </p>
  94  * <p>3) UTrieEnumRange is the only implemented callback then just implement
  95  * the while loop, when utrie_enum is called
  96  * <pre>
  97  * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
  98  * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
  99  * while(fcdIter.next(result)){
 100  *     set.add(result.start);
 101  * }
 102  * </pre>
 103  * </p>
 104  * @author synwee
 105  * @see com.ibm.icu.impl.Trie
 106  * @see com.ibm.icu.lang.UCharacterTypeIterator
 107  * @since release 2.1, Jan 17 2002
 108  */
 109 public class TrieIterator implements RangeValueIterator
 110 {
 111 
 112     // public constructor ---------------------------------------------
 113 
 114     /**
 115     * TrieEnumeration constructor
 116     * @param trie to be used
 117     * @exception IllegalArgumentException throw when argument is null.
 118     */
 119     public TrieIterator(Trie trie)
 120     {
 121         if (trie == null) {
 122             throw new IllegalArgumentException(
 123                                           "Argument trie cannot be null");
 124         }
 125         m_trie_             = trie;
 126         // synwee: check that extract belongs to the child class
 127         m_initialValue_     = extract(m_trie_.getInitialValue());
 128         reset();
 129     }
 130 
 131     // public methods -------------------------------------------------
 132 
 133     /**
 134     * <p>Returns true if we are not at the end of the iteration, false
 135     * otherwise.</p>
 136     * <p>The next set of codepoints with the same value type will be
 137     * calculated during this call and returned in the arguement element.</p>
 138     * @param element return result
 139     * @return true if we are not at the end of the iteration, false otherwise.
 140     * @exception NoSuchElementException - if no more elements exist.
 141     * @see com.ibm.icu.util.RangeValueIterator.Element
 142     */
 143     public final boolean next(Element element)
 144     {
 145         if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
 146             return false;
 147         }
 148         if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
 149             calculateNextBMPElement(element)) {
 150             return true;
 151         }
 152         calculateNextSupplementaryElement(element);
 153         return true;
 154     }
 155 
 156     /**
 157     * Resets the iterator to the beginning of the iteration
 158     */
 159     public final void reset()
 160     {
 161         m_currentCodepoint_ = 0;
 162         m_nextCodepoint_    = 0;
 163         m_nextIndex_        = 0;
 164         m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
 165         if (m_nextBlock_ == 0) {
 166             m_nextValue_ = m_initialValue_;
 167         }
 168         else {
 169             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
 170         }
 171         m_nextBlockIndex_ = 0;
 172         m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
 173     }
 174 
 175     // protected methods ----------------------------------------------
 176 
 177     /**
 178     * Called by next() to extracts a 32 bit value from a trie value
 179     * used for comparison.
 180     * This method is to be overwritten if special manipulation is to be done
 181     * to retrieve a relevant comparison.
 182     * The default function is to return the value as it is.
 183     * @param value a value from the trie
 184     * @return extracted value
 185     */
 186     protected int extract(int value)
 187     {
 188         return value;
 189     }
 190 
 191     // private methods ------------------------------------------------
 192 
 193     /**
 194     * Set the result values
 195     * @param element return result object
 196     * @param start codepoint of range
 197     * @param limit (end + 1) codepoint of range
 198     * @param value common value of range
 199     */
 200     private final void setResult(Element element, int start, int limit,
 201                                  int value)
 202     {
 203         element.start = start;
 204         element.limit = limit;
 205         element.value = value;
 206     }
 207 
 208     /**
 209     * Finding the next element.
 210     * This method is called just before returning the result of
 211     * next().
 212     * We always store the next element before it is requested.
 213     * In the case that we have to continue calculations into the
 214     * supplementary planes, a false will be returned.
 215     * @param element return result object
 216     * @return true if the next range is found, false if we have to proceed to
 217     *         the supplementary range.
 218     */
 219     private final boolean calculateNextBMPElement(Element element)
 220     {
 221         int currentBlock    = m_nextBlock_;
 222         int currentValue    = m_nextValue_;
 223         m_currentCodepoint_ = m_nextCodepoint_;
 224         m_nextCodepoint_ ++;
 225         m_nextBlockIndex_ ++;
 226         if (!checkBlockDetail(currentValue)) {
 227             setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 228                       currentValue);
 229             return true;
 230         }
 231         // synwee check that next block index == 0 here
 232         // enumerate BMP - the main loop enumerates data blocks
 233         while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
 234             m_nextIndex_ ++;
 235             // because of the way the character is split to form the index
 236             // the lead surrogate and trail surrogate can not be in the
 237             // mid of a block
 238             if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
 239                 // skip lead surrogate code units,
 240                 // go to lead surrogate codepoints
 241                 m_nextIndex_ = BMP_INDEX_LENGTH_;
 242             }
 243             else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
 244                 // go back to regular BMP code points
 245                 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
 246             }
 247 
 248             m_nextBlockIndex_ = 0;
 249             if (!checkBlock(currentBlock, currentValue)) {
 250                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 251                           currentValue);
 252                 return true;
 253             }
 254         }
 255         m_nextCodepoint_ --;   // step one back since this value has not been
 256         m_nextBlockIndex_ --;  // retrieved yet.
 257         return false;
 258     }
 259 
 260     /**
 261     * Finds the next supplementary element.
 262     * For each entry in the trie, the value to be delivered is passed through
 263     * extract().
 264     * We always store the next element before it is requested.
 265     * Called after calculateNextBMP() completes its round of BMP characters.
 266     * There is a slight difference in the usage of m_currentCodepoint_
 267     * here as compared to calculateNextBMP(). Though both represents the
 268     * lower bound of the next element, in calculateNextBMP() it gets set
 269     * at the start of any loop, where-else, in calculateNextSupplementary()
 270     * since m_currentCodepoint_ already contains the lower bound of the
 271     * next element (passed down from calculateNextBMP()), we keep it till
 272     * the end before resetting it to the new value.
 273     * Note, if there are no more iterations, it will never get to here.
 274     * Blocked out by next().
 275     * @param element return result object
 276     */
 277     private final void calculateNextSupplementaryElement(Element element)
 278     {
 279         int currentValue = m_nextValue_;
 280         int currentBlock = m_nextBlock_;
 281         m_nextCodepoint_ ++;
 282         m_nextBlockIndex_ ++;
 283 
 284         if (UTF16.getTrailSurrogate(m_nextCodepoint_)
 285                                         != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
 286             // this piece is only called when we are in the middle of a lead
 287             // surrogate block
 288             if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
 289                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 290                           currentValue);
 291                 m_currentCodepoint_ = m_nextCodepoint_;
 292                 return;
 293             }
 294             // we have cleared one block
 295             m_nextIndex_ ++;
 296             m_nextTrailIndexOffset_ ++;
 297             if (!checkTrailBlock(currentBlock, currentValue)) {
 298                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 299                           currentValue);
 300                 m_currentCodepoint_ = m_nextCodepoint_;
 301                 return;
 302             }
 303         }
 304         int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
 305         // enumerate supplementary code points
 306         while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
 307             // lead surrogate access
 308             int leadBlock =
 309                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
 310                                                    Trie.INDEX_STAGE_2_SHIFT_;
 311             if (leadBlock == m_trie_.m_dataOffset_) {
 312                 // no entries for a whole block of lead surrogates
 313                 if (currentValue != m_initialValue_) {
 314                     m_nextValue_      = m_initialValue_;
 315                     m_nextBlock_      = 0;
 316                     m_nextBlockIndex_ = 0;
 317                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 318                               currentValue);
 319                     m_currentCodepoint_ = m_nextCodepoint_;
 320                     return;
 321                 }
 322 
 323                 nextLead += DATA_BLOCK_LENGTH_;
 324                 // number of total affected supplementary codepoints in one
 325                 // block
 326                 // this is not a simple addition of
 327                 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
 328                 // that we might have moved some of the codepoints
 329                 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
 330                                      (char)nextLead,
 331                                      (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
 332                 continue;
 333             }
 334             if (m_trie_.m_dataManipulate_ == null) {
 335                 throw new NullPointerException(
 336                             "The field DataManipulate in this Trie is null");
 337             }
 338             // enumerate trail surrogates for this lead surrogate
 339             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
 340                                m_trie_.getValue(leadBlock +
 341                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
 342             if (m_nextIndex_ <= 0) {
 343                 // no data for this lead surrogate
 344                 if (currentValue != m_initialValue_) {
 345                     m_nextValue_      = m_initialValue_;
 346                     m_nextBlock_      = 0;
 347                     m_nextBlockIndex_ = 0;
 348                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 349                               currentValue);
 350                     m_currentCodepoint_ = m_nextCodepoint_;
 351                     return;
 352                 }
 353                 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
 354             } else {
 355                 m_nextTrailIndexOffset_ = 0;
 356                 if (!checkTrailBlock(currentBlock, currentValue)) {
 357                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
 358                               currentValue);
 359                     m_currentCodepoint_ = m_nextCodepoint_;
 360                     return;
 361                 }
 362             }
 363             nextLead ++;
 364          }
 365 
 366          // deliver last range
 367          setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
 368                    currentValue);
 369     }
 370 
 371     /**
 372     * Internal block value calculations
 373     * Performs calculations on a data block to find codepoints in m_nextBlock_
 374     * after the index m_nextBlockIndex_ that has the same value.
 375     * Note m_*_ variables at this point is the next codepoint whose value
 376     * has not been calculated.
 377     * But when returned with false, it will be the last codepoint whose
 378     * value has been calculated.
 379     * @param currentValue the value which other codepoints are tested against
 380     * @return true if the whole block has the same value as currentValue or if
 381     *              the whole block has been calculated, false otherwise.
 382     */
 383     private final boolean checkBlockDetail(int currentValue)
 384     {
 385         while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
 386             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
 387                                                     m_nextBlockIndex_));
 388             if (m_nextValue_ != currentValue) {
 389                 return false;
 390             }
 391             ++ m_nextBlockIndex_;
 392             ++ m_nextCodepoint_;
 393         }
 394         return true;
 395     }
 396 
 397     /**
 398     * Internal block value calculations
 399     * Performs calculations on a data block to find codepoints in m_nextBlock_
 400     * that has the same value.
 401     * Will call checkBlockDetail() if highlevel check fails.
 402     * Note m_*_ variables at this point is the next codepoint whose value
 403     * has not been calculated.
 404     * @param currentBlock the initial block containing all currentValue
 405     * @param currentValue the value which other codepoints are tested against
 406     * @return true if the whole block has the same value as currentValue or if
 407     *              the whole block has been calculated, false otherwise.
 408     */
 409     private final boolean checkBlock(int currentBlock, int currentValue)
 410     {
 411         m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
 412                                                   Trie.INDEX_STAGE_2_SHIFT_;
 413         if (m_nextBlock_ == currentBlock &&
 414             (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
 415             // the block is the same as the previous one, filled with
 416             // currentValue
 417             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
 418         }
 419         else if (m_nextBlock_ == 0) {
 420             // this is the all-initial-value block
 421             if (currentValue != m_initialValue_) {
 422                 m_nextValue_      = m_initialValue_;
 423                 m_nextBlockIndex_ = 0;
 424                 return false;
 425             }
 426             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
 427         }
 428         else {
 429             if (!checkBlockDetail(currentValue)) {
 430                 return false;
 431             }
 432         }
 433         return true;
 434     }
 435 
 436     /**
 437     * Internal block value calculations
 438     * Performs calculations on multiple data blocks for a set of trail
 439     * surrogates to find codepoints in m_nextBlock_ that has the same value.
 440     * Will call checkBlock() for internal block checks.
 441     * Note m_*_ variables at this point is the next codepoint whose value
 442     * has not been calculated.
 443     * @param currentBlock the initial block containing all currentValue
 444     * @param currentValue the value which other codepoints are tested against
 445     * @return true if the whole block has the same value as currentValue or if
 446     *              the whole block has been calculated, false otherwise.
 447     */
 448     private final boolean checkTrailBlock(int currentBlock,
 449                                           int currentValue)
 450     {
 451         // enumerate code points for this lead surrogate
 452         while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
 453         {
 454             // if we ever reach here, we are at the start of a new block
 455             m_nextBlockIndex_ = 0;
 456             // copy of most of the body of the BMP loop
 457             if (!checkBlock(currentBlock, currentValue)) {
 458                 return false;
 459             }
 460             m_nextTrailIndexOffset_ ++;
 461             m_nextIndex_ ++;
 462         }
 463         return true;
 464     }
 465 
 466     /**
 467     * Checks if we are beginning at the start of a initial block.
 468     * If we are then the rest of the codepoints in this initial block
 469     * has the same values.
 470     * We increment m_nextCodepoint_ and relevant data members if so.
 471     * This is used only in for the supplementary codepoints because
 472     * the offset to the trail indexes could be 0.
 473     * @return true if we are at the start of a initial block.
 474     */
 475     private final boolean checkNullNextTrailIndex()
 476     {
 477         if (m_nextIndex_ <= 0) {
 478             m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
 479             int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
 480             int leadBlock =
 481                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
 482                                                    Trie.INDEX_STAGE_2_SHIFT_;
 483             if (m_trie_.m_dataManipulate_ == null) {
 484                 throw new NullPointerException(
 485                             "The field DataManipulate in this Trie is null");
 486             }
 487             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
 488                                m_trie_.getValue(leadBlock +
 489                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
 490             m_nextIndex_ --;
 491             m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
 492             return true;
 493         }
 494         return false;
 495     }
 496 
 497     // private data members --------------------------------------------
 498 
 499     /**
 500     * Size of the stage 1 BMP indexes
 501     */
 502     private static final int BMP_INDEX_LENGTH_ =
 503                                         0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
 504     /**
 505     * Lead surrogate minimum value
 506     */
 507     private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
 508     /**
 509     * Trail surrogate minimum value
 510     */
 511     private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
 512     /**
 513     * Number of trail surrogate
 514     */
 515     private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
 516     /**
 517     * Number of stage 1 indexes for supplementary calculations that maps to
 518     * each lead surrogate character.
 519     * See second pass into getRawOffset for the trail surrogate character.
 520     * 10 for significant number of bits for trail surrogates, 5 for what we
 521     * discard during shifting.
 522     */
 523     private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
 524                                     1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
 525     /**
 526     * Number of data values in a stage 2 (data array) block.
 527     */
 528     private static final int DATA_BLOCK_LENGTH_ =
 529                                               1 << Trie.INDEX_STAGE_1_SHIFT_;
 530     /**
 531     * Trie instance
 532     */
 533     private Trie m_trie_;
 534     /**
 535     * Initial value for trie values
 536     */
 537     private int m_initialValue_;
 538     /**
 539     * Next element results and data.
 540     */
 541     private int m_currentCodepoint_;
 542     private int m_nextCodepoint_;
 543     private int m_nextValue_;
 544     private int m_nextIndex_;
 545     private int m_nextBlock_;
 546     private int m_nextBlockIndex_;
 547     private int m_nextTrailIndexOffset_;
 548 }