1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  * Copyright (C) 2000-2014, International Business Machines Corporation and
  29  * others. All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 package sun.text.normalizer;
  33 
  34 import java.text.CharacterIterator;
  35 import java.text.Normalizer;
  36 
  37 /**
  38  * Unicode Normalization
  39  *
  40  * <h2>Unicode normalization API</h2>
  41  *
  42  * <code>normalize</code> transforms Unicode text into an equivalent composed or
  43  * decomposed form, allowing for easier sorting and searching of text.
  44  * <code>normalize</code> supports the standard normalization forms described in
  45  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
  46  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  47  *
  48  * Characters with accents or other adornments can be encoded in
  49  * several different ways in Unicode.  For example, take the character A-acute.
  50  * In Unicode, this can be encoded as a single character (the
  51  * "composed" form):
  52  *
  53  * <pre>
  54  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
  55  * </pre>
  56  *
  57  * or as two separate characters (the "decomposed" form):
  58  *
  59  * <pre>
  60  *      0041    LATIN CAPITAL LETTER A
  61  *      0301    COMBINING ACUTE ACCENT
  62  * </pre>
  63  *
  64  * To a user of your program, however, both of these sequences should be
  65  * treated as the same "user-level" character "A with acute accent".  When you
  66  * are searching or comparing text, you must ensure that these two sequences are
  67  * treated equivalently.  In addition, you must handle characters with more than
  68  * one accent.  Sometimes the order of a character's combining accents is
  69  * significant, while in other cases accent sequences in different orders are
  70  * really equivalent.
  71  *
  72  * Similarly, the string "ffi" can be encoded as three separate letters:
  73  *
  74  * <pre>
  75  *      0066    LATIN SMALL LETTER F
  76  *      0066    LATIN SMALL LETTER F
  77  *      0069    LATIN SMALL LETTER I
  78  * </pre>
  79  *
  80  * or as the single character
  81  *
  82  * <pre>
  83  *      FB03    LATIN SMALL LIGATURE FFI
  84  * </pre>
  85  *
  86  * The ffi ligature is not a distinct semantic character, and strictly speaking
  87  * it shouldn't be in Unicode at all, but it was included for compatibility
  88  * with existing character sets that already provided it.  The Unicode standard
  89  * identifies such characters by giving them "compatibility" decompositions
  90  * into the corresponding semantic characters.  When sorting and searching, you
  91  * will often want to use these mappings.
  92  *
  93  * <code>normalize</code> helps solve these problems by transforming text into
  94  * the canonical composed and decomposed forms as shown in the first example
  95  * above. In addition, you can have it perform compatibility decompositions so
  96  * that you can treat compatibility characters the same as their equivalents.
  97  * Finally, <code>normalize</code> rearranges accents into the proper canonical
  98  * order, so that you do not have to worry about accent rearrangement on your
  99  * own.
 100  *
 101  * Form FCD, "Fast C or D", is also designed for collation.
 102  * It allows to work on strings that are not necessarily normalized
 103  * with an algorithm (like in collation) that works under "canonical closure",
 104  * i.e., it treats precomposed characters and their decomposed equivalents the
 105  * same.
 106  *
 107  * It is not a normalization form because it does not provide for uniqueness of
 108  * representation. Multiple strings may be canonically equivalent (their NFDs
 109  * are identical) and may all conform to FCD without being identical themselves.
 110  *
 111  * The form is defined such that the "raw decomposition", the recursive
 112  * canonical decomposition of each character, results in a string that is
 113  * canonically ordered. This means that precomposed characters are allowed for
 114  * as long as their decompositions do not need canonical reordering.
 115  *
 116  * Its advantage for a process like collation is that all NFD and most NFC texts
 117  * - and many unnormalized texts - already conform to FCD and do not need to be
 118  * normalized (NFD) for such a process. The FCD quick check will return YES for
 119  * most strings in practice.
 120  *
 121  * normalize(FCD) may be implemented with NFD.
 122  *
 123  * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
 124  * http://www.unicode.org/notes/tn5/#FCD
 125  *
 126  * ICU collation performs either NFD or FCD normalization automatically if
 127  * normalization is turned on for the collator object. Beyond collation and
 128  * string search, normalized strings may be useful for string equivalence
 129  * comparisons, transliteration/transcription, unique representations, etc.
 130  *
 131  * The W3C generally recommends to exchange texts in NFC.
 132  * Note also that most legacy character encodings use only precomposed forms and
 133  * often do not encode any combining marks by themselves. For conversion to such
 134  * character encodings the Unicode text needs to be normalized to NFC.
 135  * For more usage examples, see the Unicode Standard Annex.
 136  *
 137  * Note: The Normalizer class also provides API for iterative normalization.
 138  * While the setIndex() and getIndex() refer to indices in the
 139  * underlying Unicode input text, the next() and previous() methods
 140  * iterate through characters in the normalized output.
 141  * This means that there is not necessarily a one-to-one correspondence
 142  * between characters returned by next() and previous() and the indices
 143  * passed to and returned from setIndex() and getIndex().
 144  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
 145  *
 146  * @stable ICU 2.8
 147  */
 148 // Original filename in ICU4J: Normalizer.java
 149 public final class NormalizerBase implements Cloneable {
 150 
 151     // The input text and our position in it
 152     private UCharacterIterator  text;
 153     private Normalizer2         norm2;
 154     private Mode                mode;
 155     private int                 options;
 156 
 157     // The normalization buffer is the result of normalization
 158     // of the source in [currentIndex..nextIndex] .
 159     private int                 currentIndex;
 160     private int                 nextIndex;
 161 
 162     // A buffer for holding intermediate results
 163     private StringBuilder       buffer;
 164     private int                 bufferPos;
 165 
 166     // Helper classes to defer loading of normalization data.
 167     private static final class ModeImpl {
 168         private ModeImpl(Normalizer2 n2) {
 169             normalizer2 = n2;
 170         }
 171         private final Normalizer2 normalizer2;
 172     }
 173 
 174     private static final class NFDModeImpl {
 175         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
 176     }
 177 
 178     private static final class NFKDModeImpl {
 179         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
 180     }
 181 
 182     private static final class NFCModeImpl {
 183         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
 184     }
 185 
 186     private static final class NFKCModeImpl {
 187         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
 188     }
 189 
 190     private static final class Unicode32 {
 191         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
 192     }
 193 
 194     private static final class NFD32ModeImpl {
 195         private static final ModeImpl INSTANCE =
 196             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
 197                                                  Unicode32.INSTANCE));
 198     }
 199 
 200     private static final class NFKD32ModeImpl {
 201         private static final ModeImpl INSTANCE =
 202             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
 203                                                  Unicode32.INSTANCE));
 204     }
 205 
 206     private static final class NFC32ModeImpl {
 207         private static final ModeImpl INSTANCE =
 208             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
 209                                                  Unicode32.INSTANCE));
 210     }
 211 
 212     private static final class NFKC32ModeImpl {
 213         private static final ModeImpl INSTANCE =
 214             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
 215                                                  Unicode32.INSTANCE));
 216     }
 217 
 218     /**
 219      * Options bit set value to select Unicode 3.2 normalization
 220      * (except NormalizationCorrections).
 221      * At most one Unicode version can be selected at a time.
 222      * @stable ICU 2.6
 223      */
 224     public static final int UNICODE_3_2=0x20;
 225 
 226     public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
 227 
 228     /*
 229      * Default option for the latest Unicode normalization. This option is
 230      * provided mainly for testing.
 231      * The value zero means that normalization is done with the fixes for
 232      *   - Corrigendum 4 (Five CJK Canonical Mapping Errors)
 233      *   - Corrigendum 5 (Normalization Idempotency)
 234      */
 235     public static final int UNICODE_LATEST = 0x00;
 236 
 237     /**
 238      * Constant indicating that the end of the iteration has been reached.
 239      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
 240      * @stable ICU 2.8
 241      */
 242     public static final int DONE = UCharacterIterator.DONE;
 243 
 244     /**
 245      * Constants for normalization modes.
 246      * <p>
 247      * The Mode class is not intended for public subclassing.
 248      * Only the Mode constants provided by the Normalizer class should be used,
 249      * and any fields or methods should not be called or overridden by users.
 250      * @stable ICU 2.8
 251      */
 252     public abstract static class Mode {
 253 
 254         /**
 255          * Sole constructor
 256          * @internal
 257          * @deprecated This API is ICU internal only.
 258          */
 259         @Deprecated
 260         protected Mode() {
 261         }
 262 
 263         /**
 264          * @internal
 265          * @deprecated This API is ICU internal only.
 266          */
 267         @Deprecated
 268         protected abstract Normalizer2 getNormalizer2(int options);
 269     }
 270 
 271     private static Mode toMode(Normalizer.Form form) {
 272         switch (form) {
 273         case NFC :
 274             return NFC;
 275         case NFD :
 276             return NFD;
 277         case NFKC :
 278             return NFKC;
 279         case NFKD :
 280             return NFKD;
 281         }
 282 
 283         throw new IllegalArgumentException("Unexpected normalization form: " +
 284                                            form);
 285     }
 286 
 287     private static final class NONEMode extends Mode {
 288         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
 289     }
 290 
 291     private static final class NFDMode extends Mode {
 292         protected Normalizer2 getNormalizer2(int options) {
 293             return (options&UNICODE_3_2) != 0 ?
 294                     NFD32ModeImpl.INSTANCE.normalizer2 :
 295                     NFDModeImpl.INSTANCE.normalizer2;
 296         }
 297     }
 298 
 299     private static final class NFKDMode extends Mode {
 300         protected Normalizer2 getNormalizer2(int options) {
 301             return (options&UNICODE_3_2) != 0 ?
 302                     NFKD32ModeImpl.INSTANCE.normalizer2 :
 303                     NFKDModeImpl.INSTANCE.normalizer2;
 304         }
 305     }
 306 
 307     private static final class NFCMode extends Mode {
 308         protected Normalizer2 getNormalizer2(int options) {
 309             return (options&UNICODE_3_2) != 0 ?
 310                     NFC32ModeImpl.INSTANCE.normalizer2 :
 311                     NFCModeImpl.INSTANCE.normalizer2;
 312         }
 313     }
 314 
 315     private static final class NFKCMode extends Mode {
 316         protected Normalizer2 getNormalizer2(int options) {
 317             return (options&UNICODE_3_2) != 0 ?
 318                     NFKC32ModeImpl.INSTANCE.normalizer2 :
 319                     NFKCModeImpl.INSTANCE.normalizer2;
 320         }
 321     }
 322 
 323     /**
 324      * No decomposition/composition.
 325      * @stable ICU 2.8
 326      */
 327     public static final Mode NONE = new NONEMode();
 328 
 329     /**
 330      * Canonical decomposition.
 331      * @stable ICU 2.8
 332      */
 333     public static final Mode NFD = new NFDMode();
 334 
 335     /**
 336      * Compatibility decomposition.
 337      * @stable ICU 2.8
 338      */
 339     public static final Mode NFKD = new NFKDMode();
 340 
 341     /**
 342      * Canonical decomposition followed by canonical composition.
 343      * @stable ICU 2.8
 344      */
 345     public static final Mode NFC = new NFCMode();
 346 
 347     public static final Mode NFKC =new NFKCMode();
 348 
 349     //-------------------------------------------------------------------------
 350     // Iterator constructors
 351     //-------------------------------------------------------------------------
 352 
 353     /**
 354      * Creates a new {@code NormalizerBase} object for iterating over the
 355      * normalized form of a given string.
 356      * <p>
 357      * The {@code options} parameter specifies which optional
 358      * {@code NormalizerBase} features are to be enabled for this object.
 359      * <p>
 360      * @param str  The string to be normalized.  The normalization
 361      *              will start at the beginning of the string.
 362      *
 363      * @param mode The normalization mode.
 364      *
 365      * @param opt Any optional features to be enabled.
 366      *            Currently the only available option is {@link #UNICODE_3_2}.
 367      *            If you want the default behavior corresponding to one of the
 368      *            standard Unicode Normalization Forms, use 0 for this argument.
 369      * @stable ICU 2.6
 370      */
 371     public NormalizerBase(String str, Mode mode, int opt) {
 372         this.text = UCharacterIterator.getInstance(str);
 373         this.mode = mode;
 374         this.options=opt;
 375         norm2 = mode.getNormalizer2(opt);
 376         buffer = new StringBuilder();
 377     }
 378 
 379     public NormalizerBase(String str, Mode mode) {
 380        this(str, mode, 0);
 381     }
 382 
 383 
 384     /**
 385      * Creates a new {@code NormalizerBase} object for iterating over the
 386      * normalized form of the given text.
 387      * <p>
 388      * @param iter  The input text to be normalized.  The normalization
 389      *              will start at the beginning of the string.
 390      *
 391      * @param mode  The normalization mode.
 392      *
 393      * @param opt Any optional features to be enabled.
 394      *            Currently the only available option is {@link #UNICODE_3_2}.
 395      *            If you want the default behavior corresponding to one of the
 396      *            standard Unicode Normalization Forms, use 0 for this argument.
 397      * @stable ICU 2.6
 398      */
 399     public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
 400         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
 401         this.mode = mode;
 402         this.options = opt;
 403         norm2 = mode.getNormalizer2(opt);
 404         buffer = new StringBuilder();
 405     }
 406 
 407     public NormalizerBase(CharacterIterator iter, Mode mode) {
 408        this(iter, mode, 0);
 409     }
 410 
 411     /**
 412      * Clones this {@code NormalizerBase} object.  All properties of this
 413      * object are duplicated in the new object, including the cloning of any
 414      * {@link CharacterIterator} that was passed in to the constructor
 415      * or to {@link #setText(CharacterIterator) setText}.
 416      * However, the text storage underlying
 417      * the {@code CharacterIterator} is not duplicated unless the
 418      * iterator's {@code clone} method does so.
 419      * @stable ICU 2.8
 420      */
 421     public Object clone() {
 422         try {
 423             NormalizerBase copy = (NormalizerBase) super.clone();
 424             copy.text = (UCharacterIterator) text.clone();
 425             copy.mode = mode;
 426             copy.options = options;
 427             copy.norm2 = norm2;
 428             copy.buffer = new StringBuilder(buffer);
 429             copy.bufferPos = bufferPos;
 430             copy.currentIndex = currentIndex;
 431             copy.nextIndex = nextIndex;
 432             return copy;
 433         }
 434         catch (CloneNotSupportedException e) {
 435             throw new InternalError(e.toString(), e);
 436         }
 437     }
 438 
 439     /**
 440      * Normalizes a {@code String} using the given normalization operation.
 441      * <p>
 442      * The {@code options} parameter specifies which optional
 443      * {@code NormalizerBase} features are to be enabled for this operation.
 444      * Currently the only available option is {@link #UNICODE_3_2}.
 445      * If you want the default behavior corresponding to one of the standard
 446      * Unicode Normalization Forms, use 0 for this argument.
 447      * <p>
 448      * @param str       the input string to be normalized.
 449      * @param mode      the normalization mode
 450      * @param options   the optional features to be enabled.
 451      * @return String   the normalized string
 452      * @stable ICU 2.6
 453      */
 454     public static String normalize(String str, Mode mode, int options) {
 455         return mode.getNormalizer2(options).normalize(str);
 456     }
 457 
 458     public static String normalize(String str, Normalizer.Form form) {
 459         return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
 460     }
 461 
 462     public static String normalize(String str, Normalizer.Form form, int options) {
 463         return NormalizerBase.normalize(str, toMode(form), options);
 464     }
 465 
 466     /**
 467      * Test if a string is in a given normalization form.
 468      * This is semantically equivalent to source.equals(normalize(source, mode)).
 469      *
 470      * Unlike quickCheck(), this function returns a definitive result,
 471      * never a "maybe".
 472      * For NFD, NFKD, and FCD, both functions work exactly the same.
 473      * For NFC and NFKC where quickCheck may return "maybe", this function will
 474      * perform further tests to arrive at a true/false result.
 475      * @param str       the input string to be checked to see if it is
 476      *                   normalized
 477      * @param mode      the normalization mode
 478      * @param options   Options for use with exclusion set and tailored Normalization
 479      *                  The only option that is currently recognized is UNICODE_3_2
 480      * @see #isNormalized
 481      * @stable ICU 2.6
 482      */
 483     public static boolean isNormalized(String str, Mode mode, int options) {
 484         return mode.getNormalizer2(options).isNormalized(str);
 485     }
 486 
 487     public static boolean isNormalized(String str, Normalizer.Form form) {
 488         return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
 489     }
 490 
 491     public static boolean isNormalized(String str, Normalizer.Form form, int options) {
 492         return NormalizerBase.isNormalized(str, toMode(form), options);
 493     }
 494 
 495     //-------------------------------------------------------------------------
 496     // Iteration API
 497     //-------------------------------------------------------------------------
 498 
 499     /**
 500      * Return the current character in the normalized text.
 501      * @return The codepoint as an int
 502      * @stable ICU 2.8
 503      */
 504     public int current() {
 505         if(bufferPos<buffer.length() || nextNormalize()) {
 506             return buffer.codePointAt(bufferPos);
 507         } else {
 508             return DONE;
 509         }
 510     }
 511 
 512     /**
 513      * Return the next character in the normalized text and advance
 514      * the iteration position by one.  If the end
 515      * of the text has already been reached, {@link #DONE} is returned.
 516      * @return The codepoint as an int
 517      * @stable ICU 2.8
 518      */
 519     public int next() {
 520         if(bufferPos<buffer.length() ||  nextNormalize()) {
 521             int c=buffer.codePointAt(bufferPos);
 522             bufferPos+=Character.charCount(c);
 523             return c;
 524         } else {
 525             return DONE;
 526         }
 527     }
 528 
 529     /**
 530      * Return the previous character in the normalized text and decrement
 531      * the iteration position by one.  If the beginning
 532      * of the text has already been reached, {@link #DONE} is returned.
 533      * @return The codepoint as an int
 534      * @stable ICU 2.8
 535      */
 536     public int previous() {
 537         if(bufferPos>0 || previousNormalize()) {
 538             int c=buffer.codePointBefore(bufferPos);
 539             bufferPos-=Character.charCount(c);
 540             return c;
 541         } else {
 542             return DONE;
 543         }
 544     }
 545 
 546     /**
 547      * Reset the index to the beginning of the text.
 548      * This is equivalent to setIndexOnly(startIndex)).
 549      * @stable ICU 2.8
 550      */
 551     public void reset() {
 552         text.setIndex(0);
 553         currentIndex=nextIndex=0;
 554         clearBuffer();
 555     }
 556 
 557     /**
 558      * Set the iteration position in the input text that is being normalized,
 559      * without any immediate normalization.
 560      * After setIndexOnly(), getIndex() will return the same index that is
 561      * specified here.
 562      *
 563      * @param index the desired index in the input text.
 564      * @stable ICU 2.8
 565      */
 566     public void setIndexOnly(int index) {
 567         text.setIndex(index);  // validates index
 568         currentIndex=nextIndex=index;
 569         clearBuffer();
 570     }
 571 
 572     /**
 573      * Set the iteration position in the input text that is being normalized
 574      * and return the first normalized character at that position.
 575      * <p>
 576      * <b>Note:</b> This method sets the position in the <em>input</em> text,
 577      * while {@link #next} and {@link #previous} iterate through characters
 578      * in the normalized <em>output</em>.  This means that there is not
 579      * necessarily a one-to-one correspondence between characters returned
 580      * by {@code next} and {@code previous} and the indices passed to and
 581      * returned from {@code setIndex} and {@link #getIndex}.
 582      * <p>
 583      * @param index the desired index in the input text.
 584      *
 585      * @return   the first normalized character that is the result of iterating
 586      *            forward starting at the given index.
 587      *
 588      * @throws IllegalArgumentException if the given index is less than
 589      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
 590      * deprecated ICU 3.2
 591      * @obsolete ICU 3.2
 592      */
 593      public int setIndex(int index) {
 594          setIndexOnly(index);
 595          return current();
 596      }
 597 
 598     /**
 599      * Retrieve the index of the start of the input text. This is the begin
 600      * index of the {@code CharacterIterator} or the start (i.e. 0) of the
 601      * {@code String} over which this {@code NormalizerBase} is iterating
 602      * @deprecated ICU 2.2. Use startIndex() instead.
 603      * @return The codepoint as an int
 604      * @see #startIndex
 605      */
 606     @Deprecated
 607     public int getBeginIndex() {
 608         return 0;
 609     }
 610 
 611     /**
 612      * Retrieve the index of the end of the input text.  This is the end index
 613      * of the {@code CharacterIterator} or the length of the {@code String}
 614      * over which this {@code NormalizerBase} is iterating
 615      * @deprecated ICU 2.2. Use endIndex() instead.
 616      * @return The codepoint as an int
 617      * @see #endIndex
 618      */
 619     @Deprecated
 620     public int getEndIndex() {
 621         return endIndex();
 622     }
 623 
 624     /**
 625      * Retrieve the current iteration position in the input text that is
 626      * being normalized.  This method is useful in applications such as
 627      * searching, where you need to be able to determine the position in
 628      * the input text that corresponds to a given normalized output character.
 629      * <p>
 630      * <b>Note:</b> This method sets the position in the <em>input</em>, while
 631      * {@link #next} and {@link #previous} iterate through characters in the
 632      * <em>output</em>.  This means that there is not necessarily a one-to-one
 633      * correspondence between characters returned by {@code next} and
 634      * {@code previous} and the indices passed to and returned from
 635      * {@code setIndex} and {@link #getIndex}.
 636      * @return The current iteration position
 637      * @stable ICU 2.8
 638      */
 639     public int getIndex() {
 640         if(bufferPos<buffer.length()) {
 641             return currentIndex;
 642         } else {
 643             return nextIndex;
 644         }
 645     }
 646 
 647     /**
 648      * Retrieve the index of the end of the input text.  This is the end index
 649      * of the {@code CharacterIterator} or the length of the {@code String}
 650      * over which this {@code NormalizerBase} is iterating
 651      * @return The current iteration position
 652      * @stable ICU 2.8
 653      */
 654     public int endIndex() {
 655         return text.getLength();
 656     }
 657 
 658     //-------------------------------------------------------------------------
 659     // Iterator attributes
 660     //-------------------------------------------------------------------------
 661     /**
 662      * Set the normalization mode for this object.
 663      * <p>
 664      * <b>Note:</b>If the normalization mode is changed while iterating
 665      * over a string, calls to {@link #next} and {@link #previous} may
 666      * return previously buffers characters in the old normalization mode
 667      * until the iteration is able to re-sync at the next base character.
 668      * It is safest to call {@link #setText setText()}, {@link #first},
 669      * {@link #last}, etc. after calling {@code setMode}.
 670      * <p>
 671      * @param newMode the new mode for this {@code NormalizerBase}.
 672      * The supported modes are:
 673      * <ul>
 674      *  <li>{@link #NFC}    - Unicode canonical decompositiion
 675      *                        followed by canonical composition.
 676      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
 677      *                        follwed by canonical composition.
 678      *  <li>{@link #NFD}    - Unicode canonical decomposition
 679      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
 680      *  <li>{@link #NONE}   - Do nothing but return characters
 681      *                        from the underlying input text.
 682      * </ul>
 683      *
 684      * @see #getMode
 685      * @stable ICU 2.8
 686      */
 687     public void setMode(Mode newMode) {
 688         mode = newMode;
 689         norm2 = mode.getNormalizer2(options);
 690     }
 691 
 692     /**
 693      * Return the basic operation performed by this {@code NormalizerBase}
 694      *
 695      * @see #setMode
 696      * @stable ICU 2.8
 697      */
 698     public Mode getMode() {
 699         return mode;
 700     }
 701 
 702     /**
 703      * Set the input text over which this {@code NormalizerBase} will iterate.
 704      * The iteration position is set to the beginning of the input text.
 705      * @param newText   The new string to be normalized.
 706      * @stable ICU 2.8
 707      */
 708     public void setText(String newText) {
 709         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
 710         if (newIter == null) {
 711             throw new IllegalStateException("Could not create a new UCharacterIterator");
 712         }
 713         text = newIter;
 714         reset();
 715     }
 716 
 717     /**
 718      * Set the input text over which this {@code NormalizerBase} will iterate.
 719      * The iteration position is set to the beginning of the input text.
 720      * @param newText   The new string to be normalized.
 721      * @stable ICU 2.8
 722      */
 723     public void setText(CharacterIterator newText) {
 724         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
 725         if (newIter == null) {
 726             throw new IllegalStateException("Could not create a new UCharacterIterator");
 727         }
 728         text = newIter;
 729         currentIndex=nextIndex=0;
 730         clearBuffer();
 731     }
 732 
 733     private void clearBuffer() {
 734         buffer.setLength(0);
 735         bufferPos=0;
 736     }
 737 
 738     private boolean nextNormalize() {
 739         clearBuffer();
 740         currentIndex=nextIndex;
 741         text.setIndex(nextIndex);
 742         // Skip at least one character so we make progress.
 743         int c=text.nextCodePoint();
 744         if(c<0) {
 745             return false;
 746         }
 747         StringBuilder segment=new StringBuilder().appendCodePoint(c);
 748         while((c=text.nextCodePoint())>=0) {
 749             if(norm2.hasBoundaryBefore(c)) {
 750                 text.moveCodePointIndex(-1);
 751                 break;
 752             }
 753             segment.appendCodePoint(c);
 754         }
 755         nextIndex=text.getIndex();
 756         norm2.normalize(segment, buffer);
 757         return buffer.length()!=0;
 758     }
 759 
 760     private boolean previousNormalize() {
 761         clearBuffer();
 762         nextIndex=currentIndex;
 763         text.setIndex(currentIndex);
 764         StringBuilder segment=new StringBuilder();
 765         int c;
 766         while((c=text.previousCodePoint())>=0) {
 767             if(c<=0xffff) {
 768                 segment.insert(0, (char)c);
 769             } else {
 770                 segment.insert(0, Character.toChars(c));
 771             }
 772             if(norm2.hasBoundaryBefore(c)) {
 773                 break;
 774             }
 775         }
 776         currentIndex=text.getIndex();
 777         norm2.normalize(segment, buffer);
 778         bufferPos=buffer.length();
 779         return buffer.length()!=0;
 780     }
 781 
 782 }