1 /*
   2  * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.text.CharacterIterator;
  40 import java.text.Normalizer;
  41 
  42 /**
  43  * Unicode Normalization
  44  *
  45  * <h2>Unicode normalization API</h2>
  46  *
  47  * <code>normalize</code> transforms Unicode text into an equivalent composed or
  48  * decomposed form, allowing for easier sorting and searching of text.
  49  * <code>normalize</code> supports the standard normalization forms described in
  50  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
  51  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  52  *
  53  * Characters with accents or other adornments can be encoded in
  54  * several different ways in Unicode.  For example, take the character A-acute.
  55  * In Unicode, this can be encoded as a single character (the
  56  * "composed" form):
  57  *
  58  * <p>
  59  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
  60  * </p>
  61  *
  62  * or as two separate characters (the "decomposed" form):
  63  *
  64  * <p>
  65  *      0041    LATIN CAPITAL LETTER A
  66  *      0301    COMBINING ACUTE ACCENT
  67  * </p>
  68  *
  69  * To a user of your program, however, both of these sequences should be
  70  * treated as the same "user-level" character "A with acute accent".  When you
  71  * are searching or comparing text, you must ensure that these two sequences are
  72  * treated equivalently.  In addition, you must handle characters with more than
  73  * one accent.  Sometimes the order of a character's combining accents is
  74  * significant, while in other cases accent sequences in different orders are
  75  * really equivalent.
  76  *
  77  * Similarly, the string "ffi" can be encoded as three separate letters:
  78  *
  79  * <p>
  80  *      0066    LATIN SMALL LETTER F
  81  *      0066    LATIN SMALL LETTER F
  82  *      0069    LATIN SMALL LETTER I
  83  * </p>
  84  *
  85  * or as the single character
  86  *
  87  * <p>
  88  *      FB03    LATIN SMALL LIGATURE FFI
  89  * </p>
  90  *
  91  * The ffi ligature is not a distinct semantic character, and strictly speaking
  92  * it shouldn't be in Unicode at all, but it was included for compatibility
  93  * with existing character sets that already provided it.  The Unicode standard
  94  * identifies such characters by giving them "compatibility" decompositions
  95  * into the corresponding semantic characters.  When sorting and searching, you
  96  * will often want to use these mappings.
  97  *
  98  * <code>normalize</code> helps solve these problems by transforming text into
  99  * the canonical composed and decomposed forms as shown in the first example
 100  * above. In addition, you can have it perform compatibility decompositions so
 101  * that you can treat compatibility characters the same as their equivalents.
 102  * Finally, <code>normalize</code> rearranges accents into the proper canonical
 103  * order, so that you do not have to worry about accent rearrangement on your
 104  * own.
 105  *
 106  * Form FCD, "Fast C or D", is also designed for collation.
 107  * It allows to work on strings that are not necessarily normalized
 108  * with an algorithm (like in collation) that works under "canonical closure",
 109  * i.e., it treats precomposed characters and their decomposed equivalents the
 110  * same.
 111  *
 112  * It is not a normalization form because it does not provide for uniqueness of
 113  * representation. Multiple strings may be canonically equivalent (their NFDs
 114  * are identical) and may all conform to FCD without being identical themselves.
 115  *
 116  * The form is defined such that the "raw decomposition", the recursive
 117  * canonical decomposition of each character, results in a string that is
 118  * canonically ordered. This means that precomposed characters are allowed for
 119  * as long as their decompositions do not need canonical reordering.
 120  *
 121  * Its advantage for a process like collation is that all NFD and most NFC texts
 122  * - and many unnormalized texts - already conform to FCD and do not need to be
 123  * normalized (NFD) for such a process. The FCD quick check will return YES for
 124  * most strings in practice.
 125  *
 126  * normalize(FCD) may be implemented with NFD.
 127  *
 128  * For more details on FCD see the collation design document:
 129  * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
 130  *
 131  * ICU collation performs either NFD or FCD normalization automatically if
 132  * normalization is turned on for the collator object. Beyond collation and
 133  * string search, normalized strings may be useful for string equivalence
 134  * comparisons, transliteration/transcription, unique representations, etc.
 135  *
 136  * The W3C generally recommends to exchange texts in NFC.
 137  * Note also that most legacy character encodings use only precomposed forms and
 138  * often do not encode any combining marks by themselves. For conversion to such
 139  * character encodings the Unicode text needs to be normalized to NFC.
 140  * For more usage examples, see the Unicode Standard Annex.
 141  * @stable ICU 2.8
 142  */
 143 
 144 public final class NormalizerBase implements Cloneable {
 145 
 146     //-------------------------------------------------------------------------
 147     // Private data
 148     //-------------------------------------------------------------------------
 149     private char[] buffer = new char[100];
 150     private int bufferStart = 0;
 151     private int bufferPos   = 0;
 152     private int bufferLimit = 0;
 153 
 154     // The input text and our position in it
 155     private UCharacterIterator  text;
 156     private Mode                mode = NFC;
 157     private int                 options = 0;
 158     private int                 currentIndex;
 159     private int                 nextIndex;
 160 
 161     /**
 162      * Options bit set value to select Unicode 3.2 normalization
 163      * (except NormalizationCorrections).
 164      * At most one Unicode version can be selected at a time.
 165      * @stable ICU 2.6
 166      */
 167     public static final int UNICODE_3_2=0x20;
 168 
 169     /**
 170      * Constant indicating that the end of the iteration has been reached.
 171      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
 172      * @stable ICU 2.8
 173      */
 174     public static final int DONE = UCharacterIterator.DONE;
 175 
 176     /**
 177      * Constants for normalization modes.
 178      * @stable ICU 2.8
 179      */
 180     public static class Mode {
 181         private int modeValue;
 182         private Mode(int value) {
 183             modeValue = value;
 184         }
 185 
 186         /**
 187          * This method is used for method dispatch
 188          * @stable ICU 2.6
 189          */
 190         protected int normalize(char[] src, int srcStart, int srcLimit,
 191                                 char[] dest,int destStart,int destLimit,
 192                                 UnicodeSet nx) {
 193             int srcLen = (srcLimit - srcStart);
 194             int destLen = (destLimit - destStart);
 195             if( srcLen > destLen ) {
 196                 return srcLen;
 197             }
 198             System.arraycopy(src,srcStart,dest,destStart,srcLen);
 199             return srcLen;
 200         }
 201 
 202         /**
 203          * This method is used for method dispatch
 204          * @stable ICU 2.6
 205          */
 206         protected int normalize(char[] src, int srcStart, int srcLimit,
 207                                 char[] dest,int destStart,int destLimit,
 208                                 int options) {
 209             return normalize(   src, srcStart, srcLimit,
 210                                 dest,destStart,destLimit,
 211                                 NormalizerImpl.getNX(options)
 212                                 );
 213         }
 214 
 215         /**
 216          * This method is used for method dispatch
 217          * @stable ICU 2.6
 218          */
 219         protected String normalize(String src, int options) {
 220             return src;
 221         }
 222 
 223         /**
 224          * This method is used for method dispatch
 225          * @stable ICU 2.8
 226          */
 227         protected int getMinC() {
 228             return -1;
 229         }
 230 
 231         /**
 232          * This method is used for method dispatch
 233          * @stable ICU 2.8
 234          */
 235         protected int getMask() {
 236             return -1;
 237         }
 238 
 239         /**
 240          * This method is used for method dispatch
 241          * @stable ICU 2.8
 242          */
 243         protected IsPrevBoundary getPrevBoundary() {
 244             return null;
 245         }
 246 
 247         /**
 248          * This method is used for method dispatch
 249          * @stable ICU 2.8
 250          */
 251         protected IsNextBoundary getNextBoundary() {
 252             return null;
 253         }
 254 
 255         /**
 256          * This method is used for method dispatch
 257          * @stable ICU 2.6
 258          */
 259         protected QuickCheckResult quickCheck(char[] src,int start, int limit,
 260                                               boolean allowMaybe,UnicodeSet nx) {
 261             if(allowMaybe) {
 262                 return MAYBE;
 263             }
 264             return NO;
 265         }
 266 
 267         /**
 268          * This method is used for method dispatch
 269          * @stable ICU 2.8
 270          */
 271         protected boolean isNFSkippable(int c) {
 272             return true;
 273         }
 274     }
 275 
 276     /**
 277      * No decomposition/composition.
 278      * @stable ICU 2.8
 279      */
 280     public static final Mode NONE = new Mode(1);
 281 
 282     /**
 283      * Canonical decomposition.
 284      * @stable ICU 2.8
 285      */
 286     public static final Mode NFD = new NFDMode(2);
 287 
 288     private static final class NFDMode extends Mode {
 289         private NFDMode(int value) {
 290             super(value);
 291         }
 292 
 293         protected int normalize(char[] src, int srcStart, int srcLimit,
 294                                 char[] dest,int destStart,int destLimit,
 295                                 UnicodeSet nx) {
 296             int[] trailCC = new int[1];
 297             return NormalizerImpl.decompose(src,  srcStart,srcLimit,
 298                                             dest, destStart,destLimit,
 299                                             false, trailCC,nx);
 300         }
 301 
 302         protected String normalize( String src, int options) {
 303             return decompose(src,false,options);
 304         }
 305 
 306         protected int getMinC() {
 307             return NormalizerImpl.MIN_WITH_LEAD_CC;
 308         }
 309 
 310         protected IsPrevBoundary getPrevBoundary() {
 311             return new IsPrevNFDSafe();
 312         }
 313 
 314         protected IsNextBoundary getNextBoundary() {
 315             return new IsNextNFDSafe();
 316         }
 317 
 318         protected int getMask() {
 319             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD);
 320         }
 321 
 322         protected QuickCheckResult quickCheck(char[] src,int start,
 323                                               int limit,boolean allowMaybe,
 324                                               UnicodeSet nx) {
 325             return NormalizerImpl.quickCheck(
 326                                              src, start,limit,
 327                                              NormalizerImpl.getFromIndexesArr(
 328                                                                               NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE
 329                                                                               ),
 330                                              NormalizerImpl.QC_NFD,
 331                                              0,
 332                                              allowMaybe,
 333                                              nx
 334                                              );
 335         }
 336 
 337         protected boolean isNFSkippable(int c) {
 338             return NormalizerImpl.isNFSkippable(c,this,
 339                                                 (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD)
 340                                                 );
 341         }
 342     }
 343 
 344     /**
 345      * Compatibility decomposition.
 346      * @stable ICU 2.8
 347      */
 348     public static final Mode NFKD = new NFKDMode(3);
 349 
 350     private static final class NFKDMode extends Mode {
 351         private NFKDMode(int value) {
 352             super(value);
 353         }
 354 
 355         protected int normalize(char[] src, int srcStart, int srcLimit,
 356                                 char[] dest,int destStart,int destLimit,
 357                                 UnicodeSet nx) {
 358             int[] trailCC = new int[1];
 359             return NormalizerImpl.decompose(src,  srcStart,srcLimit,
 360                                             dest, destStart,destLimit,
 361                                             true, trailCC, nx);
 362         }
 363 
 364         protected String normalize( String src, int options) {
 365             return decompose(src,true,options);
 366         }
 367 
 368         protected int getMinC() {
 369             return NormalizerImpl.MIN_WITH_LEAD_CC;
 370         }
 371 
 372         protected IsPrevBoundary getPrevBoundary() {
 373             return new IsPrevNFDSafe();
 374         }
 375 
 376         protected IsNextBoundary getNextBoundary() {
 377             return new IsNextNFDSafe();
 378         }
 379 
 380         protected int getMask() {
 381             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD);
 382         }
 383 
 384         protected QuickCheckResult quickCheck(char[] src,int start,
 385                                               int limit,boolean allowMaybe,
 386                                               UnicodeSet nx) {
 387             return NormalizerImpl.quickCheck(
 388                                              src,start,limit,
 389                                              NormalizerImpl.getFromIndexesArr(
 390                                                                               NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE
 391                                                                               ),
 392                                              NormalizerImpl.QC_NFKD,
 393                                              NormalizerImpl.OPTIONS_COMPAT,
 394                                              allowMaybe,
 395                                              nx
 396                                              );
 397         }
 398 
 399         protected boolean isNFSkippable(int c) {
 400             return NormalizerImpl.isNFSkippable(c, this,
 401                                                 (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD)
 402                                                 );
 403         }
 404     }
 405 
 406     /**
 407      * Canonical decomposition followed by canonical composition.
 408      * @stable ICU 2.8
 409      */
 410     public static final Mode NFC = new NFCMode(4);
 411 
 412     private static final class NFCMode extends Mode{
 413         private NFCMode(int value) {
 414             super(value);
 415         }
 416         protected int normalize(char[] src, int srcStart, int srcLimit,
 417                                 char[] dest,int destStart,int destLimit,
 418                                 UnicodeSet nx) {
 419             return NormalizerImpl.compose( src, srcStart, srcLimit,
 420                                            dest,destStart,destLimit,
 421                                            0, nx);
 422         }
 423 
 424         protected String normalize( String src, int options) {
 425             return compose(src, false, options);
 426         }
 427 
 428         protected int getMinC() {
 429             return NormalizerImpl.getFromIndexesArr(
 430                                                     NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
 431                                                     );
 432         }
 433         protected IsPrevBoundary getPrevBoundary() {
 434             return new IsPrevTrueStarter();
 435         }
 436         protected IsNextBoundary getNextBoundary() {
 437             return new IsNextTrueStarter();
 438         }
 439         protected int getMask() {
 440             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC);
 441         }
 442         protected QuickCheckResult quickCheck(char[] src,int start,
 443                                               int limit,boolean allowMaybe,
 444                                               UnicodeSet nx) {
 445             return NormalizerImpl.quickCheck(
 446                                              src,start,limit,
 447                                              NormalizerImpl.getFromIndexesArr(
 448                                                                               NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
 449                                                                               ),
 450                                              NormalizerImpl.QC_NFC,
 451                                              0,
 452                                              allowMaybe,
 453                                              nx
 454                                              );
 455         }
 456         protected boolean isNFSkippable(int c) {
 457             return NormalizerImpl.isNFSkippable(c,this,
 458                                                 ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
 459                                                   (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)
 460                                                   )
 461                                                 );
 462         }
 463     };
 464 
 465     /**
 466      * Compatibility decomposition followed by canonical composition.
 467      * @stable ICU 2.8
 468      */
 469     public static final Mode NFKC =new NFKCMode(5);
 470 
 471     private static final class NFKCMode extends Mode{
 472         private NFKCMode(int value) {
 473             super(value);
 474         }
 475         protected int normalize(char[] src, int srcStart, int srcLimit,
 476                                 char[] dest,int destStart,int destLimit,
 477                                 UnicodeSet nx) {
 478             return NormalizerImpl.compose(src,  srcStart,srcLimit,
 479                                           dest, destStart,destLimit,
 480                                           NormalizerImpl.OPTIONS_COMPAT, nx);
 481         }
 482 
 483         protected String normalize( String src, int options) {
 484             return compose(src, true, options);
 485         }
 486         protected int getMinC() {
 487             return NormalizerImpl.getFromIndexesArr(
 488                                                     NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
 489                                                     );
 490         }
 491         protected IsPrevBoundary getPrevBoundary() {
 492             return new IsPrevTrueStarter();
 493         }
 494         protected IsNextBoundary getNextBoundary() {
 495             return new IsNextTrueStarter();
 496         }
 497         protected int getMask() {
 498             return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC);
 499         }
 500         protected QuickCheckResult quickCheck(char[] src,int start,
 501                                               int limit,boolean allowMaybe,
 502                                               UnicodeSet nx) {
 503             return NormalizerImpl.quickCheck(
 504                                              src,start,limit,
 505                                              NormalizerImpl.getFromIndexesArr(
 506                                                                               NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
 507                                                                               ),
 508                                              NormalizerImpl.QC_NFKC,
 509                                              NormalizerImpl.OPTIONS_COMPAT,
 510                                              allowMaybe,
 511                                              nx
 512                                              );
 513         }
 514         protected boolean isNFSkippable(int c) {
 515             return NormalizerImpl.isNFSkippable(c, this,
 516                                                 ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
 517                                                   (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)
 518                                                   )
 519                                                 );
 520         }
 521     };
 522 
 523     /**
 524      * Result values for quickCheck().
 525      * For details see Unicode Technical Report 15.
 526      * @stable ICU 2.8
 527      */
 528     public static final class QuickCheckResult{
 529         private int resultValue;
 530         private QuickCheckResult(int value) {
 531             resultValue=value;
 532         }
 533     }
 534     /**
 535      * Indicates that string is not in the normalized format
 536      * @stable ICU 2.8
 537      */
 538     public static final QuickCheckResult NO = new QuickCheckResult(0);
 539 
 540     /**
 541      * Indicates that string is in the normalized format
 542      * @stable ICU 2.8
 543      */
 544     public static final QuickCheckResult YES = new QuickCheckResult(1);
 545 
 546     /**
 547      * Indicates it cannot be determined if string is in the normalized
 548      * format without further thorough checks.
 549      * @stable ICU 2.8
 550      */
 551     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
 552 
 553     //-------------------------------------------------------------------------
 554     // Constructors
 555     //-------------------------------------------------------------------------
 556 
 557     /**
 558      * Creates a new <tt>Normalizer</tt> object for iterating over the
 559      * normalized form of a given string.
 560      * <p>
 561      * The <tt>options</tt> parameter specifies which optional
 562      * <tt>Normalizer</tt> features are to be enabled for this object.
 563      * <p>
 564      * @param str  The string to be normalized.  The normalization
 565      *              will start at the beginning of the string.
 566      *
 567      * @param mode The normalization mode.
 568      *
 569      * @param opt Any optional features to be enabled.
 570      *            Currently the only available option is {@link #UNICODE_3_2}.
 571      *            If you want the default behavior corresponding to one of the
 572      *            standard Unicode Normalization Forms, use 0 for this argument.
 573      * @stable ICU 2.6
 574      */
 575     public NormalizerBase(String str, Mode mode, int opt) {
 576         this.text = UCharacterIterator.getInstance(str);
 577         this.mode = mode;
 578         this.options=opt;
 579     }
 580 
 581     /**
 582      * Creates a new <tt>Normalizer</tt> object for iterating over the
 583      * normalized form of the given text.
 584      * <p>
 585      * @param iter  The input text to be normalized.  The normalization
 586      *              will start at the beginning of the string.
 587      *
 588      * @param mode  The normalization mode.
 589      */
 590     public NormalizerBase(CharacterIterator iter, Mode mode) {
 591           this(iter, mode, UNICODE_LATEST);
 592     }
 593 
 594     /**
 595      * Creates a new <tt>Normalizer</tt> object for iterating over the
 596      * normalized form of the given text.
 597      * <p>
 598      * @param iter  The input text to be normalized.  The normalization
 599      *              will start at the beginning of the string.
 600      *
 601      * @param mode  The normalization mode.
 602      *
 603      * @param opt Any optional features to be enabled.
 604      *            Currently the only available option is {@link #UNICODE_3_2}.
 605      *            If you want the default behavior corresponding to one of the
 606      *            standard Unicode Normalization Forms, use 0 for this argument.
 607      * @stable ICU 2.6
 608      */
 609     public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
 610         this.text = UCharacterIterator.getInstance(
 611                                                    (CharacterIterator)iter.clone()
 612                                                    );
 613         this.mode = mode;
 614         this.options = opt;
 615     }
 616 
 617     /**
 618      * Clones this <tt>Normalizer</tt> object.  All properties of this
 619      * object are duplicated in the new object, including the cloning of any
 620      * {@link CharacterIterator} that was passed in to the constructor
 621      * or to {@link #setText(CharacterIterator) setText}.
 622      * However, the text storage underlying
 623      * the <tt>CharacterIterator</tt> is not duplicated unless the
 624      * iterator's <tt>clone</tt> method does so.
 625      * @stable ICU 2.8
 626      */
 627     public Object clone() {
 628         try {
 629             NormalizerBase copy = (NormalizerBase) super.clone();
 630             copy.text = (UCharacterIterator) text.clone();
 631             //clone the internal buffer
 632             if (buffer != null) {
 633                 copy.buffer = new char[buffer.length];
 634                 System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
 635             }
 636             return copy;
 637         }
 638         catch (CloneNotSupportedException e) {
 639             throw new InternalError(e.toString(), e);
 640         }
 641     }
 642 
 643     //--------------------------------------------------------------------------
 644     // Static Utility methods
 645     //--------------------------------------------------------------------------
 646 
 647     /**
 648      * Compose a string.
 649      * The string will be composed according to the specified mode.
 650      * @param str        The string to compose.
 651      * @param compat     If true the string will be composed according to
 652      *                    NFKC rules and if false will be composed according to
 653      *                    NFC rules.
 654      * @param options    The only recognized option is UNICODE_3_2
 655      * @return String    The composed string
 656      * @stable ICU 2.6
 657      */
 658     public static String compose(String str, boolean compat, int options) {
 659 
 660         char[] dest, src;
 661         if (options == UNICODE_3_2_0_ORIGINAL) {
 662             String mappedStr = NormalizerImpl.convert(str);
 663             dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE];
 664             src = mappedStr.toCharArray();
 665         } else {
 666             dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
 667             src = str.toCharArray();
 668         }
 669         int destSize=0;
 670 
 671         UnicodeSet nx = NormalizerImpl.getNX(options);
 672 
 673         /* reset options bits that should only be set here or inside compose() */
 674         options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
 675 
 676         if(compat) {
 677             options|=NormalizerImpl.OPTIONS_COMPAT;
 678         }
 679 
 680         for(;;) {
 681             destSize=NormalizerImpl.compose(src,0,src.length,
 682                                             dest,0,dest.length,options,
 683                                             nx);
 684             if(destSize<=dest.length) {
 685                 return new String(dest,0,destSize);
 686             } else {
 687                 dest = new char[destSize];
 688             }
 689         }
 690     }
 691 
 692     private static final int MAX_BUF_SIZE_COMPOSE = 2;
 693     private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
 694 
 695     /**
 696      * Decompose a string.
 697      * The string will be decomposed according to the specified mode.
 698      * @param str       The string to decompose.
 699      * @param compat    If true the string will be decomposed according to NFKD
 700      *                   rules and if false will be decomposed according to NFD
 701      *                   rules.
 702      * @return String   The decomposed string
 703      * @stable ICU 2.8
 704      */
 705     public static String decompose(String str, boolean compat) {
 706         return decompose(str,compat,UNICODE_LATEST);
 707     }
 708 
 709     /**
 710      * Decompose a string.
 711      * The string will be decomposed according to the specified mode.
 712      * @param str     The string to decompose.
 713      * @param compat  If true the string will be decomposed according to NFKD
 714      *                 rules and if false will be decomposed according to NFD
 715      *                 rules.
 716      * @param options The normalization options, ORed together (0 for no options).
 717      * @return String The decomposed string
 718      * @stable ICU 2.6
 719      */
 720     public static String decompose(String str, boolean compat, int options) {
 721 
 722         int[] trailCC = new int[1];
 723         int destSize=0;
 724         UnicodeSet nx = NormalizerImpl.getNX(options);
 725         char[] dest;
 726 
 727         if (options == UNICODE_3_2_0_ORIGINAL) {
 728             String mappedStr = NormalizerImpl.convert(str);
 729             dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE];
 730 
 731             for(;;) {
 732                 destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(),
 733                                                   dest,0,dest.length,
 734                                                   compat,trailCC, nx);
 735                 if(destSize<=dest.length) {
 736                     return new String(dest,0,destSize);
 737                 } else {
 738                     dest = new char[destSize];
 739                 }
 740             }
 741         } else {
 742             dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];
 743 
 744             for(;;) {
 745                 destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
 746                                                   dest,0,dest.length,
 747                                                   compat,trailCC, nx);
 748                 if(destSize<=dest.length) {
 749                     return new String(dest,0,destSize);
 750                 } else {
 751                     dest = new char[destSize];
 752                 }
 753             }
 754         }
 755     }
 756 
 757     /**
 758      * Normalize a string.
 759      * The string will be normalized according to the specified normalization
 760      * mode and options.
 761      * @param src       The char array to compose.
 762      * @param srcStart  Start index of the source
 763      * @param srcLimit  Limit index of the source
 764      * @param dest      The char buffer to fill in
 765      * @param destStart Start index of the destination buffer
 766      * @param destLimit End index of the destination buffer
 767      * @param mode      The normalization mode; one of Normalizer.NONE,
 768      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
 769      *                   Normalizer.NFKD, Normalizer.DEFAULT
 770      * @param options The normalization options, ORed together (0 for no options).
 771      * @return int      The total buffer size needed;if greater than length of
 772      *                   result, the output was truncated.
 773      * @exception       IndexOutOfBoundsException if the target capacity is
 774      *                   less than the required length
 775      * @stable ICU 2.6
 776      */
 777     public static int normalize(char[] src,int srcStart, int srcLimit,
 778                                 char[] dest,int destStart, int destLimit,
 779                                 Mode  mode, int options) {
 780         int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
 781 
 782         if(length<=(destLimit-destStart)) {
 783             return length;
 784         } else {
 785             throw new IndexOutOfBoundsException(Integer.toString(length));
 786         }
 787     }
 788 
 789     //-------------------------------------------------------------------------
 790     // Iteration API
 791     //-------------------------------------------------------------------------
 792 
 793     /**
 794      * Return the current character in the normalized text->
 795      * @return The codepoint as an int
 796      * @stable ICU 2.8
 797      */
 798     public int current() {
 799         if(bufferPos<bufferLimit || nextNormalize()) {
 800             return getCodePointAt(bufferPos);
 801         } else {
 802             return DONE;
 803         }
 804     }
 805 
 806     /**
 807      * Return the next character in the normalized text and advance
 808      * the iteration position by one.  If the end
 809      * of the text has already been reached, {@link #DONE} is returned.
 810      * @return The codepoint as an int
 811      * @stable ICU 2.8
 812      */
 813     public int next() {
 814         if(bufferPos<bufferLimit ||  nextNormalize()) {
 815             int c=getCodePointAt(bufferPos);
 816             bufferPos+=(c>0xFFFF) ? 2 : 1;
 817             return c;
 818         } else {
 819             return DONE;
 820         }
 821     }
 822 
 823 
 824     /**
 825      * Return the previous character in the normalized text and decrement
 826      * the iteration position by one.  If the beginning
 827      * of the text has already been reached, {@link #DONE} is returned.
 828      * @return The codepoint as an int
 829      * @stable ICU 2.8
 830      */
 831     public int previous() {
 832         if(bufferPos>0 || previousNormalize()) {
 833             int c=getCodePointAt(bufferPos-1);
 834             bufferPos-=(c>0xFFFF) ? 2 : 1;
 835             return c;
 836         } else {
 837             return DONE;
 838         }
 839     }
 840 
 841     /**
 842      * Reset the index to the beginning of the text.
 843      * This is equivalent to setIndexOnly(startIndex)).
 844      * @stable ICU 2.8
 845      */
 846     public void reset() {
 847         text.setIndex(0);
 848         currentIndex=nextIndex=0;
 849         clearBuffer();
 850     }
 851 
 852     /**
 853      * Set the iteration position in the input text that is being normalized,
 854      * without any immediate normalization.
 855      * After setIndexOnly(), getIndex() will return the same index that is
 856      * specified here.
 857      *
 858      * @param index the desired index in the input text.
 859      * @stable ICU 2.8
 860      */
 861     public void setIndexOnly(int index) {
 862         text.setIndex(index);
 863         currentIndex=nextIndex=index; // validates index
 864         clearBuffer();
 865     }
 866 
 867     /**
 868      * Set the iteration position in the input text that is being normalized
 869      * and return the first normalized character at that position.
 870      * <p>
 871      * <b>Note:</b> This method sets the position in the <em>input</em> text,
 872      * while {@link #next} and {@link #previous} iterate through characters
 873      * in the normalized <em>output</em>.  This means that there is not
 874      * necessarily a one-to-one correspondence between characters returned
 875      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
 876      * returned from <tt>setIndex</tt> and {@link #getIndex}.
 877      * <p>
 878      * @param index the desired index in the input text->
 879      *
 880      * @return   the first normalized character that is the result of iterating
 881      *            forward starting at the given index.
 882      *
 883      * @throws IllegalArgumentException if the given index is less than
 884      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
 885      * @return The codepoint as an int
 886      * @deprecated ICU 3.2
 887      * @obsolete ICU 3.2
 888      */
 889      @Deprecated
 890      public int setIndex(int index) {
 891          setIndexOnly(index);
 892          return current();
 893      }
 894 
 895     /**
 896      * Retrieve the index of the start of the input text. This is the begin
 897      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
 898      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
 899      * @deprecated ICU 2.2. Use startIndex() instead.
 900      * @return The codepoint as an int
 901      * @see #startIndex
 902      */
 903     @Deprecated
 904     public int getBeginIndex() {
 905         return 0;
 906     }
 907 
 908     /**
 909      * Retrieve the index of the end of the input text.  This is the end index
 910      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 911      * over which this <tt>Normalizer</tt> is iterating
 912      * @deprecated ICU 2.2. Use endIndex() instead.
 913      * @return The codepoint as an int
 914      * @see #endIndex
 915      */
 916     @Deprecated
 917     public int getEndIndex() {
 918         return endIndex();
 919     }
 920 
 921     /**
 922      * Retrieve the current iteration position in the input text that is
 923      * being normalized.  This method is useful in applications such as
 924      * searching, where you need to be able to determine the position in
 925      * the input text that corresponds to a given normalized output character.
 926      * <p>
 927      * <b>Note:</b> This method sets the position in the <em>input</em>, while
 928      * {@link #next} and {@link #previous} iterate through characters in the
 929      * <em>output</em>.  This means that there is not necessarily a one-to-one
 930      * correspondence between characters returned by <tt>next</tt> and
 931      * <tt>previous</tt> and the indices passed to and returned from
 932      * <tt>setIndex</tt> and {@link #getIndex}.
 933      * @return The current iteration position
 934      * @stable ICU 2.8
 935      */
 936     public int getIndex() {
 937         if(bufferPos<bufferLimit) {
 938             return currentIndex;
 939         } else {
 940             return nextIndex;
 941         }
 942     }
 943 
 944     /**
 945      * Retrieve the index of the end of the input text->  This is the end index
 946      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
 947      * over which this <tt>Normalizer</tt> is iterating
 948      * @return The current iteration position
 949      * @stable ICU 2.8
 950      */
 951     public int endIndex() {
 952         return text.getLength();
 953     }
 954 
 955     //-------------------------------------------------------------------------
 956     // Property access methods
 957     //-------------------------------------------------------------------------
 958     /**
 959      * Set the normalization mode for this object.
 960      * <p>
 961      * <b>Note:</b>If the normalization mode is changed while iterating
 962      * over a string, calls to {@link #next} and {@link #previous} may
 963      * return previously buffers characters in the old normalization mode
 964      * until the iteration is able to re-sync at the next base character.
 965      * It is safest to call {@link #setText setText()}, {@link #first},
 966      * {@link #last}, etc. after calling <tt>setMode</tt>.
 967      * <p>
 968      * @param newMode the new mode for this <tt>Normalizer</tt>.
 969      * The supported modes are:
 970      * <ul>
 971      *  <li>{@link #COMPOSE}        - Unicode canonical decompositiion
 972      *                                  followed by canonical composition.
 973      *  <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
 974      *                                  follwed by canonical composition.
 975      *  <li>{@link #DECOMP}         - Unicode canonical decomposition
 976      *  <li>{@link #DECOMP_COMPAT}  - Unicode compatibility decomposition.
 977      *  <li>{@link #NO_OP}          - Do nothing but return characters
 978      *                                  from the underlying input text.
 979      * </ul>
 980      *
 981      * @see #getMode
 982      * @stable ICU 2.8
 983      */
 984     public void setMode(Mode newMode) {
 985         mode = newMode;
 986     }
 987     /**
 988      * Return the basic operation performed by this <tt>Normalizer</tt>
 989      *
 990      * @see #setMode
 991      * @stable ICU 2.8
 992      */
 993     public Mode getMode() {
 994         return mode;
 995     }
 996 
 997     /**
 998      * Set the input text over which this <tt>Normalizer</tt> will iterate.
 999      * The iteration position is set to the beginning of the input text->
1000      * @param newText   The new string to be normalized.
1001      * @stable ICU 2.8
1002      */
1003     public void setText(String newText) {
1004 
1005         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1006         if (newIter == null) {
1007             throw new InternalError("Could not create a new UCharacterIterator");
1008         }
1009         text = newIter;
1010         reset();
1011     }
1012 
1013     /**
1014      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1015      * The iteration position is set to the beginning of the input text->
1016      * @param newText   The new string to be normalized.
1017      * @stable ICU 2.8
1018      */
1019     public void setText(CharacterIterator newText) {
1020 
1021         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1022         if (newIter == null) {
1023             throw new InternalError("Could not create a new UCharacterIterator");
1024         }
1025         text = newIter;
1026         currentIndex=nextIndex=0;
1027         clearBuffer();
1028     }
1029 
1030     //-------------------------------------------------------------------------
1031     // Private utility methods
1032     //-------------------------------------------------------------------------
1033 
1034 
1035     /* backward iteration --------------------------------------------------- */
1036 
1037     /*
1038      * read backwards and get norm32
1039      * return 0 if the character is <minC
1040      * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1041      * surrogate but read second!)
1042      */
1043 
1044     private static  long getPrevNorm32(UCharacterIterator src,
1045                                        int/*unsigned*/ minC,
1046                                        int/*unsigned*/ mask,
1047                                        char[] chars) {
1048         long norm32;
1049         int ch=0;
1050         /* need src.hasPrevious() */
1051         if((ch=src.previous()) == UCharacterIterator.DONE) {
1052             return 0;
1053         }
1054         chars[0]=(char)ch;
1055         chars[1]=0;
1056 
1057         /* check for a surrogate before getting norm32 to see if we need to
1058          * predecrement further */
1059         if(chars[0]<minC) {
1060             return 0;
1061         } else if(!UTF16.isSurrogate(chars[0])) {
1062             return NormalizerImpl.getNorm32(chars[0]);
1063         } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) {
1064             /* unpaired surrogate */
1065             chars[1]=(char)src.current();
1066             return 0;
1067         } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) {
1068             norm32=NormalizerImpl.getNorm32(chars[1]);
1069             if((norm32&mask)==0) {
1070                 /* all surrogate pairs with this lead surrogate have irrelevant
1071                  * data */
1072                 return 0;
1073             } else {
1074                 /* norm32 must be a surrogate special */
1075                 return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]);
1076             }
1077         } else {
1078             /* unpaired second surrogate, undo the c2=src.previous() movement */
1079             src.moveIndex( 1);
1080             return 0;
1081         }
1082     }
1083 
1084     private interface IsPrevBoundary{
1085         public boolean isPrevBoundary(UCharacterIterator src,
1086                                       int/*unsigned*/ minC,
1087                                       int/*unsigned*/ mask,
1088                                       char[] chars);
1089     }
1090     private static final class IsPrevNFDSafe implements IsPrevBoundary{
1091         /*
1092          * for NF*D:
1093          * read backwards and check if the lead combining class is 0
1094          * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1095          * surrogate but read second!)
1096          */
1097         public boolean isPrevBoundary(UCharacterIterator src,
1098                                       int/*unsigned*/ minC,
1099                                       int/*unsigned*/ ccOrQCMask,
1100                                       char[] chars) {
1101 
1102             return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC,
1103                                                           ccOrQCMask, chars),
1104                                             ccOrQCMask,
1105                                             ccOrQCMask& NormalizerImpl.QC_MASK);
1106         }
1107     }
1108 
1109     private static final class IsPrevTrueStarter implements IsPrevBoundary{
1110         /*
1111          * read backwards and check if the character is (or its decomposition
1112          * begins with) a "true starter" (cc==0 and NF*C_YES)
1113          * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
1114          * surrogate but read second!)
1115          */
1116         public boolean isPrevBoundary(UCharacterIterator src,
1117                                       int/*unsigned*/ minC,
1118                                       int/*unsigned*/ ccOrQCMask,
1119                                       char[] chars) {
1120             long norm32;
1121             int/*unsigned*/ decompQCMask;
1122 
1123             decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
1124             norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
1125             return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask);
1126         }
1127     }
1128 
1129     private static int findPreviousIterationBoundary(UCharacterIterator src,
1130                                                      IsPrevBoundary obj,
1131                                                      int/*unsigned*/ minC,
1132                                                      int/*mask*/ mask,
1133                                                      char[] buffer,
1134                                                      int[] startIndex) {
1135         char[] chars=new char[2];
1136         boolean isBoundary;
1137 
1138         /* fill the buffer from the end backwards */
1139         startIndex[0] = buffer.length;
1140         chars[0]=0;
1141         while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) {
1142             isBoundary=obj.isPrevBoundary(src, minC, mask, chars);
1143 
1144             /* always write this character to the front of the buffer */
1145             /* make sure there is enough space in the buffer */
1146             if(startIndex[0] < (chars[1]==0 ? 1 : 2)) {
1147 
1148                 // grow the buffer
1149                 char[] newBuf = new char[buffer.length*2];
1150                 /* move the current buffer contents up */
1151                 System.arraycopy(buffer,startIndex[0],newBuf,
1152                                  newBuf.length-(buffer.length-startIndex[0]),
1153                                  buffer.length-startIndex[0]);
1154                 //adjust the startIndex
1155                 startIndex[0]+=newBuf.length-buffer.length;
1156 
1157                 buffer=newBuf;
1158                 newBuf=null;
1159 
1160             }
1161 
1162             buffer[--startIndex[0]]=chars[0];
1163             if(chars[1]!=0) {
1164                 buffer[--startIndex[0]]=chars[1];
1165             }
1166 
1167             /* stop if this just-copied character is a boundary */
1168             if(isBoundary) {
1169                 break;
1170             }
1171         }
1172 
1173         /* return the length of the buffer contents */
1174         return buffer.length-startIndex[0];
1175     }
1176 
1177     private static int previous(UCharacterIterator src,
1178                                 char[] dest, int destStart, int destLimit,
1179                                 Mode mode,
1180                                 boolean doNormalize,
1181                                 boolean[] pNeededToNormalize,
1182                                 int options) {
1183 
1184         IsPrevBoundary isPreviousBoundary;
1185         int destLength, bufferLength;
1186         int/*unsigned*/ mask;
1187         int c,c2;
1188 
1189         char minC;
1190         int destCapacity = destLimit-destStart;
1191         destLength=0;
1192 
1193         if(pNeededToNormalize!=null) {
1194             pNeededToNormalize[0]=false;
1195         }
1196         minC = (char)mode.getMinC();
1197         mask = mode.getMask();
1198         isPreviousBoundary = mode.getPrevBoundary();
1199 
1200         if(isPreviousBoundary==null) {
1201             destLength=0;
1202             if((c=src.previous())>=0) {
1203                 destLength=1;
1204                 if(UTF16.isTrailSurrogate((char)c)) {
1205                     c2= src.previous();
1206                     if(c2!= UCharacterIterator.DONE) {
1207                         if(UTF16.isLeadSurrogate((char)c2)) {
1208                             if(destCapacity>=2) {
1209                                 dest[1]=(char)c; // trail surrogate
1210                                 destLength=2;
1211                             }
1212                             // lead surrogate to be written below
1213                             c=c2;
1214                         } else {
1215                             src.moveIndex(1);
1216                         }
1217                     }
1218                 }
1219 
1220                 if(destCapacity>0) {
1221                     dest[0]=(char)c;
1222                 }
1223             }
1224             return destLength;
1225         }
1226 
1227         char[] buffer = new char[100];
1228         int[] startIndex= new int[1];
1229         bufferLength=findPreviousIterationBoundary(src,
1230                                                    isPreviousBoundary,
1231                                                    minC, mask,buffer,
1232                                                    startIndex);
1233         if(bufferLength>0) {
1234             if(doNormalize) {
1235                 destLength=NormalizerBase.normalize(buffer,startIndex[0],
1236                                                 startIndex[0]+bufferLength,
1237                                                 dest, destStart,destLimit,
1238                                                 mode, options);
1239 
1240                 if(pNeededToNormalize!=null) {
1241                     pNeededToNormalize[0]=destLength!=bufferLength ||
1242                                           Utility.arrayRegionMatches(
1243                                             buffer,0,dest,
1244                                             destStart,destLimit
1245                                           );
1246                 }
1247             } else {
1248                 /* just copy the source characters */
1249                 if(destCapacity>0) {
1250                     System.arraycopy(buffer,startIndex[0],dest,0,
1251                                      (bufferLength<destCapacity) ?
1252                                      bufferLength : destCapacity
1253                                      );
1254                 }
1255             }
1256         }
1257 
1258 
1259         return destLength;
1260     }
1261 
1262 
1263 
1264     /* forward iteration ---------------------------------------------------- */
1265     /*
1266      * read forward and check if the character is a next-iteration boundary
1267      * if c2!=0 then (c, c2) is a surrogate pair
1268      */
1269     private interface IsNextBoundary{
1270         boolean isNextBoundary(UCharacterIterator src,
1271                                int/*unsigned*/ minC,
1272                                int/*unsigned*/ mask,
1273                                int[] chars);
1274     }
1275     /*
1276      * read forward and get norm32
1277      * return 0 if the character is <minC
1278      * if c2!=0 then (c2, c) is a surrogate pair
1279      * always reads complete characters
1280      */
1281     private static long /*unsigned*/ getNextNorm32(UCharacterIterator src,
1282                                                    int/*unsigned*/ minC,
1283                                                    int/*unsigned*/ mask,
1284                                                    int[] chars) {
1285         long norm32;
1286 
1287         /* need src.hasNext() to be true */
1288         chars[0]=src.next();
1289         chars[1]=0;
1290 
1291         if(chars[0]<minC) {
1292             return 0;
1293         }
1294 
1295         norm32=NormalizerImpl.getNorm32((char)chars[0]);
1296         if(UTF16.isLeadSurrogate((char)chars[0])) {
1297             if(src.current()!=UCharacterIterator.DONE &&
1298                UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) {
1299                 src.moveIndex(1); /* skip the c2 surrogate */
1300                 if((norm32&mask)==0) {
1301                     /* irrelevant data */
1302                     return 0;
1303                 } else {
1304                     /* norm32 must be a surrogate special */
1305                     return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]);
1306                 }
1307             } else {
1308                 /* unmatched surrogate */
1309                 return 0;
1310             }
1311         }
1312         return norm32;
1313     }
1314 
1315 
1316     /*
1317      * for NF*D:
1318      * read forward and check if the lead combining class is 0
1319      * if c2!=0 then (c, c2) is a surrogate pair
1320      */
1321     private static final class IsNextNFDSafe implements IsNextBoundary{
1322         public boolean isNextBoundary(UCharacterIterator src,
1323                                       int/*unsigned*/ minC,
1324                                       int/*unsigned*/ ccOrQCMask,
1325                                       int[] chars) {
1326             return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars),
1327                                             ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK);
1328         }
1329     }
1330 
1331     /*
1332      * for NF*C:
1333      * read forward and check if the character is (or its decomposition begins
1334      * with) a "true starter" (cc==0 and NF*C_YES)
1335      * if c2!=0 then (c, c2) is a surrogate pair
1336      */
1337     private static final class IsNextTrueStarter implements IsNextBoundary{
1338         public boolean isNextBoundary(UCharacterIterator src,
1339                                       int/*unsigned*/ minC,
1340                                       int/*unsigned*/ ccOrQCMask,
1341                                       int[] chars) {
1342             long norm32;
1343             int/*unsigned*/ decompQCMask;
1344 
1345             decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
1346             norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
1347             return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask);
1348         }
1349     }
1350 
1351     private static int findNextIterationBoundary(UCharacterIterator src,
1352                                                  IsNextBoundary obj,
1353                                                  int/*unsigned*/ minC,
1354                                                  int/*unsigned*/ mask,
1355                                                  char[] buffer) {
1356         if(src.current()==UCharacterIterator.DONE) {
1357             return 0;
1358         }
1359 
1360         /* get one character and ignore its properties */
1361         int[] chars = new int[2];
1362         chars[0]=src.next();
1363         buffer[0]=(char)chars[0];
1364         int bufferIndex = 1;
1365 
1366         if(UTF16.isLeadSurrogate((char)chars[0])&&
1367            src.current()!=UCharacterIterator.DONE) {
1368             if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) {
1369                 buffer[bufferIndex++]=(char)chars[1];
1370             } else {
1371                 src.moveIndex(-1); /* back out the non-trail-surrogate */
1372             }
1373         }
1374 
1375         /* get all following characters until we see a boundary */
1376         /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff
1377          * is part of the string */
1378         while( src.current()!=UCharacterIterator.DONE) {
1379             if(obj.isNextBoundary(src, minC, mask, chars)) {
1380                 /* back out the latest movement to stop at the boundary */
1381                 src.moveIndex(chars[1]==0 ? -1 : -2);
1382                 break;
1383             } else {
1384                 if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) {
1385                     buffer[bufferIndex++]=(char)chars[0];
1386                     if(chars[1]!=0) {
1387                         buffer[bufferIndex++]=(char)chars[1];
1388                     }
1389                 } else {
1390                     char[] newBuf = new char[buffer.length*2];
1391                     System.arraycopy(buffer,0,newBuf,0,bufferIndex);
1392                     buffer = newBuf;
1393                     buffer[bufferIndex++]=(char)chars[0];
1394                     if(chars[1]!=0) {
1395                         buffer[bufferIndex++]=(char)chars[1];
1396                     }
1397                 }
1398             }
1399         }
1400 
1401         /* return the length of the buffer contents */
1402         return bufferIndex;
1403     }
1404 
1405     private static int next(UCharacterIterator src,
1406                             char[] dest, int destStart, int destLimit,
1407                             NormalizerBase.Mode mode,
1408                             boolean doNormalize,
1409                             boolean[] pNeededToNormalize,
1410                             int options) {
1411 
1412         IsNextBoundary isNextBoundary;
1413         int /*unsigned*/ mask;
1414         int /*unsigned*/ bufferLength;
1415         int c,c2;
1416         char minC;
1417         int destCapacity = destLimit - destStart;
1418         int destLength = 0;
1419         if(pNeededToNormalize!=null) {
1420             pNeededToNormalize[0]=false;
1421         }
1422 
1423         minC = (char)mode.getMinC();
1424         mask = mode.getMask();
1425         isNextBoundary = mode.getNextBoundary();
1426 
1427         if(isNextBoundary==null) {
1428             destLength=0;
1429             c=src.next();
1430             if(c!=UCharacterIterator.DONE) {
1431                 destLength=1;
1432                 if(UTF16.isLeadSurrogate((char)c)) {
1433                     c2= src.next();
1434                     if(c2!= UCharacterIterator.DONE) {
1435                         if(UTF16.isTrailSurrogate((char)c2)) {
1436                             if(destCapacity>=2) {
1437                                 dest[1]=(char)c2; // trail surrogate
1438                                 destLength=2;
1439                             }
1440                             // lead surrogate to be written below
1441                         } else {
1442                             src.moveIndex(-1);
1443                         }
1444                     }
1445                 }
1446 
1447                 if(destCapacity>0) {
1448                     dest[0]=(char)c;
1449                 }
1450             }
1451             return destLength;
1452         }
1453 
1454         char[] buffer=new char[100];
1455         int[] startIndex = new int[1];
1456         bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask,
1457                                                buffer);
1458         if(bufferLength>0) {
1459             if(doNormalize) {
1460                 destLength=mode.normalize(buffer,startIndex[0],bufferLength,
1461                                           dest,destStart,destLimit, options);
1462 
1463                 if(pNeededToNormalize!=null) {
1464                     pNeededToNormalize[0]=destLength!=bufferLength ||
1465                                           Utility.arrayRegionMatches(buffer,startIndex[0],
1466                                             dest,destStart,
1467                                             destLength);
1468                 }
1469             } else {
1470                 /* just copy the source characters */
1471                 if(destCapacity>0) {
1472                     System.arraycopy(buffer,0,dest,destStart,
1473                                      Math.min(bufferLength,destCapacity)
1474                                      );
1475                 }
1476 
1477 
1478             }
1479         }
1480         return destLength;
1481     }
1482 
1483     private void clearBuffer() {
1484         bufferLimit=bufferStart=bufferPos=0;
1485     }
1486 
1487     private boolean nextNormalize() {
1488 
1489         clearBuffer();
1490         currentIndex=nextIndex;
1491         text.setIndex(nextIndex);
1492 
1493         bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options);
1494 
1495         nextIndex=text.getIndex();
1496         return (bufferLimit>0);
1497     }
1498 
1499     private boolean previousNormalize() {
1500 
1501         clearBuffer();
1502         nextIndex=currentIndex;
1503         text.setIndex(currentIndex);
1504         bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options);
1505 
1506         currentIndex=text.getIndex();
1507         bufferPos = bufferLimit;
1508         return bufferLimit>0;
1509     }
1510 
1511     private int getCodePointAt(int index) {
1512         if( UTF16.isSurrogate(buffer[index])) {
1513             if(UTF16.isLeadSurrogate(buffer[index])) {
1514                 if((index+1)<bufferLimit &&
1515                    UTF16.isTrailSurrogate(buffer[index+1])) {
1516                     return UCharacterProperty.getRawSupplementary(
1517                                                                   buffer[index],
1518                                                                   buffer[index+1]
1519                                                                   );
1520                 }
1521             }else if(UTF16.isTrailSurrogate(buffer[index])) {
1522                 if(index>0 && UTF16.isLeadSurrogate(buffer[index-1])) {
1523                     return UCharacterProperty.getRawSupplementary(
1524                                                                   buffer[index-1],
1525                                                                   buffer[index]
1526                                                                   );
1527                 }
1528             }
1529         }
1530         return buffer[index];
1531 
1532     }
1533 
1534     /**
1535      * Internal API
1536      * @internal
1537      */
1538     public static boolean isNFSkippable(int c, Mode mode) {
1539         return mode.isNFSkippable(c);
1540     }
1541 
1542     //
1543     // Options
1544     //
1545 
1546     /*
1547      * Default option for Unicode 3.2.0 normalization.
1548      * Corrigendum 4 was fixed in Unicode 3.2.0 but isn't supported in
1549      * IDNA/StringPrep.
1550      * The public review issue #29 was fixed in Unicode 4.1.0. Corrigendum 5
1551      * allowed Unicode 3.2 to 4.0.1 to apply the fix for PRI #29, but it isn't
1552      * supported by IDNA/StringPrep as well as Corrigendum 4.
1553      */
1554     public static final int UNICODE_3_2_0_ORIGINAL =
1555                                UNICODE_3_2 |
1556                                NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
1557                                NormalizerImpl.BEFORE_PRI_29;
1558 
1559     /*
1560      * Default option for the latest Unicode normalization. This option is
1561      * provided mainly for testing.
1562      * The value zero means that normalization is done with the fixes for
1563      *   - Corrigendum 4 (Five CJK Canonical Mapping Errors)
1564      *   - Corrigendum 5 (Normalization Idempotency)
1565      */
1566     public static final int UNICODE_LATEST = 0x00;
1567 
1568     //
1569     // public constructor and methods for java.text.Normalizer and
1570     // sun.text.Normalizer
1571     //
1572 
1573     /**
1574      * Creates a new <tt>Normalizer</tt> object for iterating over the
1575      * normalized form of a given string.
1576      *
1577      * @param str  The string to be normalized.  The normalization
1578      *              will start at the beginning of the string.
1579      *
1580      * @param mode The normalization mode.
1581      */
1582     public NormalizerBase(String str, Mode mode) {
1583           this(str, mode, UNICODE_LATEST);
1584     }
1585 
1586     /**
1587      * Normalizes a <code>String</code> using the given normalization form.
1588      *
1589      * @param str      the input string to be normalized.
1590      * @param form     the normalization form
1591      */
1592     public static String normalize(String str, Normalizer.Form form) {
1593         return normalize(str, form, UNICODE_LATEST);
1594     }
1595 
1596     /**
1597      * Normalizes a <code>String</code> using the given normalization form.
1598      *
1599      * @param str      the input string to be normalized.
1600      * @param form     the normalization form
1601      * @param options   the optional features to be enabled.
1602      */
1603     public static String normalize(String str, Normalizer.Form form, int options) {
1604         int len = str.length();
1605         boolean asciiOnly = true;
1606         if (len < 80) {
1607             for (int i = 0; i < len; i++) {
1608                 if (str.charAt(i) > 127) {
1609                     asciiOnly = false;
1610                     break;
1611                 }
1612             }
1613         } else {
1614             char[] a = str.toCharArray();
1615             for (int i = 0; i < len; i++) {
1616                 if (a[i] > 127) {
1617                     asciiOnly = false;
1618                     break;
1619                 }
1620             }
1621         }
1622 
1623         switch (form) {
1624         case NFC :
1625             return asciiOnly ? str : NFC.normalize(str, options);
1626         case NFD :
1627             return asciiOnly ? str : NFD.normalize(str, options);
1628         case NFKC :
1629             return asciiOnly ? str : NFKC.normalize(str, options);
1630         case NFKD :
1631             return asciiOnly ? str : NFKD.normalize(str, options);
1632         }
1633 
1634         throw new IllegalArgumentException("Unexpected normalization form: " +
1635                                            form);
1636     }
1637 
1638     /**
1639      * Test if a string is in a given normalization form.
1640      * This is semantically equivalent to source.equals(normalize(source, mode)).
1641      *
1642      * Unlike quickCheck(), this function returns a definitive result,
1643      * never a "maybe".
1644      * For NFD, NFKD, and FCD, both functions work exactly the same.
1645      * For NFC and NFKC where quickCheck may return "maybe", this function will
1646      * perform further tests to arrive at a true/false result.
1647      * @param str       the input string to be checked to see if it is normalized
1648      * @param form      the normalization form
1649      * @param options   the optional features to be enabled.
1650      */
1651     public static boolean isNormalized(String str, Normalizer.Form form) {
1652         return isNormalized(str, form, UNICODE_LATEST);
1653     }
1654 
1655     /**
1656      * Test if a string is in a given normalization form.
1657      * This is semantically equivalent to source.equals(normalize(source, mode)).
1658      *
1659      * Unlike quickCheck(), this function returns a definitive result,
1660      * never a "maybe".
1661      * For NFD, NFKD, and FCD, both functions work exactly the same.
1662      * For NFC and NFKC where quickCheck may return "maybe", this function will
1663      * perform further tests to arrive at a true/false result.
1664      * @param str       the input string to be checked to see if it is normalized
1665      * @param form      the normalization form
1666      * @param options   the optional features to be enabled.
1667      */
1668     public static boolean isNormalized(String str, Normalizer.Form form, int options) {
1669         switch (form) {
1670         case NFC:
1671             return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1672         case NFD:
1673             return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1674         case NFKC:
1675             return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1676         case NFKD:
1677             return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
1678         }
1679 
1680         throw new IllegalArgumentException("Unexpected normalization form: " +
1681                                            form);
1682     }
1683 }