1 /*
   2  * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 
  33 package sun.text.normalizer;
  34 
  35 /**
  36  * Unicode normalization functionality for standard Unicode normalization or
  37  * for using custom mapping tables.
  38  * All instances of this class are unmodifiable/immutable.
  39  * The Normalizer2 class is not intended for public subclassing.
  40  * <p>
  41  * The primary functions are to produce a normalized string and to detect whether
  42  * a string is already normalized.
  43  * The most commonly used normalization forms are those defined in
  44  * http://www.unicode.org/unicode/reports/tr15/
  45  * However, this API supports additional normalization forms for specialized purposes.
  46  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  47  * and can be used in implementations of UTS #46.
  48  * <p>
  49  * Not only are the standard compose and decompose modes supplied,
  50  * but additional modes are provided as documented in the Mode enum.
  51  * <p>
  52  * Some of the functions in this class identify normalization boundaries.
  53  * At a normalization boundary, the portions of the string
  54  * before it and starting from it do not interact and can be handled independently.
  55  * <p>
  56  * The spanQuickCheckYes() stops at a normalization boundary.
  57  * When the goal is a normalized string, then the text before the boundary
  58  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  59  * <p>
  60  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  61  * a character is guaranteed to be at a normalization boundary,
  62  * regardless of context.
  63  * This is used for moving from one normalization boundary to the next
  64  * or preceding boundary, and for performing iterative normalization.
  65  * <p>
  66  * Iterative normalization is useful when only a small portion of a
  67  * longer string needs to be processed.
  68  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  69  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  70  * (to process only the substring for which sort key bytes are computed).
  71  * <p>
  72  * The set of normalization boundaries returned by these functions may not be
  73  * complete: There may be more boundaries that could be returned.
  74  * Different functions may return different boundaries.
  75  * @stable ICU 4.4
  76  * @author Markus W. Scherer
  77  */
  78 abstract class Normalizer2 {
  79 
  80     /**
  81      * Returns a Normalizer2 instance for Unicode NFC normalization.
  82      * Same as getInstance(null, "nfc", Mode.COMPOSE).
  83      * Returns an unmodifiable singleton instance.
  84      * @return the requested Normalizer2, if successful
  85      * @stable ICU 49
  86      */
  87     public static Normalizer2 getNFCInstance() {
  88         return Norm2AllModes.getNFCInstance().comp;
  89     }
  90 
  91     /**
  92      * Returns a Normalizer2 instance for Unicode NFD normalization.
  93      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
  94      * Returns an unmodifiable singleton instance.
  95      * @return the requested Normalizer2, if successful
  96      * @stable ICU 49
  97      */
  98     public static Normalizer2 getNFDInstance() {
  99         return Norm2AllModes.getNFCInstance().decomp;
 100     }
 101 
 102     /**
 103      * Returns a Normalizer2 instance for Unicode NFKC normalization.
 104      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
 105      * Returns an unmodifiable singleton instance.
 106      * @return the requested Normalizer2, if successful
 107      * @stable ICU 49
 108      */
 109     public static Normalizer2 getNFKCInstance() {
 110         return Norm2AllModes.getNFKCInstance().comp;
 111     }
 112 
 113     /**
 114      * Returns a Normalizer2 instance for Unicode NFKD normalization.
 115      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
 116      * Returns an unmodifiable singleton instance.
 117      * @return the requested Normalizer2, if successful
 118      * @stable ICU 49
 119      */
 120     public static Normalizer2 getNFKDInstance() {
 121         return Norm2AllModes.getNFKCInstance().decomp;
 122     }
 123 
 124     /**
 125      * Returns the normalized form of the source string.
 126      * @param src source string
 127      * @return normalized src
 128      * @stable ICU 4.4
 129      */
 130     public String normalize(CharSequence src) {
 131         if(src instanceof String) {
 132             // Fastpath: Do not construct a new String if the src is a String
 133             // and is already normalized.
 134             int spanLength=spanQuickCheckYes(src);
 135             if(spanLength==src.length()) {
 136                 return (String)src;
 137             }
 138             if (spanLength != 0) {
 139                 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
 140                 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
 141             }
 142         }
 143         return normalize(src, new StringBuilder(src.length())).toString();
 144     }
 145 
 146     /**
 147      * Writes the normalized form of the source string to the destination string
 148      * (replacing its contents) and returns the destination string.
 149      * The source and destination strings must be different objects.
 150      * @param src source string
 151      * @param dest destination string; its contents is replaced with normalized src
 152      * @return dest
 153      * @stable ICU 4.4
 154      */
 155     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
 156 
 157     /**
 158      * Writes the normalized form of the source string to the destination Appendable
 159      * and returns the destination Appendable.
 160      * The source and destination strings must be different objects.
 161      *
 162      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
 163      *
 164      * @param src source string
 165      * @param dest destination Appendable; gets normalized src appended
 166      * @return dest
 167      * @stable ICU 4.6
 168      */
 169     public abstract Appendable normalize(CharSequence src, Appendable dest);
 170 
 171     /**
 172      * Appends the normalized form of the second string to the first string
 173      * (merging them at the boundary) and returns the first string.
 174      * The result is normalized if the first string was normalized.
 175      * The first and second strings must be different objects.
 176      * @param first string, should be normalized
 177      * @param second string, will be normalized
 178      * @return first
 179      * @stable ICU 4.4
 180      */
 181     public abstract StringBuilder normalizeSecondAndAppend(
 182             StringBuilder first, CharSequence second);
 183 
 184     /**
 185      * Appends the second string to the first string
 186      * (merging them at the boundary) and returns the first string.
 187      * The result is normalized if both the strings were normalized.
 188      * The first and second strings must be different objects.
 189      * @param first string, should be normalized
 190      * @param second string, should be normalized
 191      * @return first
 192      * @stable ICU 4.4
 193      */
 194     public abstract StringBuilder append(StringBuilder first, CharSequence second);
 195 
 196     /**
 197      * Gets the decomposition mapping of c.
 198      * Roughly equivalent to normalizing the String form of c
 199      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
 200      * returns null if c does not have a decomposition mapping in this instance's data.
 201      * This function is independent of the mode of the Normalizer2.
 202      * @param c code point
 203      * @return c's decomposition mapping, if any; otherwise null
 204      * @stable ICU 4.6
 205      */
 206     public abstract String getDecomposition(int c);
 207 
 208     /**
 209      * Gets the combining class of c.
 210      * The default implementation returns 0
 211      * but all standard implementations return the Unicode Canonical_Combining_Class value.
 212      * @param c code point
 213      * @return c's combining class
 214      * @stable ICU 49
 215      */
 216     public int getCombiningClass(int c) { return 0; }
 217 
 218     /**
 219      * Tests if the string is normalized.
 220      * Internally, in cases where the quickCheck() method would return "maybe"
 221      * (which is only possible for the two COMPOSE modes) this method
 222      * resolves to "yes" or "no" to provide a definitive result,
 223      * at the cost of doing more work in those cases.
 224      * @param s input string
 225      * @return true if s is normalized
 226      * @stable ICU 4.4
 227      */
 228     public abstract boolean isNormalized(CharSequence s);
 229 
 230     /**
 231      * Returns the end of the normalized substring of the input string.
 232      * In other words, with <code>end=spanQuickCheckYes(s);</code>
 233      * the substring <code>s.subSequence(0, end)</code>
 234      * will pass the quick check with a "yes" result.
 235      * <p>
 236      * The returned end index is usually one or more characters before the
 237      * "no" or "maybe" character: The end index is at a normalization boundary.
 238      * (See the class documentation for more about normalization boundaries.)
 239      * <p>
 240      * When the goal is a normalized string and most input strings are expected
 241      * to be normalized already, then call this method,
 242      * and if it returns a prefix shorter than the input string,
 243      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
 244      * @param s input string
 245      * @return "yes" span end index
 246      * @stable ICU 4.4
 247      */
 248     public abstract int spanQuickCheckYes(CharSequence s);
 249 
 250     /**
 251      * Tests if the character always has a normalization boundary before it,
 252      * regardless of context.
 253      * If true, then the character does not normalization-interact with
 254      * preceding characters.
 255      * In other words, a string containing this character can be normalized
 256      * by processing portions before this character and starting from this
 257      * character independently.
 258      * This is used for iterative normalization. See the class documentation for details.
 259      * @param c character to test
 260      * @return true if c has a normalization boundary before it
 261      * @stable ICU 4.4
 262      */
 263     public abstract boolean hasBoundaryBefore(int c);
 264 
 265     /**
 266      * Sole constructor.  (For invocation by subclass constructors,
 267      * typically implicit.)
 268      * @internal
 269      * deprecated This API is ICU internal only.
 270      */
 271     protected Normalizer2() {
 272     }
 273 }