1 /*
   2  * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 
  33 package jdk.internal.icu.text;
  34 
  35 import jdk.internal.icu.impl.Norm2AllModes;
  36 
  37 /**
  38  * Unicode normalization functionality for standard Unicode normalization or
  39  * for using custom mapping tables.
  40  * All instances of this class are unmodifiable/immutable.
  41  * The Normalizer2 class is not intended for public subclassing.
  42  * <p>
  43  * The primary functions are to produce a normalized string and to detect whether
  44  * a string is already normalized.
  45  * The most commonly used normalization forms are those defined in
  46  * http://www.unicode.org/unicode/reports/tr15/
  47  * However, this API supports additional normalization forms for specialized purposes.
  48  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  49  * and can be used in implementations of UTS #46.
  50  * <p>
  51  * Not only are the standard compose and decompose modes supplied,
  52  * but additional modes are provided as documented in the Mode enum.
  53  * <p>
  54  * Some of the functions in this class identify normalization boundaries.
  55  * At a normalization boundary, the portions of the string
  56  * before it and starting from it do not interact and can be handled independently.
  57  * <p>
  58  * The spanQuickCheckYes() stops at a normalization boundary.
  59  * When the goal is a normalized string, then the text before the boundary
  60  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  61  * <p>
  62  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  63  * a character is guaranteed to be at a normalization boundary,
  64  * regardless of context.
  65  * This is used for moving from one normalization boundary to the next
  66  * or preceding boundary, and for performing iterative normalization.
  67  * <p>
  68  * Iterative normalization is useful when only a small portion of a
  69  * longer string needs to be processed.
  70  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  71  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  72  * (to process only the substring for which sort key bytes are computed).
  73  * <p>
  74  * The set of normalization boundaries returned by these functions may not be
  75  * complete: There may be more boundaries that could be returned.
  76  * Different functions may return different boundaries.
  77  * @stable ICU 4.4
  78  * @author Markus W. Scherer
  79  */
  80 public abstract class Normalizer2 {
  81 
  82     /**
  83      * Returns a Normalizer2 instance for Unicode NFC normalization.
  84      * Same as getInstance(null, "nfc", Mode.COMPOSE).
  85      * Returns an unmodifiable singleton instance.
  86      * @return the requested Normalizer2, if successful
  87      * @stable ICU 49
  88      */
  89     public static Normalizer2 getNFCInstance() {
  90         return Norm2AllModes.getNFCInstance().comp;
  91     }
  92 
  93     /**
  94      * Returns a Normalizer2 instance for Unicode NFD normalization.
  95      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
  96      * Returns an unmodifiable singleton instance.
  97      * @return the requested Normalizer2, if successful
  98      * @stable ICU 49
  99      */
 100     public static Normalizer2 getNFDInstance() {
 101         return Norm2AllModes.getNFCInstance().decomp;
 102     }
 103 
 104     /**
 105      * Returns a Normalizer2 instance for Unicode NFKC normalization.
 106      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
 107      * Returns an unmodifiable singleton instance.
 108      * @return the requested Normalizer2, if successful
 109      * @stable ICU 49
 110      */
 111     public static Normalizer2 getNFKCInstance() {
 112         return Norm2AllModes.getNFKCInstance().comp;
 113     }
 114 
 115     /**
 116      * Returns a Normalizer2 instance for Unicode NFKD normalization.
 117      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
 118      * Returns an unmodifiable singleton instance.
 119      * @return the requested Normalizer2, if successful
 120      * @stable ICU 49
 121      */
 122     public static Normalizer2 getNFKDInstance() {
 123         return Norm2AllModes.getNFKCInstance().decomp;
 124     }
 125 
 126     /**
 127      * Returns the normalized form of the source string.
 128      * @param src source string
 129      * @return normalized src
 130      * @stable ICU 4.4
 131      */
 132     public String normalize(CharSequence src) {
 133         if(src instanceof String) {
 134             // Fastpath: Do not construct a new String if the src is a String
 135             // and is already normalized.
 136             int spanLength=spanQuickCheckYes(src);
 137             if(spanLength==src.length()) {
 138                 return (String)src;
 139             }
 140             if (spanLength != 0) {
 141                 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
 142                 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
 143             }
 144         }
 145         return normalize(src, new StringBuilder(src.length())).toString();
 146     }
 147 
 148     /**
 149      * Writes the normalized form of the source string to the destination string
 150      * (replacing its contents) and returns the destination string.
 151      * The source and destination strings must be different objects.
 152      * @param src source string
 153      * @param dest destination string; its contents is replaced with normalized src
 154      * @return dest
 155      * @stable ICU 4.4
 156      */
 157     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
 158 
 159     /**
 160      * Writes the normalized form of the source string to the destination Appendable
 161      * and returns the destination Appendable.
 162      * The source and destination strings must be different objects.
 163      *
 164      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
 165      *
 166      * @param src source string
 167      * @param dest destination Appendable; gets normalized src appended
 168      * @return dest
 169      * @stable ICU 4.6
 170      */
 171     public abstract Appendable normalize(CharSequence src, Appendable dest);
 172 
 173     /**
 174      * Appends the normalized form of the second string to the first string
 175      * (merging them at the boundary) and returns the first string.
 176      * The result is normalized if the first string was normalized.
 177      * The first and second strings must be different objects.
 178      * @param first string, should be normalized
 179      * @param second string, will be normalized
 180      * @return first
 181      * @stable ICU 4.4
 182      */
 183     public abstract StringBuilder normalizeSecondAndAppend(
 184             StringBuilder first, CharSequence second);
 185 
 186     /**
 187      * Appends the second string to the first string
 188      * (merging them at the boundary) and returns the first string.
 189      * The result is normalized if both the strings were normalized.
 190      * The first and second strings must be different objects.
 191      * @param first string, should be normalized
 192      * @param second string, should be normalized
 193      * @return first
 194      * @stable ICU 4.4
 195      */
 196     public abstract StringBuilder append(StringBuilder first, CharSequence second);
 197 
 198     /**
 199      * Gets the decomposition mapping of c.
 200      * Roughly equivalent to normalizing the String form of c
 201      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
 202      * returns null if c does not have a decomposition mapping in this instance's data.
 203      * This function is independent of the mode of the Normalizer2.
 204      * @param c code point
 205      * @return c's decomposition mapping, if any; otherwise null
 206      * @stable ICU 4.6
 207      */
 208     public abstract String getDecomposition(int c);
 209 
 210     /**
 211      * Gets the combining class of c.
 212      * The default implementation returns 0
 213      * but all standard implementations return the Unicode Canonical_Combining_Class value.
 214      * @param c code point
 215      * @return c's combining class
 216      * @stable ICU 49
 217      */
 218     public int getCombiningClass(int c) { return 0; }
 219 
 220     /**
 221      * Tests if the string is normalized.
 222      * Internally, in cases where the quickCheck() method would return "maybe"
 223      * (which is only possible for the two COMPOSE modes) this method
 224      * resolves to "yes" or "no" to provide a definitive result,
 225      * at the cost of doing more work in those cases.
 226      * @param s input string
 227      * @return true if s is normalized
 228      * @stable ICU 4.4
 229      */
 230     public abstract boolean isNormalized(CharSequence s);
 231 
 232     /**
 233      * Returns the end of the normalized substring of the input string.
 234      * In other words, with <code>end=spanQuickCheckYes(s);</code>
 235      * the substring <code>s.subSequence(0, end)</code>
 236      * will pass the quick check with a "yes" result.
 237      * <p>
 238      * The returned end index is usually one or more characters before the
 239      * "no" or "maybe" character: The end index is at a normalization boundary.
 240      * (See the class documentation for more about normalization boundaries.)
 241      * <p>
 242      * When the goal is a normalized string and most input strings are expected
 243      * to be normalized already, then call this method,
 244      * and if it returns a prefix shorter than the input string,
 245      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
 246      * @param s input string
 247      * @return "yes" span end index
 248      * @stable ICU 4.4
 249      */
 250     public abstract int spanQuickCheckYes(CharSequence s);
 251 
 252     /**
 253      * Tests if the character always has a normalization boundary before it,
 254      * regardless of context.
 255      * If true, then the character does not normalization-interact with
 256      * preceding characters.
 257      * In other words, a string containing this character can be normalized
 258      * by processing portions before this character and starting from this
 259      * character independently.
 260      * This is used for iterative normalization. See the class documentation for details.
 261      * @param c character to test
 262      * @return true if c has a normalization boundary before it
 263      * @stable ICU 4.4
 264      */
 265     public abstract boolean hasBoundaryBefore(int c);
 266 
 267     /**
 268      * Sole constructor.  (For invocation by subclass constructors,
 269      * typically implicit.)
 270      * @internal
 271      * deprecated This API is ICU internal only.
 272      */
 273     protected Normalizer2() {
 274     }
 275 }