New jdk/src/java.base/share/classes/sun/text/normalizer/Normalizer2.java

   1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *   Copyright (C) 2009-2014, International Business Machines
  29  *   Corporation and others.  All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 
  33 package sun.text.normalizer;
  34 
  35 /**
  36  * Unicode normalization functionality for standard Unicode normalization or
  37  * for using custom mapping tables.
  38  * All instances of this class are unmodifiable/immutable.
  39  * The Normalizer2 class is not intended for public subclassing.
  40  * <p>
  41  * The primary functions are to produce a normalized string and to detect whether
  42  * a string is already normalized.
  43  * The most commonly used normalization forms are those defined in
  44  * http://www.unicode.org/unicode/reports/tr15/
  45  * However, this API supports additional normalization forms for specialized purposes.
  46  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  47  * and can be used in implementations of UTS #46.
  48  * <p>
  49  * Not only are the standard compose and decompose modes supplied,
  50  * but additional modes are provided as documented in the Mode enum.
  51  * <p>
  52  * Some of the functions in this class identify normalization boundaries.
  53  * At a normalization boundary, the portions of the string
  54  * before it and starting from it do not interact and can be handled independently.
  55  * <p>
  56  * The spanQuickCheckYes() stops at a normalization boundary.
  57  * When the goal is a normalized string, then the text before the boundary
  58  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  59  * <p>
  60  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  61  * a character is guaranteed to be at a normalization boundary,
  62  * regardless of context.
  63  * This is used for moving from one normalization boundary to the next
  64  * or preceding boundary, and for performing iterative normalization.
  65  * <p>
  66  * Iterative normalization is useful when only a small portion of a
  67  * longer string needs to be processed.
  68  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  69  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  70  * (to process only the substring for which sort key bytes are computed).
  71  * <p>
  72  * The set of normalization boundaries returned by these functions may not be
  73  * complete: There may be more boundaries that could be returned.
  74  * Different functions may return different boundaries.
  75  * @stable ICU 4.4
  76  * @author Markus W. Scherer
  77  */
  78 abstract class Normalizer2 {
  79 
  80     /**
  81      * Returns a Normalizer2 instance for Unicode NFC normalization.
  82      * Same as getInstance(null, "nfc", Mode.COMPOSE).
  83      * Returns an unmodifiable singleton instance.
  84      * @return the requested Normalizer2, if successful
  85      * @stable ICU 49
  86      */
  87     public static Normalizer2 getNFCInstance() {
  88         return Norm2AllModes.getNFCInstance().comp;
  89     }
  90 
  91     /**
  92      * Returns a Normalizer2 instance for Unicode NFD normalization.
  93      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
  94      * Returns an unmodifiable singleton instance.
  95      * @return the requested Normalizer2, if successful
  96      * @stable ICU 49
  97      */
  98     public static Normalizer2 getNFDInstance() {
  99         return Norm2AllModes.getNFCInstance().decomp;
 100     }
 101 
 102     /**
 103      * Returns a Normalizer2 instance for Unicode NFKC normalization.
 104      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
 105      * Returns an unmodifiable singleton instance.
 106      * @return the requested Normalizer2, if successful
 107      * @stable ICU 49
 108      */
 109     public static Normalizer2 getNFKCInstance() {
 110         return Norm2AllModes.getNFKCInstance().comp;
 111     }
 112 
 113     /**
 114      * Returns a Normalizer2 instance for Unicode NFKD normalization.
 115      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
 116      * Returns an unmodifiable singleton instance.
 117      * @return the requested Normalizer2, if successful
 118      * @stable ICU 49
 119      */
 120     public static Normalizer2 getNFKDInstance() {
 121         return Norm2AllModes.getNFKCInstance().decomp;
 122     }
 123 
 124     /**
 125      * Returns the normalized form of the source string.
 126      * @param src source string
 127      * @return normalized src
 128      * @stable ICU 4.4
 129      */
 130     public String normalize(CharSequence src) {
 131         if(src instanceof String) {
 132             // Fastpath: Do not construct a new String if the src is a String
 133             // and is already normalized.
 134             int spanLength=spanQuickCheckYes(src);
 135             if(spanLength==src.length()) {
 136                 return (String)src;
 137             }
 138             StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
 139             return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
 140         }
 141         return normalize(src, new StringBuilder(src.length())).toString();
 142     }
 143 
 144     /**
 145      * Writes the normalized form of the source string to the destination string
 146      * (replacing its contents) and returns the destination string.
 147      * The source and destination strings must be different objects.
 148      * @param src source string
 149      * @param dest destination string; its contents is replaced with normalized src
 150      * @return dest
 151      * @stable ICU 4.4
 152      */
 153     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
 154 
 155     /**
 156      * Writes the normalized form of the source string to the destination Appendable
 157      * and returns the destination Appendable.
 158      * The source and destination strings must be different objects.
 159      *
 160      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
 161      *
 162      * @param src source string
 163      * @param dest destination Appendable; gets normalized src appended
 164      * @return dest
 165      * @stable ICU 4.6
 166      */
 167     public abstract Appendable normalize(CharSequence src, Appendable dest);
 168 
 169     /**
 170      * Appends the normalized form of the second string to the first string
 171      * (merging them at the boundary) and returns the first string.
 172      * The result is normalized if the first string was normalized.
 173      * The first and second strings must be different objects.
 174      * @param first string, should be normalized
 175      * @param second string, will be normalized
 176      * @return first
 177      * @stable ICU 4.4
 178      */
 179     public abstract StringBuilder normalizeSecondAndAppend(
 180             StringBuilder first, CharSequence second);
 181 
 182     /**
 183      * Appends the second string to the first string
 184      * (merging them at the boundary) and returns the first string.
 185      * The result is normalized if both the strings were normalized.
 186      * The first and second strings must be different objects.
 187      * @param first string, should be normalized
 188      * @param second string, should be normalized
 189      * @return first
 190      * @stable ICU 4.4
 191      */
 192     public abstract StringBuilder append(StringBuilder first, CharSequence second);
 193 
 194     /**
 195      * Gets the decomposition mapping of c.
 196      * Roughly equivalent to normalizing the String form of c
 197      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
 198      * returns null if c does not have a decomposition mapping in this instance's data.
 199      * This function is independent of the mode of the Normalizer2.
 200      * @param c code point
 201      * @return c's decomposition mapping, if any; otherwise null
 202      * @stable ICU 4.6
 203      */
 204     public abstract String getDecomposition(int c);
 205 
 206     /**
 207      * Gets the combining class of c.
 208      * The default implementation returns 0
 209      * but all standard implementations return the Unicode Canonical_Combining_Class value.
 210      * @param c code point
 211      * @return c's combining class
 212      * @stable ICU 49
 213      */
 214     public int getCombiningClass(int c) { return 0; }
 215 
 216     /**
 217      * Tests if the string is normalized.
 218      * Internally, in cases where the quickCheck() method would return "maybe"
 219      * (which is only possible for the two COMPOSE modes) this method
 220      * resolves to "yes" or "no" to provide a definitive result,
 221      * at the cost of doing more work in those cases.
 222      * @param s input string
 223      * @return true if s is normalized
 224      * @stable ICU 4.4
 225      */
 226     public abstract boolean isNormalized(CharSequence s);
 227 
 228     /**
 229      * Returns the end of the normalized substring of the input string.
 230      * In other words, with <code>end=spanQuickCheckYes(s);</code>
 231      * the substring <code>s.subSequence(0, end)</code>
 232      * will pass the quick check with a "yes" result.
 233      * <p>
 234      * The returned end index is usually one or more characters before the
 235      * "no" or "maybe" character: The end index is at a normalization boundary.
 236      * (See the class documentation for more about normalization boundaries.)
 237      * <p>
 238      * When the goal is a normalized string and most input strings are expected
 239      * to be normalized already, then call this method,
 240      * and if it returns a prefix shorter than the input string,
 241      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
 242      * @param s input string
 243      * @return "yes" span end index
 244      * @stable ICU 4.4
 245      */
 246     public abstract int spanQuickCheckYes(CharSequence s);
 247 
 248     /**
 249      * Tests if the character always has a normalization boundary before it,
 250      * regardless of context.
 251      * If true, then the character does not normalization-interact with
 252      * preceding characters.
 253      * In other words, a string containing this character can be normalized
 254      * by processing portions before this character and starting from this
 255      * character independently.
 256      * This is used for iterative normalization. See the class documentation for details.
 257      * @param c character to test
 258      * @return true if c has a normalization boundary before it
 259      * @stable ICU 4.4
 260      */
 261     public abstract boolean hasBoundaryBefore(int c);
 262 
 263     /**
 264      * Sole constructor.  (For invocation by subclass constructors,
 265      * typically implicit.)
 266      * @internal
 267      * deprecated This API is ICU internal only.
 268      */
 269     protected Normalizer2() {
 270     }
 271 }