1 /* 2 * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 33 package sun.text.normalizer; 34 35 /** 36 * Unicode normalization functionality for standard Unicode normalization or 37 * for using custom mapping tables. 38 * All instances of this class are unmodifiable/immutable. 39 * The Normalizer2 class is not intended for public subclassing. 40 * <p> 41 * The primary functions are to produce a normalized string and to detect whether 42 * a string is already normalized. 43 * The most commonly used normalization forms are those defined in 44 * http://www.unicode.org/unicode/reports/tr15/ 45 * However, this API supports additional normalization forms for specialized purposes. 46 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 47 * and can be used in implementations of UTS #46. 48 * <p> 49 * Not only are the standard compose and decompose modes supplied, 50 * but additional modes are provided as documented in the Mode enum. 51 * <p> 52 * Some of the functions in this class identify normalization boundaries. 53 * At a normalization boundary, the portions of the string 54 * before it and starting from it do not interact and can be handled independently. 55 * <p> 56 * The spanQuickCheckYes() stops at a normalization boundary. 57 * When the goal is a normalized string, then the text before the boundary 58 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 59 * <p> 60 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 61 * a character is guaranteed to be at a normalization boundary, 62 * regardless of context. 63 * This is used for moving from one normalization boundary to the next 64 * or preceding boundary, and for performing iterative normalization. 65 * <p> 66 * Iterative normalization is useful when only a small portion of a 67 * longer string needs to be processed. 68 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 69 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 70 * (to process only the substring for which sort key bytes are computed). 71 * <p> 72 * The set of normalization boundaries returned by these functions may not be 73 * complete: There may be more boundaries that could be returned. 74 * Different functions may return different boundaries. 75 * @stable ICU 4.4 76 * @author Markus W. Scherer 77 */ 78 abstract class Normalizer2 { 79 80 /** 81 * Returns a Normalizer2 instance for Unicode NFC normalization. 82 * Same as getInstance(null, "nfc", Mode.COMPOSE). 83 * Returns an unmodifiable singleton instance. 84 * @return the requested Normalizer2, if successful 85 * @stable ICU 49 86 */ 87 public static Normalizer2 getNFCInstance() { 88 return Norm2AllModes.getNFCInstance().comp; 89 } 90 91 /** 92 * Returns a Normalizer2 instance for Unicode NFD normalization. 93 * Same as getInstance(null, "nfc", Mode.DECOMPOSE). 94 * Returns an unmodifiable singleton instance. 95 * @return the requested Normalizer2, if successful 96 * @stable ICU 49 97 */ 98 public static Normalizer2 getNFDInstance() { 99 return Norm2AllModes.getNFCInstance().decomp; 100 } 101 102 /** 103 * Returns a Normalizer2 instance for Unicode NFKC normalization. 104 * Same as getInstance(null, "nfkc", Mode.COMPOSE). 105 * Returns an unmodifiable singleton instance. 106 * @return the requested Normalizer2, if successful 107 * @stable ICU 49 108 */ 109 public static Normalizer2 getNFKCInstance() { 110 return Norm2AllModes.getNFKCInstance().comp; 111 } 112 113 /** 114 * Returns a Normalizer2 instance for Unicode NFKD normalization. 115 * Same as getInstance(null, "nfkc", Mode.DECOMPOSE). 116 * Returns an unmodifiable singleton instance. 117 * @return the requested Normalizer2, if successful 118 * @stable ICU 49 119 */ 120 public static Normalizer2 getNFKDInstance() { 121 return Norm2AllModes.getNFKCInstance().decomp; 122 } 123 124 /** 125 * Returns the normalized form of the source string. 126 * @param src source string 127 * @return normalized src 128 * @stable ICU 4.4 129 */ 130 public String normalize(CharSequence src) { 131 if(src instanceof String) { 132 // Fastpath: Do not construct a new String if the src is a String 133 // and is already normalized. 134 int spanLength=spanQuickCheckYes(src); 135 if(spanLength==src.length()) { 136 return (String)src; 137 } 138 if (spanLength != 0) { 139 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength); 140 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString(); 141 } 142 } 143 return normalize(src, new StringBuilder(src.length())).toString(); 144 } 145 146 /** 147 * Writes the normalized form of the source string to the destination string 148 * (replacing its contents) and returns the destination string. 149 * The source and destination strings must be different objects. 150 * @param src source string 151 * @param dest destination string; its contents is replaced with normalized src 152 * @return dest 153 * @stable ICU 4.4 154 */ 155 public abstract StringBuilder normalize(CharSequence src, StringBuilder dest); 156 157 /** 158 * Writes the normalized form of the source string to the destination Appendable 159 * and returns the destination Appendable. 160 * The source and destination strings must be different objects. 161 * 162 * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}. 163 * 164 * @param src source string 165 * @param dest destination Appendable; gets normalized src appended 166 * @return dest 167 * @stable ICU 4.6 168 */ 169 public abstract Appendable normalize(CharSequence src, Appendable dest); 170 171 /** 172 * Appends the normalized form of the second string to the first string 173 * (merging them at the boundary) and returns the first string. 174 * The result is normalized if the first string was normalized. 175 * The first and second strings must be different objects. 176 * @param first string, should be normalized 177 * @param second string, will be normalized 178 * @return first 179 * @stable ICU 4.4 180 */ 181 public abstract StringBuilder normalizeSecondAndAppend( 182 StringBuilder first, CharSequence second); 183 184 /** 185 * Appends the second string to the first string 186 * (merging them at the boundary) and returns the first string. 187 * The result is normalized if both the strings were normalized. 188 * The first and second strings must be different objects. 189 * @param first string, should be normalized 190 * @param second string, should be normalized 191 * @return first 192 * @stable ICU 4.4 193 */ 194 public abstract StringBuilder append(StringBuilder first, CharSequence second); 195 196 /** 197 * Gets the decomposition mapping of c. 198 * Roughly equivalent to normalizing the String form of c 199 * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function 200 * returns null if c does not have a decomposition mapping in this instance's data. 201 * This function is independent of the mode of the Normalizer2. 202 * @param c code point 203 * @return c's decomposition mapping, if any; otherwise null 204 * @stable ICU 4.6 205 */ 206 public abstract String getDecomposition(int c); 207 208 /** 209 * Gets the combining class of c. 210 * The default implementation returns 0 211 * but all standard implementations return the Unicode Canonical_Combining_Class value. 212 * @param c code point 213 * @return c's combining class 214 * @stable ICU 49 215 */ 216 public int getCombiningClass(int c) { return 0; } 217 218 /** 219 * Tests if the string is normalized. 220 * Internally, in cases where the quickCheck() method would return "maybe" 221 * (which is only possible for the two COMPOSE modes) this method 222 * resolves to "yes" or "no" to provide a definitive result, 223 * at the cost of doing more work in those cases. 224 * @param s input string 225 * @return true if s is normalized 226 * @stable ICU 4.4 227 */ 228 public abstract boolean isNormalized(CharSequence s); 229 230 /** 231 * Returns the end of the normalized substring of the input string. 232 * In other words, with <code>end=spanQuickCheckYes(s);</code> 233 * the substring <code>s.subSequence(0, end)</code> 234 * will pass the quick check with a "yes" result. 235 * <p> 236 * The returned end index is usually one or more characters before the 237 * "no" or "maybe" character: The end index is at a normalization boundary. 238 * (See the class documentation for more about normalization boundaries.) 239 * <p> 240 * When the goal is a normalized string and most input strings are expected 241 * to be normalized already, then call this method, 242 * and if it returns a prefix shorter than the input string, 243 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 244 * @param s input string 245 * @return "yes" span end index 246 * @stable ICU 4.4 247 */ 248 public abstract int spanQuickCheckYes(CharSequence s); 249 250 /** 251 * Tests if the character always has a normalization boundary before it, 252 * regardless of context. 253 * If true, then the character does not normalization-interact with 254 * preceding characters. 255 * In other words, a string containing this character can be normalized 256 * by processing portions before this character and starting from this 257 * character independently. 258 * This is used for iterative normalization. See the class documentation for details. 259 * @param c character to test 260 * @return true if c has a normalization boundary before it 261 * @stable ICU 4.4 262 */ 263 public abstract boolean hasBoundaryBefore(int c); 264 265 /** 266 * Sole constructor. (For invocation by subclass constructors, 267 * typically implicit.) 268 * @internal 269 * deprecated This API is ICU internal only. 270 */ 271 protected Normalizer2() { 272 } 273 }