1 /*
   2  * Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
  29  *                                                                             *
  30  * The original version of this source code and documentation is copyrighted   *
  31  * and owned by IBM, These materials are provided under terms of a License     *
  32  * Agreement between IBM and Sun. This technology is protected by multiple     *
  33  * US and International patents. This notice and attribution to IBM may not    *
  34  * to removed.                                                                 *
  35  *******************************************************************************
  36  */
  37 
  38 package java.text;
  39 
  40 import sun.text.normalizer.NormalizerBase;
  41 
  42 /**
  43  * This class provides the method {@code normalize} which transforms Unicode
  44  * text into an equivalent composed or decomposed form, allowing for easier
  45  * sorting and searching of text.
  46  * The {@code normalize} method supports the standard normalization forms
  47  * described in
  48  * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
  49  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  50  * <p>
  51  * Characters with accents or other adornments can be encoded in
  52  * several different ways in Unicode.  For example, take the character A-acute.
  53  * In Unicode, this can be encoded as a single character (the "composed" form):
  54  *
  55  * <pre>
  56  *      U+00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
  57  *
  58  * or as two separate characters (the "decomposed" form):
  59  *
  60  * <pre>
  61  *      U+0041    LATIN CAPITAL LETTER A
  62  *      U+0301    COMBINING ACUTE ACCENT</pre>
  63  *
  64  * To a user of your program, however, both of these sequences should be
  65  * treated as the same "user-level" character "A with acute accent".  When you
  66  * are searching or comparing text, you must ensure that these two sequences are
  67  * treated as equivalent.  In addition, you must handle characters with more than
  68  * one accent. Sometimes the order of a character's combining accents is
  69  * significant, while in other cases accent sequences in different orders are
  70  * really equivalent.
  71  * <p>
  72  * Similarly, the string "ffi" can be encoded as three separate letters:
  73  *
  74  * <pre>
  75  *      U+0066    LATIN SMALL LETTER F
  76  *      U+0066    LATIN SMALL LETTER F
  77  *      U+0069    LATIN SMALL LETTER I</pre>
  78  *
  79  * or as the single character
  80  *
  81  * <pre>
  82  *      U+FB03    LATIN SMALL LIGATURE FFI</pre>
  83  *
  84  * The ffi ligature is not a distinct semantic character, and strictly speaking
  85  * it shouldn't be in Unicode at all, but it was included for compatibility
  86  * with existing character sets that already provided it.  The Unicode standard
  87  * identifies such characters by giving them "compatibility" decompositions
  88  * into the corresponding semantic characters.  When sorting and searching, you
  89  * will often want to use these mappings.
  90  * <p>
  91  * The {@code normalize} method helps solve these problems by transforming
  92  * text into the canonical composed and decomposed forms as shown in the first
  93  * example above. In addition, you can have it perform compatibility
  94  * decompositions so that you can treat compatibility characters the same as
  95  * their equivalents.
  96  * Finally, the {@code normalize} method rearranges accents into the
  97  * proper canonical order, so that you do not have to worry about accent
  98  * rearrangement on your own.
  99  * <p>
 100  * The W3C generally recommends to exchange texts in NFC.
 101  * Note also that most legacy character encodings use only precomposed forms and
 102  * often do not encode any combining marks by themselves. For conversion to such
 103  * character encodings the Unicode text needs to be normalized to NFC.
 104  * For more usage examples, see the Unicode Standard Annex.
 105  *
 106  * @since 1.6
 107  */
 108 public final class Normalizer {
 109 
 110    private Normalizer() {};
 111 
 112     /**
 113      * This enum provides constants of the four Unicode normalization forms
 114      * that are described in
 115      * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
 116      * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
 117      * and two methods to access them.
 118      *
 119      * @since 1.6
 120      */
 121     public static enum Form {
 122 
 123         /**
 124          * Canonical decomposition.
 125          */
 126         NFD,
 127 
 128         /**
 129          * Canonical decomposition, followed by canonical composition.
 130          */
 131         NFC,
 132 
 133         /**
 134          * Compatibility decomposition.
 135          */
 136         NFKD,
 137 
 138         /**
 139          * Compatibility decomposition, followed by canonical composition.
 140          */
 141         NFKC
 142     }
 143 
 144     /**
 145      * Normalize a sequence of char values.
 146      * The sequence will be normalized according to the specified normalization
 147      * from.
 148      * @param src        The sequence of char values to normalize.
 149      * @param form       The normalization form; one of
 150      *                   {@link java.text.Normalizer.Form#NFC},
 151      *                   {@link java.text.Normalizer.Form#NFD},
 152      *                   {@link java.text.Normalizer.Form#NFKC},
 153      *                   {@link java.text.Normalizer.Form#NFKD}
 154      * @return The normalized String
 155      * @throws NullPointerException If {@code src} or {@code form}
 156      * is null.
 157      */
 158     public static String normalize(CharSequence src, Form form) {
 159         return NormalizerBase.normalize(src.toString(), form);
 160     }
 161 
 162     /**
 163      * Determines if the given sequence of char values is normalized.
 164      * @param src        The sequence of char values to be checked.
 165      * @param form       The normalization form; one of
 166      *                   {@link java.text.Normalizer.Form#NFC},
 167      *                   {@link java.text.Normalizer.Form#NFD},
 168      *                   {@link java.text.Normalizer.Form#NFKC},
 169      *                   {@link java.text.Normalizer.Form#NFKD}
 170      * @return true if the sequence of char values is normalized;
 171      * false otherwise.
 172      * @throws NullPointerException If {@code src} or {@code form}
 173      * is null.
 174      */
 175     public static boolean isNormalized(CharSequence src, Form form) {
 176         return NormalizerBase.isNormalized(src.toString(), form);
 177     }
 178 }