1 /*
   2  * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
  29  *                                                                             *
  30  * The original version of this source code and documentation is copyrighted   *
  31  * and owned by IBM, These materials are provided under terms of a License     *
  32  * Agreement between IBM and Sun. This technology is protected by multiple     *
  33  * US and International patents. This notice and attribution to IBM may not    *
  34  * to removed.                                                                 *
  35  *******************************************************************************
  36  */
  37 
  38 package java.text;
  39 
  40 import sun.text.normalizer.NormalizerBase;
  41 import sun.text.normalizer.NormalizerImpl;
  42 
  43 /**
  44  * This class provides the method <code>normalize</code> which transforms Unicode
  45  * text into an equivalent composed or decomposed form, allowing for easier
  46  * sorting and searching of text.
  47  * The <code>normalize</code> method supports the standard normalization forms
  48  * described in
  49  * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
  50  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  51  * <p>
  52  * Characters with accents or other adornments can be encoded in
  53  * several different ways in Unicode.  For example, take the character A-acute.
  54  * In Unicode, this can be encoded as a single character (the "composed" form):
  55  *
  56  * <pre>
  57  *      U+00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
  58  *
  59  * or as two separate characters (the "decomposed" form):
  60  *
  61  * <pre>
  62  *      U+0041    LATIN CAPITAL LETTER A
  63  *      U+0301    COMBINING ACUTE ACCENT</pre>
  64  *
  65  * To a user of your program, however, both of these sequences should be
  66  * treated as the same "user-level" character "A with acute accent".  When you
  67  * are searching or comparing text, you must ensure that these two sequences are
  68  * treated as equivalent.  In addition, you must handle characters with more than
  69  * one accent. Sometimes the order of a character's combining accents is
  70  * significant, while in other cases accent sequences in different orders are
  71  * really equivalent.
  72  * <p>
  73  * Similarly, the string "ffi" can be encoded as three separate letters:
  74  *
  75  * <pre>
  76  *      U+0066    LATIN SMALL LETTER F
  77  *      U+0066    LATIN SMALL LETTER F
  78  *      U+0069    LATIN SMALL LETTER I</pre>
  79  *
  80  * or as the single character
  81  *
  82  * <pre>
  83  *      U+FB03    LATIN SMALL LIGATURE FFI</pre>
  84  *
  85  * The ffi ligature is not a distinct semantic character, and strictly speaking
  86  * it shouldn't be in Unicode at all, but it was included for compatibility
  87  * with existing character sets that already provided it.  The Unicode standard
  88  * identifies such characters by giving them "compatibility" decompositions
  89  * into the corresponding semantic characters.  When sorting and searching, you
  90  * will often want to use these mappings.
  91  * <p>
  92  * The <code>normalize</code> method helps solve these problems by transforming
  93  * text into the canonical composed and decomposed forms as shown in the first
  94  * example above. In addition, you can have it perform compatibility
  95  * decompositions so that you can treat compatibility characters the same as
  96  * their equivalents.
  97  * Finally, the <code>normalize</code> method rearranges accents into the
  98  * proper canonical order, so that you do not have to worry about accent
  99  * rearrangement on your own.
 100  * <p>
 101  * The W3C generally recommends to exchange texts in NFC.
 102  * Note also that most legacy character encodings use only precomposed forms and
 103  * often do not encode any combining marks by themselves. For conversion to such
 104  * character encodings the Unicode text needs to be normalized to NFC.
 105  * For more usage examples, see the Unicode Standard Annex.
 106  *
 107  * @since 1.6
 108  */
 109 public final class Normalizer {
 110 
 111    private Normalizer() {};
 112 
 113     /**
 114      * This enum provides constants of the four Unicode normalization forms
 115      * that are described in
 116      * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
 117      * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
 118      * and two methods to access them.
 119      *
 120      * @since 1.6
 121      */
 122     public static enum Form {
 123 
 124         /**
 125          * Canonical decomposition.
 126          */
 127         NFD,
 128 
 129         /**
 130          * Canonical decomposition, followed by canonical composition.
 131          */
 132         NFC,
 133 
 134         /**
 135          * Compatibility decomposition.
 136          */
 137         NFKD,
 138 
 139         /**
 140          * Compatibility decomposition, followed by canonical composition.
 141          */
 142         NFKC
 143     }
 144 
 145     /**
 146      * Normalize a sequence of char values.
 147      * The sequence will be normalized according to the specified normalization
 148      * from.
 149      * @param src        The sequence of char values to normalize.
 150      * @param form       The normalization form; one of
 151      *                   {@link java.text.Normalizer.Form#NFC},
 152      *                   {@link java.text.Normalizer.Form#NFD},
 153      *                   {@link java.text.Normalizer.Form#NFKC},
 154      *                   {@link java.text.Normalizer.Form#NFKD}
 155      * @return The normalized String
 156      * @throws NullPointerException If <code>src</code> or <code>form</code>
 157      * is null.
 158      */
 159     public static String normalize(CharSequence src, Form form) {
 160         return NormalizerBase.normalize(src.toString(), form);
 161     }
 162 
 163     /**
 164      * Determines if the given sequence of char values is normalized.
 165      * @param src        The sequence of char values to be checked.
 166      * @param form       The normalization form; one of
 167      *                   {@link java.text.Normalizer.Form#NFC},
 168      *                   {@link java.text.Normalizer.Form#NFD},
 169      *                   {@link java.text.Normalizer.Form#NFKC},
 170      *                   {@link java.text.Normalizer.Form#NFKD}
 171      * @return true if the sequence of char values is normalized;
 172      * false otherwise.
 173      * @throws NullPointerException If <code>src</code> or <code>form</code>
 174      * is null.
 175      */
 176     public static boolean isNormalized(CharSequence src, Form form) {
 177         return NormalizerBase.isNormalized(src.toString(), form);
 178     }
 179 }