< prev index next >

jdk/src/java.base/share/classes/sun/text/normalizer/UCharacter.java

Print this page


   1 /*
   2  * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;





  38 
  39 import java.io.IOException;
  40 import java.util.MissingResourceException;
  41 
  42 /**
  43  * <p>
  44  * The UCharacter class provides extensions to the
  45  * <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html">
  46  * java.lang.Character</a> class. These extensions provide support for
  47  * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
  48  * class, provide support for supplementary characters (those with code
  49  * points above U+FFFF).
  50  * Each ICU release supports the latest version of Unicode available at that time.
  51  * </p>
  52  * <p>
  53  * Code points are represented in these API using ints. While it would be
  54  * more convenient in Java to have a separate primitive datatype for them,
  55  * ints suffice in the meantime.
  56  * </p>
  57  * <p>
  58  * To use this class please add the jar file name icu4j.jar to the
  59  * class path, since it contains data files which supply the information used
  60  * by this file.<br>
  61  * E.g. In Windows <br>
  62  * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
  63  * Otherwise, another method would be to copy the files uprops.dat and
  64  * unames.icu from the icu4j source subdirectory
  65  * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
  66  * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
  67  * </p>
  68  * <p>
  69  * Aside from the additions for UTF-16 support, and the updated Unicode
  70  * properties, the main differences between UCharacter and Character are:
  71  * <ul>
  72  * <li> UCharacter is not designed to be a char wrapper and does not have
  73  *      APIs to which involves management of that single char.<br>
  74  *      These include:
  75  *      <ul>
  76  *        <li> char charValue(),
  77  *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
  78  *      </ul>
  79  * <li> UCharacter does not include Character APIs that are deprecated, nor
  80  *      does it include the Java-specific character information, such as
  81  *      boolean isJavaIdentifierPart(char ch).
  82  * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
  83  *      values '10' - '35'. UCharacter also does this in digit and
  84  *      getNumericValue, to adhere to the java semantics of these
  85  *      methods.  New methods unicodeDigit, and
  86  *      getUnicodeNumericValue do not treat the above code points
  87  *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
  88  * </ul>
  89  * <p>
  90  * Further detail differences can be determined from the program
  91  *        <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">

  92  *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
  93  * </p>
  94  * <p>
  95  * In addition to Java compatibility functions, which calculate derived properties,
  96  * this API provides low-level access to the Unicode Character Database.
  97  * </p>
  98  * <p>
  99  * Unicode assigns each code point (not just assigned character) values for
 100  * many properties.
 101  * Most of them are simple boolean flags, or constants from a small enumerated list.
 102  * For some properties, values are strings or other relatively more complex types.
 103  * </p>
 104  * <p>
 105  * For more information see
 106  * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
 107  * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).



 108  * </p>
 109  * <p>
 110  * There are also functions that provide easy migration from C/POSIX functions
 111  * like isblank(). Their use is generally discouraged because the C/POSIX
 112  * standards do not define their semantics beyond the ASCII range, which means
 113  * that different implementations exhibit very different behavior.
 114  * Instead, Unicode properties should be used directly.
 115  * </p>
 116  * <p>
 117  * There are also only a few, broad C/POSIX character classes, and they tend
 118  * to be used for conflicting purposes. For example, the "isalpha()" class
 119  * is sometimes used to determine word boundaries, while a more sophisticated
 120  * approach would at least distinguish initial letters from continuation
 121  * characters (the latter including combining marks).
 122  * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
 123  * Another example: There is no "istitle()" class for titlecase characters.
 124  * </p>
 125  * <p>
 126  * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
 127  * ICU implements them according to the Standard Recommendations in
 128  * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
 129  * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
 130  * </p>
 131  * <pre>{@code
 132  * API access for C/POSIX character classes is as follows:

 133  * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
 134  * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
 135  * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
 136  * - punct:     ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0


 137  * - digit:     isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
 138  * - xdigit:    hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
 139  * - alnum:     hasBinaryProperty(c, UProperty.POSIX_ALNUM)
 140  * - space:     isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
 141  * - blank:     hasBinaryProperty(c, UProperty.POSIX_BLANK)
 142  * - cntrl:     getType(c)==CONTROL
 143  * - graph:     hasBinaryProperty(c, UProperty.POSIX_GRAPH)
 144  * - print:     hasBinaryProperty(c, UProperty.POSIX_PRINT)
 145  * }</pre>

 146  * <p>
 147  * The C/POSIX character classes are also available in UnicodeSet patterns,
 148  * using patterns like [:graph:] or \p{graph}.
 149  * </p>
 150  * <p>
 151  * Note: There are several ICU (and Java) whitespace functions.
 152  * Comparison:
 153  * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
 154  *       most of general categories "Z" (separators) + most whitespace ISO controls
 155  *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
 156  * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
 157  * - isSpaceChar: just Z (including no-break spaces)
 158  * </p>
 159  * <p>
 160  * This class is not subclassable
 161  * </p>
 162  * @author Syn Wee Quek
 163  * @stable ICU 2.1
 164  * @see com.ibm.icu.lang.UCharacterEnums
 165  */
 166 
 167 public final class UCharacter
 168 {
 169 
 170     /**













 171      * Numeric Type constants.
 172      * @see UProperty#NUMERIC_TYPE
 173      * @stable ICU 2.4
 174      */
 175     public static interface NumericType
 176     {
 177         /**
 178          * @stable ICU 2.4
 179          */




 180         public static final int DECIMAL = 1;


















































 181     }
 182 
 183     // public data members -----------------------------------------------
 184 
 185     /**
 186      * The lowest Unicode code point value.
 187      * @stable ICU 2.1
 188      */
 189     public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
 190 
 191     /**
 192      * The highest Unicode code point value (scalar value) according to the
 193      * Unicode Standard.
 194      * This is a 21-bit value (21 bits, rounded up).<br>
 195      * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
 196      * @stable ICU 2.1
 197      */
 198     public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
 199 
 200     /**
 201      * The minimum value for Supplementary code points
 202      * @stable ICU 2.1
 203      */
 204     public static final int SUPPLEMENTARY_MIN_VALUE =
 205         UTF16.SUPPLEMENTARY_MIN_VALUE;
 206 
 207     // public methods ----------------------------------------------------
 208 
 209     /**
 210      * Retrieves the numeric value of a decimal digit code point.
 211      * <br>This method observes the semantics of
 212      * <code>java.lang.Character.digit()</code>.  Note that this
 213      * will return positive values for code points for which isDigit
 214      * returns false, just like java.lang.Character.
 215      * <br><em>Semantic Change:</em> In release 1.3.1 and
 216      * prior, this did not treat the European letters as having a
 217      * digit value, and also treated numeric letters and other numbers as
 218      * digits.
 219      * This has been changed to conform to the java semantics.
 220      * <br>A code point is a valid digit if and only if:
 221      * <ul>
 222      *   <li>ch is a decimal digit or one of the european letters, and
 223      *   <li>the value of ch is less than the specified radix.
 224      * </ul>
 225      * @param ch the code point to query
 226      * @param radix the radix
 227      * @return the numeric value represented by the code point in the
 228      * specified radix, or -1 if the code point is not a decimal digit
 229      * or if its value is too large for the radix
 230      * @stable ICU 2.1
 231      */
 232     public static int digit(int ch, int radix)
 233     {
 234         // when ch is out of bounds getProperty == 0
 235         int props = getProperty(ch);
 236         int value;
 237         if (getNumericType(props) == NumericType.DECIMAL) {
 238             value = UCharacterProperty.getUnsignedValue(props);


 239         } else {
 240             value = getEuropeanDigit(ch);
 241         }
 242         return (0 <= value && value < radix) ? value : -1;





































 243     }
 244 
 245     /**
 246      * Returns the Bidirection property of a code point.
 247      * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
 248      * property.<br>
 249      * Result returned belongs to the interface
 250      * <a href=UCharacterDirection.html>UCharacterDirection</a>
 251      * @param ch the code point to be determined its direction
 252      * @return direction constant from UCharacterDirection.
 253      * @stable ICU 2.1
 254      */
 255     public static int getDirection(int ch)
 256     {
 257         return gBdp.getClass(ch);




























































 258     }
 259 
 260     /**
 261      * Returns a code point corresponding to the two UTF16 characters.
 262      * @param lead the lead char
 263      * @param trail the trail char
 264      * @return code point if surrogate characters are valid.
 265      * @exception IllegalArgumentException thrown when argument characters do
 266      *            not form a valid codepoint
 267      * @stable ICU 2.1
 268      */
 269     public static int getCodePoint(char lead, char trail)
 270     {
 271         if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
 272             return UCharacterProperty.getRawSupplementary(lead, trail);
 273         }
 274         throw new IllegalArgumentException("Illegal surrogate characters");
 275     }
 276 
 277     /**
 278      * <p>Get the "age" of the code point.</p>
 279      * <p>The "age" is the Unicode version when the code point was first
 280      * designated (as a non-character or for Private Use) or assigned a
 281      * character.
 282      * <p>This can be useful to avoid emitting code points to receiving
 283      * processes that do not accept newer characters.</p>
 284      * <p>The data is from the UCD file DerivedAge.txt.</p>
 285      * @param ch The code point.
 286      * @return the Unicode version number
 287      * @stable ICU 2.6
 288      */
 289     public static VersionInfo getAge(int ch)
 290     {
 291         if (ch < MIN_VALUE || ch > MAX_VALUE) {
 292         throw new IllegalArgumentException("Codepoint out of bounds");
 293         }
 294         return PROPERTY_.getAge(ch);
 295     }
 296 
 297     // private variables -------------------------------------------------
 298 
 299     /**
 300      * Database storing the sets of character property
 301      */
 302     private static final UCharacterProperty PROPERTY_;
 303     /**
 304      * For optimization




































 305      */
 306     private static final char[] PROPERTY_TRIE_INDEX_;
 307     private static final char[] PROPERTY_TRIE_DATA_;
 308     private static final int PROPERTY_INITIAL_VALUE_;

 309 
 310     private static final UBiDiProps gBdp;
 311 
 312     // block to initialise character property database
 313     static
 314     {
 315         try
 316         {
 317             PROPERTY_ = UCharacterProperty.getInstance();
 318             PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
 319             PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
 320             PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
 321         }
 322         catch (Exception e)
 323         {
 324             throw new MissingResourceException(e.getMessage(),"","");
 325         }
 326 
 327         UBiDiProps bdp;
 328         try {
 329             bdp=UBiDiProps.getSingleton();
 330         } catch(IOException e) {
 331             bdp=UBiDiProps.getDummy();
 332         }
 333         gBdp=bdp;
 334     }
 335 
 336     /**
 337      * Shift to get numeric type

 338      */
 339     private static final int NUMERIC_TYPE_SHIFT_ = 5;
 340     /**
 341      * Mask to get numeric type

 342      */
 343     private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
 344 
 345     // private methods ---------------------------------------------------
 346 
 347     /**
 348      * Getting the digit values of characters like 'A' - 'Z', normal,
 349      * half-width and full-width. This method assumes that the other digit
 350      * characters are checked by the calling method.
 351      * @param ch character to test
 352      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
 353      *         its corresponding digit will be returned.
 354      */
 355     private static int getEuropeanDigit(int ch) {
 356         if ((ch > 0x7a && ch < 0xff21)
 357             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
 358             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
 359             return -1;
 360         }
 361         if (ch <= 0x7a) {
 362             // ch >= 0x41 or ch < 0x61
 363             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
 364         }
 365         // ch >= 0xff21
 366         if (ch <= 0xff3a) {
 367             return ch + 10 - 0xff21;
 368         }
 369         // ch >= 0xff41 && ch <= 0xff5a
 370         return ch + 10 - 0xff41;
 371     }
 372 
 373     /**
 374      * Gets the numeric type of the property argument
 375      * @param props 32 bit property
 376      * @return the numeric type
 377      */
 378     private static int getNumericType(int props)
 379     {
 380         return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
 381     }
 382 
 383     /**
 384      * Gets the property value at the index.
 385      * This is optimized.
 386      * Note this is alittle different from CharTrie the index m_trieData_
 387      * is never negative.
 388      * This is a duplicate of UCharacterProperty.getProperty. For optimization
 389      * purposes, this method calls the trie data directly instead of through
 390      * UCharacterProperty.getProperty.
 391      * @param ch code point whose property value is to be retrieved
 392      * @return property value of code point
 393      * @stable ICU 2.6
 394      */
 395     private static final int getProperty(int ch)
 396     {
 397         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
 398             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
 399                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
 400             // BMP codepoint 0000..D7FF or DC00..FFFF
 401             try { // using try for ch < 0 is faster than using an if statement
 402                 return PROPERTY_TRIE_DATA_[
 403                               (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
 404                               + (ch & 0x1f)];
 405             } catch (ArrayIndexOutOfBoundsException e) {
 406                 return PROPERTY_INITIAL_VALUE_;
 407             }
 408         }
 409         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 410             // lead surrogate D800..DBFF
 411             return PROPERTY_TRIE_DATA_[
 412                               (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
 413                               + (ch & 0x1f)];
 414         }
 415         // for optimization
 416         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
 417             // supplementary code point 10000..10FFFF
 418             // look at the construction of supplementary characters
 419             // trail forms the ends of it.
 420             return PROPERTY_.m_trie_.getSurrogateValue(
 421                                       UTF16.getLeadSurrogate(ch),
 422                                       (char)(ch & 0x3ff));
 423         }
 424         // return m_dataOffset_ if there is an error, in this case we return
 425         // the default value: m_initialValue_
 426         // we cannot assume that m_initialValue_ is at offset 0
 427         // this is for optimization.
 428         return PROPERTY_INITIAL_VALUE_;
 429     }
 430 
 431 }
   1 /*
   2  * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */











  25 
  26 /**
  27 *******************************************************************************
  28 * Copyright (C) 1996-2014, International Business Machines Corporation and
  29 * others. All Rights Reserved.
  30 *******************************************************************************
  31 */
  32 
  33 package sun.text.normalizer;

  34 
  35 /**
  36  * <p>The UCharacter class provides extensions to the
  37  * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">

  38  * java.lang.Character</a> class. These extensions provide support for
  39  * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
  40  * class, provide support for supplementary characters (those with code
  41  * points above U+FFFF).
  42  * Each ICU release supports the latest version of Unicode available at that time.
  43  *
  44  * <p>Code points are represented in these API using ints. While it would be

  45  * more convenient in Java to have a separate primitive datatype for them,
  46  * ints suffice in the meantime.
  47  *
  48  * <p>To use this class please add the jar file name icu4j.jar to the

  49  * class path, since it contains data files which supply the information used
  50  * by this file.<br>
  51  * E.g. In Windows <br>
  52  * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
  53  * Otherwise, another method would be to copy the files uprops.dat and
  54  * unames.icu from the icu4j source subdirectory
  55  * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
  56  * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
  57  *
  58  * <p>Aside from the additions for UTF-16 support, and the updated Unicode

  59  * properties, the main differences between UCharacter and Character are:
  60  * <ul>
  61  * <li> UCharacter is not designed to be a char wrapper and does not have
  62  *      APIs to which involves management of that single char.<br>
  63  *      These include:
  64  *      <ul>
  65  *        <li> char charValue(),
  66  *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
  67  *      </ul>
  68  * <li> UCharacter does not include Character APIs that are deprecated, nor
  69  *      does it include the Java-specific character information, such as
  70  *      boolean isJavaIdentifierPart(char ch).
  71  * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
  72  *      values '10' - '35'. UCharacter also does this in digit and
  73  *      getNumericValue, to adhere to the java semantics of these
  74  *      methods.  New methods unicodeDigit, and
  75  *      getUnicodeNumericValue do not treat the above code points
  76  *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
  77  * </ul>
  78  * <p>
  79  * Further detail on differences can be determined using the program
  80  *        <a href=
  81  * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
  82  *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
  83  * </p>
  84  * <p>
  85  * In addition to Java compatibility functions, which calculate derived properties,
  86  * this API provides low-level access to the Unicode Character Database.
  87  * </p>
  88  * <p>
  89  * Unicode assigns each code point (not just assigned character) values for
  90  * many properties.
  91  * Most of them are simple boolean flags, or constants from a small enumerated list.
  92  * For some properties, values are strings or other relatively more complex types.
  93  * </p>
  94  * <p>
  95  * For more information see
  96  * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
  97  * (http://www.unicode.org/ucd/)
  98  * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
  99  * User Guide chapter on Properties</a>
 100  * (http://www.icu-project.org/userguide/properties.html).
 101  * </p>
 102  * <p>
 103  * There are also functions that provide easy migration from C/POSIX functions
 104  * like isblank(). Their use is generally discouraged because the C/POSIX
 105  * standards do not define their semantics beyond the ASCII range, which means
 106  * that different implementations exhibit very different behavior.
 107  * Instead, Unicode properties should be used directly.
 108  * </p>
 109  * <p>
 110  * There are also only a few, broad C/POSIX character classes, and they tend
 111  * to be used for conflicting purposes. For example, the "isalpha()" class
 112  * is sometimes used to determine word boundaries, while a more sophisticated
 113  * approach would at least distinguish initial letters from continuation
 114  * characters (the latter including combining marks).
 115  * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
 116  * Another example: There is no "istitle()" class for titlecase characters.
 117  * </p>
 118  * <p>
 119  * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
 120  * ICU implements them according to the Standard Recommendations in
 121  * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
 122  * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
 123  * </p>
 124  * <p>
 125  * API access for C/POSIX character classes is as follows:
 126  * <pre>{@code
 127  * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
 128  * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
 129  * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
 130  * - punct:     ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
 131  *               (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
 132  *               (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
 133  * - digit:     isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
 134  * - xdigit:    hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
 135  * - alnum:     hasBinaryProperty(c, UProperty.POSIX_ALNUM)
 136  * - space:     isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
 137  * - blank:     hasBinaryProperty(c, UProperty.POSIX_BLANK)
 138  * - cntrl:     getType(c)==CONTROL
 139  * - graph:     hasBinaryProperty(c, UProperty.POSIX_GRAPH)
 140  * - print:     hasBinaryProperty(c, UProperty.POSIX_PRINT)
 141  * }</pre>
 142  * </p>
 143  * <p>
 144  * The C/POSIX character classes are also available in UnicodeSet patterns,
 145  * using patterns like [:graph:] or \p{graph}.
 146  * </p>
 147  *
 148  * There are several ICU (and Java) whitespace functions.
 149  * Comparison:<ul>
 150  * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
 151  *       most of general categories "Z" (separators) + most whitespace ISO controls
 152  *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
 153  * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
 154  * <li> isSpaceChar: just Z (including no-break spaces)</ul>
 155  * </p>
 156  * <p>
 157  * This class is not subclassable.
 158  * </p>
 159  * @author Syn Wee Quek
 160  * @stable ICU 2.1
 161  * @see com.ibm.icu.lang.UCharacterEnums
 162  */
 163 
 164 public final class UCharacter
 165 {
 166 
 167     /**
 168      * Joining Group constants.
 169      * @see UProperty#JOINING_GROUP
 170      * @stable ICU 2.4
 171      */
 172     public static interface JoiningGroup
 173     {
 174         /**
 175          * @stable ICU 2.4
 176          */
 177         public static final int NO_JOINING_GROUP = 0;
 178     }
 179 
 180     /**
 181      * Numeric Type constants.
 182      * @see UProperty#NUMERIC_TYPE
 183      * @stable ICU 2.4
 184      */
 185     public static interface NumericType
 186     {
 187         /**
 188          * @stable ICU 2.4
 189          */
 190         public static final int NONE = 0;
 191         /**
 192          * @stable ICU 2.4
 193          */
 194         public static final int DECIMAL = 1;
 195         /**
 196          * @stable ICU 2.4
 197          */
 198         public static final int DIGIT = 2;
 199         /**
 200          * @stable ICU 2.4
 201          */
 202         public static final int NUMERIC = 3;
 203         /**
 204          * @stable ICU 2.4
 205          */
 206         public static final int COUNT = 4;
 207     }
 208 
 209     /**
 210      * Hangul Syllable Type constants.
 211      *
 212      * @see UProperty#HANGUL_SYLLABLE_TYPE
 213      * @stable ICU 2.6
 214      */
 215     public static interface HangulSyllableType
 216     {
 217         /**
 218          * @stable ICU 2.6
 219          */
 220         public static final int NOT_APPLICABLE      = 0;   /*[NA]*/ /*See note !!*/
 221         /**
 222          * @stable ICU 2.6
 223          */
 224         public static final int LEADING_JAMO        = 1;   /*[L]*/
 225         /**
 226          * @stable ICU 2.6
 227          */
 228         public static final int VOWEL_JAMO          = 2;   /*[V]*/
 229         /**
 230          * @stable ICU 2.6
 231          */
 232         public static final int TRAILING_JAMO       = 3;   /*[T]*/
 233         /**
 234          * @stable ICU 2.6
 235          */
 236         public static final int LV_SYLLABLE         = 4;   /*[LV]*/
 237         /**
 238          * @stable ICU 2.6
 239          */
 240         public static final int LVT_SYLLABLE        = 5;   /*[LVT]*/
 241         /**
 242          * @stable ICU 2.6
 243          */
 244         public static final int COUNT               = 6;
 245     }
 246 
 247     // public data members -----------------------------------------------
 248 
 249     /**
 250      * The lowest Unicode code point value.
 251      * @stable ICU 2.1
 252      */
 253     public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
 254 
 255     /**
 256      * The highest Unicode code point value (scalar value) according to the
 257      * Unicode Standard.
 258      * This is a 21-bit value (21 bits, rounded up).<br>
 259      * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
 260      * @stable ICU 2.1
 261      */
 262     public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
 263 







 264     // public methods ----------------------------------------------------
 265 
 266     /**
 267      * Returns the numeric value of a decimal digit code point.
 268      * <br>This method observes the semantics of
 269      * <code>java.lang.Character.digit()</code>.  Note that this
 270      * will return positive values for code points for which isDigit
 271      * returns false, just like java.lang.Character.
 272      * <br><em>Semantic Change:</em> In release 1.3.1 and
 273      * prior, this did not treat the European letters as having a
 274      * digit value, and also treated numeric letters and other numbers as
 275      * digits.
 276      * This has been changed to conform to the java semantics.
 277      * <br>A code point is a valid digit if and only if:
 278      * <ul>
 279      *   <li>ch is a decimal digit or one of the european letters, and
 280      *   <li>the value of ch is less than the specified radix.
 281      * </ul>
 282      * @param ch the code point to query
 283      * @param radix the radix
 284      * @return the numeric value represented by the code point in the
 285      * specified radix, or -1 if the code point is not a decimal digit
 286      * or if its value is too large for the radix
 287      * @stable ICU 2.1
 288      */
 289     public static int digit(int ch, int radix)
 290     {
 291         if (2 <= radix && radix <= 36) {
 292             int value = digit(ch);
 293             if (value < 0) {
 294                 // ch is not a decimal digit, try latin letters
 295                 value = UCharacterProperty.getEuropeanDigit(ch);
 296             }
 297             return (value < radix) ? value : -1;
 298         } else {
 299             return -1;  // invalid radix
 300         }
 301     }
 302 
 303     /**
 304      * Returns the numeric value of a decimal digit code point.
 305      * <br>This is a convenience overload of <code>digit(int, int)</code>
 306      * that provides a decimal radix.
 307      * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
 308      * treated numeric letters and other numbers as digits.  This has
 309      * been changed to conform to the java semantics.
 310      * @param ch the code point to query
 311      * @return the numeric value represented by the code point,
 312      * or -1 if the code point is not a decimal digit or if its
 313      * value is too large for a decimal radix
 314      * @stable ICU 2.1
 315      */
 316     public static int digit(int ch)
 317     {
 318         return UCharacterProperty.INSTANCE.digit(ch);
 319     }
 320 
 321     /**
 322      * Returns a value indicating a code point's Unicode category.
 323      * Up-to-date Unicode implementation of java.lang.Character.getType()
 324      * except for the above mentioned code points that had their category
 325      * changed.<br>
 326      * Return results are constants from the interface
 327      * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
 328      * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
 329      * those returned by java.lang.Character.getType.  UCharacterCategory values
 330      * match the ones used in ICU4C, while java.lang.Character type
 331      * values, though similar, skip the value 17.</p>
 332      * @param ch code point whose type is to be determined
 333      * @return category which is a value of UCharacterCategory
 334      * @stable ICU 2.1
 335      */
 336     public static int getType(int ch)
 337     {
 338         return UCharacterProperty.INSTANCE.getType(ch);
 339     }
 340 
 341     /**
 342      * Returns the Bidirection property of a code point.
 343      * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
 344      * property.<br>
 345      * Result returned belongs to the interface
 346      * <a href=UCharacterDirection.html>UCharacterDirection</a>
 347      * @param ch the code point to be determined its direction
 348      * @return direction constant from UCharacterDirection.
 349      * @stable ICU 2.1
 350      */
 351     public static int getDirection(int ch)
 352     {
 353         return UBiDiProps.INSTANCE.getClass(ch);
 354     }
 355 
 356     /**
 357      * Maps the specified code point to a "mirror-image" code point.
 358      * For code points with the "mirrored" property, implementations sometimes
 359      * need a "poor man's" mapping to another code point such that the default
 360      * glyph may serve as the mirror-image of the default glyph of the
 361      * specified code point.<br>
 362      * This is useful for text conversion to and from codepages with visual
 363      * order, and for displays without glyph selection capabilities.
 364      * @param ch code point whose mirror is to be retrieved
 365      * @return another code point that may serve as a mirror-image substitute,
 366      *         or ch itself if there is no such mapping or ch does not have the
 367      *         "mirrored" property
 368      * @stable ICU 2.1
 369      */
 370     public static int getMirror(int ch)
 371     {
 372         return UBiDiProps.INSTANCE.getMirror(ch);
 373     }
 374 
 375     /**
 376      * Maps the specified character to its paired bracket character.
 377      * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
 378      * Otherwise c itself is returned.
 379      * See http://www.unicode.org/reports/tr9/
 380      *
 381      * @param c the code point to be mapped
 382      * @return the paired bracket code point,
 383      *         or c itself if there is no such mapping
 384      *         (Bidi_Paired_Bracket_Type=None)
 385      *
 386      * @see UProperty#BIDI_PAIRED_BRACKET
 387      * @see UProperty#BIDI_PAIRED_BRACKET_TYPE
 388      * @see #getMirror(int)
 389      * @stable ICU 52
 390      */
 391     public static int getBidiPairedBracket(int c) {
 392         return UBiDiProps.INSTANCE.getPairedBracket(c);
 393     }
 394 
 395     /**
 396      * Returns the combining class of the argument codepoint
 397      * @param ch code point whose combining is to be retrieved
 398      * @return the combining class of the codepoint
 399      * @stable ICU 2.1
 400      */
 401     public static int getCombiningClass(int ch)
 402     {
 403         return Normalizer2.getNFDInstance().getCombiningClass(ch);
 404     }
 405 
 406     /**
 407      * Returns the version of Unicode data used.
 408      * @return the unicode version number used
 409      * @stable ICU 2.1
 410      */
 411     public static VersionInfo getUnicodeVersion()
 412     {
 413         return UCharacterProperty.INSTANCE.m_unicodeVersion_;
 414     }
 415 
 416     /**
 417      * Returns a code point corresponding to the two UTF16 characters.
 418      * @param lead the lead char
 419      * @param trail the trail char
 420      * @return code point if surrogate characters are valid.
 421      * @exception IllegalArgumentException thrown when argument characters do
 422      *            not form a valid codepoint
 423      * @stable ICU 2.1
 424      */
 425     public static int getCodePoint(char lead, char trail)
 426     {
 427         if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
 428             return UCharacterProperty.getRawSupplementary(lead, trail);
 429         }
 430         throw new IllegalArgumentException("Illegal surrogate characters");
 431     }
 432 
 433     /**
 434      * Returns the "age" of the code point.</p>
 435      * <p>The "age" is the Unicode version when the code point was first
 436      * designated (as a non-character or for Private Use) or assigned a
 437      * character.
 438      * <p>This can be useful to avoid emitting code points to receiving
 439      * processes that do not accept newer characters.</p>
 440      * <p>The data is from the UCD file DerivedAge.txt.</p>
 441      * @param ch The code point.
 442      * @return the Unicode version number
 443      * @stable ICU 2.6
 444      */
 445     public static VersionInfo getAge(int ch)
 446     {
 447         if (ch < MIN_VALUE || ch > MAX_VALUE) {
 448             throw new IllegalArgumentException("Codepoint out of bounds");
 449         }
 450         return UCharacterProperty.INSTANCE.getAge(ch);
 451     }
 452 






 453     /**
 454      * Returns the property value for an Unicode property type of a code point.
 455      * Also returns binary and mask property values.</p>
 456      * <p>Unicode, especially in version 3.2, defines many more properties than
 457      * the original set in UnicodeData.txt.</p>
 458      * <p>The properties APIs are intended to reflect Unicode properties as
 459      * defined in the Unicode Character Database (UCD) and Unicode Technical
 460      * Reports (UTR). For details about the properties see
 461      * http://www.unicode.org/.</p>
 462      * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
 463      * </p>
 464      * <pre>
 465      * Sample usage:
 466      * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
 467      * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
 468      * boolean b = (ideo == 1) ? true : false;
 469      * </pre>
 470      * @param ch code point to test.
 471      * @param type UProperty selector constant, identifies which binary
 472      *        property to check. Must be
 473      *        UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or
 474      *        UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or
 475      *        UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
 476      * @return numeric value that is directly the property value or,
 477      *         for enumerated properties, corresponds to the numeric value of
 478      *         the enumerated constant of the respective property value
 479      *         enumeration type (cast to enum type if necessary).
 480      *         Returns 0 or 1 (for false / true) for binary Unicode properties.
 481      *         Returns a bit-mask for mask properties.
 482      *         Returns 0 if 'type' is out of bounds or if the Unicode version
 483      *         does not have data for the property at all, or not for this code
 484      *         point.
 485      * @see UProperty
 486      * @see #hasBinaryProperty
 487      * @see #getIntPropertyMinValue
 488      * @see #getIntPropertyMaxValue
 489      * @see #getUnicodeVersion
 490      * @stable ICU 2.4
 491      */
 492      // for BiDiBase.java
 493     public static int getIntPropertyValue(int ch, int type) {
 494         return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
 495     }
 496 
 497     // private constructor -----------------------------------------------
 498 
 499     /**
 500      * Private constructor to prevent instantiation
 501      */
 502     private UCharacter() { }










 503 
 504       /*
 505        * Copied from UCharacterEnums.java
 506        */





 507 
 508         /**
 509          * Character type Mn
 510          * @stable ICU 2.1
 511          */
 512         public static final byte NON_SPACING_MARK        = 6;
 513         /**
 514          * Character type Me
 515          * @stable ICU 2.1
 516          */
 517         public static final byte ENCLOSING_MARK          = 7;



 518         /**
 519          * Character type Mc
 520          * @stable ICU 2.1




 521          */
 522         public static final byte COMBINING_SPACING_MARK  = 8;

















 523         /**
 524          * Character type count
 525          * @stable ICU 2.1

 526          */
 527         public static final byte CHAR_CATEGORY_COUNT     = 30;



 528 
 529         /**
 530          * Directional type R
 531          * @stable ICU 2.1








 532          */
 533         public static final int RIGHT_TO_LEFT              = 1;
 534         /**
 535          * Directional type AL
 536          * @stable ICU 2.1
 537          */
 538         public static final int RIGHT_TO_LEFT_ARABIC       = 13;






























 539 }
< prev index next >