1 /* 2 * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.io.IOException; 40 import java.util.MissingResourceException; 41 42 /** 43 * <p> 44 * The UCharacter class provides extensions to the 45 * <a href="https://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html"> 46 * java.lang.Character</a> class. These extensions provide support for 47 * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a> 48 * class, provide support for supplementary characters (those with code 49 * points above U+FFFF). 50 * Each ICU release supports the latest version of Unicode available at that time. 51 * </p> 52 * <p> 53 * Code points are represented in these API using ints. While it would be 54 * more convenient in Java to have a separate primitive datatype for them, 55 * ints suffice in the meantime. 56 * </p> 57 * <p> 58 * To use this class please add the jar file name icu4j.jar to the 59 * class path, since it contains data files which supply the information used 60 * by this file.<br> 61 * E.g. In Windows <br> 62 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br> 63 * Otherwise, another method would be to copy the files uprops.dat and 64 * unames.icu from the icu4j source subdirectory 65 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory 66 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>. 67 * </p> 68 * <p> 69 * Aside from the additions for UTF-16 support, and the updated Unicode 70 * properties, the main differences between UCharacter and Character are: 71 * <ul> 72 * <li> UCharacter is not designed to be a char wrapper and does not have 73 * APIs to which involves management of that single char.<br> 74 * These include: 75 * <ul> 76 * <li> char charValue(), 77 * <li> int compareTo(java.lang.Character, java.lang.Character), etc. 78 * </ul> 79 * <li> UCharacter does not include Character APIs that are deprecated, nor 80 * does it include the Java-specific character information, such as 81 * boolean isJavaIdentifierPart(char ch). 82 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric 83 * values '10' - '35'. UCharacter also does this in digit and 84 * getNumericValue, to adhere to the java semantics of these 85 * methods. New methods unicodeDigit, and 86 * getUnicodeNumericValue do not treat the above code points 87 * as having numeric values. This is a semantic change from ICU4J 1.3.1. 88 * </ul> 89 * <p> 90 * Further detail differences can be determined from the program 91 * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java"> 92 * com.ibm.icu.dev.test.lang.UCharacterCompare</a> 93 * </p> 94 * <p> 95 * In addition to Java compatibility functions, which calculate derived properties, 96 * this API provides low-level access to the Unicode Character Database. 97 * </p> 98 * <p> 99 * Unicode assigns each code point (not just assigned character) values for 100 * many properties. 101 * Most of them are simple boolean flags, or constants from a small enumerated list. 102 * For some properties, values are strings or other relatively more complex types. 103 * </p> 104 * <p> 105 * For more information see 106 * "About the Unicode Character Database" (http://www.unicode.org/ucd/) 107 * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html). 108 * </p> 109 * <p> 110 * There are also functions that provide easy migration from C/POSIX functions 111 * like isblank(). Their use is generally discouraged because the C/POSIX 112 * standards do not define their semantics beyond the ASCII range, which means 113 * that different implementations exhibit very different behavior. 114 * Instead, Unicode properties should be used directly. 115 * </p> 116 * <p> 117 * There are also only a few, broad C/POSIX character classes, and they tend 118 * to be used for conflicting purposes. For example, the "isalpha()" class 119 * is sometimes used to determine word boundaries, while a more sophisticated 120 * approach would at least distinguish initial letters from continuation 121 * characters (the latter including combining marks). 122 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) 123 * Another example: There is no "istitle()" class for titlecase characters. 124 * </p> 125 * <p> 126 * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. 127 * ICU implements them according to the Standard Recommendations in 128 * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions 129 * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). 130 * </p> 131 * <p> 132 * API access for C/POSIX character classes is as follows: 133 * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC) 134 * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE) 135 * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE) 136 * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0 137 * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER 138 * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT) 139 * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM) 140 * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE) 141 * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK) 142 * - cntrl: getType(c)==CONTROL 143 * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH) 144 * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT) 145 * </p> 146 * <p> 147 * The C/POSIX character classes are also available in UnicodeSet patterns, 148 * using patterns like [:graph:] or \p{graph}. 149 * </p> 150 * <p> 151 * Note: There are several ICU (and Java) whitespace functions. 152 * Comparison: 153 * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; 154 * most of general categories "Z" (separators) + most whitespace ISO controls 155 * (including no-break spaces, but excluding IS1..IS4 and ZWSP) 156 * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces 157 * - isSpaceChar: just Z (including no-break spaces) 158 * </p> 159 * <p> 160 * This class is not subclassable 161 * </p> 162 * @author Syn Wee Quek 163 * @stable ICU 2.1 164 * @see com.ibm.icu.lang.UCharacterEnums 165 */ 166 167 public final class UCharacter 168 { 169 170 /** 171 * Numeric Type constants. 172 * @see UProperty#NUMERIC_TYPE 173 * @stable ICU 2.4 174 */ 175 public static interface NumericType 176 { 177 /** 178 * @stable ICU 2.4 179 */ 180 public static final int DECIMAL = 1; 181 } 182 183 // public data members ----------------------------------------------- 184 185 /** 186 * The lowest Unicode code point value. 187 * @stable ICU 2.1 188 */ 189 public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE; 190 191 /** 192 * The highest Unicode code point value (scalar value) according to the 193 * Unicode Standard. 194 * This is a 21-bit value (21 bits, rounded up).<br> 195 * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE 196 * @stable ICU 2.1 197 */ 198 public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; 199 200 /** 201 * The minimum value for Supplementary code points 202 * @stable ICU 2.1 203 */ 204 public static final int SUPPLEMENTARY_MIN_VALUE = 205 UTF16.SUPPLEMENTARY_MIN_VALUE; 206 207 // public methods ---------------------------------------------------- 208 209 /** 210 * Retrieves the numeric value of a decimal digit code point. 211 * <br>This method observes the semantics of 212 * <code>java.lang.Character.digit()</code>. Note that this 213 * will return positive values for code points for which isDigit 214 * returns false, just like java.lang.Character. 215 * <br><em>Semantic Change:</em> In release 1.3.1 and 216 * prior, this did not treat the European letters as having a 217 * digit value, and also treated numeric letters and other numbers as 218 * digits. 219 * This has been changed to conform to the java semantics. 220 * <br>A code point is a valid digit if and only if: 221 * <ul> 222 * <li>ch is a decimal digit or one of the european letters, and 223 * <li>the value of ch is less than the specified radix. 224 * </ul> 225 * @param ch the code point to query 226 * @param radix the radix 227 * @return the numeric value represented by the code point in the 228 * specified radix, or -1 if the code point is not a decimal digit 229 * or if its value is too large for the radix 230 * @stable ICU 2.1 231 */ 232 public static int digit(int ch, int radix) 233 { 234 // when ch is out of bounds getProperty == 0 235 int props = getProperty(ch); 236 int value; 237 if (getNumericType(props) == NumericType.DECIMAL) { 238 value = UCharacterProperty.getUnsignedValue(props); 239 } else { 240 value = getEuropeanDigit(ch); 241 } 242 return (0 <= value && value < radix) ? value : -1; 243 } 244 245 /** 246 * Returns the Bidirection property of a code point. 247 * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional 248 * property.<br> 249 * Result returned belongs to the interface 250 * <a href=UCharacterDirection.html>UCharacterDirection</a> 251 * @param ch the code point to be determined its direction 252 * @return direction constant from UCharacterDirection. 253 * @stable ICU 2.1 254 */ 255 public static int getDirection(int ch) 256 { 257 return gBdp.getClass(ch); 258 } 259 260 /** 261 * Returns a code point corresponding to the two UTF16 characters. 262 * @param lead the lead char 263 * @param trail the trail char 264 * @return code point if surrogate characters are valid. 265 * @exception IllegalArgumentException thrown when argument characters do 266 * not form a valid codepoint 267 * @stable ICU 2.1 268 */ 269 public static int getCodePoint(char lead, char trail) 270 { 271 if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { 272 return UCharacterProperty.getRawSupplementary(lead, trail); 273 } 274 throw new IllegalArgumentException("Illegal surrogate characters"); 275 } 276 277 /** 278 * <p>Get the "age" of the code point.</p> 279 * <p>The "age" is the Unicode version when the code point was first 280 * designated (as a non-character or for Private Use) or assigned a 281 * character. 282 * <p>This can be useful to avoid emitting code points to receiving 283 * processes that do not accept newer characters.</p> 284 * <p>The data is from the UCD file DerivedAge.txt.</p> 285 * @param ch The code point. 286 * @return the Unicode version number 287 * @stable ICU 2.6 288 */ 289 public static VersionInfo getAge(int ch) 290 { 291 if (ch < MIN_VALUE || ch > MAX_VALUE) { 292 throw new IllegalArgumentException("Codepoint out of bounds"); 293 } 294 return PROPERTY_.getAge(ch); 295 } 296 297 // private variables ------------------------------------------------- 298 299 /** 300 * Database storing the sets of character property 301 */ 302 private static final UCharacterProperty PROPERTY_; 303 /** 304 * For optimization 305 */ 306 private static final char[] PROPERTY_TRIE_INDEX_; 307 private static final char[] PROPERTY_TRIE_DATA_; 308 private static final int PROPERTY_INITIAL_VALUE_; 309 310 private static final UBiDiProps gBdp; 311 312 // block to initialise character property database 313 static 314 { 315 try 316 { 317 PROPERTY_ = UCharacterProperty.getInstance(); 318 PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_; 319 PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_; 320 PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_; 321 } 322 catch (Exception e) 323 { 324 throw new MissingResourceException(e.getMessage(),"",""); 325 } 326 327 UBiDiProps bdp; 328 try { 329 bdp=UBiDiProps.getSingleton(); 330 } catch(IOException e) { 331 bdp=UBiDiProps.getDummy(); 332 } 333 gBdp=bdp; 334 } 335 336 /** 337 * Shift to get numeric type 338 */ 339 private static final int NUMERIC_TYPE_SHIFT_ = 5; 340 /** 341 * Mask to get numeric type 342 */ 343 private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_; 344 345 // private methods --------------------------------------------------- 346 347 /** 348 * Getting the digit values of characters like 'A' - 'Z', normal, 349 * half-width and full-width. This method assumes that the other digit 350 * characters are checked by the calling method. 351 * @param ch character to test 352 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 353 * its corresponding digit will be returned. 354 */ 355 private static int getEuropeanDigit(int ch) { 356 if ((ch > 0x7a && ch < 0xff21) 357 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 358 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 359 return -1; 360 } 361 if (ch <= 0x7a) { 362 // ch >= 0x41 or ch < 0x61 363 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 364 } 365 // ch >= 0xff21 366 if (ch <= 0xff3a) { 367 return ch + 10 - 0xff21; 368 } 369 // ch >= 0xff41 && ch <= 0xff5a 370 return ch + 10 - 0xff41; 371 } 372 373 /** 374 * Gets the numeric type of the property argument 375 * @param props 32 bit property 376 * @return the numeric type 377 */ 378 private static int getNumericType(int props) 379 { 380 return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; 381 } 382 383 /** 384 * Gets the property value at the index. 385 * This is optimized. 386 * Note this is alittle different from CharTrie the index m_trieData_ 387 * is never negative. 388 * This is a duplicate of UCharacterProperty.getProperty. For optimization 389 * purposes, this method calls the trie data directly instead of through 390 * UCharacterProperty.getProperty. 391 * @param ch code point whose property value is to be retrieved 392 * @return property value of code point 393 * @stable ICU 2.6 394 */ 395 private static final int getProperty(int ch) 396 { 397 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE 398 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE 399 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { 400 // BMP codepoint 0000..D7FF or DC00..FFFF 401 try { // using try for ch < 0 is faster than using an if statement 402 return PROPERTY_TRIE_DATA_[ 403 (PROPERTY_TRIE_INDEX_[ch >> 5] << 2) 404 + (ch & 0x1f)]; 405 } catch (ArrayIndexOutOfBoundsException e) { 406 return PROPERTY_INITIAL_VALUE_; 407 } 408 } 409 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 410 // lead surrogate D800..DBFF 411 return PROPERTY_TRIE_DATA_[ 412 (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) 413 + (ch & 0x1f)]; 414 } 415 // for optimization 416 if (ch <= UTF16.CODEPOINT_MAX_VALUE) { 417 // supplementary code point 10000..10FFFF 418 // look at the construction of supplementary characters 419 // trail forms the ends of it. 420 return PROPERTY_.m_trie_.getSurrogateValue( 421 UTF16.getLeadSurrogate(ch), 422 (char)(ch & 0x3ff)); 423 } 424 // return m_dataOffset_ if there is an error, in this case we return 425 // the default value: m_initialValue_ 426 // we cannot assume that m_initialValue_ is at offset 0 427 // this is for optimization. 428 return PROPERTY_INITIAL_VALUE_; 429 } 430 431 }