1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.io.BufferedInputStream; 40 import java.io.InputStream; 41 import java.io.IOException; 42 import java.util.MissingResourceException; 43 44 /** 45 * <p>Internal class used for Unicode character property database.</p> 46 * <p>This classes store binary data read from uprops.icu. 47 * It does not have the capability to parse the data into more high-level 48 * information. It only returns bytes of information when required.</p> 49 * <p>Due to the form most commonly used for retrieval, array of char is used 50 * to store the binary data.</p> 51 * <p>UCharacterPropertyDB also contains information on accessing indexes to 52 * significant points in the binary data.</p> 53 * <p>Responsibility for molding the binary data into more meaning form lies on 54 * <a href=UCharacter.html>UCharacter</a>.</p> 55 * @author Syn Wee Quek 56 * @since release 2.1, february 1st 2002 57 */ 58 59 public final class UCharacterProperty 60 { 61 // public data members ----------------------------------------------- 62 63 /** 64 * Trie data 65 */ 66 public CharTrie m_trie_; 67 /** 68 * Optimization 69 * CharTrie index array 70 */ 71 public char[] m_trieIndex_; 72 /** 73 * Optimization 74 * CharTrie data array 75 */ 76 public char[] m_trieData_; 77 /** 78 * Optimization 79 * CharTrie data offset 80 */ 81 public int m_trieInitialValue_; 82 /** 83 * Unicode version 84 */ 85 public VersionInfo m_unicodeVersion_; 86 87 // uprops.h enum UPropertySource --------------------------------------- *** 88 89 /** From uchar.c/uprops.icu properties vectors trie */ 90 public static final int SRC_PROPSVEC=2; 91 /** One more than the highest UPropertySource (SRC_) constant. */ 92 public static final int SRC_COUNT=9; 93 94 // public methods ---------------------------------------------------- 95 96 /** 97 * Java friends implementation 98 */ 99 public void setIndexData(CharTrie.FriendAgent friendagent) 100 { 101 m_trieIndex_ = friendagent.getPrivateIndex(); 102 m_trieData_ = friendagent.getPrivateData(); 103 m_trieInitialValue_ = friendagent.getPrivateInitialValue(); 104 } 105 106 /** 107 * Gets the property value at the index. 108 * This is optimized. 109 * Note this is alittle different from CharTrie the index m_trieData_ 110 * is never negative. 111 * @param ch code point whose property value is to be retrieved 112 * @return property value of code point 113 */ 114 public final int getProperty(int ch) 115 { 116 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE 117 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE 118 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { 119 // BMP codepoint 0000..D7FF or DC00..FFFF 120 // optimized 121 try { // using try for ch < 0 is faster than using an if statement 122 return m_trieData_[ 123 (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] 124 << Trie.INDEX_STAGE_2_SHIFT_) 125 + (ch & Trie.INDEX_STAGE_3_MASK_)]; 126 } catch (ArrayIndexOutOfBoundsException e) { 127 return m_trieInitialValue_; 128 } 129 } 130 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 131 // lead surrogate D800..DBFF 132 return m_trieData_[ 133 (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ 134 + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] 135 << Trie.INDEX_STAGE_2_SHIFT_) 136 + (ch & Trie.INDEX_STAGE_3_MASK_)]; 137 } 138 if (ch <= UTF16.CODEPOINT_MAX_VALUE) { 139 // supplementary code point 10000..10FFFF 140 // look at the construction of supplementary characters 141 // trail forms the ends of it. 142 return m_trie_.getSurrogateValue( 143 UTF16.getLeadSurrogate(ch), 144 (char)(ch & Trie.SURROGATE_MASK_)); 145 } 146 // ch is out of bounds 147 // return m_dataOffset_ if there is an error, in this case we return 148 // the default value: m_initialValue_ 149 // we cannot assume that m_initialValue_ is at offset 0 150 // this is for optimization. 151 return m_trieInitialValue_; 152 153 // this all is an inlined form of return m_trie_.getCodePointValue(ch); 154 } 155 156 /** 157 * Getting the unsigned numeric value of a character embedded in the property 158 * argument 159 * @param prop the character 160 * @return unsigned numberic value 161 */ 162 public static int getUnsignedValue(int prop) 163 { 164 return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; 165 } 166 167 /** 168 * Gets the unicode additional properties. 169 * C version getUnicodeProperties. 170 * @param codepoint codepoint whose additional properties is to be 171 * retrieved 172 * @param column 173 * @return unicode properties 174 */ 175 public int getAdditional(int codepoint, int column) { 176 if (column == -1) { 177 return getProperty(codepoint); 178 } 179 if (column < 0 || column >= m_additionalColumnsCount_) { 180 return 0; 181 } 182 return m_additionalVectors_[ 183 m_additionalTrie_.getCodePointValue(codepoint) + column]; 184 } 185 186 /** 187 * <p>Get the "age" of the code point.</p> 188 * <p>The "age" is the Unicode version when the code point was first 189 * designated (as a non-character or for Private Use) or assigned a 190 * character.</p> 191 * <p>This can be useful to avoid emitting code points to receiving 192 * processes that do not accept newer characters.</p> 193 * <p>The data is from the UCD file DerivedAge.txt.</p> 194 * <p>This API does not check the validity of the codepoint.</p> 195 * @param codepoint The code point. 196 * @return the Unicode version number 197 */ 198 public VersionInfo getAge(int codepoint) 199 { 200 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 201 return VersionInfo.getInstance( 202 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 203 version & LAST_NIBBLE_MASK_, 0, 0); 204 } 205 206 /** 207 * Forms a supplementary code point from the argument character<br> 208 * Note this is for internal use hence no checks for the validity of the 209 * surrogate characters are done 210 * @param lead lead surrogate character 211 * @param trail trailing surrogate character 212 * @return code point of the supplementary character 213 */ 214 public static int getRawSupplementary(char lead, char trail) 215 { 216 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 217 } 218 219 /** 220 * Loads the property data and initialize the UCharacterProperty instance. 221 * @throws MissingResourceException when data is missing or data has been corrupted 222 */ 223 public static UCharacterProperty getInstance() 224 { 225 if(INSTANCE_ == null) { 226 try { 227 INSTANCE_ = new UCharacterProperty(); 228 } 229 catch (Exception e) { 230 throw new MissingResourceException(e.getMessage(),"",""); 231 } 232 } 233 return INSTANCE_; 234 } 235 236 /** 237 * Checks if the argument c is to be treated as a white space in ICU 238 * rules. Usually ICU rule white spaces are ignored unless quoted. 239 * Equivalent to test for Pattern_White_Space Unicode property. 240 * Stable set of characters, won't change. 241 * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ 242 * @param c codepoint to check 243 * @return true if c is a ICU white space 244 */ 245 public static boolean isRuleWhiteSpace(int c) 246 { 247 /* "white space" in the sense of ICU rule parsers 248 This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. 249 See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ 250 U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 251 Equivalent to test for Pattern_White_Space Unicode property. 252 */ 253 return (c >= 0x0009 && c <= 0x2029 && 254 (c <= 0x000D || c == 0x0020 || c == 0x0085 || 255 c == 0x200E || c == 0x200F || c >= 0x2028)); 256 } 257 258 // protected variables ----------------------------------------------- 259 260 /** 261 * Extra property trie 262 */ 263 CharTrie m_additionalTrie_; 264 /** 265 * Extra property vectors, 1st column for age and second for binary 266 * properties. 267 */ 268 int m_additionalVectors_[]; 269 /** 270 * Number of additional columns 271 */ 272 int m_additionalColumnsCount_; 273 /** 274 * Maximum values for block, bits used as in vector word 275 * 0 276 */ 277 int m_maxBlockScriptValue_; 278 /** 279 * Maximum values for script, bits used as in vector word 280 * 0 281 */ 282 int m_maxJTGValue_; 283 284 // private variables ------------------------------------------------- 285 286 /** 287 * UnicodeData.txt property object 288 */ 289 private static UCharacterProperty INSTANCE_ = null; 290 291 /** 292 * Default name of the datafile 293 */ 294 private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; 295 296 /** 297 * Default buffer size of datafile 298 */ 299 private static final int DATA_BUFFER_SIZE_ = 25000; 300 301 /** 302 * Numeric value shift 303 */ 304 private static final int VALUE_SHIFT_ = 8; 305 306 /** 307 * Mask to be applied after shifting to obtain an unsigned numeric value 308 */ 309 private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF; 310 311 /** 312 * Shift value for lead surrogate to form a supplementary character. 313 */ 314 private static final int LEAD_SURROGATE_SHIFT_ = 10; 315 /** 316 * Offset to add to combined surrogate pair to avoid msking. 317 */ 318 private static final int SURROGATE_OFFSET_ = 319 UTF16.SUPPLEMENTARY_MIN_VALUE - 320 (UTF16.SURROGATE_MIN_VALUE << 321 LEAD_SURROGATE_SHIFT_) - 322 UTF16.TRAIL_SURROGATE_MIN_VALUE; 323 324 // additional properties ---------------------------------------------- 325 326 /** 327 * First nibble shift 328 */ 329 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 330 /** 331 * Second nibble mask 332 */ 333 private static final int LAST_NIBBLE_MASK_ = 0xF; 334 /** 335 * Age value shift 336 */ 337 private static final int AGE_SHIFT_ = 24; 338 339 // private constructors -------------------------------------------------- 340 341 /** 342 * Constructor 343 * @exception IOException thrown when data reading fails or data corrupted 344 */ 345 private UCharacterProperty() throws IOException 346 { 347 // jar access 348 InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_); 349 BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_); 350 UCharacterPropertyReader reader = new UCharacterPropertyReader(b); 351 reader.read(this); 352 b.close(); 353 354 m_trie_.putIndexData(this); 355 } 356 357 public void upropsvec_addPropertyStarts(UnicodeSet set) { 358 /* add the start code point of each same-value range of the properties vectors trie */ 359 if(m_additionalColumnsCount_>0) { 360 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 361 TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); 362 RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); 363 while(propsVectorsIter.next(propsVectorsResult)){ 364 set.add(propsVectorsResult.start); 365 } 366 } 367 } 368 369 }