1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 import java.io.BufferedInputStream; 40 import java.io.InputStream; 41 import java.io.IOException; 42 import java.util.MissingResourceException; 43 44 /** 45 * <p>Internal class used for Unicode character property database.</p> 46 * <p>This classes store binary data read from uprops.icu. 47 * It does not have the capability to parse the data into more high-level 48 * information. It only returns bytes of information when required.</p> 49 * <p>Due to the form most commonly used for retrieval, array of char is used 50 * to store the binary data.</p> 51 * <p>UCharacterPropertyDB also contains information on accessing indexes to 52 * significant points in the binary data.</p> 53 * <p>Responsibility for molding the binary data into more meaning form lies on 54 * <a href=UCharacter.html>UCharacter</a>.</p> 55 * @author Syn Wee Quek 56 * @since release 2.1, february 1st 2002 57 */ 58 59 public final class UCharacterProperty 60 { 61 // public data members ----------------------------------------------- 62 63 /** 64 * Trie data 65 */ 66 public CharTrie m_trie_; 67 /** 68 * Optimization 69 * CharTrie index array 70 */ 71 public char[] m_trieIndex_; 72 /** 73 * Optimization 74 * CharTrie data array 75 */ 76 public char[] m_trieData_; 77 /** 78 * Optimization 79 * CharTrie data offset 80 */ 81 public int m_trieInitialValue_; 82 /** 83 * Unicode version 84 */ 85 public VersionInfo m_unicodeVersion_; 86 87 // uprops.h enum UPropertySource --------------------------------------- *** 88 89 /** From uchar.c/uprops.icu properties vectors trie */ 90 public static final int SRC_PROPSVEC=2; 91 /** One more than the highest UPropertySource (SRC_) constant. */ 92 public static final int SRC_COUNT=9; 93 94 // public methods ---------------------------------------------------- 95 96 /** 97 * Java friends implementation 98 */ 99 public void setIndexData(CharTrie.FriendAgent friendagent) 100 { 101 m_trieIndex_ = friendagent.getPrivateIndex(); 102 m_trieData_ = friendagent.getPrivateData(); 103 m_trieInitialValue_ = friendagent.getPrivateInitialValue(); 104 } 105 106 /** 107 * Gets the property value at the index. 108 * This is optimized. 109 * Note this is alittle different from CharTrie the index m_trieData_ 110 * is never negative. 111 * @param ch code point whose property value is to be retrieved 112 * @return property value of code point 113 */ 114 public final int getProperty(int ch) 115 { 116 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE 117 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE 118 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { 119 // BMP codepoint 0000..D7FF or DC00..FFFF 120 // optimized 121 try { // using try for ch < 0 is faster than using an if statement 122 return m_trieData_[ 123 (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] 124 << Trie.INDEX_STAGE_2_SHIFT_) 125 + (ch & Trie.INDEX_STAGE_3_MASK_)]; 126 } catch (ArrayIndexOutOfBoundsException e) { 127 return m_trieInitialValue_; 128 } 129 } 130 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 131 // lead surrogate D800..DBFF 132 return m_trieData_[ 133 (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ 134 + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] 135 << Trie.INDEX_STAGE_2_SHIFT_) 136 + (ch & Trie.INDEX_STAGE_3_MASK_)]; 137 } 138 if (ch <= UTF16.CODEPOINT_MAX_VALUE) { 139 // supplementary code point 10000..10FFFF 140 // look at the construction of supplementary characters 141 // trail forms the ends of it. 142 return m_trie_.getSurrogateValue( 143 UTF16.getLeadSurrogate(ch), 144 (char)(ch & Trie.SURROGATE_MASK_)); 145 } 146 // ch is out of bounds 147 // return m_dataOffset_ if there is an error, in this case we return 148 // the default value: m_initialValue_ 149 // we cannot assume that m_initialValue_ is at offset 0 150 // this is for optimization. 151 return m_trieInitialValue_; 152 153 // this all is an inlined form of return m_trie_.getCodePointValue(ch); 154 } 155 156 /** 157 * Getting the unsigned numeric value of a character embedded in the property 158 * argument 159 * @param prop the character 160 * @return unsigned numberic value 161 */ 162 public static int getUnsignedValue(int prop) 163 { 164 return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; 165 } 166 167 /** 168 * Gets the unicode additional properties. 169 * C version getUnicodeProperties. 170 * @param codepoint codepoint whose additional properties is to be 171 * retrieved 172 * @param column 173 * @return unicode properties 174 */ 175 public int getAdditional(int codepoint, int column) { 176 if (column == -1) { 177 return getProperty(codepoint); 178 } 179 if (column < 0 || column >= m_additionalColumnsCount_) { 180 return 0; 181 } 182 return m_additionalVectors_[ 183 m_additionalTrie_.getCodePointValue(codepoint) + column]; 184 } 185 186 /** 187 * <p>Get the "age" of the code point.</p> 188 * <p>The "age" is the Unicode version when the code point was first 189 * designated (as a non-character or for Private Use) or assigned a 190 * character.</p> 191 * <p>This can be useful to avoid emitting code points to receiving 192 * processes that do not accept newer characters.</p> 193 * <p>The data is from the UCD file DerivedAge.txt.</p> 194 * <p>This API does not check the validity of the codepoint.</p> 195 * @param codepoint The code point. 196 * @return the Unicode version number 197 */ 198 public VersionInfo getAge(int codepoint) 199 { 200 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 201 return VersionInfo.getInstance( 202 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 203 version & LAST_NIBBLE_MASK_, 0, 0); 204 } 205 206 /** 207 * Forms a supplementary code point from the argument character<br> 208 * Note this is for internal use hence no checks for the validity of the 209 * surrogate characters are done 210 * @param lead lead surrogate character 211 * @param trail trailing surrogate character 212 * @return code point of the supplementary character 213 */ 214 public static int getRawSupplementary(char lead, char trail) 215 { 216 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 217 } 218 219 /** 220 * Loads the property data and initialize the UCharacterProperty instance. 221 * @throws MissingResourceException when data is missing or data has been corrupted 222 */ 223 public static UCharacterProperty getInstance() 224 { 225 if(INSTANCE_ == null) { 226 try { 227 INSTANCE_ = new UCharacterProperty(); 228 } 229 catch (Exception e) { 230 throw new MissingResourceException(e.getMessage(),"",""); 231 } 232 } 233 return INSTANCE_; 234 } 235 236 /** 237 * Checks if the argument c is to be treated as a white space in ICU 238 * rules. Usually ICU rule white spaces are ignored unless quoted. 239 * Equivalent to test for Pattern_White_Space Unicode property. 240 * Stable set of characters, won't change. 241 * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ 242 * @param c codepoint to check 243 * @return true if c is a ICU white space 244 */ 245 public static boolean isRuleWhiteSpace(int c) 246 { 247 /* "white space" in the sense of ICU rule parsers 248 This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. 249 See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ 250 U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 251 Equivalent to test for Pattern_White_Space Unicode property. 252 */ 253 return (c >= 0x0009 && c <= 0x2029 && 254 (c <= 0x000D || c == 0x0020 || c == 0x0085 || 255 c == 0x200E || c == 0x200F || c >= 0x2028)); 256 } 257 258 // protected variables ----------------------------------------------- 259 260 /** 261 * Extra property trie 262 */ 263 CharTrie m_additionalTrie_; 264 /** 265 * Extra property vectors, 1st column for age and second for binary 266 * properties. 267 */ 268 int m_additionalVectors_[]; 269 /** 270 * Number of additional columns 271 */ 272 int m_additionalColumnsCount_; 273 /** 274 * Maximum values for block, bits used as in vector word 275 * 0 276 */ 277 int m_maxBlockScriptValue_; 278 /** 279 * Maximum values for script, bits used as in vector word 280 * 0 281 */ 282 int m_maxJTGValue_; 283 284 // private variables ------------------------------------------------- 285 286 /** 287 * UnicodeData.txt property object 288 */ 289 private static UCharacterProperty INSTANCE_ = null; 290 291 /** 292 * Default name of the datafile 293 */ 294 private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; 295 296 /** 297 * Default buffer size of datafile 298 */ 299 private static final int DATA_BUFFER_SIZE_ = 25000; 300 301 /** 302 * Numeric value shift 303 */ 304 private static final int VALUE_SHIFT_ = 8; 305 306 /** 307 * Mask to be applied after shifting to obtain an unsigned numeric value 308 */ 309 private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF; 310 311 /** 312 * Shift value for lead surrogate to form a supplementary character. 313 */ 314 private static final int LEAD_SURROGATE_SHIFT_ = 10; 315 /** 316 * Offset to add to combined surrogate pair to avoid msking. 317 */ 318 private static final int SURROGATE_OFFSET_ = 319 UTF16.SUPPLEMENTARY_MIN_VALUE - 320 (UTF16.SURROGATE_MIN_VALUE << 321 LEAD_SURROGATE_SHIFT_) - 322 UTF16.TRAIL_SURROGATE_MIN_VALUE; 323 324 // additional properties ---------------------------------------------- 325 326 /** 327 * First nibble shift 328 */ 329 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 330 /** 331 * Second nibble mask 332 */ 333 private static final int LAST_NIBBLE_MASK_ = 0xF; 334 /** 335 * Age value shift 336 */ 337 private static final int AGE_SHIFT_ = 24; 338 339 // private constructors -------------------------------------------------- 340 341 /** 342 * Constructor 343 * @exception IOException thrown when data reading fails or data corrupted 344 */ 345 private UCharacterProperty() throws IOException 346 { 347 // jar access 348 InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_); 349 BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_); 350 UCharacterPropertyReader reader = new UCharacterPropertyReader(b); 351 reader.read(this); 352 b.close(); 353 354 m_trie_.putIndexData(this); 355 } 356 357 public void upropsvec_addPropertyStarts(UnicodeSet set) { 358 /* add the start code point of each same-value range of the properties vectors trie */ 359 if(m_additionalColumnsCount_>0) { 360 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 361 TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); 362 RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); 363 while(propsVectorsIter.next(propsVectorsResult)){ 364 set.add(propsVectorsResult.start); 365 } 366 } 367 } 368 369 } | 1 /* 2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 1996-2014, International Business Machines Corporation and 28 * others. All Rights Reserved. 29 ******************************************************************************* 30 */ 31 32 package sun.text.normalizer; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.util.Iterator; 37 import java.util.MissingResourceException; 38 39 import sun.text.normalizer.UCharacter.HangulSyllableType; 40 import sun.text.normalizer.UCharacter.NumericType; 41 42 /** 43 * <p>Internal class used for Unicode character property database.</p> 44 * <p>This classes store binary data read from uprops.icu. 45 * It does not have the capability to parse the data into more high-level 46 * information. It only returns bytes of information when required.</p> 47 * <p>Due to the form most commonly used for retrieval, array of char is used 48 * to store the binary data.</p> 49 * <p>UCharacterPropertyDB also contains information on accessing indexes to 50 * significant points in the binary data.</p> 51 * <p>Responsibility for molding the binary data into more meaning form lies on 52 * <a href=UCharacter.html>UCharacter</a>.</p> 53 * @author Syn Wee Quek 54 * @since release 2.1, february 1st 2002 55 */ 56 57 final class UCharacterProperty 58 { 59 // public data members ----------------------------------------------- 60 61 /* 62 * public singleton instance 63 */ 64 public static final UCharacterProperty INSTANCE; 65 66 /** 67 * Trie data 68 */ 69 public Trie2_16 m_trie_; 70 71 /** 72 * Unicode version 73 */ 74 public VersionInfo m_unicodeVersion_; 75 76 /** 77 * Character type mask 78 */ 79 public static final int TYPE_MASK = 0x1F; 80 81 // uprops.h enum UPropertySource --------------------------------------- *** 82 83 /** From uchar.c/uprops.icu main trie */ 84 public static final int SRC_CHAR=1; 85 /** From uchar.c/uprops.icu properties vectors trie */ 86 public static final int SRC_PROPSVEC=2; 87 /** From ubidi_props.c/ubidi.icu */ 88 public static final int SRC_BIDI=5; 89 /** From normalizer2impl.cpp/nfc.nrm */ 90 public static final int SRC_NFC=8; 91 /** From normalizer2impl.cpp/nfkc.nrm */ 92 public static final int SRC_NFKC=9; 93 94 // public methods ---------------------------------------------------- 95 96 /** 97 * Gets the main property value for code point ch. 98 * @param ch code point whose property value is to be retrieved 99 * @return property value of code point 100 */ 101 public final int getProperty(int ch) 102 { 103 return m_trie_.get(ch); 104 } 105 106 /** 107 * Gets the unicode additional properties. 108 * Java version of C u_getUnicodeProperties(). 109 * @param codepoint codepoint whose additional properties is to be 110 * retrieved 111 * @param column The column index. 112 * @return unicode properties 113 */ 114 public int getAdditional(int codepoint, int column) { 115 assert column >= 0; 116 if (column >= m_additionalColumnsCount_) { 117 return 0; 118 } 119 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 120 } 121 122 /** 123 * <p>Get the "age" of the code point.</p> 124 * <p>The "age" is the Unicode version when the code point was first 125 * designated (as a non-character or for Private Use) or assigned a 126 * character.</p> 127 * <p>This can be useful to avoid emitting code points to receiving 128 * processes that do not accept newer characters.</p> 129 * <p>The data is from the UCD file DerivedAge.txt.</p> 130 * <p>This API does not check the validity of the codepoint.</p> 131 * @param codepoint The code point. 132 * @return the Unicode version number 133 */ 134 public VersionInfo getAge(int codepoint) 135 { 136 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 137 return VersionInfo.getInstance( 138 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 139 version & LAST_NIBBLE_MASK_, 0, 0); 140 } 141 142 // int-value and enumerated properties --------------------------------- *** 143 144 public int getType(int c) { 145 return getProperty(c)&TYPE_MASK; 146 } 147 148 /* 149 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 150 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 151 */ 152 private static final int /* UHangulSyllableType */ gcbToHst[]={ 153 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 154 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 155 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 156 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 157 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 158 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 159 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 160 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 161 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 162 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 163 /* 164 * Omit GCB values beyond what we need for hst. 165 * The code below checks for the array length. 166 */ 167 }; 168 169 private class IntProperty { 170 int column; // SRC_PROPSVEC column, or "source" if mask==0 171 int mask; 172 int shift; 173 174 IntProperty(int column, int mask, int shift) { 175 this.column=column; 176 this.mask=mask; 177 this.shift=shift; 178 } 179 180 IntProperty(int source) { 181 this.column=source; 182 this.mask=0; 183 } 184 185 int getValue(int c) { 186 // systematic, directly stored properties 187 return (getAdditional(c, column)&mask)>>>shift; 188 } 189 } 190 191 private class BiDiIntProperty extends IntProperty { 192 BiDiIntProperty() { 193 super(SRC_BIDI); 194 } 195 } 196 197 private class CombiningClassIntProperty extends IntProperty { 198 CombiningClassIntProperty(int source) { 199 super(source); 200 } 201 } 202 203 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 204 int which; 205 int max; 206 207 NormQuickCheckIntProperty(int source, int which, int max) { 208 super(source); 209 this.which=which; 210 this.max=max; 211 } 212 } 213 214 private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 215 int getValue(int c) { 216 return UBiDiProps.INSTANCE.getPairedBracketType(c); 217 } 218 }; 219 220 public int getIntPropertyValue(int c, int which) { 221 if (which == BIDI_PAIRED_BRACKET_TYPE) { 222 return intProp.getValue(c); 223 } 224 return 0; // undefined 225 } 226 227 /** 228 * Forms a supplementary code point from the argument character<br> 229 * Note this is for internal use hence no checks for the validity of the 230 * surrogate characters are done 231 * @param lead lead surrogate character 232 * @param trail trailing surrogate character 233 * @return code point of the supplementary character 234 */ 235 public static int getRawSupplementary(char lead, char trail) 236 { 237 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 238 } 239 240 /** 241 * Gets the type mask 242 * @param type character type 243 * @return mask 244 */ 245 public static final int getMask(int type) 246 { 247 return 1 << type; 248 } 249 250 /** 251 * Returns the digit values of characters like 'A' - 'Z', normal, 252 * half-width and full-width. This method assumes that the other digit 253 * characters are checked by the calling method. 254 * @param ch character to test 255 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 256 * its corresponding digit will be returned. 257 */ 258 public static int getEuropeanDigit(int ch) { 259 if ((ch > 0x7a && ch < 0xff21) 260 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 261 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 262 return -1; 263 } 264 if (ch <= 0x7a) { 265 // ch >= 0x41 or ch < 0x61 266 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 267 } 268 // ch >= 0xff21 269 if (ch <= 0xff3a) { 270 return ch + 10 - 0xff21; 271 } 272 // ch >= 0xff41 && ch <= 0xff5a 273 return ch + 10 - 0xff41; 274 } 275 276 public int digit(int c) { 277 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 278 if(value<=9) { 279 return value; 280 } else { 281 return -1; 282 } 283 } 284 285 // protected variables ----------------------------------------------- 286 287 /** 288 * Extra property trie 289 */ 290 Trie2_16 m_additionalTrie_; 291 /** 292 * Extra property vectors, 1st column for age and second for binary 293 * properties. 294 */ 295 int m_additionalVectors_[]; 296 /** 297 * Number of additional columns 298 */ 299 int m_additionalColumnsCount_; 300 /** 301 * Maximum values for block, bits used as in vector word 302 * 0 303 */ 304 int m_maxBlockScriptValue_; 305 /** 306 * Maximum values for script, bits used as in vector word 307 * 0 308 */ 309 int m_maxJTGValue_; 310 /** 311 * Script_Extensions data 312 */ 313 public char[] m_scriptExtensions_; 314 315 // private variables ------------------------------------------------- 316 317 /** 318 * Default name of the datafile 319 */ 320 private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; 321 322 /** 323 * Shift value for lead surrogate to form a supplementary character. 324 */ 325 private static final int LEAD_SURROGATE_SHIFT_ = 10; 326 /** 327 * Offset to add to combined surrogate pair to avoid masking. 328 */ 329 private static final int SURROGATE_OFFSET_ = 330 UTF16.SUPPLEMENTARY_MIN_VALUE - 331 (UTF16.SURROGATE_MIN_VALUE << 332 LEAD_SURROGATE_SHIFT_) - 333 UTF16.TRAIL_SURROGATE_MIN_VALUE; 334 335 336 // property data constants ------------------------------------------------- 337 338 /** 339 * Numeric types and values in the main properties words. 340 */ 341 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 342 private static final int getNumericTypeValue(int props) { 343 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 344 } 345 346 /* constants for the storage form of numeric types and values */ 347 /** No numeric value. */ 348 private static final int NTV_NONE_ = 0; 349 /** Decimal digits: nv=0..9 */ 350 private static final int NTV_DECIMAL_START_ = 1; 351 /** Other digits: nv=0..9 */ 352 private static final int NTV_DIGIT_START_ = 11; 353 /** Small integers: nv=0..154 */ 354 private static final int NTV_NUMERIC_START_ = 21; 355 356 private static final int ntvGetType(int ntv) { 357 return 358 (ntv==NTV_NONE_) ? NumericType.NONE : 359 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 360 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 361 NumericType.NUMERIC; 362 } 363 364 /* 365 * Properties in vector word 0 366 * Bits 367 * 31..24 DerivedAge version major/minor one nibble each 368 * 23..22 3..1: Bits 7..0 = Script_Extensions index 369 * 3: Script value from Script_Extensions 370 * 2: Script=Inherited 371 * 1: Script=Common 372 * 0: Script=bits 7..0 373 * 21..20 reserved 374 * 19..17 East Asian Width 375 * 16.. 8 UBlockCode 376 * 7.. 0 UScriptCode 377 */ 378 /** 379 * Script_Extensions: mask includes Script 380 */ 381 public static final int SCRIPT_X_MASK = 0x00c000ff; 382 //private static final int SCRIPT_X_SHIFT = 22; 383 /** 384 * Integer properties mask and shift values for East Asian cell width. 385 * Equivalent to icu4c UPROPS_EA_MASK 386 */ 387 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 388 /** 389 * Integer properties mask and shift values for East Asian cell width. 390 * Equivalent to icu4c UPROPS_EA_SHIFT 391 */ 392 private static final int EAST_ASIAN_SHIFT_ = 17; 393 /** 394 * Integer properties mask and shift values for blocks. 395 * Equivalent to icu4c UPROPS_BLOCK_MASK 396 */ 397 private static final int BLOCK_MASK_ = 0x0001ff00; 398 /** 399 * Integer properties mask and shift values for blocks. 400 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 401 */ 402 private static final int BLOCK_SHIFT_ = 8; 403 /** 404 * Integer properties mask and shift values for scripts. 405 * Equivalent to icu4c UPROPS_SHIFT_MASK 406 */ 407 public static final int SCRIPT_MASK_ = 0x000000ff; 408 409 /** 410 * Additional properties used in internal trie data 411 */ 412 /* 413 * Properties in vector word 1 414 * Each bit encodes one binary property. 415 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 416 * UPROPS_BINARY_1_TOP<=32! 417 * 418 * Keep this list of property enums in sync with 419 * propListNames[] in icu/source/tools/genprops/props2.c! 420 * 421 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 422 */ 423 private static final int WHITE_SPACE_PROPERTY_ = 0; 424 private static final int DASH_PROPERTY_ = 1; 425 private static final int HYPHEN_PROPERTY_ = 2; 426 private static final int QUOTATION_MARK_PROPERTY_ = 3; 427 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 428 private static final int MATH_PROPERTY_ = 5; 429 private static final int HEX_DIGIT_PROPERTY_ = 6; 430 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 431 private static final int ALPHABETIC_PROPERTY_ = 8; 432 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 433 private static final int DIACRITIC_PROPERTY_ = 10; 434 private static final int EXTENDER_PROPERTY_ = 11; 435 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 436 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 437 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 438 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 439 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 440 private static final int RADICAL_PROPERTY_ = 17; 441 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 442 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 443 private static final int DEPRECATED_PROPERTY_ = 20; 444 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 445 private static final int XID_START_PROPERTY_ = 22; 446 private static final int XID_CONTINUE_PROPERTY_ = 23; 447 private static final int ID_START_PROPERTY_ = 24; 448 private static final int ID_CONTINUE_PROPERTY_ = 25; 449 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 450 private static final int S_TERM_PROPERTY_ = 27; 451 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 452 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 453 private static final int PATTERN_WHITE_SPACE = 30; 454 455 /* 456 * Properties in vector word 2 457 * Bits 458 * 31..26 reserved 459 * 25..20 Line Break 460 * 19..15 Sentence Break 461 * 14..10 Word Break 462 * 9.. 5 Grapheme Cluster Break 463 * 4.. 0 Decomposition Type 464 */ 465 private static final int LB_MASK = 0x03f00000; 466 private static final int LB_SHIFT = 20; 467 468 private static final int SB_MASK = 0x000f8000; 469 private static final int SB_SHIFT = 15; 470 471 private static final int WB_MASK = 0x00007c00; 472 private static final int WB_SHIFT = 10; 473 474 private static final int GCB_MASK = 0x000003e0; 475 private static final int GCB_SHIFT = 5; 476 477 /** 478 * Integer properties mask for decomposition type. 479 * Equivalent to icu4c UPROPS_DT_MASK. 480 */ 481 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 482 483 /** 484 * First nibble shift 485 */ 486 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 487 /** 488 * Second nibble mask 489 */ 490 private static final int LAST_NIBBLE_MASK_ = 0xF; 491 /** 492 * Age value shift 493 */ 494 private static final int AGE_SHIFT_ = 24; 495 496 // private constructors -------------------------------------------------- 497 498 /** 499 * Constructor 500 * @exception IOException thrown when data reading fails or data corrupted 501 */ 502 private UCharacterProperty() throws IOException 503 { 504 // jar access 505 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 506 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 507 // Read or skip the 16 indexes. 508 int propertyOffset = bytes.getInt(); 509 /* exceptionOffset = */ bytes.getInt(); 510 /* caseOffset = */ bytes.getInt(); 511 int additionalOffset = bytes.getInt(); 512 int additionalVectorsOffset = bytes.getInt(); 513 m_additionalColumnsCount_ = bytes.getInt(); 514 int scriptExtensionsOffset = bytes.getInt(); 515 int reservedOffset7 = bytes.getInt(); 516 /* reservedOffset8 = */ bytes.getInt(); 517 /* dataTopOffset = */ bytes.getInt(); 518 m_maxBlockScriptValue_ = bytes.getInt(); 519 m_maxJTGValue_ = bytes.getInt(); 520 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 521 522 // read the main properties trie 523 m_trie_ = Trie2_16.createFromSerialized(bytes); 524 int expectedTrieLength = (propertyOffset - 16) * 4; 525 int trieLength = m_trie_.getSerializedLength(); 526 if(trieLength > expectedTrieLength) { 527 throw new IOException("uprops.icu: not enough bytes for main trie"); 528 } 529 // skip padding after trie bytes 530 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 531 532 // skip unused intervening data structures 533 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 534 535 if(m_additionalColumnsCount_ > 0) { 536 // reads the additional property block 537 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 538 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 539 trieLength = m_additionalTrie_.getSerializedLength(); 540 if(trieLength > expectedTrieLength) { 541 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 542 } 543 // skip padding after trie bytes 544 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 545 546 // additional properties 547 int size = scriptExtensionsOffset - additionalVectorsOffset; 548 m_additionalVectors_ = new int[size]; 549 for (int i = 0; i < size; i ++) { 550 m_additionalVectors_[i] = bytes.getInt(); 551 } 552 } 553 554 // Script_Extensions 555 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 556 if(numChars > 0) { 557 m_scriptExtensions_ = new char[numChars]; 558 for(int i = 0; i < numChars; ++i) { 559 m_scriptExtensions_[i] = bytes.getChar(); 560 } 561 } 562 } 563 564 private static final class IsAcceptable implements ICUBinary.Authenticate { 565 // @Override when we switch to Java 6 566 public boolean isDataVersionAcceptable(byte version[]) { 567 return version[0] == 7; 568 } 569 } 570 571 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 572 573 public void upropsvec_addPropertyStarts(UnicodeSet set) { 574 /* add the start code point of each same-value range of the properties vectors trie */ 575 if(m_additionalColumnsCount_>0) { 576 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 577 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 578 Trie2.Range range; 579 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 580 set.add(range.startCodePoint); 581 } 582 } 583 } 584 585 // This static initializer block must be placed after 586 // other static member initialization 587 static { 588 try { 589 INSTANCE = new UCharacterProperty(); 590 } 591 catch (IOException e) { 592 throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); 593 } 594 } 595 596 597 // Moved from UProperty.java 598 /** 599 * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). 600 * Used in UAX #9: Unicode Bidirectional Algorithm 601 * (http://www.unicode.org/reports/tr9/) 602 * Returns UCharacter.BidiPairedBracketType values. 603 * @stable ICU 52 604 */ 605 public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; 606 607 } |