1 /* 2 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 package sun.util.locale.provider; 41 42 import java.io.BufferedInputStream; 43 import java.io.InputStream; 44 import java.io.IOException; 45 import java.lang.reflect.Module; 46 import java.security.AccessController; 47 import java.security.PrivilegedActionException; 48 import java.security.PrivilegedExceptionAction; 49 import java.util.MissingResourceException; 50 import sun.text.CompactByteArray; 51 import sun.text.SupplementaryCharacterData; 52 53 /** 54 * This is the class that represents the list of known words used by 55 * DictionaryBasedBreakIterator. The conceptual data structure used 56 * here is a trie: there is a node hanging off the root node for every 57 * letter that can start a word. Each of these nodes has a node hanging 58 * off of it for every letter that can be the second letter of a word 59 * if this node is the first letter, and so on. The trie is represented 60 * as a two-dimensional array that can be treated as a table of state 61 * transitions. Indexes are used to compress this array, taking 62 * advantage of the fact that this array will always be very sparse. 63 */ 64 class BreakDictionary { 65 66 //========================================================================= 67 // data members 68 //========================================================================= 69 70 /** 71 * The version of the dictionary that was read in. 72 */ 73 private static int supportedVersion = 1; 74 75 /** 76 * Maps from characters to column numbers. The main use of this is to 77 * avoid making room in the array for empty columns. 78 */ 79 private CompactByteArray columnMap = null; 80 private SupplementaryCharacterData supplementaryCharColumnMap = null; 81 82 /** 83 * The number of actual columns in the table 84 */ 85 private int numCols; 86 87 /** 88 * Columns are organized into groups of 32. This says how many 89 * column groups. (We could calculate this, but we store the 90 * value to avoid having to repeatedly calculate it.) 91 */ 92 private int numColGroups; 93 94 /** 95 * The actual compressed state table. Each conceptual row represents 96 * a state, and the cells in it contain the row numbers of the states 97 * to transition to for each possible letter. 0 is used to indicate 98 * an illegal combination of letters (i.e., the error state). The 99 * table is compressed by eliminating all the unpopulated (i.e., zero) 100 * cells. Multiple conceptual rows can then be doubled up in a single 101 * physical row by sliding them up and possibly shifting them to one 102 * side or the other so the populated cells don't collide. Indexes 103 * are used to identify unpopulated cells and to locate populated cells. 104 */ 105 private short[] table = null; 106 107 /** 108 * This index maps logical row numbers to physical row numbers 109 */ 110 private short[] rowIndex = null; 111 112 /** 113 * A bitmap is used to tell which cells in the comceptual table are 114 * populated. This array contains all the unique bit combinations 115 * in that bitmap. If the table is more than 32 columns wide, 116 * successive entries in this array are used for a single row. 117 */ 118 private int[] rowIndexFlags = null; 119 120 /** 121 * This index maps from a logical row number into the bitmap table above. 122 * (This keeps us from storing duplicate bitmap combinations.) Since there 123 * are a lot of rows with only one populated cell, instead of wasting space 124 * in the bitmap table, we just store a negative number in this index for 125 * rows with one populated cell. The absolute value of that number is 126 * the column number of the populated cell. 127 */ 128 private short[] rowIndexFlagsIndex = null; 129 130 /** 131 * For each logical row, this index contains a constant that is added to 132 * the logical column number to get the physical column number 133 */ 134 private byte[] rowIndexShifts = null; 135 136 //========================================================================= 137 // deserialization 138 //========================================================================= 139 140 BreakDictionary(Module module, String dictionaryName) 141 throws IOException, MissingResourceException { 142 143 readDictionaryFile(module, dictionaryName); 144 } 145 146 private void readDictionaryFile(final Module module, final String dictionaryName) 147 throws IOException, MissingResourceException { 148 149 BufferedInputStream in; 150 try { 151 PrivilegedExceptionAction<BufferedInputStream> pa = () -> { 152 String pathName = "jdk.localedata".equals(module.getName()) ? 153 "sun/text/resources/ext/" : 154 "sun/text/resources/"; 155 InputStream is = module.getResourceAsStream(pathName + dictionaryName); 156 if (is == null) { 157 // Try to load the file with "java.base" module instance. Assumption 158 // here is that the fall back data files to be read should reside in 159 // java.base. 160 is = BreakDictionary.class.getModule().getResourceAsStream("sun/text/resources/" + dictionaryName); 161 } 162 163 return new BufferedInputStream(is); 164 }; 165 in = AccessController.doPrivileged(pa); 166 } 167 catch (PrivilegedActionException e) { 168 throw new InternalError(e.toString(), e); 169 } 170 171 byte[] buf = new byte[8]; 172 if (in.read(buf) != 8) { 173 throw new MissingResourceException("Wrong data length", 174 dictionaryName, ""); 175 } 176 177 // check version 178 int version = RuleBasedBreakIterator.getInt(buf, 0); 179 if (version != supportedVersion) { 180 throw new MissingResourceException("Dictionary version(" + version + ") is unsupported", 181 dictionaryName, ""); 182 } 183 184 // get data size 185 int len = RuleBasedBreakIterator.getInt(buf, 4); 186 buf = new byte[len]; 187 if (in.read(buf) != len) { 188 throw new MissingResourceException("Wrong data length", 189 dictionaryName, ""); 190 } 191 192 // close the stream 193 in.close(); 194 195 int l; 196 int offset = 0; 197 198 // read in the column map for BMP characteres (this is serialized in 199 // its internal form: an index array followed by a data array) 200 l = RuleBasedBreakIterator.getInt(buf, offset); 201 offset += 4; 202 short[] temp = new short[l]; 203 for (int i = 0; i < l; i++, offset+=2) { 204 temp[i] = RuleBasedBreakIterator.getShort(buf, offset); 205 } 206 l = RuleBasedBreakIterator.getInt(buf, offset); 207 offset += 4; 208 byte[] temp2 = new byte[l]; 209 for (int i = 0; i < l; i++, offset++) { 210 temp2[i] = buf[offset]; 211 } 212 columnMap = new CompactByteArray(temp, temp2); 213 214 // read in numCols and numColGroups 215 numCols = RuleBasedBreakIterator.getInt(buf, offset); 216 offset += 4; 217 numColGroups = RuleBasedBreakIterator.getInt(buf, offset); 218 offset += 4; 219 220 // read in the row-number index 221 l = RuleBasedBreakIterator.getInt(buf, offset); 222 offset += 4; 223 rowIndex = new short[l]; 224 for (int i = 0; i < l; i++, offset+=2) { 225 rowIndex[i] = RuleBasedBreakIterator.getShort(buf, offset); 226 } 227 228 // load in the populated-cells bitmap: index first, then bitmap list 229 l = RuleBasedBreakIterator.getInt(buf, offset); 230 offset += 4; 231 rowIndexFlagsIndex = new short[l]; 232 for (int i = 0; i < l; i++, offset+=2) { 233 rowIndexFlagsIndex[i] = RuleBasedBreakIterator.getShort(buf, offset); 234 } 235 l = RuleBasedBreakIterator.getInt(buf, offset); 236 offset += 4; 237 rowIndexFlags = new int[l]; 238 for (int i = 0; i < l; i++, offset+=4) { 239 rowIndexFlags[i] = RuleBasedBreakIterator.getInt(buf, offset); 240 } 241 242 // load in the row-shift index 243 l = RuleBasedBreakIterator.getInt(buf, offset); 244 offset += 4; 245 rowIndexShifts = new byte[l]; 246 for (int i = 0; i < l; i++, offset++) { 247 rowIndexShifts[i] = buf[offset]; 248 } 249 250 // load in the actual state table 251 l = RuleBasedBreakIterator.getInt(buf, offset); 252 offset += 4; 253 table = new short[l]; 254 for (int i = 0; i < l; i++, offset+=2) { 255 table[i] = RuleBasedBreakIterator.getShort(buf, offset); 256 } 257 258 // finally, prepare the column map for supplementary characters 259 l = RuleBasedBreakIterator.getInt(buf, offset); 260 offset += 4; 261 int[] temp3 = new int[l]; 262 for (int i = 0; i < l; i++, offset+=4) { 263 temp3[i] = RuleBasedBreakIterator.getInt(buf, offset); 264 } 265 supplementaryCharColumnMap = new SupplementaryCharacterData(temp3); 266 } 267 268 //========================================================================= 269 // access to the words 270 //========================================================================= 271 272 /** 273 * Uses the column map to map the character to a column number, then 274 * passes the row and column number to getNextState() 275 * @param row The current state 276 * @param ch The character whose column we're interested in 277 * @return The new state to transition to 278 */ 279 public final short getNextStateFromCharacter(int row, int ch) { 280 int col; 281 if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 282 col = columnMap.elementAt((char)ch); 283 } else { 284 col = supplementaryCharColumnMap.getValue(ch); 285 } 286 return getNextState(row, col); 287 } 288 289 /** 290 * Returns the value in the cell with the specified (logical) row and 291 * column numbers. In DictionaryBasedBreakIterator, the row number is 292 * a state number, the column number is an input, and the return value 293 * is the row number of the new state to transition to. (0 is the 294 * "error" state, and -1 is the "end of word" state in a dictionary) 295 * @param row The row number of the current state 296 * @param col The column number of the input character (0 means "not a 297 * dictionary character") 298 * @return The row number of the new state to transition to 299 */ 300 public final short getNextState(int row, int col) { 301 if (cellIsPopulated(row, col)) { 302 // we map from logical to physical row number by looking up the 303 // mapping in rowIndex; we map from logical column number to 304 // physical column number by looking up a shift value for this 305 // logical row and offsetting the logical column number by 306 // the shift amount. Then we can use internalAt() to actually 307 // get the value out of the table. 308 return internalAt(rowIndex[row], col + rowIndexShifts[row]); 309 } 310 else { 311 return 0; 312 } 313 } 314 315 /** 316 * Given (logical) row and column numbers, returns true if the 317 * cell in that position is populated 318 */ 319 private boolean cellIsPopulated(int row, int col) { 320 // look up the entry in the bitmap index for the specified row. 321 // If it's a negative number, it's the column number of the only 322 // populated cell in the row 323 if (rowIndexFlagsIndex[row] < 0) { 324 return col == -rowIndexFlagsIndex[row]; 325 } 326 327 // if it's a positive number, it's the offset of an entry in the bitmap 328 // list. If the table is more than 32 columns wide, the bitmap is stored 329 // successive entries in the bitmap list, so we have to divide the column 330 // number by 32 and offset the number we got out of the index by the result. 331 // Once we have the appropriate piece of the bitmap, test the appropriate 332 // bit and return the result. 333 else { 334 int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; 335 return (flags & (1 << (col & 0x1f))) != 0; 336 } 337 } 338 339 /** 340 * Implementation of getNextState() when we know the specified cell is 341 * populated. 342 * @param row The PHYSICAL row number of the cell 343 * @param col The PHYSICAL column number of the cell 344 * @return The value stored in the cell 345 */ 346 private short internalAt(int row, int col) { 347 // the table is a one-dimensional array, so this just does the math necessary 348 // to treat it as a two-dimensional array (we don't just use a two-dimensional 349 // array because two-dimensional arrays are inefficient in Java) 350 return table[row * numCols + col]; 351 } 352 }