1 /* 2 * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 package sun.util.locale.provider; 41 42 import java.io.BufferedInputStream; 43 import java.io.IOException; 44 import java.security.AccessController; 45 import java.security.PrivilegedActionException; 46 import java.security.PrivilegedExceptionAction; 47 import java.util.MissingResourceException; 48 import sun.text.CompactByteArray; 49 import sun.text.SupplementaryCharacterData; 50 51 /** 52 * This is the class that represents the list of known words used by 53 * DictionaryBasedBreakIterator. The conceptual data structure used 54 * here is a trie: there is a node hanging off the root node for every 55 * letter that can start a word. Each of these nodes has a node hanging 56 * off of it for every letter that can be the second letter of a word 57 * if this node is the first letter, and so on. The trie is represented 58 * as a two-dimensional array that can be treated as a table of state 59 * transitions. Indexes are used to compress this array, taking 60 * advantage of the fact that this array will always be very sparse. 61 */ 62 class BreakDictionary { 63 64 //========================================================================= 65 // data members 66 //========================================================================= 67 68 /** 69 * The version of the dictionary that was read in. 70 */ 71 private static int supportedVersion = 1; 72 73 /** 74 * Maps from characters to column numbers. The main use of this is to 75 * avoid making room in the array for empty columns. 76 */ 77 private CompactByteArray columnMap = null; 78 private SupplementaryCharacterData supplementaryCharColumnMap = null; 79 80 /** 81 * The number of actual columns in the table 82 */ 83 private int numCols; 84 85 /** 86 * Columns are organized into groups of 32. This says how many 87 * column groups. (We could calculate this, but we store the 88 * value to avoid having to repeatedly calculate it.) 89 */ 90 private int numColGroups; 91 92 /** 93 * The actual compressed state table. Each conceptual row represents 94 * a state, and the cells in it contain the row numbers of the states 95 * to transition to for each possible letter. 0 is used to indicate 96 * an illegal combination of letters (i.e., the error state). The 97 * table is compressed by eliminating all the unpopulated (i.e., zero) 98 * cells. Multiple conceptual rows can then be doubled up in a single 99 * physical row by sliding them up and possibly shifting them to one 100 * side or the other so the populated cells don't collide. Indexes 101 * are used to identify unpopulated cells and to locate populated cells. 102 */ 103 private short[] table = null; 104 105 /** 106 * This index maps logical row numbers to physical row numbers 107 */ 108 private short[] rowIndex = null; 109 110 /** 111 * A bitmap is used to tell which cells in the comceptual table are 112 * populated. This array contains all the unique bit combinations 113 * in that bitmap. If the table is more than 32 columns wide, 114 * successive entries in this array are used for a single row. 115 */ 116 private int[] rowIndexFlags = null; 117 118 /** 119 * This index maps from a logical row number into the bitmap table above. 120 * (This keeps us from storing duplicate bitmap combinations.) Since there 121 * are a lot of rows with only one populated cell, instead of wasting space 122 * in the bitmap table, we just store a negative number in this index for 123 * rows with one populated cell. The absolute value of that number is 124 * the column number of the populated cell. 125 */ 126 private short[] rowIndexFlagsIndex = null; 127 128 /** 129 * For each logical row, this index contains a constant that is added to 130 * the logical column number to get the physical column number 131 */ 132 private byte[] rowIndexShifts = null; 133 134 //========================================================================= 135 // deserialization 136 //========================================================================= 137 138 BreakDictionary(String dictionaryName) 139 throws IOException, MissingResourceException { 140 141 readDictionaryFile(dictionaryName); 142 } 143 144 private void readDictionaryFile(final String dictionaryName) 145 throws IOException, MissingResourceException { 146 147 BufferedInputStream in; 148 try { 149 in = AccessController.doPrivileged( 150 new PrivilegedExceptionAction<BufferedInputStream>() { 151 @Override 152 public BufferedInputStream run() throws Exception { 153 return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + dictionaryName)); 154 } 155 } 156 ); 157 } 158 catch (PrivilegedActionException e) { 159 throw new InternalError(e.toString(), e); 160 } 161 162 byte[] buf = new byte[8]; 163 if (in.read(buf) != 8) { 164 throw new MissingResourceException("Wrong data length", 165 dictionaryName, ""); 166 } 167 168 // check version 169 int version = RuleBasedBreakIterator.getInt(buf, 0); 170 if (version != supportedVersion) { 171 throw new MissingResourceException("Dictionary version(" + version + ") is unsupported", 172 dictionaryName, ""); 173 } 174 175 // get data size 176 int len = RuleBasedBreakIterator.getInt(buf, 4); 177 buf = new byte[len]; 178 if (in.read(buf) != len) { 179 throw new MissingResourceException("Wrong data length", 180 dictionaryName, ""); 181 } 182 183 // close the stream 184 in.close(); 185 186 int l; 187 int offset = 0; 188 189 // read in the column map for BMP characteres (this is serialized in 190 // its internal form: an index array followed by a data array) 191 l = RuleBasedBreakIterator.getInt(buf, offset); 192 offset += 4; 193 short[] temp = new short[l]; 194 for (int i = 0; i < l; i++, offset+=2) { 195 temp[i] = RuleBasedBreakIterator.getShort(buf, offset); 196 } 197 l = RuleBasedBreakIterator.getInt(buf, offset); 198 offset += 4; 199 byte[] temp2 = new byte[l]; 200 for (int i = 0; i < l; i++, offset++) { 201 temp2[i] = buf[offset]; 202 } 203 columnMap = new CompactByteArray(temp, temp2); 204 205 // read in numCols and numColGroups 206 numCols = RuleBasedBreakIterator.getInt(buf, offset); 207 offset += 4; 208 numColGroups = RuleBasedBreakIterator.getInt(buf, offset); 209 offset += 4; 210 211 // read in the row-number index 212 l = RuleBasedBreakIterator.getInt(buf, offset); 213 offset += 4; 214 rowIndex = new short[l]; 215 for (int i = 0; i < l; i++, offset+=2) { 216 rowIndex[i] = RuleBasedBreakIterator.getShort(buf, offset); 217 } 218 219 // load in the populated-cells bitmap: index first, then bitmap list 220 l = RuleBasedBreakIterator.getInt(buf, offset); 221 offset += 4; 222 rowIndexFlagsIndex = new short[l]; 223 for (int i = 0; i < l; i++, offset+=2) { 224 rowIndexFlagsIndex[i] = RuleBasedBreakIterator.getShort(buf, offset); 225 } 226 l = RuleBasedBreakIterator.getInt(buf, offset); 227 offset += 4; 228 rowIndexFlags = new int[l]; 229 for (int i = 0; i < l; i++, offset+=4) { 230 rowIndexFlags[i] = RuleBasedBreakIterator.getInt(buf, offset); 231 } 232 233 // load in the row-shift index 234 l = RuleBasedBreakIterator.getInt(buf, offset); 235 offset += 4; 236 rowIndexShifts = new byte[l]; 237 for (int i = 0; i < l; i++, offset++) { 238 rowIndexShifts[i] = buf[offset]; 239 } 240 241 // load in the actual state table 242 l = RuleBasedBreakIterator.getInt(buf, offset); 243 offset += 4; 244 table = new short[l]; 245 for (int i = 0; i < l; i++, offset+=2) { 246 table[i] = RuleBasedBreakIterator.getShort(buf, offset); 247 } 248 249 // finally, prepare the column map for supplementary characters 250 l = RuleBasedBreakIterator.getInt(buf, offset); 251 offset += 4; 252 int[] temp3 = new int[l]; 253 for (int i = 0; i < l; i++, offset+=4) { 254 temp3[i] = RuleBasedBreakIterator.getInt(buf, offset); 255 } 256 supplementaryCharColumnMap = new SupplementaryCharacterData(temp3); 257 } 258 259 //========================================================================= 260 // access to the words 261 //========================================================================= 262 263 /** 264 * Uses the column map to map the character to a column number, then 265 * passes the row and column number to getNextState() 266 * @param row The current state 267 * @param ch The character whose column we're interested in 268 * @return The new state to transition to 269 */ 270 public final short getNextStateFromCharacter(int row, int ch) { 271 int col; 272 if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 273 col = columnMap.elementAt((char)ch); 274 } else { 275 col = supplementaryCharColumnMap.getValue(ch); 276 } 277 return getNextState(row, col); 278 } 279 280 /** 281 * Returns the value in the cell with the specified (logical) row and 282 * column numbers. In DictionaryBasedBreakIterator, the row number is 283 * a state number, the column number is an input, and the return value 284 * is the row number of the new state to transition to. (0 is the 285 * "error" state, and -1 is the "end of word" state in a dictionary) 286 * @param row The row number of the current state 287 * @param col The column number of the input character (0 means "not a 288 * dictionary character") 289 * @return The row number of the new state to transition to 290 */ 291 public final short getNextState(int row, int col) { 292 if (cellIsPopulated(row, col)) { 293 // we map from logical to physical row number by looking up the 294 // mapping in rowIndex; we map from logical column number to 295 // physical column number by looking up a shift value for this 296 // logical row and offsetting the logical column number by 297 // the shift amount. Then we can use internalAt() to actually 298 // get the value out of the table. 299 return internalAt(rowIndex[row], col + rowIndexShifts[row]); 300 } 301 else { 302 return 0; 303 } 304 } 305 306 /** 307 * Given (logical) row and column numbers, returns true if the 308 * cell in that position is populated 309 */ 310 private boolean cellIsPopulated(int row, int col) { 311 // look up the entry in the bitmap index for the specified row. 312 // If it's a negative number, it's the column number of the only 313 // populated cell in the row 314 if (rowIndexFlagsIndex[row] < 0) { 315 return col == -rowIndexFlagsIndex[row]; 316 } 317 318 // if it's a positive number, it's the offset of an entry in the bitmap 319 // list. If the table is more than 32 columns wide, the bitmap is stored 320 // successive entries in the bitmap list, so we have to divide the column 321 // number by 32 and offset the number we got out of the index by the result. 322 // Once we have the appropriate piece of the bitmap, test the appropriate 323 // bit and return the result. 324 else { 325 int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; 326 return (flags & (1 << (col & 0x1f))) != 0; 327 } 328 } 329 330 /** 331 * Implementation of getNextState() when we know the specified cell is 332 * populated. 333 * @param row The PHYSICAL row number of the cell 334 * @param col The PHYSICAL column number of the cell 335 * @return The value stored in the cell 336 */ 337 private short internalAt(int row, int col) { 338 // the table is a one-dimensional array, so this just does the math necessary 339 // to treat it as a two-dimensional array (we don't just use a two-dimensional 340 // array because two-dimensional arrays are inefficient in Java) 341 return table[row * numCols + col]; 342 } 343 }