1 /* 2 * Copyright (c) 1999, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 package java.text; 41 42 import java.io.*; 43 import java.security.AccessController; 44 import java.security.PrivilegedActionException; 45 import java.security.PrivilegedExceptionAction; 46 import java.util.MissingResourceException; 47 import sun.text.CompactByteArray; 48 import sun.text.SupplementaryCharacterData; 49 50 /** 51 * This is the class that represents the list of known words used by 52 * DictionaryBasedBreakIterator. The conceptual data structure used 53 * here is a trie: there is a node hanging off the root node for every 54 * letter that can start a word. Each of these nodes has a node hanging 55 * off of it for every letter that can be the second letter of a word 56 * if this node is the first letter, and so on. The trie is represented 57 * as a two-dimensional array that can be treated as a table of state 58 * transitions. Indexes are used to compress this array, taking 59 * advantage of the fact that this array will always be very sparse. 60 */ 61 class BreakDictionary { 62 63 //========================================================================= 64 // data members 65 //========================================================================= 66 67 /** 68 * The version of the dictionary that was read in. 69 */ 70 private static int supportedVersion = 1; 71 72 /** 73 * Maps from characters to column numbers. The main use of this is to 74 * avoid making room in the array for empty columns. 75 */ 76 private CompactByteArray columnMap = null; 77 private SupplementaryCharacterData supplementaryCharColumnMap = null; 78 79 /** 80 * The number of actual columns in the table 81 */ 82 private int numCols; 83 84 /** 85 * Columns are organized into groups of 32. This says how many 86 * column groups. (We could calculate this, but we store the 87 * value to avoid having to repeatedly calculate it.) 88 */ 89 private int numColGroups; 90 91 /** 92 * The actual compressed state table. Each conceptual row represents 93 * a state, and the cells in it contain the row numbers of the states 94 * to transition to for each possible letter. 0 is used to indicate 95 * an illegal combination of letters (i.e., the error state). The 96 * table is compressed by eliminating all the unpopulated (i.e., zero) 97 * cells. Multiple conceptual rows can then be doubled up in a single 98 * physical row by sliding them up and possibly shifting them to one 99 * side or the other so the populated cells don't collide. Indexes 100 * are used to identify unpopulated cells and to locate populated cells. 101 */ 102 private short[] table = null; 103 104 /** 105 * This index maps logical row numbers to physical row numbers 106 */ 107 private short[] rowIndex = null; 108 109 /** 110 * A bitmap is used to tell which cells in the comceptual table are 111 * populated. This array contains all the unique bit combinations 112 * in that bitmap. If the table is more than 32 columns wide, 113 * successive entries in this array are used for a single row. 114 */ 115 private int[] rowIndexFlags = null; 116 117 /** 118 * This index maps from a logical row number into the bitmap table above. 119 * (This keeps us from storing duplicate bitmap combinations.) Since there 120 * are a lot of rows with only one populated cell, instead of wasting space 121 * in the bitmap table, we just store a negative number in this index for 122 * rows with one populated cell. The absolute value of that number is 123 * the column number of the populated cell. 124 */ 125 private short[] rowIndexFlagsIndex = null; 126 127 /** 128 * For each logical row, this index contains a constant that is added to 129 * the logical column number to get the physical column number 130 */ 131 private byte[] rowIndexShifts = null; 132 133 //========================================================================= 134 // deserialization 135 //========================================================================= 136 137 public BreakDictionary(String dictionaryName) 138 throws IOException, MissingResourceException { 139 140 readDictionaryFile(dictionaryName); 141 } 142 143 private void readDictionaryFile(final String dictionaryName) 144 throws IOException, MissingResourceException { 145 146 BufferedInputStream in; 147 try { 148 in = AccessController.doPrivileged( 149 new PrivilegedExceptionAction<BufferedInputStream>() { 150 public BufferedInputStream run() throws Exception { 151 return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + dictionaryName)); 152 } 153 } 154 ); 155 } 156 catch (PrivilegedActionException e) { 157 throw new InternalError(e.toString(), e); 158 } 159 160 byte[] buf = new byte[8]; 161 if (in.read(buf) != 8) { 162 throw new MissingResourceException("Wrong data length", 163 dictionaryName, ""); 164 } 165 166 // check vesion 167 int version = BreakIterator.getInt(buf, 0); 168 if (version != supportedVersion) { 169 throw new MissingResourceException("Dictionary version(" + version + ") is unsupported", 170 dictionaryName, ""); 171 } 172 173 // get data size 174 int len = BreakIterator.getInt(buf, 4); 175 buf = new byte[len]; 176 if (in.read(buf) != len) { 177 throw new MissingResourceException("Wrong data length", 178 dictionaryName, ""); 179 } 180 181 // close the stream 182 in.close(); 183 184 int l; 185 int offset = 0; 186 187 // read in the column map for BMP characteres (this is serialized in 188 // its internal form: an index array followed by a data array) 189 l = BreakIterator.getInt(buf, offset); 190 offset += 4; 191 short[] temp = new short[l]; 192 for (int i = 0; i < l; i++, offset+=2) { 193 temp[i] = BreakIterator.getShort(buf, offset); 194 } 195 l = BreakIterator.getInt(buf, offset); 196 offset += 4; 197 byte[] temp2 = new byte[l]; 198 for (int i = 0; i < l; i++, offset++) { 199 temp2[i] = buf[offset]; 200 } 201 columnMap = new CompactByteArray(temp, temp2); 202 203 // read in numCols and numColGroups 204 numCols = BreakIterator.getInt(buf, offset); 205 offset += 4; 206 numColGroups = BreakIterator.getInt(buf, offset); 207 offset += 4; 208 209 // read in the row-number index 210 l = BreakIterator.getInt(buf, offset); 211 offset += 4; 212 rowIndex = new short[l]; 213 for (int i = 0; i < l; i++, offset+=2) { 214 rowIndex[i] = BreakIterator.getShort(buf, offset); 215 } 216 217 // load in the populated-cells bitmap: index first, then bitmap list 218 l = BreakIterator.getInt(buf, offset); 219 offset += 4; 220 rowIndexFlagsIndex = new short[l]; 221 for (int i = 0; i < l; i++, offset+=2) { 222 rowIndexFlagsIndex[i] = BreakIterator.getShort(buf, offset); 223 } 224 l = BreakIterator.getInt(buf, offset); 225 offset += 4; 226 rowIndexFlags = new int[l]; 227 for (int i = 0; i < l; i++, offset+=4) { 228 rowIndexFlags[i] = BreakIterator.getInt(buf, offset); 229 } 230 231 // load in the row-shift index 232 l = BreakIterator.getInt(buf, offset); 233 offset += 4; 234 rowIndexShifts = new byte[l]; 235 for (int i = 0; i < l; i++, offset++) { 236 rowIndexShifts[i] = buf[offset]; 237 } 238 239 // load in the actual state table 240 l = BreakIterator.getInt(buf, offset); 241 offset += 4; 242 table = new short[l]; 243 for (int i = 0; i < l; i++, offset+=2) { 244 table[i] = BreakIterator.getShort(buf, offset); 245 } 246 247 // finally, prepare the column map for supplementary characters 248 l = BreakIterator.getInt(buf, offset); 249 offset += 4; 250 int[] temp3 = new int[l]; 251 for (int i = 0; i < l; i++, offset+=4) { 252 temp3[i] = BreakIterator.getInt(buf, offset); 253 } 254 supplementaryCharColumnMap = new SupplementaryCharacterData(temp3); 255 } 256 257 //========================================================================= 258 // access to the words 259 //========================================================================= 260 261 /** 262 * Uses the column map to map the character to a column number, then 263 * passes the row and column number to getNextState() 264 * @param row The current state 265 * @param ch The character whose column we're interested in 266 * @return The new state to transition to 267 */ 268 public final short getNextStateFromCharacter(int row, int ch) { 269 int col; 270 if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 271 col = columnMap.elementAt((char)ch); 272 } else { 273 col = supplementaryCharColumnMap.getValue(ch); 274 } 275 return getNextState(row, col); 276 } 277 278 /** 279 * Returns the value in the cell with the specified (logical) row and 280 * column numbers. In DictionaryBasedBreakIterator, the row number is 281 * a state number, the column number is an input, and the return value 282 * is the row number of the new state to transition to. (0 is the 283 * "error" state, and -1 is the "end of word" state in a dictionary) 284 * @param row The row number of the current state 285 * @param col The column number of the input character (0 means "not a 286 * dictionary character") 287 * @return The row number of the new state to transition to 288 */ 289 public final short getNextState(int row, int col) { 290 if (cellIsPopulated(row, col)) { 291 // we map from logical to physical row number by looking up the 292 // mapping in rowIndex; we map from logical column number to 293 // physical column number by looking up a shift value for this 294 // logical row and offsetting the logical column number by 295 // the shift amount. Then we can use internalAt() to actually 296 // get the value out of the table. 297 return internalAt(rowIndex[row], col + rowIndexShifts[row]); 298 } 299 else { 300 return 0; 301 } 302 } 303 304 /** 305 * Given (logical) row and column numbers, returns true if the 306 * cell in that position is populated 307 */ 308 private final boolean cellIsPopulated(int row, int col) { 309 // look up the entry in the bitmap index for the specified row. 310 // If it's a negative number, it's the column number of the only 311 // populated cell in the row 312 if (rowIndexFlagsIndex[row] < 0) { 313 return col == -rowIndexFlagsIndex[row]; 314 } 315 316 // if it's a positive number, it's the offset of an entry in the bitmap 317 // list. If the table is more than 32 columns wide, the bitmap is stored 318 // successive entries in the bitmap list, so we have to divide the column 319 // number by 32 and offset the number we got out of the index by the result. 320 // Once we have the appropriate piece of the bitmap, test the appropriate 321 // bit and return the result. 322 else { 323 int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; 324 return (flags & (1 << (col & 0x1f))) != 0; 325 } 326 } 327 328 /** 329 * Implementation of getNextState() when we know the specified cell is 330 * populated. 331 * @param row The PHYSICAL row number of the cell 332 * @param col The PHYSICAL column number of the cell 333 * @return The value stored in the cell 334 */ 335 private final short internalAt(int row, int col) { 336 // the table is a one-dimensional array, so this just does the math necessary 337 // to treat it as a two-dimensional array (we don't just use a two-dimensional 338 // array because two-dimensional arrays are inefficient in Java) 339 return table[row * numCols + col]; 340 } 341 }