1 /*
   2  * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *
  28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  29  * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  30  *
  31  * The original version of this source code and documentation
  32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  33  * subsidiary of IBM. These materials are provided under terms
  34  * of a License Agreement between Taligent and Sun. This technology
  35  * is protected by multiple US and International patents.
  36  *
  37  * This notice and attribution to Taligent may not be removed.
  38  * Taligent is a registered trademark of Taligent, Inc.
  39  */
  40 package sun.text;
  41 
  42 import java.nio.BufferUnderflowException;
  43 import java.nio.ByteBuffer;
  44 import java.util.MissingResourceException;
  45 import sun.text.CompactByteArray;
  46 import sun.text.SupplementaryCharacterData;
  47 
  48 /**
  49  * This is the class that represents the list of known words used by
  50  * DictionaryBasedBreakIterator.  The conceptual data structure used
  51  * here is a trie: there is a node hanging off the root node for every
  52  * letter that can start a word.  Each of these nodes has a node hanging
  53  * off of it for every letter that can be the second letter of a word
  54  * if this node is the first letter, and so on.  The trie is represented
  55  * as a two-dimensional array that can be treated as a table of state
  56  * transitions.  Indexes are used to compress this array, taking
  57  * advantage of the fact that this array will always be very sparse.
  58  */
  59 class BreakDictionary {
  60 
  61     //=========================================================================
  62     // data members
  63     //=========================================================================
  64 
  65     /**
  66       * The version of the dictionary that was read in.
  67       */
  68     private static int supportedVersion = 1;
  69 
  70     /**
  71      * Maps from characters to column numbers.  The main use of this is to
  72      * avoid making room in the array for empty columns.
  73      */
  74     private CompactByteArray columnMap = null;
  75     private SupplementaryCharacterData supplementaryCharColumnMap = null;
  76 
  77     /**
  78      * The number of actual columns in the table
  79      */
  80     private int numCols;
  81 
  82     /**
  83      * Columns are organized into groups of 32.  This says how many
  84      * column groups.  (We could calculate this, but we store the
  85      * value to avoid having to repeatedly calculate it.)
  86      */
  87     private int numColGroups;
  88 
  89     /**
  90      * The actual compressed state table.  Each conceptual row represents
  91      * a state, and the cells in it contain the row numbers of the states
  92      * to transition to for each possible letter.  0 is used to indicate
  93      * an illegal combination of letters (i.e., the error state).  The
  94      * table is compressed by eliminating all the unpopulated (i.e., zero)
  95      * cells.  Multiple conceptual rows can then be doubled up in a single
  96      * physical row by sliding them up and possibly shifting them to one
  97      * side or the other so the populated cells don't collide.  Indexes
  98      * are used to identify unpopulated cells and to locate populated cells.
  99      */
 100     private short[] table = null;
 101 
 102     /**
 103      * This index maps logical row numbers to physical row numbers
 104      */
 105     private short[] rowIndex = null;
 106 
 107     /**
 108      * A bitmap is used to tell which cells in the comceptual table are
 109      * populated.  This array contains all the unique bit combinations
 110      * in that bitmap.  If the table is more than 32 columns wide,
 111      * successive entries in this array are used for a single row.
 112      */
 113     private int[] rowIndexFlags = null;
 114 
 115     /**
 116      * This index maps from a logical row number into the bitmap table above.
 117      * (This keeps us from storing duplicate bitmap combinations.)  Since there
 118      * are a lot of rows with only one populated cell, instead of wasting space
 119      * in the bitmap table, we just store a negative number in this index for
 120      * rows with one populated cell.  The absolute value of that number is
 121      * the column number of the populated cell.
 122      */
 123     private short[] rowIndexFlagsIndex = null;
 124 
 125     /**
 126      * For each logical row, this index contains a constant that is added to
 127      * the logical column number to get the physical column number
 128      */
 129     private byte[] rowIndexShifts = null;
 130 
 131     //=========================================================================
 132     // deserialization
 133     //=========================================================================
 134 
 135     BreakDictionary(String dictionaryName, byte[] dictionaryData) {
 136         try {
 137             setupDictionary(dictionaryName, dictionaryData);
 138         } catch (BufferUnderflowException bue) {
 139             MissingResourceException e;
 140             e = new MissingResourceException("Corrupted dictionary data",
 141                                              dictionaryName, "");
 142             e.initCause(bue);
 143             throw e;
 144         }
 145     }
 146 
 147     private void setupDictionary(String dictionaryName, byte[] dictionaryData) {
 148         ByteBuffer bb = ByteBuffer.wrap(dictionaryData);
 149 
 150         // check version
 151         int version = bb.getInt();
 152         if (version != supportedVersion) {
 153             throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
 154                                                dictionaryName, "");
 155         }
 156 
 157         // Check data size
 158         int len = bb.getInt();
 159         if (bb.position() + len != bb.limit()) {
 160             throw new MissingResourceException("Dictionary size is wrong: " + bb.limit(),
 161                                                dictionaryName, "");
 162         }
 163 
 164         // read in the column map for BMP characteres (this is serialized in
 165         // its internal form: an index array followed by a data array)
 166         len = bb.getInt();
 167         short[] temp = new short[len];
 168         for (int i = 0; i < len; i++) {
 169             temp[i] = bb.getShort();
 170         }
 171         len = bb.getInt();
 172         byte[] temp2 = new byte[len];
 173         bb.get(temp2);
 174         columnMap = new CompactByteArray(temp, temp2);
 175 
 176         // read in numCols and numColGroups
 177         numCols = bb.getInt();
 178         numColGroups = bb.getInt();
 179 
 180         // read in the row-number index
 181         len = bb.getInt();
 182         rowIndex = new short[len];
 183         for (int i = 0; i < len; i++) {
 184             rowIndex[i] = bb.getShort();
 185         }
 186 
 187         // load in the populated-cells bitmap: index first, then bitmap list
 188         len = bb.getInt();
 189         rowIndexFlagsIndex = new short[len];
 190         for (int i = 0; i < len; i++) {
 191             rowIndexFlagsIndex[i] = bb.getShort();
 192         }
 193         len = bb.getInt();
 194         rowIndexFlags = new int[len];
 195         for (int i = 0; i < len; i++) {
 196             rowIndexFlags[i] = bb.getInt();
 197         }
 198 
 199         // load in the row-shift index
 200         len = bb.getInt();
 201         rowIndexShifts = new byte[len];
 202         bb.get(rowIndexShifts);
 203 
 204         // load in the actual state table
 205         len = bb.getInt();
 206         table = new short[len];
 207         for (int i = 0; i < len; i++) {
 208             table[i] = bb.getShort();
 209         }
 210 
 211         // finally, prepare the column map for supplementary characters
 212         len = bb.getInt();
 213         int[] temp3 = new int[len];
 214         for (int i = 0; i < len; i++) {
 215             temp3[i] = bb.getInt();
 216         }
 217         assert bb.position() == bb.limit();
 218 
 219         supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
 220     }
 221 
 222     //=========================================================================
 223     // access to the words
 224     //=========================================================================
 225 
 226     /**
 227      * Uses the column map to map the character to a column number, then
 228      * passes the row and column number to getNextState()
 229      * @param row The current state
 230      * @param ch The character whose column we're interested in
 231      * @return The new state to transition to
 232      */
 233     public final short getNextStateFromCharacter(int row, int ch) {
 234         int col;
 235         if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
 236             col = columnMap.elementAt((char)ch);
 237         } else {
 238             col = supplementaryCharColumnMap.getValue(ch);
 239         }
 240         return getNextState(row, col);
 241     }
 242 
 243     /**
 244      * Returns the value in the cell with the specified (logical) row and
 245      * column numbers.  In DictionaryBasedBreakIterator, the row number is
 246      * a state number, the column number is an input, and the return value
 247      * is the row number of the new state to transition to.  (0 is the
 248      * "error" state, and -1 is the "end of word" state in a dictionary)
 249      * @param row The row number of the current state
 250      * @param col The column number of the input character (0 means "not a
 251      * dictionary character")
 252      * @return The row number of the new state to transition to
 253      */
 254     public final short getNextState(int row, int col) {
 255         if (cellIsPopulated(row, col)) {
 256             // we map from logical to physical row number by looking up the
 257             // mapping in rowIndex; we map from logical column number to
 258             // physical column number by looking up a shift value for this
 259             // logical row and offsetting the logical column number by
 260             // the shift amount.  Then we can use internalAt() to actually
 261             // get the value out of the table.
 262             return internalAt(rowIndex[row], col + rowIndexShifts[row]);
 263         }
 264         else {
 265             return 0;
 266         }
 267     }
 268 
 269     /**
 270      * Given (logical) row and column numbers, returns true if the
 271      * cell in that position is populated
 272      */
 273     private boolean cellIsPopulated(int row, int col) {
 274         // look up the entry in the bitmap index for the specified row.
 275         // If it's a negative number, it's the column number of the only
 276         // populated cell in the row
 277         if (rowIndexFlagsIndex[row] < 0) {
 278             return col == -rowIndexFlagsIndex[row];
 279         }
 280 
 281         // if it's a positive number, it's the offset of an entry in the bitmap
 282         // list.  If the table is more than 32 columns wide, the bitmap is stored
 283         // successive entries in the bitmap list, so we have to divide the column
 284         // number by 32 and offset the number we got out of the index by the result.
 285         // Once we have the appropriate piece of the bitmap, test the appropriate
 286         // bit and return the result.
 287         else {
 288             int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
 289             return (flags & (1 << (col & 0x1f))) != 0;
 290         }
 291     }
 292 
 293     /**
 294      * Implementation of getNextState() when we know the specified cell is
 295      * populated.
 296      * @param row The PHYSICAL row number of the cell
 297      * @param col The PHYSICAL column number of the cell
 298      * @return The value stored in the cell
 299      */
 300     private short internalAt(int row, int col) {
 301         // the table is a one-dimensional array, so this just does the math necessary
 302         // to treat it as a two-dimensional array (we don't just use a two-dimensional
 303         // array because two-dimensional arrays are inefficient in Java)
 304         return table[row * numCols + col];
 305     }
 306 }