Old jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerDataReader.java

   1 /*
   2  * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
  28  *                                                                             *
  29  * The original version of this source code and documentation is copyrighted   *
  30  * and owned by IBM, These materials are provided under terms of a License     *
  31  * Agreement between IBM and Sun. This technology is protected by multiple     *
  32  * US and International patents. This notice and attribution to IBM may not    *
  33  * to removed.                                                                 *
  34  *******************************************************************************
  35  */
  36 
  37 package sun.text.normalizer;
  38 
  39 import java.io.DataInputStream;
  40 import java.io.InputStream;
  41 import java.io.IOException;
  42 
  43 /**
  44  * @author        Ram Viswanadha
  45  */
  46 
  47     /*
  48      * Description of the format of unorm.icu version 2.1.
  49      *
  50      * Main change from version 1 to version 2:
  51      * Use of new, common Trie instead of normalization-specific tries.
  52      * Change to version 2.1: add third/auxiliary trie with associated data.
  53      *
  54      * For more details of how to use the data structures see the code
  55      * in unorm.cpp (runtime normalization code) and
  56      * in gennorm.c and gennorm/store.c (build-time data generation).
  57      *
  58      * For the serialized format of Trie see Trie.c/TrieHeader.
  59      *
  60      * - Overall partition
  61      *
  62      * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
  63      * After that there are the following structures:
  64      *
  65      * char indexes[INDEX_TOP];                   -- INDEX_TOP=32, see enum in this file
  66      *
  67      * Trie normTrie;                           -- size in bytes=indexes[INDEX_TRIE_SIZE]
  68      *
  69      * char extraData[extraDataTop];            -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
  70      *                                                 extraData[0] contains the number of units for
  71      *                                                 FC_NFKC_Closure (formatVersion>=2.1)
  72      *
  73      * char combiningTable[combiningTableTop];  -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
  74      *                                                 combiningTableTop may include one 16-bit padding unit
  75      *                                                 to make sure that fcdTrie is 32-bit-aligned
  76      *
  77      * Trie fcdTrie;                            -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
  78      *
  79      * Trie auxTrie;                            -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
  80      *
  81      *
  82      * The indexes array contains lengths and sizes of the following arrays and structures
  83      * as well as the following values:
  84      *  indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
  85      *      -- one more than the highest combining index computed for forward-only-combining characters
  86      *  indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
  87      *      -- number of combining indexes computed for both-ways-combining characters
  88      *  indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
  89      *      -- number of combining indexes computed for backward-only-combining characters
  90      *
  91      *  indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
  92      *      -- first code point with a quick check NF* value of NO/MAYBE
  93      *
  94      *
  95      * - Tries
  96      *
  97      * The main structures are two Trie tables ("compact arrays"),
  98      * each with one index array and one data array.
  99      * See Trie.h and Trie.c.
 100      *
 101      *
 102      * - Tries in unorm.icu
 103      *
 104      * The first trie (normTrie above)
 105      * provides data for the NF* quick checks and normalization.
 106      * The second trie (fcdTrie above) provides data just for FCD checks.
 107      *
 108      *
 109      * - norm32 data words from the first trie
 110      *
 111      * The norm32Table contains one 32-bit word "norm32" per code point.
 112      * It contains the following bit fields:
 113      * 31..16   extra data index, EXTRA_SHIFT is used to shift this field down
 114      *          if this index is <EXTRA_INDEX_TOP then it is an index into
 115      *              extraData[] where variable-length normalization data for this
 116      *              code point is found
 117      *          if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
 118      *              then this is a norm32 for a leading surrogate, and the index
 119      *              value is used together with the following trailing surrogate
 120      *              code unit in the second trie access
 121      *          if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
 122      *              then this is a norm32 for a "special" character,
 123      *              i.e., the character is a Hangul syllable or a Jamo
 124      *              see EXTRA_HANGUL etc.
 125      *          generally, instead of extracting this index from the norm32 and
 126      *              comparing it with the above constants,
 127      *              the normalization code compares the entire norm32 value
 128      *              with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
 129      *
 130      * 15..8    combining class (cc) according to UnicodeData.txt
 131      *
 132      *  7..6    COMBINES_ANY flags, used in composition to see if a character
 133      *              combines with any following or preceding character(s)
 134      *              at all
 135      *     7    COMBINES_BACK
 136      *     6    COMBINES_FWD
 137      *
 138      *  5..0    quick check flags, set for "no" or "maybe", with separate flags for
 139      *              each normalization form
 140      *              the higher bits are "maybe" flags; for NF*D there are no such flags
 141      *              the lower bits are "no" flags for all forms, in the same order
 142      *              as the "maybe" flags,
 143      *              which is (MSB to LSB): NFKD NFD NFKC NFC
 144      *  5..4    QC_ANY_MAYBE
 145      *  3..0    QC_ANY_NO
 146      *              see further related constants
 147      *
 148      *
 149      * - Extra data per code point
 150      *
 151      * "Extra data" is referenced by the index in norm32.
 152      * It is variable-length data. It is only present, and only those parts
 153      * of it are, as needed for a given character.
 154      * The norm32 extra data index is added to the beginning of extraData[]
 155      * to get to a vector of 16-bit words with data at the following offsets:
 156      *
 157      * [-1]     Combining index for composition.
 158      *              Stored only if norm32&COMBINES_ANY .
 159      * [0]      Lengths of the canonical and compatibility decomposition strings.
 160      *              Stored only if there are decompositions, i.e.,
 161      *              if norm32&(QC_NFD|QC_NFKD)
 162      *          High byte: length of NFKD, or 0 if none
 163      *          Low byte: length of NFD, or 0 if none
 164      *          Each length byte also has another flag:
 165      *              Bit 7 of a length byte is set if there are non-zero
 166      *              combining classes (cc's) associated with the respective
 167      *              decomposition. If this flag is set, then the decomposition
 168      *              is preceded by a 16-bit word that contains the
 169      *              leading and trailing cc's.
 170      *              Bits 6..0 of a length byte are the length of the
 171      *              decomposition string, not counting the cc word.
 172      * [1..n]   NFD
 173      * [n+1..]  NFKD
 174      *
 175      * Each of the two decompositions consists of up to two parts:
 176      * - The 16-bit words with the leading and trailing cc's.
 177      *   This is only stored if bit 7 of the corresponding length byte
 178      *   is set. In this case, at least one of the cc's is not zero.
 179      *   High byte: leading cc==cc of the first code point in the decomposition string
 180      *   Low byte: trailing cc==cc of the last code point in the decomposition string
 181      * - The decomposition string in UTF-16, with length code units.
 182      *
 183      *
 184      * - Combining indexes and combiningTable[]
 185      *
 186      * Combining indexes are stored at the [-1] offset of the extra data
 187      * if the character combines forward or backward with any other characters.
 188      * They are used for (re)composition in NF*C.
 189      * Values of combining indexes are arranged according to whether a character
 190      * combines forward, backward, or both ways:
 191      *    forward-only < both ways < backward-only
 192      *
 193      * The index values for forward-only and both-ways combining characters
 194      * are indexes into the combiningTable[].
 195      * The index values for backward-only combining characters are simply
 196      * incremented from the preceding index values to be unique.
 197      *
 198      * In the combiningTable[], a variable-length list
 199      * of variable-length (back-index, code point) pair entries is stored
 200      * for each forward-combining character.
 201      *
 202      * These back-indexes are the combining indexes of both-ways or backward-only
 203      * combining characters that the forward-combining character combines with.
 204      *
 205      * Each list is sorted in ascending order of back-indexes.
 206      * Each list is terminated with the last back-index having bit 15 set.
 207      *
 208      * Each pair (back-index, code point) takes up either 2 or 3
 209      * 16-bit words.
 210      * The first word of a list entry is the back-index, with its bit 15 set if
 211      * this is the last pair in the list.
 212      *
 213      * The second word contains flags in bits 15..13 that determine
 214      * if there is a third word and how the combined character is encoded:
 215      * 15   set if there is a third word in this list entry
 216      * 14   set if the result is a supplementary character
 217      * 13   set if the result itself combines forward
 218      *
 219      * According to these bits 15..14 of the second word,
 220      * the result character is encoded as follows:
 221      * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
 222      *          the second word.
 223      * 10       The result is 0x2000..0xffff and stored in the third word.
 224      *          Bits 12..0 of the second word are not used.
 225      * 11       The result is a supplementary character.
 226      *          Bits 9..0 of the leading surrogate are in bits 9..0 of
 227      *          the second word.
 228      *          Add 0xd800 to these bits to get the complete surrogate.
 229      *          Bits 12..10 of the second word are not used.
 230      *          The trailing surrogate is stored in the third word.
 231      *
 232      *
 233      * - FCD trie
 234      *
 235      * The FCD trie is very simple.
 236      * It is a folded trie with 16-bit data words.
 237      * In each word, the high byte contains the leading cc of the character,
 238      * and the low byte contains the trailing cc of the character.
 239      * These cc's are the cc's of the first and last code points in the
 240      * canonical decomposition of the character.
 241      *
 242      * Since all 16 bits are used for cc's, lead surrogates must be tested
 243      * by checking the code unit instead of the trie data.
 244      * This is done only if the 16-bit data word is not zero.
 245      * If the code unit is a leading surrogate and the data word is not zero,
 246      * then instead of cc's it contains the offset for the second trie lookup.
 247      *
 248      *
 249      * - Auxiliary trie and data
 250      *
 251      *
 252      * The auxiliary 16-bit trie contains data for additional properties.
 253      * Bits
 254      * 15..13   reserved
 255      *     12   not NFC_Skippable (f) (formatVersion>=2.2)
 256      *     11   flag: not a safe starter for canonical closure
 257      *     10   composition exclusion
 258      *  9.. 0   index into extraData[] to FC_NFKC_Closure string
 259      *          (not for lead surrogate),
 260      *          or lead surrogate offset (for lead surrogate, if 9..0 not zero)
 261      *
 262      * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
 263      * (used in NormalizerTransliterator)
 264      *
 265      * A skippable character is
 266      * a) unassigned, or ALL of the following:
 267      * b) of combining class 0.
 268      * c) not decomposed by this normalization form.
 269      * AND if NFC or NFKC,
 270      * d) can never compose with a previous character.
 271      * e) can never compose with a following character.
 272      * f) can never change if another character is added.
 273      *    Example: a-breve might satisfy all but f, but if you
 274      *    add an ogonek it changes to a-ogonek + breve
 275      *
 276      * a)..e) must be tested from norm32.
 277      * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
 278      * into the auxiliary trie.
 279      * The same bit is used for NFC and NFKC; (c) differs for them.
 280      * As usual, we build the "not skippable" flags so that unassigned
 281      * code points get a 0 bit.
 282      * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
 283      * Test Hangul LV syllables entirely in code.
 284      *
 285      *
 286      * - FC_NFKC_Closure strings in extraData[]
 287      *
 288      * Strings are either stored as a single code unit or as the length
 289      * followed by that many units.
 290      *
 291      */
 292 final class NormalizerDataReader implements ICUBinary.Authenticate {
 293 
 294    /**
 295     * <p>Protected constructor.</p>
 296     * @param inputStream ICU uprop.dat file input stream
 297     * @exception IOException throw if data file fails authentication
 298     * @draft 2.1
 299     */
 300     protected NormalizerDataReader(InputStream inputStream)
 301                                         throws IOException{
 302 
 303         unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
 304         dataInputStream = new DataInputStream(inputStream);
 305     }
 306 
 307     // protected methods -------------------------------------------------
 308 
 309     protected int[] readIndexes(int length)throws IOException{
 310         int[] indexes = new int[length];
 311         //Read the indexes
 312         for (int i = 0; i <length ; i++) {
 313              indexes[i] = dataInputStream.readInt();
 314         }
 315         return indexes;
 316     }
 317     /**
 318     * <p>Reads unorm.icu, parse it into blocks of data to be stored in
 319     * NormalizerImpl.</P
 320     * @param normBytes
 321     * @param fcdBytes
 322     * @param auxBytes
 323     * @param extraData
 324     * @param combiningTable
 325     * @exception thrown when data reading fails
 326     * @draft 2.1
 327     */
 328     protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
 329                         char[] extraData, char[] combiningTable)
 330                         throws IOException{
 331 
 332          //Read the bytes that make up the normTrie
 333          dataInputStream.readFully(normBytes);
 334 
 335          //normTrieStream= new ByteArrayInputStream(normBytes);
 336 
 337          //Read the extra data
 338          for(int i=0;i<extraData.length;i++){
 339              extraData[i]=dataInputStream.readChar();
 340          }
 341 
 342          //Read the combining class table
 343          for(int i=0; i<combiningTable.length; i++){
 344              combiningTable[i]=dataInputStream.readChar();
 345          }
 346 
 347          //Read the fcdTrie
 348          dataInputStream.readFully(fcdBytes);
 349 
 350 
 351          //Read the AuxTrie
 352         dataInputStream.readFully(auxBytes);
 353     }
 354 
 355     public byte[] getDataFormatVersion(){
 356         return DATA_FORMAT_VERSION;
 357     }
 358 
 359     public boolean isDataVersionAcceptable(byte version[])
 360     {
 361         return version[0] == DATA_FORMAT_VERSION[0]
 362                && version[2] == DATA_FORMAT_VERSION[2]
 363                && version[3] == DATA_FORMAT_VERSION[3];
 364     }
 365 
 366     public byte[] getUnicodeVersion(){
 367         return unicodeVersion;
 368     }
 369     // private data members -------------------------------------------------
 370 
 371 
 372     /**
 373     * ICU data file input stream
 374     */
 375     private DataInputStream dataInputStream;
 376 
 377     private byte[] unicodeVersion;
 378 
 379     /**
 380     * File format version that this class understands.
 381     * No guarantees are made if a older version is used
 382     * see store.c of gennorm for more information and values
 383     */
 384     private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
 385                                                     (byte)0x72, (byte)0x6D};
 386     private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
 387                                                         (byte)0x5, (byte)0x2};
 388 
 389 }