1 /*
   2  * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  * Copyright (C) 2009-2014, International Business Machines Corporation and
  29  * others. All Rights Reserved.
  30  *******************************************************************************
  31  */
  32 
  33 package jdk.internal.icu.impl;
  34 
  35 import java.io.IOException;
  36 import java.nio.ByteBuffer;
  37 
  38 
  39 /**
  40  * @author aheninger
  41  *
  42  * A read-only Trie2, holding 16 bit data values.
  43  *
  44  * A Trie2 is a highly optimized data structure for mapping from Unicode
  45  * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
  46  *
  47  * See class Trie2 for descriptions of the API for accessing the contents of a trie.
  48  *
  49  * The fundamental data access methods are declared final in this class, with
  50  * the intent that applications might gain a little extra performance, when compared
  51  * with calling the same methods via the abstract UTrie2 base class.
  52  */
  53 public final class Trie2_16 extends Trie2 {
  54 
  55     /**
  56      *  Internal constructor, not for general use.
  57      */
  58     Trie2_16() {
  59     }
  60 
  61 
  62     /**
  63      * Create a Trie2 from its serialized form.  Inverse of utrie2_serialize().
  64      * The serialized format is identical between ICU4C and ICU4J, so this function
  65      * will work with serialized Trie2s from either.
  66      *
  67      * The serialized Trie2 in the bytes may be in either little or big endian byte order.
  68      * This allows using serialized Tries from ICU4C without needing to consider the
  69      * byte order of the system that created them.
  70      *
  71      * @param bytes a byte buffer to the serialized form of a UTrie2.
  72      * @return An unserialized Trie2_16, ready for use.
  73      * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
  74      * @throws IOException if a read error occurs in the buffer.
  75      * @throws ClassCastException if the bytes contain a serialized Trie2_32
  76      */
  77     public static Trie2_16  createFromSerialized(ByteBuffer bytes) throws IOException {
  78         return (Trie2_16) Trie2.createFromSerialized(bytes);
  79     }
  80 
  81     /**
  82      * Get the value for a code point as stored in the Trie2.
  83      *
  84      * @param codePoint the code point
  85      * @return the value
  86      */
  87     @Override
  88     public final int get(int codePoint) {
  89         int value;
  90         int ix;
  91 
  92         if (codePoint >= 0) {
  93             if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
  94                 // Ordinary BMP code point, excluding leading surrogates.
  95                 // BMP uses a single level lookup.  BMP index starts at offset 0 in the Trie2 index.
  96                 // 16 bit data is stored in the index array itself.
  97                 ix = index[codePoint >> UTRIE2_SHIFT_2];
  98                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
  99                 value = index[ix];
 100                 return value;
 101             }
 102             if (codePoint <= 0xffff) {
 103                 // Lead Surrogate Code Point.  A Separate index section is stored for
 104                 // lead surrogate code units and code points.
 105                 //   The main index has the code unit data.
 106                 //   For this function, we need the code point data.
 107                 // Note: this expression could be refactored for slightly improved efficiency, but
 108                 //       surrogate code points will be so rare in practice that it's not worth it.
 109                 ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
 110                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
 111                 value = index[ix];
 112                 return value;
 113             }
 114             if (codePoint < highStart) {
 115                 // Supplemental code point, use two-level lookup.
 116                 ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
 117                 ix = index[ix];
 118                 ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
 119                 ix = index[ix];
 120                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
 121                 value = index[ix];
 122                 return value;
 123             }
 124             if (codePoint <= 0x10ffff) {
 125                 value = index[highValueIndex];
 126                 return value;
 127             }
 128         }
 129 
 130         // Fall through.  The code point is outside of the legal range of 0..0x10ffff.
 131         return errorValue;
 132     }
 133 
 134 
 135     /**
 136      * Get a Trie2 value for a UTF-16 code unit.
 137      *
 138      * This function returns the same value as get() if the input
 139      * character is outside of the lead surrogate range
 140      *
 141      * There are two values stored in a Trie2 for inputs in the lead
 142      * surrogate range.  This function returns the alternate value,
 143      * while Trie2.get() returns the main value.
 144      *
 145      * @param codeUnit a 16 bit code unit or lead surrogate value.
 146      * @return the value
 147      */
 148     @Override
 149     public int getFromU16SingleLead(char codeUnit) {
 150         int value;
 151         int ix;
 152 
 153         // Because the input is a 16 bit char, we can skip the tests for it being in
 154         // the BMP range.  It is.
 155         ix = index[codeUnit >> UTRIE2_SHIFT_2];
 156         ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
 157         value = index[ix];
 158         return value;
 159     }
 160 
 161     /**
 162      * @return the number of bytes of the serialized trie
 163      */
 164     public int getSerializedLength() {
 165         return 16+(header.indexLength+dataLength)*2;
 166     }
 167 }