--- old/src/java.base/share/classes/sun/text/normalizer/Trie2.java 2020-01-10 15:57:37.000000000 -0800 +++ /dev/null 2020-01-10 15:57:37.000000000 -0800 @@ -1,655 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2009-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Iterator; -import java.util.NoSuchElementException; - - -/** - * This is the interface and common implementation of a Unicode Trie2. - * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) - * to 16- or 32-bit integer values. It works best when there are ranges of - * characters with the same value, which is generally the case with Unicode - * character properties. - * - * This is the second common version of a Unicode trie (hence the name Trie2). - * - */ -abstract class Trie2 implements Iterable { - - /** - * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). - * - * Reads from the current position and leaves the buffer after the end of the trie. - * - * The serialized format is identical between ICU4C and ICU4J, so this function - * will work with serialized Trie2s from either. - * - * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending - * on the width of the data. - * - * To obtain the width of the Trie2, check the actual class type of the returned Trie2. - * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will - * return only Tries of their specific type/size. - * - * The serialized Trie2 on the stream may be in either little or big endian byte order. - * This allows using serialized Tries from ICU4C without needing to consider the - * byte order of the system that created them. - * - * @param bytes a byte buffer to the serialized form of a UTrie2. - * @return An unserialized Trie2, ready for use. - * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. - * @throws IOException if a read error occurs in the buffer. - * - */ - public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { - // From ICU4C utrie2_impl.h - // * Trie2 data structure in serialized form: - // * - // * UTrie2Header header; - // * uint16_t index[header.index2Length]; - // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] - // * @internal - // */ - // typedef struct UTrie2Header { - // /** "Tri2" in big-endian US-ASCII (0x54726932) */ - // uint32_t signature; - - // /** - // * options bit field: - // * 15.. 4 reserved (0) - // * 3.. 0 UTrie2ValueBits valueBits - // */ - // uint16_t options; - // - // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ - // uint16_t indexLength; - // - // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ - // uint16_t shiftedDataLength; - // - // /** Null index and data blocks, not shifted. */ - // uint16_t index2NullOffset, dataNullOffset; - // - // /** - // * First code point of the single-value range ending with U+10ffff, - // * rounded up and then shifted right by UTRIE2_SHIFT_1. - // */ - // uint16_t shiftedHighStart; - // } UTrie2Header; - - ByteOrder outerByteOrder = bytes.order(); - try { - UTrie2Header header = new UTrie2Header(); - - /* check the signature */ - header.signature = bytes.getInt(); - switch (header.signature) { - case 0x54726932: - // The buffer is already set to the trie data byte order. - break; - case 0x32697254: - // Temporarily reverse the byte order. - boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; - bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); - header.signature = 0x54726932; - break; - default: - throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); - } - - header.options = bytes.getChar(); - header.indexLength = bytes.getChar(); - header.shiftedDataLength = bytes.getChar(); - header.index2NullOffset = bytes.getChar(); - header.dataNullOffset = bytes.getChar(); - header.shiftedHighStart = bytes.getChar(); - - if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) { - throw new IllegalArgumentException("UTrie2 serialized format error."); - } - - Trie2 This; - This = new Trie2_16(); - This.header = header; - - /* get the length values and offsets */ - This.indexLength = header.indexLength; - This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; - This.index2NullOffset = header.index2NullOffset; - This.dataNullOffset = header.dataNullOffset; - This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; - This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; - This.highValueIndex += This.indexLength; - - // Allocate the Trie2 index array. If the data width is 16 bits, the array also - // includes the space for the data. - - int indexArraySize = This.indexLength; - indexArraySize += This.dataLength; - This.index = new char[indexArraySize]; - - /* Read in the index */ - int i; - for (i=0; i iterator() { - return iterator(defaultValueMapper); - } - - private static ValueMapper defaultValueMapper = new ValueMapper() { - public int map(int in) { - return in; - } - }; - - /** - * Create an iterator over the value ranges from this Trie2. - * Values from the Trie2 are passed through a caller-supplied remapping function, - * and it is the remapped values that determine the ranges that - * will be produced by the iterator. - * - * - * @param mapper provides a function to remap values obtained from the Trie2. - * @return an Iterator - */ - public Iterator iterator(ValueMapper mapper) { - return new Trie2Iterator(mapper); - } - - /** - * When iterating over the contents of a Trie2, an instance of TrieValueMapper may - * be used to remap the values from the Trie2. The remapped values will be used - * both in determining the ranges of codepoints and as the value to be returned - * for each range. - * - * Example of use, with an anonymous subclass of TrieValueMapper: - * - * - * ValueMapper m = new ValueMapper() { - * int map(int in) {return in & 0x1f;}; - * } - * for (Iterator iter = trie.iterator(m); i.hasNext(); ) { - * Trie2EnumRange r = i.next(); - * ... // Do something with the range r. - * } - * - */ - public interface ValueMapper { - public int map(int originalVal); - } - - //-------------------------------------------------------------------------------- - // - // Below this point are internal implementation items. No further public API. - // - //-------------------------------------------------------------------------------- - - /** - * Trie2 data structure in serialized form: - * - * UTrie2Header header; - * uint16_t index[header.index2Length]; - * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] - * - * For Java, this is read from the stream into an instance of UTrie2Header. - * (The C version just places a struct over the raw serialized data.) - * - * @internal - */ - static class UTrie2Header { - /** "Tri2" in big-endian US-ASCII (0x54726932) */ - int signature; - - /** - * options bit field (uint16_t): - * 15.. 4 reserved (0) - * 3.. 0 UTrie2ValueBits valueBits - */ - int options; - - /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ - int indexLength; - - /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ - int shiftedDataLength; - - /** Null index and data blocks, not shifted. (uint16_t) */ - int index2NullOffset, dataNullOffset; - - /** - * First code point of the single-value range ending with U+10ffff, - * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) - */ - int shiftedHighStart; - } - - // - // Data members of UTrie2. - // - UTrie2Header header; - char index[]; // Index array. Includes data for 16 bit Tries. - int data16; // Offset to data portion of the index array, if 16 bit data. - // zero if 32 bit data. - int data32[]; // NULL if 16b data is used via index - - int indexLength; - int dataLength; - int index2NullOffset; // 0xffff if there is no dedicated index-2 null block - int initialValue; - - /** Value returned for out-of-range code points and illegal UTF-8. */ - int errorValue; - - /* Start of the last range which ends at U+10ffff, and its value. */ - int highStart; - int highValueIndex; - - int dataNullOffset; - - /** - * Trie2 constants, defining shift widths, index array lengths, etc. - * - * These are needed for the runtime macros but users can treat these as - * implementation details and skip to the actual public API further below. - */ - - static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; - - - /** Shift size for getting the index-1 table offset. */ - static final int UTRIE2_SHIFT_1=6+5; - - /** Shift size for getting the index-2 table offset. */ - static final int UTRIE2_SHIFT_2=5; - - /** - * Difference between the two shift sizes, - * for getting an index-1 offset from an index-2 offset. 6=11-5 - */ - static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; - - /** - * Number of index-1 entries for the BMP. 32=0x20 - * This part of the index-1 table is omitted from the serialized form. - */ - static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; - - /** Number of entries in an index-2 block. 64=0x40 */ - static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) - */ - static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; - static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; - - /** Count the lengths of both BMP pieces. 2080=0x820 */ - static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; - - /** - * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. - * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. - */ - static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; - static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ - - /** - * The index-1 table, only used for supplementary code points, at offset 2112=0x840. - * Variable length, for code points up to highStart, where the last single-value range starts. - * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. - * (For 0x100000 supplementary code points U+10000..U+10ffff.) - * - * The part of the index-2 table for supplementary code points starts - * after this index-1 table. - * - * Both the index-1 table and the following part of the index-2 table - * are omitted completely if there is only BMP data. - */ - static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; - - /** - * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. - * Used with linear access for single bytes 0..0xbf for simple error handling. - * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. - */ - static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; - - /** - * Implementation class for an iterator over a Trie2. - * - * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, - * then returns the special alternate values for the lead surrogates - * - * @internal - */ - class Trie2Iterator implements Iterator { - - // The normal constructor that configures the iterator to cover the complete - // contents of the Trie2 - Trie2Iterator(ValueMapper vm) { - mapper = vm; - nextStart = 0; - limitCP = 0x110000; - doLeadSurrogates = true; - } - - /** - * The main next() function for Trie2 iterators - * - */ - public Range next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - if (nextStart >= limitCP) { - // Switch over from iterating normal code point values to - // doing the alternate lead-surrogate values. - doingCodePoints = false; - nextStart = 0xd800; - } - int endOfRange = 0; - int val = 0; - int mappedVal = 0; - - if (doingCodePoints) { - // Iteration over code point values. - val = get(nextStart); - mappedVal = mapper.map(val); - endOfRange = rangeEnd(nextStart, limitCP, val); - // Loop once for each range in the Trie2 with the same raw (unmapped) value. - // Loop continues so long as the mapped values are the same. - for (;;) { - if (endOfRange >= limitCP-1) { - break; - } - val = get(endOfRange+1); - if (mapper.map(val) != mappedVal) { - break; - } - endOfRange = rangeEnd(endOfRange+1, limitCP, val); - } - } else { - // Iteration over the alternate lead surrogate values. - val = getFromU16SingleLead((char)nextStart); - mappedVal = mapper.map(val); - endOfRange = rangeEndLS((char)nextStart); - // Loop once for each range in the Trie2 with the same raw (unmapped) value. - // Loop continues so long as the mapped values are the same. - for (;;) { - if (endOfRange >= 0xdbff) { - break; - } - val = getFromU16SingleLead((char)(endOfRange+1)); - if (mapper.map(val) != mappedVal) { - break; - } - endOfRange = rangeEndLS((char)(endOfRange+1)); - } - } - returnValue.startCodePoint = nextStart; - returnValue.endCodePoint = endOfRange; - returnValue.value = mappedVal; - returnValue.leadSurrogate = !doingCodePoints; - nextStart = endOfRange+1; - return returnValue; - } - - /** - * - */ - public boolean hasNext() { - return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; - } - - private int rangeEndLS(char startingLS) { - if (startingLS >= 0xdbff) { - return 0xdbff; - } - - int c; - int val = getFromU16SingleLead(startingLS); - for (c = startingLS+1; c <= 0x0dbff; c++) { - if (getFromU16SingleLead((char)c) != val) { - break; - } - } - return c-1; - } - - // - // Iteration State Variables - // - private ValueMapper mapper; - private Range returnValue = new Range(); - // The starting code point for the next range to be returned. - private int nextStart; - // The upper limit for the last normal range to be returned. Normally 0x110000, but - // may be lower when iterating over the code points for a single lead surrogate. - private int limitCP; - - // True while iterating over the Trie2 values for code points. - // False while iterating over the alternate values for lead surrogates. - private boolean doingCodePoints = true; - - // True if the iterator should iterate the special values for lead surrogates in - // addition to the normal values for code points. - private boolean doLeadSurrogates = true; - } - - /** - * Find the last character in a contiguous range of characters with the - * same Trie2 value as the input character. - * - * @param c The character to begin with. - * @return The last contiguous character with the same value. - */ - int rangeEnd(int start, int limitp, int val) { - int c; - int limit = Math.min(highStart, limitp); - - for (c = start+1; c < limit; c++) { - if (get(c) != val) { - break; - } - } - if (c >= highStart) { - c = limitp; - } - return c - 1; - } - - - // - // Hashing implementation functions. FNV hash. Respected public domain algorithm. - // - private static int initHash() { - return 0x811c9DC5; // unsigned 2166136261 - } - - private static int hashByte(int h, int b) { - h = h * 16777619; - h = h ^ b; - return h; - } - - private static int hashUChar32(int h, int c) { - h = Trie2.hashByte(h, c & 255); - h = Trie2.hashByte(h, (c>>8) & 255); - h = Trie2.hashByte(h, c>>16); - return h; - } - - private static int hashInt(int h, int i) { - h = Trie2.hashByte(h, i & 255); - h = Trie2.hashByte(h, (i>>8) & 255); - h = Trie2.hashByte(h, (i>>16) & 255); - h = Trie2.hashByte(h, (i>>24) & 255); - return h; - } - -} --- /dev/null 2020-01-10 15:57:37.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/Trie2.java 2020-01-10 15:57:37.000000000 -0800 @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * This is the interface and common implementation of a Unicode Trie2. + * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) + * to 16- or 32-bit integer values. It works best when there are ranges of + * characters with the same value, which is generally the case with Unicode + * character properties. + * + * This is the second common version of a Unicode trie (hence the name Trie2). + * + */ +abstract class Trie2 implements Iterable { + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * + * Reads from the current position and leaves the buffer after the end of the trie. + * + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending + * on the width of the data. + * + * To obtain the width of the Trie2, check the actual class type of the returned Trie2. + * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will + * return only Tries of their specific type/size. + * + * The serialized Trie2 on the stream may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2, ready for use. + * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * + */ + public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { + // From ICU4C utrie2_impl.h + // * Trie2 data structure in serialized form: + // * + // * UTrie2Header header; + // * uint16_t index[header.index2Length]; + // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + // * @internal + // */ + // typedef struct UTrie2Header { + // /** "Tri2" in big-endian US-ASCII (0x54726932) */ + // uint32_t signature; + + // /** + // * options bit field: + // * 15.. 4 reserved (0) + // * 3.. 0 UTrie2ValueBits valueBits + // */ + // uint16_t options; + // + // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ + // uint16_t indexLength; + // + // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ + // uint16_t shiftedDataLength; + // + // /** Null index and data blocks, not shifted. */ + // uint16_t index2NullOffset, dataNullOffset; + // + // /** + // * First code point of the single-value range ending with U+10ffff, + // * rounded up and then shifted right by UTRIE2_SHIFT_1. + // */ + // uint16_t shiftedHighStart; + // } UTrie2Header; + + ByteOrder outerByteOrder = bytes.order(); + try { + UTrie2Header header = new UTrie2Header(); + + /* check the signature */ + header.signature = bytes.getInt(); + switch (header.signature) { + case 0x54726932: + // The buffer is already set to the trie data byte order. + break; + case 0x32697254: + // Temporarily reverse the byte order. + boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; + bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); + header.signature = 0x54726932; + break; + default: + throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); + } + + header.options = bytes.getChar(); + header.indexLength = bytes.getChar(); + header.shiftedDataLength = bytes.getChar(); + header.index2NullOffset = bytes.getChar(); + header.dataNullOffset = bytes.getChar(); + header.shiftedHighStart = bytes.getChar(); + + if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) { + throw new IllegalArgumentException("UTrie2 serialized format error."); + } + + Trie2 This; + This = new Trie2_16(); + This.header = header; + + /* get the length values and offsets */ + This.indexLength = header.indexLength; + This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; + This.index2NullOffset = header.index2NullOffset; + This.dataNullOffset = header.dataNullOffset; + This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; + This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; + This.highValueIndex += This.indexLength; + + // Allocate the Trie2 index array. If the data width is 16 bits, the array also + // includes the space for the data. + + int indexArraySize = This.indexLength; + indexArraySize += This.dataLength; + This.index = new char[indexArraySize]; + + /* Read in the index */ + int i; + for (i=0; i iterator() { + return iterator(defaultValueMapper); + } + + private static ValueMapper defaultValueMapper = new ValueMapper() { + public int map(int in) { + return in; + } + }; + + /** + * Create an iterator over the value ranges from this Trie2. + * Values from the Trie2 are passed through a caller-supplied remapping function, + * and it is the remapped values that determine the ranges that + * will be produced by the iterator. + * + * + * @param mapper provides a function to remap values obtained from the Trie2. + * @return an Iterator + */ + public Iterator iterator(ValueMapper mapper) { + return new Trie2Iterator(mapper); + } + + /** + * When iterating over the contents of a Trie2, an instance of TrieValueMapper may + * be used to remap the values from the Trie2. The remapped values will be used + * both in determining the ranges of codepoints and as the value to be returned + * for each range. + * + * Example of use, with an anonymous subclass of TrieValueMapper: + * + * + * ValueMapper m = new ValueMapper() { + * int map(int in) {return in & 0x1f;}; + * } + * for (Iterator iter = trie.iterator(m); i.hasNext(); ) { + * Trie2EnumRange r = i.next(); + * ... // Do something with the range r. + * } + * + */ + public interface ValueMapper { + public int map(int originalVal); + } + + //-------------------------------------------------------------------------------- + // + // Below this point are internal implementation items. No further public API. + // + //-------------------------------------------------------------------------------- + + /** + * Trie2 data structure in serialized form: + * + * UTrie2Header header; + * uint16_t index[header.index2Length]; + * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + * + * For Java, this is read from the stream into an instance of UTrie2Header. + * (The C version just places a struct over the raw serialized data.) + * + * @internal + */ + static class UTrie2Header { + /** "Tri2" in big-endian US-ASCII (0x54726932) */ + int signature; + + /** + * options bit field (uint16_t): + * 15.. 4 reserved (0) + * 3.. 0 UTrie2ValueBits valueBits + */ + int options; + + /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ + int indexLength; + + /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ + int shiftedDataLength; + + /** Null index and data blocks, not shifted. (uint16_t) */ + int index2NullOffset, dataNullOffset; + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) + */ + int shiftedHighStart; + } + + // + // Data members of UTrie2. + // + UTrie2Header header; + char index[]; // Index array. Includes data for 16 bit Tries. + int data16; // Offset to data portion of the index array, if 16 bit data. + // zero if 32 bit data. + int data32[]; // NULL if 16b data is used via index + + int indexLength; + int dataLength; + int index2NullOffset; // 0xffff if there is no dedicated index-2 null block + int initialValue; + + /** Value returned for out-of-range code points and illegal UTF-8. */ + int errorValue; + + /* Start of the last range which ends at U+10ffff, and its value. */ + int highStart; + int highValueIndex; + + int dataNullOffset; + + /** + * Trie2 constants, defining shift widths, index array lengths, etc. + * + * These are needed for the runtime macros but users can treat these as + * implementation details and skip to the actual public API further below. + */ + + static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; + + + /** Shift size for getting the index-1 table offset. */ + static final int UTRIE2_SHIFT_1=6+5; + + /** Shift size for getting the index-2 table offset. */ + static final int UTRIE2_SHIFT_2=5; + + /** + * Difference between the two shift sizes, + * for getting an index-1 offset from an index-2 offset. 6=11-5 + */ + static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; + + /** + * Number of index-1 entries for the BMP. 32=0x20 + * This part of the index-1 table is omitted from the serialized form. + */ + static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; + + /** Number of entries in an index-2 block. 64=0x40 */ + static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) + */ + static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; + static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; + + /** Count the lengths of both BMP pieces. 2080=0x820 */ + static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; + + /** + * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. + * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. + */ + static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; + static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ + + /** + * The index-1 table, only used for supplementary code points, at offset 2112=0x840. + * Variable length, for code points up to highStart, where the last single-value range starts. + * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. + * (For 0x100000 supplementary code points U+10000..U+10ffff.) + * + * The part of the index-2 table for supplementary code points starts + * after this index-1 table. + * + * Both the index-1 table and the following part of the index-2 table + * are omitted completely if there is only BMP data. + */ + static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; + + /** + * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. + * Used with linear access for single bytes 0..0xbf for simple error handling. + * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. + */ + static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; + + /** + * Implementation class for an iterator over a Trie2. + * + * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, + * then returns the special alternate values for the lead surrogates + * + * @internal + */ + class Trie2Iterator implements Iterator { + + // The normal constructor that configures the iterator to cover the complete + // contents of the Trie2 + Trie2Iterator(ValueMapper vm) { + mapper = vm; + nextStart = 0; + limitCP = 0x110000; + doLeadSurrogates = true; + } + + /** + * The main next() function for Trie2 iterators + * + */ + public Range next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + if (nextStart >= limitCP) { + // Switch over from iterating normal code point values to + // doing the alternate lead-surrogate values. + doingCodePoints = false; + nextStart = 0xd800; + } + int endOfRange = 0; + int val = 0; + int mappedVal = 0; + + if (doingCodePoints) { + // Iteration over code point values. + val = get(nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEnd(nextStart, limitCP, val); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= limitCP-1) { + break; + } + val = get(endOfRange+1); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEnd(endOfRange+1, limitCP, val); + } + } else { + // Iteration over the alternate lead surrogate values. + val = getFromU16SingleLead((char)nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEndLS((char)nextStart); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= 0xdbff) { + break; + } + val = getFromU16SingleLead((char)(endOfRange+1)); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEndLS((char)(endOfRange+1)); + } + } + returnValue.startCodePoint = nextStart; + returnValue.endCodePoint = endOfRange; + returnValue.value = mappedVal; + returnValue.leadSurrogate = !doingCodePoints; + nextStart = endOfRange+1; + return returnValue; + } + + /** + * + */ + public boolean hasNext() { + return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; + } + + private int rangeEndLS(char startingLS) { + if (startingLS >= 0xdbff) { + return 0xdbff; + } + + int c; + int val = getFromU16SingleLead(startingLS); + for (c = startingLS+1; c <= 0x0dbff; c++) { + if (getFromU16SingleLead((char)c) != val) { + break; + } + } + return c-1; + } + + // + // Iteration State Variables + // + private ValueMapper mapper; + private Range returnValue = new Range(); + // The starting code point for the next range to be returned. + private int nextStart; + // The upper limit for the last normal range to be returned. Normally 0x110000, but + // may be lower when iterating over the code points for a single lead surrogate. + private int limitCP; + + // True while iterating over the Trie2 values for code points. + // False while iterating over the alternate values for lead surrogates. + private boolean doingCodePoints = true; + + // True if the iterator should iterate the special values for lead surrogates in + // addition to the normal values for code points. + private boolean doLeadSurrogates = true; + } + + /** + * Find the last character in a contiguous range of characters with the + * same Trie2 value as the input character. + * + * @param c The character to begin with. + * @return The last contiguous character with the same value. + */ + int rangeEnd(int start, int limitp, int val) { + int c; + int limit = Math.min(highStart, limitp); + + for (c = start+1; c < limit; c++) { + if (get(c) != val) { + break; + } + } + if (c >= highStart) { + c = limitp; + } + return c - 1; + } + + + // + // Hashing implementation functions. FNV hash. Respected public domain algorithm. + // + private static int initHash() { + return 0x811c9DC5; // unsigned 2166136261 + } + + private static int hashByte(int h, int b) { + h = h * 16777619; + h = h ^ b; + return h; + } + + private static int hashUChar32(int h, int c) { + h = Trie2.hashByte(h, c & 255); + h = Trie2.hashByte(h, (c>>8) & 255); + h = Trie2.hashByte(h, c>>16); + return h; + } + + private static int hashInt(int h, int i) { + h = Trie2.hashByte(h, i & 255); + h = Trie2.hashByte(h, (i>>8) & 255); + h = Trie2.hashByte(h, (i>>16) & 255); + h = Trie2.hashByte(h, (i>>24) & 255); + return h; + } + +}