1 /* 2 * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 // (c) 2018 and later: Unicode, Inc. and others. 26 // License & terms of use: http://www.unicode.org/copyright.html#License 27 28 // created: 2018may04 Markus W. Scherer 29 30 package jdk.internal.icu.util; 31 32 import jdk.internal.icu.impl.ICUBinary; 33 34 import java.io.DataOutputStream; 35 import java.io.IOException; 36 import java.io.UncheckedIOException; 37 import java.io.OutputStream; 38 import java.nio.ByteBuffer; 39 import java.nio.ByteOrder; 40 41 import static jdk.internal.icu.impl.NormalizerImpl.UTF16Plus; 42 43 /** 44 * Immutable Unicode code point trie. 45 * Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values. 46 * For details see http://site.icu-project.org/design/struct/utrie 47 * 48 * <p>This class is not intended for public subclassing. 49 * 50 * @see MutableCodePointTrie 51 * @draft ICU 63 52 * @provisional This API might change or be removed in a future release. 53 */ 54 @SuppressWarnings("deprecation") 55 public abstract class CodePointTrie extends CodePointMap { 56 /** 57 * Selectors for the type of a CodePointTrie. 58 * Different trade-offs for size vs. speed. 59 * 60 * <p>Use null for {@link #fromBinary} to accept any type; 61 * {@link #getType} will return the actual type. 62 * 63 * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) 64 * @see #fromBinary 65 * @see #getType 66 * @draft ICU 63 67 * @provisional This API might change or be removed in a future release. 68 */ 69 public enum Type { 70 /** 71 * Fast/simple/larger BMP data structure. 72 * The {@link Fast} subclasses have additional functions for lookup for BMP and supplementary code points. 73 * 74 * @see Fast 75 * @draft ICU 63 76 * @provisional This API might change or be removed in a future release. 77 */ 78 FAST, 79 /** 80 * Small/slower BMP data structure. 81 * 82 * @see Small 83 * @draft ICU 63 84 * @provisional This API might change or be removed in a future release. 85 */ 86 SMALL 87 } 88 89 /** 90 * Selectors for the number of bits in a CodePointTrie data value. 91 * 92 * <p>Use null for {@link #fromBinary} to accept any data value width; 93 * {@link #getValueWidth} will return the actual data value width. 94 * 95 * @draft ICU 63 96 * @provisional This API might change or be removed in a future release. 97 */ 98 public enum ValueWidth { 99 /** 100 * The trie stores 16 bits per data value. 101 * It returns them as unsigned values 0..0xffff=65535. 102 * 103 * @draft ICU 63 104 * @provisional This API might change or be removed in a future release. 105 */ 106 BITS_16, 107 /** 108 * The trie stores 32 bits per data value. 109 * 110 * @draft ICU 63 111 * @provisional This API might change or be removed in a future release. 112 */ 113 BITS_32, 114 /** 115 * The trie stores 8 bits per data value. 116 * It returns them as unsigned values 0..0xff=255. 117 * 118 * @draft ICU 63 119 * @provisional This API might change or be removed in a future release. 120 */ 121 BITS_8 122 } 123 124 private CodePointTrie(char[] index, Data data, int highStart, 125 int index3NullOffset, int dataNullOffset) { 126 this.ascii = new int[ASCII_LIMIT]; 127 this.index = index; 128 this.data = data; 129 this.dataLength = data.getDataLength(); 130 this.highStart = highStart; 131 this.index3NullOffset = index3NullOffset; 132 this.dataNullOffset = dataNullOffset; 133 134 for (int c = 0; c < ASCII_LIMIT; ++c) { 135 ascii[c] = data.getFromIndex(c); 136 } 137 138 int nullValueOffset = dataNullOffset; 139 if (nullValueOffset >= dataLength) { 140 nullValueOffset = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 141 } 142 nullValue = data.getFromIndex(nullValueOffset); 143 } 144 145 /** 146 * Creates a trie from its binary form, 147 * stored in the ByteBuffer starting at the current position. 148 * Advances the buffer position to just after the trie data. 149 * Inverse of {@link #toBinary(OutputStream)}. 150 * 151 * <p>The data is copied from the buffer; 152 * later modification of the buffer will not affect the trie. 153 * 154 * @param type selects the trie type; this method throws an exception 155 * if the type does not match the binary data; 156 * use null to accept any type 157 * @param valueWidth selects the number of bits in a data value; this method throws an exception 158 * if the valueWidth does not match the binary data; 159 * use null to accept any data value width 160 * @param bytes a buffer containing the binary data of a CodePointTrie 161 * @return the trie 162 * @see MutableCodePointTrie#MutableCodePointTrie(int, int) 163 * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) 164 * @see #toBinary(OutputStream) 165 * @draft ICU 63 166 * @provisional This API might change or be removed in a future release. 167 */ 168 public static CodePointTrie fromBinary(Type type, ValueWidth valueWidth, ByteBuffer bytes) { 169 ByteOrder outerByteOrder = bytes.order(); 170 try { 171 // Enough data for a trie header? 172 if (bytes.remaining() < 16 /* sizeof(UCPTrieHeader) */) { 173 throw new InternalError("Buffer too short for a CodePointTrie header"); 174 } 175 176 // struct UCPTrieHeader 177 /** "Tri3" in big-endian US-ASCII (0x54726933) */ 178 int signature = bytes.getInt(); 179 180 // Check the signature. 181 switch (signature) { 182 case 0x54726933: 183 // The buffer is already set to the trie data byte order. 184 break; 185 case 0x33697254: 186 // Temporarily reverse the byte order. 187 boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; 188 bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); 189 signature = 0x54726933; 190 break; 191 default: 192 throw new InternalError("Buffer does not contain a serialized CodePointTrie"); 193 } 194 195 // struct UCPTrieHeader continued 196 /** 197 * Options bit field: 198 * Bits 15..12: Data length bits 19..16. 199 * Bits 11..8: Data null block offset bits 19..16. 200 * Bits 7..6: UCPTrieType 201 * Bits 5..3: Reserved (0). 202 * Bits 2..0: UCPTrieValueWidth 203 */ 204 int options = bytes.getChar(); 205 206 /** Total length of the index tables. */ 207 int indexLength = bytes.getChar(); 208 209 /** Data length bits 15..0. */ 210 int dataLength = bytes.getChar(); 211 212 /** Index-3 null block offset, 0x7fff or 0xffff if none. */ 213 int index3NullOffset = bytes.getChar(); 214 215 /** Data null block offset bits 15..0, 0xfffff if none. */ 216 int dataNullOffset = bytes.getChar(); 217 218 /** 219 * First code point of the single-value range ending with U+10ffff, 220 * rounded up and then shifted right by SHIFT_2. 221 */ 222 int shiftedHighStart = bytes.getChar(); 223 // struct UCPTrieHeader end 224 225 int typeInt = (options >> 6) & 3; 226 Type actualType; 227 switch (typeInt) { 228 case 0: actualType = Type.FAST; break; 229 case 1: actualType = Type.SMALL; break; 230 default: 231 throw new InternalError("CodePointTrie data header has an unsupported type"); 232 } 233 234 int valueWidthInt = options & OPTIONS_VALUE_BITS_MASK; 235 ValueWidth actualValueWidth; 236 switch (valueWidthInt) { 237 case 0: actualValueWidth = ValueWidth.BITS_16; break; 238 case 1: actualValueWidth = ValueWidth.BITS_32; break; 239 case 2: actualValueWidth = ValueWidth.BITS_8; break; 240 default: 241 throw new InternalError("CodePointTrie data header has an unsupported value width"); 242 } 243 244 if ((options & OPTIONS_RESERVED_MASK) != 0) { 245 throw new InternalError("CodePointTrie data header has unsupported options"); 246 } 247 248 if (type == null) { 249 type = actualType; 250 } 251 if (valueWidth == null) { 252 valueWidth = actualValueWidth; 253 } 254 if (type != actualType || valueWidth != actualValueWidth) { 255 throw new InternalError("CodePointTrie data header has a different type or value width than required"); 256 } 257 258 // Get the length values and offsets. 259 dataLength |= ((options & OPTIONS_DATA_LENGTH_MASK) << 4); 260 dataNullOffset |= ((options & OPTIONS_DATA_NULL_OFFSET_MASK) << 8); 261 262 int highStart = shiftedHighStart << SHIFT_2; 263 264 // Calculate the actual length, minus the header. 265 int actualLength = indexLength * 2; 266 if (valueWidth == ValueWidth.BITS_16) { 267 actualLength += dataLength * 2; 268 } else if (valueWidth == ValueWidth.BITS_32) { 269 actualLength += dataLength * 4; 270 } else { 271 actualLength += dataLength; 272 } 273 if (bytes.remaining() < actualLength) { 274 throw new InternalError("Buffer too short for the CodePointTrie data"); 275 } 276 277 char[] index = ICUBinary.getChars(bytes, indexLength, 0); 278 switch (valueWidth) { 279 case BITS_16: { 280 char[] data16 = ICUBinary.getChars(bytes, dataLength, 0); 281 return type == Type.FAST ? 282 new Fast16(index, data16, highStart, index3NullOffset, dataNullOffset) : 283 new Small16(index, data16, highStart, index3NullOffset, dataNullOffset); 284 } 285 case BITS_32: { 286 int[] data32 = ICUBinary.getInts(bytes, dataLength, 0); 287 return type == Type.FAST ? 288 new Fast32(index, data32, highStart, index3NullOffset, dataNullOffset) : 289 new Small32(index, data32, highStart, index3NullOffset, dataNullOffset); 290 } 291 case BITS_8: { 292 byte[] data8 = ICUBinary.getBytes(bytes, dataLength, 0); 293 return type == Type.FAST ? 294 new Fast8(index, data8, highStart, index3NullOffset, dataNullOffset) : 295 new Small8(index, data8, highStart, index3NullOffset, dataNullOffset); 296 } 297 default: 298 throw new AssertionError("should be unreachable"); 299 } 300 } finally { 301 bytes.order(outerByteOrder); 302 } 303 } 304 305 /** 306 * Returns the trie type. 307 * 308 * @return the trie type 309 * @draft ICU 63 310 * @provisional This API might change or be removed in a future release. 311 */ 312 public abstract Type getType(); 313 /** 314 * Returns the number of bits in a trie data value. 315 * 316 * @return the number of bits in a trie data value 317 * @draft ICU 63 318 * @provisional This API might change or be removed in a future release. 319 */ 320 public final ValueWidth getValueWidth() { return data.getValueWidth(); } 321 322 /** 323 * {@inheritDoc} 324 * @draft ICU 63 325 * @provisional This API might change or be removed in a future release. 326 */ 327 @Override 328 public int get(int c) { 329 return data.getFromIndex(cpIndex(c)); 330 } 331 332 /** 333 * Returns a trie value for an ASCII code point, without range checking. 334 * 335 * @param c the input code point; must be U+0000..U+007F 336 * @return The ASCII code point's trie value. 337 * @draft ICU 63 338 * @provisional This API might change or be removed in a future release. 339 */ 340 public final int asciiGet(int c) { 341 return ascii[c]; 342 } 343 344 private static final int MAX_UNICODE = 0x10ffff; 345 346 private static final int ASCII_LIMIT = 0x80; 347 348 private static final int maybeFilterValue(int value, int trieNullValue, int nullValue, 349 ValueFilter filter) { 350 if (value == trieNullValue) { 351 value = nullValue; 352 } else if (filter != null) { 353 value = filter.apply(value); 354 } 355 return value; 356 } 357 358 /** 359 * {@inheritDoc} 360 * @draft ICU 63 361 * @provisional This API might change or be removed in a future release. 362 */ 363 @Override 364 public final boolean getRange(int start, ValueFilter filter, Range range) { 365 if (start < 0 || MAX_UNICODE < start) { 366 return false; 367 } 368 if (start >= highStart) { 369 int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 370 int value = data.getFromIndex(di); 371 if (filter != null) { value = filter.apply(value); } 372 range.set(start, MAX_UNICODE, value); 373 return true; 374 } 375 376 int nullValue = this.nullValue; 377 if (filter != null) { nullValue = filter.apply(nullValue); } 378 Type type = getType(); 379 380 int prevI3Block = -1; 381 int prevBlock = -1; 382 int c = start; 383 // Initialize to make compiler happy. Real value when haveValue is true. 384 int trieValue = 0, value = 0; 385 boolean haveValue = false; 386 do { 387 int i3Block; 388 int i3; 389 int i3BlockLength; 390 int dataBlockLength; 391 if (c <= 0xffff && (type == Type.FAST || c <= SMALL_MAX)) { 392 i3Block = 0; 393 i3 = c >> FAST_SHIFT; 394 i3BlockLength = type == Type.FAST ? BMP_INDEX_LENGTH : SMALL_INDEX_LENGTH; 395 dataBlockLength = FAST_DATA_BLOCK_LENGTH; 396 } else { 397 // Use the multi-stage index. 398 int i1 = c >> SHIFT_1; 399 if (type == Type.FAST) { 400 assert(0xffff < c && c < highStart); 401 i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; 402 } else { 403 assert(c < highStart && highStart > SMALL_LIMIT); 404 i1 += SMALL_INDEX_LENGTH; 405 } 406 i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; 407 if (i3Block == prevI3Block && (c - start) >= CP_PER_INDEX_2_ENTRY) { 408 // The index-3 block is the same as the previous one, and filled with value. 409 assert((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0); 410 c += CP_PER_INDEX_2_ENTRY; 411 continue; 412 } 413 prevI3Block = i3Block; 414 if (i3Block == index3NullOffset) { 415 // This is the index-3 null block. 416 if (haveValue) { 417 if (nullValue != value) { 418 range.set(start, c - 1, value); 419 return true; 420 } 421 } else { 422 trieValue = this.nullValue; 423 value = nullValue; 424 haveValue = true; 425 } 426 prevBlock = dataNullOffset; 427 c = (c + CP_PER_INDEX_2_ENTRY) & ~(CP_PER_INDEX_2_ENTRY - 1); 428 continue; 429 } 430 i3 = (c >> SHIFT_3) & INDEX_3_MASK; 431 i3BlockLength = INDEX_3_BLOCK_LENGTH; 432 dataBlockLength = SMALL_DATA_BLOCK_LENGTH; 433 } 434 // Enumerate data blocks for one index-3 block. 435 do { 436 int block; 437 if ((i3Block & 0x8000) == 0) { 438 block = index[i3Block + i3]; 439 } else { 440 // 18-bit indexes stored in groups of 9 entries per 8 indexes. 441 int group = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); 442 int gi = i3 & 7; 443 block = (index[group++] << (2 + (2 * gi))) & 0x30000; 444 block |= index[group + gi]; 445 } 446 if (block == prevBlock && (c - start) >= dataBlockLength) { 447 // The block is the same as the previous one, and filled with value. 448 assert((c & (dataBlockLength - 1)) == 0); 449 c += dataBlockLength; 450 } else { 451 int dataMask = dataBlockLength - 1; 452 prevBlock = block; 453 if (block == dataNullOffset) { 454 // This is the data null block. 455 if (haveValue) { 456 if (nullValue != value) { 457 range.set(start, c - 1, value); 458 return true; 459 } 460 } else { 461 trieValue = this.nullValue; 462 value = nullValue; 463 haveValue = true; 464 } 465 c = (c + dataBlockLength) & ~dataMask; 466 } else { 467 int di = block + (c & dataMask); 468 int trieValue2 = data.getFromIndex(di); 469 if (haveValue) { 470 if (trieValue2 != trieValue) { 471 if (filter == null || 472 maybeFilterValue(trieValue2, this.nullValue, nullValue, 473 filter) != value) { 474 range.set(start, c - 1, value); 475 return true; 476 } 477 trieValue = trieValue2; // may or may not help 478 } 479 } else { 480 trieValue = trieValue2; 481 value = maybeFilterValue(trieValue2, this.nullValue, nullValue, filter); 482 haveValue = true; 483 } 484 while ((++c & dataMask) != 0) { 485 trieValue2 = data.getFromIndex(++di); 486 if (trieValue2 != trieValue) { 487 if (filter == null || 488 maybeFilterValue(trieValue2, this.nullValue, nullValue, 489 filter) != value) { 490 range.set(start, c - 1, value); 491 return true; 492 } 493 trieValue = trieValue2; // may or may not help 494 } 495 } 496 } 497 } 498 } while (++i3 < i3BlockLength); 499 } while (c < highStart); 500 assert(haveValue); 501 int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 502 int highValue = data.getFromIndex(di); 503 if (maybeFilterValue(highValue, this.nullValue, nullValue, filter) != value) { 504 --c; 505 } else { 506 c = MAX_UNICODE; 507 } 508 range.set(start, c, value); 509 return true; 510 } 511 512 /** 513 * Writes a representation of the trie to the output stream. 514 * Inverse of {@link #fromBinary}. 515 * 516 * @param os the output stream 517 * @return the number of bytes written 518 * @draft ICU 63 519 * @provisional This API might change or be removed in a future release. 520 */ 521 public final int toBinary(OutputStream os) { 522 try { 523 DataOutputStream dos = new DataOutputStream(os); 524 525 // Write the UCPTrieHeader 526 dos.writeInt(0x54726933); // signature="Tri3" 527 dos.writeChar( // options 528 ((dataLength & 0xf0000) >> 4) | 529 ((dataNullOffset & 0xf0000) >> 8) | 530 (getType().ordinal() << 6) | 531 getValueWidth().ordinal()); 532 dos.writeChar(index.length); 533 dos.writeChar(dataLength); 534 dos.writeChar(index3NullOffset); 535 dos.writeChar(dataNullOffset); 536 dos.writeChar(highStart >> SHIFT_2); // shiftedHighStart 537 int length = 16; // sizeof(UCPTrieHeader) 538 539 for (char i : index) { dos.writeChar(i); } 540 length += index.length * 2; 541 length += data.write(dos); 542 return length; 543 } catch (IOException e) { 544 throw new UncheckedIOException(e); 545 } 546 } 547 548 /** @internal */ 549 static final int FAST_SHIFT = 6; 550 551 /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ 552 static final int FAST_DATA_BLOCK_LENGTH = 1 << FAST_SHIFT; 553 554 /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ 555 private static final int FAST_DATA_MASK = FAST_DATA_BLOCK_LENGTH - 1; 556 557 /** @internal */ 558 private static final int SMALL_MAX = 0xfff; 559 560 /** 561 * Offset from dataLength (to be subtracted) for fetching the 562 * value returned for out-of-range code points and ill-formed UTF-8/16. 563 * @internal 564 */ 565 private static final int ERROR_VALUE_NEG_DATA_OFFSET = 1; 566 /** 567 * Offset from dataLength (to be subtracted) for fetching the 568 * value returned for code points highStart..U+10FFFF. 569 * @internal 570 */ 571 private static final int HIGH_VALUE_NEG_DATA_OFFSET = 2; 572 573 // ucptrie_impl.h 574 575 /** The length of the BMP index table. 1024=0x400 */ 576 private static final int BMP_INDEX_LENGTH = 0x10000 >> FAST_SHIFT; 577 578 static final int SMALL_LIMIT = 0x1000; 579 private static final int SMALL_INDEX_LENGTH = SMALL_LIMIT >> FAST_SHIFT; 580 581 /** Shift size for getting the index-3 table offset. */ 582 static final int SHIFT_3 = 4; 583 584 /** Shift size for getting the index-2 table offset. */ 585 private static final int SHIFT_2 = 5 + SHIFT_3; 586 587 /** Shift size for getting the index-1 table offset. */ 588 private static final int SHIFT_1 = 5 + SHIFT_2; 589 590 /** 591 * Difference between two shift sizes, 592 * for getting an index-2 offset from an index-3 offset. 5=9-4 593 */ 594 static final int SHIFT_2_3 = SHIFT_2 - SHIFT_3; 595 596 /** 597 * Difference between two shift sizes, 598 * for getting an index-1 offset from an index-2 offset. 5=14-9 599 */ 600 static final int SHIFT_1_2 = SHIFT_1 - SHIFT_2; 601 602 /** 603 * Number of index-1 entries for the BMP. (4) 604 * This part of the index-1 table is omitted from the serialized form. 605 */ 606 private static final int OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1; 607 608 /** Number of entries in an index-2 block. 32=0x20 */ 609 static final int INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2; 610 611 /** Mask for getting the lower bits for the in-index-2-block offset. */ 612 static final int INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1; 613 614 /** Number of code points per index-2 table entry. 512=0x200 */ 615 static final int CP_PER_INDEX_2_ENTRY = 1 << SHIFT_2; 616 617 /** Number of entries in an index-3 block. 32=0x20 */ 618 static final int INDEX_3_BLOCK_LENGTH = 1 << SHIFT_2_3; 619 620 /** Mask for getting the lower bits for the in-index-3-block offset. */ 621 private static final int INDEX_3_MASK = INDEX_3_BLOCK_LENGTH - 1; 622 623 /** Number of entries in a small data block. 16=0x10 */ 624 static final int SMALL_DATA_BLOCK_LENGTH = 1 << SHIFT_3; 625 626 /** Mask for getting the lower bits for the in-small-data-block offset. */ 627 static final int SMALL_DATA_MASK = SMALL_DATA_BLOCK_LENGTH - 1; 628 629 // ucptrie_impl.h: Constants for use with UCPTrieHeader.options. 630 private static final int OPTIONS_DATA_LENGTH_MASK = 0xf000; 631 private static final int OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00; 632 private static final int OPTIONS_RESERVED_MASK = 0x38; 633 private static final int OPTIONS_VALUE_BITS_MASK = 7; 634 /** 635 * Value for index3NullOffset which indicates that there is no index-3 null block. 636 * Bit 15 is unused for this value because this bit is used if the index-3 contains 637 * 18-bit indexes. 638 */ 639 static final int NO_INDEX3_NULL_OFFSET = 0x7fff; 640 static final int NO_DATA_NULL_OFFSET = 0xfffff; 641 642 private static abstract class Data { 643 abstract ValueWidth getValueWidth(); 644 abstract int getDataLength(); 645 abstract int getFromIndex(int index); 646 abstract int write(DataOutputStream dos) throws IOException; 647 } 648 649 private static final class Data16 extends Data { 650 char[] array; 651 Data16(char[] a) { array = a; } 652 @Override ValueWidth getValueWidth() { return ValueWidth.BITS_16; } 653 @Override int getDataLength() { return array.length; } 654 @Override int getFromIndex(int index) { return array[index]; } 655 @Override int write(DataOutputStream dos) throws IOException { 656 for (char v : array) { dos.writeChar(v); } 657 return array.length * 2; 658 } 659 } 660 661 private static final class Data32 extends Data { 662 int[] array; 663 Data32(int[] a) { array = a; } 664 @Override ValueWidth getValueWidth() { return ValueWidth.BITS_32; } 665 @Override int getDataLength() { return array.length; } 666 @Override int getFromIndex(int index) { return array[index]; } 667 @Override int write(DataOutputStream dos) throws IOException { 668 for (int v : array) { dos.writeInt(v); } 669 return array.length * 4; 670 } 671 } 672 673 private static final class Data8 extends Data { 674 byte[] array; 675 Data8(byte[] a) { array = a; } 676 @Override ValueWidth getValueWidth() { return ValueWidth.BITS_8; } 677 @Override int getDataLength() { return array.length; } 678 @Override int getFromIndex(int index) { return array[index] & 0xff; } 679 @Override int write(DataOutputStream dos) throws IOException { 680 for (byte v : array) { dos.writeByte(v); } 681 return array.length; 682 } 683 } 684 685 /** @internal */ 686 private final int[] ascii; 687 688 /** @internal */ 689 private final char[] index; 690 691 /** 692 * @internal 693 * @deprecated This API is ICU internal only. 694 */ 695 @Deprecated 696 protected final Data data; 697 /** 698 * @internal 699 * @deprecated This API is ICU internal only. 700 */ 701 @Deprecated 702 protected final int dataLength; 703 /** 704 * Start of the last range which ends at U+10FFFF. 705 * @internal 706 * @deprecated This API is ICU internal only. 707 */ 708 @Deprecated 709 protected final int highStart; 710 711 /** 712 * Internal index-3 null block offset. 713 * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. 714 * @internal 715 */ 716 private final int index3NullOffset; 717 /** 718 * Internal data null block offset, not shifted. 719 * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. 720 * @internal 721 */ 722 private final int dataNullOffset; 723 /** @internal */ 724 private final int nullValue; 725 726 /** 727 * @internal 728 * @deprecated This API is ICU internal only. 729 */ 730 @Deprecated 731 protected final int fastIndex(int c) { 732 return index[c >> FAST_SHIFT] + (c & FAST_DATA_MASK); 733 } 734 735 /** 736 * @internal 737 * @deprecated This API is ICU internal only. 738 */ 739 @Deprecated 740 protected final int smallIndex(Type type, int c) { 741 // Split into two methods to make this part inline-friendly. 742 // In C, this part is a macro. 743 if (c >= highStart) { 744 return dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 745 } 746 return internalSmallIndex(type, c); 747 } 748 749 private final int internalSmallIndex(Type type, int c) { 750 int i1 = c >> SHIFT_1; 751 if (type == Type.FAST) { 752 assert(0xffff < c && c < highStart); 753 i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; 754 } else { 755 assert(0 <= c && c < highStart && highStart > SMALL_LIMIT); 756 i1 += SMALL_INDEX_LENGTH; 757 } 758 int i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; 759 int i3 = (c >> SHIFT_3) & INDEX_3_MASK; 760 int dataBlock; 761 if ((i3Block & 0x8000) == 0) { 762 // 16-bit indexes 763 dataBlock = index[i3Block + i3]; 764 } else { 765 // 18-bit indexes stored in groups of 9 entries per 8 indexes. 766 i3Block = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); 767 i3 &= 7; 768 dataBlock = (index[i3Block++] << (2 + (2 * i3))) & 0x30000; 769 dataBlock |= index[i3Block + i3]; 770 } 771 return dataBlock + (c & SMALL_DATA_MASK); 772 } 773 774 /** 775 * @internal 776 * @deprecated This API is ICU internal only. 777 */ 778 @Deprecated 779 protected abstract int cpIndex(int c); 780 781 /** 782 * A CodePointTrie with {@link Type#FAST}. 783 * 784 * @draft ICU 63 785 * @provisional This API might change or be removed in a future release. 786 */ 787 public static abstract class Fast extends CodePointTrie { 788 private Fast(char[] index, Data data, int highStart, 789 int index3NullOffset, int dataNullOffset) { 790 super(index, data, highStart, index3NullOffset, dataNullOffset); 791 } 792 793 /** 794 * Creates a trie from its binary form. 795 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 796 * with {@link Type#FAST}. 797 * 798 * @param valueWidth selects the number of bits in a data value; this method throws an exception 799 * if the valueWidth does not match the binary data; 800 * use null to accept any data value width 801 * @param bytes a buffer containing the binary data of a CodePointTrie 802 * @return the trie 803 * @draft ICU 63 804 * @provisional This API might change or be removed in a future release. 805 */ 806 public static Fast fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { 807 return (Fast) CodePointTrie.fromBinary(Type.FAST, valueWidth, bytes); 808 } 809 810 /** 811 * @return {@link Type#FAST} 812 * @draft ICU 63 813 * @provisional This API might change or be removed in a future release. 814 */ 815 @Override 816 public final Type getType() { return Type.FAST; } 817 818 /** 819 * Returns a trie value for a BMP code point (U+0000..U+FFFF), without range checking. 820 * Can be used to look up a value for a UTF-16 code unit if other parts of 821 * the string processing check for surrogates. 822 * 823 * @param c the input code point, must be U+0000..U+FFFF 824 * @return The BMP code point's trie value. 825 * @draft ICU 63 826 * @provisional This API might change or be removed in a future release. 827 */ 828 public abstract int bmpGet(int c); 829 830 /** 831 * Returns a trie value for a supplementary code point (U+10000..U+10FFFF), 832 * without range checking. 833 * 834 * @param c the input code point, must be U+10000..U+10FFFF 835 * @return The supplementary code point's trie value. 836 * @draft ICU 63 837 * @provisional This API might change or be removed in a future release. 838 */ 839 public abstract int suppGet(int c); 840 841 /** 842 * @internal 843 * @deprecated This API is ICU internal only. 844 */ 845 @Deprecated 846 @Override 847 protected final int cpIndex(int c) { 848 if (c >= 0) { 849 if (c <= 0xffff) { 850 return fastIndex(c); 851 } else if (c <= 0x10ffff) { 852 return smallIndex(Type.FAST, c); 853 } 854 } 855 return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 856 } 857 858 /** 859 * {@inheritDoc} 860 * @draft ICU 63 861 * @provisional This API might change or be removed in a future release. 862 */ 863 @Override 864 public final StringIterator stringIterator(CharSequence s, int sIndex) { 865 return new FastStringIterator(s, sIndex); 866 } 867 868 private final class FastStringIterator extends StringIterator { 869 private FastStringIterator(CharSequence s, int sIndex) { 870 super(s, sIndex); 871 } 872 873 @Override 874 public boolean next() { 875 if (sIndex >= s.length()) { 876 return false; 877 } 878 char lead = s.charAt(sIndex++); 879 c = lead; 880 int dataIndex; 881 if (!Character.isSurrogate(lead)) { 882 dataIndex = fastIndex(c); 883 } else { 884 char trail; 885 if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && 886 Character.isLowSurrogate(trail = s.charAt(sIndex))) { 887 ++sIndex; 888 c = Character.toCodePoint(lead, trail); 889 dataIndex = smallIndex(Type.FAST, c); 890 } else { 891 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 892 } 893 } 894 value = data.getFromIndex(dataIndex); 895 return true; 896 } 897 898 @Override 899 public boolean previous() { 900 if (sIndex <= 0) { 901 return false; 902 } 903 char trail = s.charAt(--sIndex); 904 c = trail; 905 int dataIndex; 906 if (!Character.isSurrogate(trail)) { 907 dataIndex = fastIndex(c); 908 } else { 909 char lead; 910 if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && 911 Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { 912 --sIndex; 913 c = Character.toCodePoint(lead, trail); 914 dataIndex = smallIndex(Type.FAST, c); 915 } else { 916 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 917 } 918 } 919 value = data.getFromIndex(dataIndex); 920 return true; 921 } 922 } 923 } 924 925 /** 926 * A CodePointTrie with {@link Type#SMALL}. 927 * 928 * @draft ICU 63 929 * @provisional This API might change or be removed in a future release. 930 */ 931 public static abstract class Small extends CodePointTrie { 932 private Small(char[] index, Data data, int highStart, 933 int index3NullOffset, int dataNullOffset) { 934 super(index, data, highStart, index3NullOffset, dataNullOffset); 935 } 936 937 /** 938 * Creates a trie from its binary form. 939 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 940 * with {@link Type#SMALL}. 941 * 942 * @param valueWidth selects the number of bits in a data value; this method throws an exception 943 * if the valueWidth does not match the binary data; 944 * use null to accept any data value width 945 * @param bytes a buffer containing the binary data of a CodePointTrie 946 * @return the trie 947 * @draft ICU 63 948 * @provisional This API might change or be removed in a future release. 949 */ 950 public static Small fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { 951 return (Small) CodePointTrie.fromBinary(Type.SMALL, valueWidth, bytes); 952 } 953 954 /** 955 * @return {@link Type#SMALL} 956 * @draft ICU 63 957 * @provisional This API might change or be removed in a future release. 958 */ 959 @Override 960 public final Type getType() { return Type.SMALL; } 961 962 /** 963 * @internal 964 * @deprecated This API is ICU internal only. 965 */ 966 @Deprecated 967 @Override 968 protected final int cpIndex(int c) { 969 if (c >= 0) { 970 if (c <= SMALL_MAX) { 971 return fastIndex(c); 972 } else if (c <= 0x10ffff) { 973 return smallIndex(Type.SMALL, c); 974 } 975 } 976 return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 977 } 978 979 /** 980 * {@inheritDoc} 981 * @draft ICU 63 982 * @provisional This API might change or be removed in a future release. 983 */ 984 @Override 985 public final StringIterator stringIterator(CharSequence s, int sIndex) { 986 return new SmallStringIterator(s, sIndex); 987 } 988 989 private final class SmallStringIterator extends StringIterator { 990 private SmallStringIterator(CharSequence s, int sIndex) { 991 super(s, sIndex); 992 } 993 994 @Override 995 public boolean next() { 996 if (sIndex >= s.length()) { 997 return false; 998 } 999 char lead = s.charAt(sIndex++); 1000 c = lead; 1001 int dataIndex; 1002 if (!Character.isSurrogate(lead)) { 1003 dataIndex = cpIndex(c); 1004 } else { 1005 char trail; 1006 if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && 1007 Character.isLowSurrogate(trail = s.charAt(sIndex))) { 1008 ++sIndex; 1009 c = Character.toCodePoint(lead, trail); 1010 dataIndex = smallIndex(Type.SMALL, c); 1011 } else { 1012 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 1013 } 1014 } 1015 value = data.getFromIndex(dataIndex); 1016 return true; 1017 } 1018 1019 @Override 1020 public boolean previous() { 1021 if (sIndex <= 0) { 1022 return false; 1023 } 1024 char trail = s.charAt(--sIndex); 1025 c = trail; 1026 int dataIndex; 1027 if (!Character.isSurrogate(trail)) { 1028 dataIndex = cpIndex(c); 1029 } else { 1030 char lead; 1031 if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && 1032 Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { 1033 --sIndex; 1034 c = Character.toCodePoint(lead, trail); 1035 dataIndex = smallIndex(Type.SMALL, c); 1036 } else { 1037 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 1038 } 1039 } 1040 value = data.getFromIndex(dataIndex); 1041 return true; 1042 } 1043 } 1044 } 1045 1046 /** 1047 * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_16}. 1048 * 1049 * @draft ICU 63 1050 * @provisional This API might change or be removed in a future release. 1051 */ 1052 public static final class Fast16 extends Fast { 1053 private final char[] dataArray; 1054 1055 Fast16(char[] index, char[] data16, int highStart, 1056 int index3NullOffset, int dataNullOffset) { 1057 super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); 1058 this.dataArray = data16; 1059 } 1060 1061 /** 1062 * Creates a trie from its binary form. 1063 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1064 * with {@link Type#FAST} and {@link ValueWidth#BITS_16}. 1065 * 1066 * @param bytes a buffer containing the binary data of a CodePointTrie 1067 * @return the trie 1068 * @draft ICU 63 1069 * @provisional This API might change or be removed in a future release. 1070 */ 1071 public static Fast16 fromBinary(ByteBuffer bytes) { 1072 return (Fast16) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_16, bytes); 1073 } 1074 1075 /** 1076 * {@inheritDoc} 1077 * @draft ICU 63 1078 * @provisional This API might change or be removed in a future release. 1079 */ 1080 @Override 1081 public final int get(int c) { 1082 return dataArray[cpIndex(c)]; 1083 } 1084 1085 /** 1086 * {@inheritDoc} 1087 * @draft ICU 63 1088 * @provisional This API might change or be removed in a future release. 1089 */ 1090 @Override 1091 public final int bmpGet(int c) { 1092 assert 0 <= c && c <= 0xffff; 1093 return dataArray[fastIndex(c)]; 1094 } 1095 1096 /** 1097 * {@inheritDoc} 1098 * @draft ICU 63 1099 * @provisional This API might change or be removed in a future release. 1100 */ 1101 @Override 1102 public final int suppGet(int c) { 1103 assert 0x10000 <= c && c <= 0x10ffff; 1104 return dataArray[smallIndex(Type.FAST, c)]; 1105 } 1106 } 1107 1108 /** 1109 * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_32}. 1110 * 1111 * @draft ICU 63 1112 * @provisional This API might change or be removed in a future release. 1113 */ 1114 public static final class Fast32 extends Fast { 1115 private final int[] dataArray; 1116 1117 Fast32(char[] index, int[] data32, int highStart, 1118 int index3NullOffset, int dataNullOffset) { 1119 super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); 1120 this.dataArray = data32; 1121 } 1122 1123 /** 1124 * Creates a trie from its binary form. 1125 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1126 * with {@link Type#FAST} and {@link ValueWidth#BITS_32}. 1127 * 1128 * @param bytes a buffer containing the binary data of a CodePointTrie 1129 * @return the trie 1130 * @draft ICU 63 1131 * @provisional This API might change or be removed in a future release. 1132 */ 1133 public static Fast32 fromBinary(ByteBuffer bytes) { 1134 return (Fast32) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_32, bytes); 1135 } 1136 1137 /** 1138 * {@inheritDoc} 1139 * @draft ICU 63 1140 * @provisional This API might change or be removed in a future release. 1141 */ 1142 @Override 1143 public final int get(int c) { 1144 return dataArray[cpIndex(c)]; 1145 } 1146 1147 /** 1148 * {@inheritDoc} 1149 * @draft ICU 63 1150 * @provisional This API might change or be removed in a future release. 1151 */ 1152 @Override 1153 public final int bmpGet(int c) { 1154 assert 0 <= c && c <= 0xffff; 1155 return dataArray[fastIndex(c)]; 1156 } 1157 1158 /** 1159 * {@inheritDoc} 1160 * @draft ICU 63 1161 * @provisional This API might change or be removed in a future release. 1162 */ 1163 @Override 1164 public final int suppGet(int c) { 1165 assert 0x10000 <= c && c <= 0x10ffff; 1166 return dataArray[smallIndex(Type.FAST, c)]; 1167 } 1168 } 1169 1170 /** 1171 * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_8}. 1172 * 1173 * @draft ICU 63 1174 * @provisional This API might change or be removed in a future release. 1175 */ 1176 public static final class Fast8 extends Fast { 1177 private final byte[] dataArray; 1178 1179 Fast8(char[] index, byte[] data8, int highStart, 1180 int index3NullOffset, int dataNullOffset) { 1181 super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); 1182 this.dataArray = data8; 1183 } 1184 1185 /** 1186 * Creates a trie from its binary form. 1187 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1188 * with {@link Type#FAST} and {@link ValueWidth#BITS_8}. 1189 * 1190 * @param bytes a buffer containing the binary data of a CodePointTrie 1191 * @return the trie 1192 * @draft ICU 63 1193 * @provisional This API might change or be removed in a future release. 1194 */ 1195 public static Fast8 fromBinary(ByteBuffer bytes) { 1196 return (Fast8) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_8, bytes); 1197 } 1198 1199 /** 1200 * {@inheritDoc} 1201 * @draft ICU 63 1202 * @provisional This API might change or be removed in a future release. 1203 */ 1204 @Override 1205 public final int get(int c) { 1206 return dataArray[cpIndex(c)] & 0xff; 1207 } 1208 1209 /** 1210 * {@inheritDoc} 1211 * @draft ICU 63 1212 * @provisional This API might change or be removed in a future release. 1213 */ 1214 @Override 1215 public final int bmpGet(int c) { 1216 assert 0 <= c && c <= 0xffff; 1217 return dataArray[fastIndex(c)] & 0xff; 1218 } 1219 1220 /** 1221 * {@inheritDoc} 1222 * @draft ICU 63 1223 * @provisional This API might change or be removed in a future release. 1224 */ 1225 @Override 1226 public final int suppGet(int c) { 1227 assert 0x10000 <= c && c <= 0x10ffff; 1228 return dataArray[smallIndex(Type.FAST, c)] & 0xff; 1229 } 1230 } 1231 1232 /** 1233 * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. 1234 * 1235 * @draft ICU 63 1236 * @provisional This API might change or be removed in a future release. 1237 */ 1238 public static final class Small16 extends Small { 1239 Small16(char[] index, char[] data16, int highStart, 1240 int index3NullOffset, int dataNullOffset) { 1241 super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); 1242 } 1243 1244 /** 1245 * Creates a trie from its binary form. 1246 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1247 * with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. 1248 * 1249 * @param bytes a buffer containing the binary data of a CodePointTrie 1250 * @return the trie 1251 * @draft ICU 63 1252 * @provisional This API might change or be removed in a future release. 1253 */ 1254 public static Small16 fromBinary(ByteBuffer bytes) { 1255 return (Small16) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_16, bytes); 1256 } 1257 } 1258 1259 /** 1260 * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. 1261 * 1262 * @draft ICU 63 1263 * @provisional This API might change or be removed in a future release. 1264 */ 1265 public static final class Small32 extends Small { 1266 Small32(char[] index, int[] data32, int highStart, 1267 int index3NullOffset, int dataNullOffset) { 1268 super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); 1269 } 1270 1271 /** 1272 * Creates a trie from its binary form. 1273 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1274 * with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. 1275 * 1276 * @param bytes a buffer containing the binary data of a CodePointTrie 1277 * @return the trie 1278 * @draft ICU 63 1279 * @provisional This API might change or be removed in a future release. 1280 */ 1281 public static Small32 fromBinary(ByteBuffer bytes) { 1282 return (Small32) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_32, bytes); 1283 } 1284 } 1285 1286 /** 1287 * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. 1288 * 1289 * @draft ICU 63 1290 * @provisional This API might change or be removed in a future release. 1291 */ 1292 public static final class Small8 extends Small { 1293 Small8(char[] index, byte[] data8, int highStart, 1294 int index3NullOffset, int dataNullOffset) { 1295 super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); 1296 } 1297 1298 /** 1299 * Creates a trie from its binary form. 1300 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1301 * with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. 1302 * 1303 * @param bytes a buffer containing the binary data of a CodePointTrie 1304 * @return the trie 1305 * @draft ICU 63 1306 * @provisional This API might change or be removed in a future release. 1307 */ 1308 public static Small8 fromBinary(ByteBuffer bytes) { 1309 return (Small8) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_8, bytes); 1310 } 1311 } 1312 }