1 /* 2 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 // (c) 2018 and later: Unicode, Inc. and others. 26 // License & terms of use: http://www.unicode.org/copyright.html#License 27 28 // created: 2018may04 Markus W. Scherer 29 30 package sun.text.normalizer; 31 32 import java.io.DataOutputStream; 33 import java.io.IOException; 34 import java.io.UncheckedIOException; 35 import java.io.OutputStream; 36 import java.nio.ByteBuffer; 37 import java.nio.ByteOrder; 38 39 import static sun.text.normalizer.NormalizerImpl.UTF16Plus; 40 41 /** 42 * Immutable Unicode code point trie. 43 * Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values. 44 * For details see http://site.icu-project.org/design/struct/utrie 45 * 46 * <p>This class is not intended for public subclassing. 47 * 48 * @see MutableCodePointTrie 49 * @draft ICU 63 50 * @provisional This API might change or be removed in a future release. 51 */ 52 @SuppressWarnings("deprecation") 53 public abstract class CodePointTrie extends CodePointMap { 54 /** 55 * Selectors for the type of a CodePointTrie. 56 * Different trade-offs for size vs. speed. 57 * 58 * <p>Use null for {@link #fromBinary} to accept any type; 59 * {@link #getType} will return the actual type. 60 * 61 * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) 62 * @see #fromBinary 63 * @see #getType 64 * @draft ICU 63 65 * @provisional This API might change or be removed in a future release. 66 */ 67 public enum Type { 68 /** 69 * Fast/simple/larger BMP data structure. 70 * The {@link Fast} subclasses have additional functions for lookup for BMP and supplementary code points. 71 * 72 * @see Fast 73 * @draft ICU 63 74 * @provisional This API might change or be removed in a future release. 75 */ 76 FAST, 77 /** 78 * Small/slower BMP data structure. 79 * 80 * @see Small 81 * @draft ICU 63 82 * @provisional This API might change or be removed in a future release. 83 */ 84 SMALL 85 } 86 87 /** 88 * Selectors for the number of bits in a CodePointTrie data value. 89 * 90 * <p>Use null for {@link #fromBinary} to accept any data value width; 91 * {@link #getValueWidth} will return the actual data value width. 92 * 93 * @draft ICU 63 94 * @provisional This API might change or be removed in a future release. 95 */ 96 public enum ValueWidth { 97 /** 98 * The trie stores 16 bits per data value. 99 * It returns them as unsigned values 0..0xffff=65535. 100 * 101 * @draft ICU 63 102 * @provisional This API might change or be removed in a future release. 103 */ 104 BITS_16, 105 /** 106 * The trie stores 32 bits per data value. 107 * 108 * @draft ICU 63 109 * @provisional This API might change or be removed in a future release. 110 */ 111 BITS_32, 112 /** 113 * The trie stores 8 bits per data value. 114 * It returns them as unsigned values 0..0xff=255. 115 * 116 * @draft ICU 63 117 * @provisional This API might change or be removed in a future release. 118 */ 119 BITS_8 120 } 121 122 private CodePointTrie(char[] index, Data data, int highStart, 123 int index3NullOffset, int dataNullOffset) { 124 this.ascii = new int[ASCII_LIMIT]; 125 this.index = index; 126 this.data = data; 127 this.dataLength = data.getDataLength(); 128 this.highStart = highStart; 129 this.index3NullOffset = index3NullOffset; 130 this.dataNullOffset = dataNullOffset; 131 132 for (int c = 0; c < ASCII_LIMIT; ++c) { 133 ascii[c] = data.getFromIndex(c); 134 } 135 136 int nullValueOffset = dataNullOffset; 137 if (nullValueOffset >= dataLength) { 138 nullValueOffset = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 139 } 140 nullValue = data.getFromIndex(nullValueOffset); 141 } 142 143 /** 144 * Creates a trie from its binary form, 145 * stored in the ByteBuffer starting at the current position. 146 * Advances the buffer position to just after the trie data. 147 * Inverse of {@link #toBinary(OutputStream)}. 148 * 149 * <p>The data is copied from the buffer; 150 * later modification of the buffer will not affect the trie. 151 * 152 * @param type selects the trie type; this method throws an exception 153 * if the type does not match the binary data; 154 * use null to accept any type 155 * @param valueWidth selects the number of bits in a data value; this method throws an exception 156 * if the valueWidth does not match the binary data; 157 * use null to accept any data value width 158 * @param bytes a buffer containing the binary data of a CodePointTrie 159 * @return the trie 160 * @see MutableCodePointTrie#MutableCodePointTrie(int, int) 161 * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) 162 * @see #toBinary(OutputStream) 163 * @draft ICU 63 164 * @provisional This API might change or be removed in a future release. 165 */ 166 public static CodePointTrie fromBinary(Type type, ValueWidth valueWidth, ByteBuffer bytes) { 167 ByteOrder outerByteOrder = bytes.order(); 168 try { 169 // Enough data for a trie header? 170 if (bytes.remaining() < 16 /* sizeof(UCPTrieHeader) */) { 171 throw new InternalError("Buffer too short for a CodePointTrie header"); 172 } 173 174 // struct UCPTrieHeader 175 /** "Tri3" in big-endian US-ASCII (0x54726933) */ 176 int signature = bytes.getInt(); 177 178 // Check the signature. 179 switch (signature) { 180 case 0x54726933: 181 // The buffer is already set to the trie data byte order. 182 break; 183 case 0x33697254: 184 // Temporarily reverse the byte order. 185 boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; 186 bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); 187 signature = 0x54726933; 188 break; 189 default: 190 throw new InternalError("Buffer does not contain a serialized CodePointTrie"); 191 } 192 193 // struct UCPTrieHeader continued 194 /** 195 * Options bit field: 196 * Bits 15..12: Data length bits 19..16. 197 * Bits 11..8: Data null block offset bits 19..16. 198 * Bits 7..6: UCPTrieType 199 * Bits 5..3: Reserved (0). 200 * Bits 2..0: UCPTrieValueWidth 201 */ 202 int options = bytes.getChar(); 203 204 /** Total length of the index tables. */ 205 int indexLength = bytes.getChar(); 206 207 /** Data length bits 15..0. */ 208 int dataLength = bytes.getChar(); 209 210 /** Index-3 null block offset, 0x7fff or 0xffff if none. */ 211 int index3NullOffset = bytes.getChar(); 212 213 /** Data null block offset bits 15..0, 0xfffff if none. */ 214 int dataNullOffset = bytes.getChar(); 215 216 /** 217 * First code point of the single-value range ending with U+10ffff, 218 * rounded up and then shifted right by SHIFT_2. 219 */ 220 int shiftedHighStart = bytes.getChar(); 221 // struct UCPTrieHeader end 222 223 int typeInt = (options >> 6) & 3; 224 Type actualType; 225 switch (typeInt) { 226 case 0: actualType = Type.FAST; break; 227 case 1: actualType = Type.SMALL; break; 228 default: 229 throw new InternalError("CodePointTrie data header has an unsupported type"); 230 } 231 232 int valueWidthInt = options & OPTIONS_VALUE_BITS_MASK; 233 ValueWidth actualValueWidth; 234 switch (valueWidthInt) { 235 case 0: actualValueWidth = ValueWidth.BITS_16; break; 236 case 1: actualValueWidth = ValueWidth.BITS_32; break; 237 case 2: actualValueWidth = ValueWidth.BITS_8; break; 238 default: 239 throw new InternalError("CodePointTrie data header has an unsupported value width"); 240 } 241 242 if ((options & OPTIONS_RESERVED_MASK) != 0) { 243 throw new InternalError("CodePointTrie data header has unsupported options"); 244 } 245 246 if (type == null) { 247 type = actualType; 248 } 249 if (valueWidth == null) { 250 valueWidth = actualValueWidth; 251 } 252 if (type != actualType || valueWidth != actualValueWidth) { 253 throw new InternalError("CodePointTrie data header has a different type or value width than required"); 254 } 255 256 // Get the length values and offsets. 257 dataLength |= ((options & OPTIONS_DATA_LENGTH_MASK) << 4); 258 dataNullOffset |= ((options & OPTIONS_DATA_NULL_OFFSET_MASK) << 8); 259 260 int highStart = shiftedHighStart << SHIFT_2; 261 262 // Calculate the actual length, minus the header. 263 int actualLength = indexLength * 2; 264 if (valueWidth == ValueWidth.BITS_16) { 265 actualLength += dataLength * 2; 266 } else if (valueWidth == ValueWidth.BITS_32) { 267 actualLength += dataLength * 4; 268 } else { 269 actualLength += dataLength; 270 } 271 if (bytes.remaining() < actualLength) { 272 throw new InternalError("Buffer too short for the CodePointTrie data"); 273 } 274 275 char[] index = ICUBinary.getChars(bytes, indexLength, 0); 276 switch (valueWidth) { 277 case BITS_16: { 278 char[] data16 = ICUBinary.getChars(bytes, dataLength, 0); 279 return type == Type.FAST ? 280 new Fast16(index, data16, highStart, index3NullOffset, dataNullOffset) : 281 new Small16(index, data16, highStart, index3NullOffset, dataNullOffset); 282 } 283 case BITS_32: { 284 int[] data32 = ICUBinary.getInts(bytes, dataLength, 0); 285 return type == Type.FAST ? 286 new Fast32(index, data32, highStart, index3NullOffset, dataNullOffset) : 287 new Small32(index, data32, highStart, index3NullOffset, dataNullOffset); 288 } 289 case BITS_8: { 290 byte[] data8 = ICUBinary.getBytes(bytes, dataLength, 0); 291 return type == Type.FAST ? 292 new Fast8(index, data8, highStart, index3NullOffset, dataNullOffset) : 293 new Small8(index, data8, highStart, index3NullOffset, dataNullOffset); 294 } 295 default: 296 throw new AssertionError("should be unreachable"); 297 } 298 } finally { 299 bytes.order(outerByteOrder); 300 } 301 } 302 303 /** 304 * Returns the trie type. 305 * 306 * @return the trie type 307 * @draft ICU 63 308 * @provisional This API might change or be removed in a future release. 309 */ 310 public abstract Type getType(); 311 /** 312 * Returns the number of bits in a trie data value. 313 * 314 * @return the number of bits in a trie data value 315 * @draft ICU 63 316 * @provisional This API might change or be removed in a future release. 317 */ 318 public final ValueWidth getValueWidth() { return data.getValueWidth(); } 319 320 /** 321 * {@inheritDoc} 322 * @draft ICU 63 323 * @provisional This API might change or be removed in a future release. 324 */ 325 @Override 326 public int get(int c) { 327 return data.getFromIndex(cpIndex(c)); 328 } 329 330 /** 331 * Returns a trie value for an ASCII code point, without range checking. 332 * 333 * @param c the input code point; must be U+0000..U+007F 334 * @return The ASCII code point's trie value. 335 * @draft ICU 63 336 * @provisional This API might change or be removed in a future release. 337 */ 338 public final int asciiGet(int c) { 339 return ascii[c]; 340 } 341 342 private static final int MAX_UNICODE = 0x10ffff; 343 344 private static final int ASCII_LIMIT = 0x80; 345 346 private static final int maybeFilterValue(int value, int trieNullValue, int nullValue, 347 ValueFilter filter) { 348 if (value == trieNullValue) { 349 value = nullValue; 350 } else if (filter != null) { 351 value = filter.apply(value); 352 } 353 return value; 354 } 355 356 /** 357 * {@inheritDoc} 358 * @draft ICU 63 359 * @provisional This API might change or be removed in a future release. 360 */ 361 @Override 362 public final boolean getRange(int start, ValueFilter filter, Range range) { 363 if (start < 0 || MAX_UNICODE < start) { 364 return false; 365 } 366 if (start >= highStart) { 367 int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 368 int value = data.getFromIndex(di); 369 if (filter != null) { value = filter.apply(value); } 370 range.set(start, MAX_UNICODE, value); 371 return true; 372 } 373 374 int nullValue = this.nullValue; 375 if (filter != null) { nullValue = filter.apply(nullValue); } 376 Type type = getType(); 377 378 int prevI3Block = -1; 379 int prevBlock = -1; 380 int c = start; 381 // Initialize to make compiler happy. Real value when haveValue is true. 382 int trieValue = 0, value = 0; 383 boolean haveValue = false; 384 do { 385 int i3Block; 386 int i3; 387 int i3BlockLength; 388 int dataBlockLength; 389 if (c <= 0xffff && (type == Type.FAST || c <= SMALL_MAX)) { 390 i3Block = 0; 391 i3 = c >> FAST_SHIFT; 392 i3BlockLength = type == Type.FAST ? BMP_INDEX_LENGTH : SMALL_INDEX_LENGTH; 393 dataBlockLength = FAST_DATA_BLOCK_LENGTH; 394 } else { 395 // Use the multi-stage index. 396 int i1 = c >> SHIFT_1; 397 if (type == Type.FAST) { 398 assert(0xffff < c && c < highStart); 399 i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; 400 } else { 401 assert(c < highStart && highStart > SMALL_LIMIT); 402 i1 += SMALL_INDEX_LENGTH; 403 } 404 i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; 405 if (i3Block == prevI3Block && (c - start) >= CP_PER_INDEX_2_ENTRY) { 406 // The index-3 block is the same as the previous one, and filled with value. 407 assert((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0); 408 c += CP_PER_INDEX_2_ENTRY; 409 continue; 410 } 411 prevI3Block = i3Block; 412 if (i3Block == index3NullOffset) { 413 // This is the index-3 null block. 414 if (haveValue) { 415 if (nullValue != value) { 416 range.set(start, c - 1, value); 417 return true; 418 } 419 } else { 420 trieValue = this.nullValue; 421 value = nullValue; 422 haveValue = true; 423 } 424 prevBlock = dataNullOffset; 425 c = (c + CP_PER_INDEX_2_ENTRY) & ~(CP_PER_INDEX_2_ENTRY - 1); 426 continue; 427 } 428 i3 = (c >> SHIFT_3) & INDEX_3_MASK; 429 i3BlockLength = INDEX_3_BLOCK_LENGTH; 430 dataBlockLength = SMALL_DATA_BLOCK_LENGTH; 431 } 432 // Enumerate data blocks for one index-3 block. 433 do { 434 int block; 435 if ((i3Block & 0x8000) == 0) { 436 block = index[i3Block + i3]; 437 } else { 438 // 18-bit indexes stored in groups of 9 entries per 8 indexes. 439 int group = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); 440 int gi = i3 & 7; 441 block = (index[group++] << (2 + (2 * gi))) & 0x30000; 442 block |= index[group + gi]; 443 } 444 if (block == prevBlock && (c - start) >= dataBlockLength) { 445 // The block is the same as the previous one, and filled with value. 446 assert((c & (dataBlockLength - 1)) == 0); 447 c += dataBlockLength; 448 } else { 449 int dataMask = dataBlockLength - 1; 450 prevBlock = block; 451 if (block == dataNullOffset) { 452 // This is the data null block. 453 if (haveValue) { 454 if (nullValue != value) { 455 range.set(start, c - 1, value); 456 return true; 457 } 458 } else { 459 trieValue = this.nullValue; 460 value = nullValue; 461 haveValue = true; 462 } 463 c = (c + dataBlockLength) & ~dataMask; 464 } else { 465 int di = block + (c & dataMask); 466 int trieValue2 = data.getFromIndex(di); 467 if (haveValue) { 468 if (trieValue2 != trieValue) { 469 if (filter == null || 470 maybeFilterValue(trieValue2, this.nullValue, nullValue, 471 filter) != value) { 472 range.set(start, c - 1, value); 473 return true; 474 } 475 trieValue = trieValue2; // may or may not help 476 } 477 } else { 478 trieValue = trieValue2; 479 value = maybeFilterValue(trieValue2, this.nullValue, nullValue, filter); 480 haveValue = true; 481 } 482 while ((++c & dataMask) != 0) { 483 trieValue2 = data.getFromIndex(++di); 484 if (trieValue2 != trieValue) { 485 if (filter == null || 486 maybeFilterValue(trieValue2, this.nullValue, nullValue, 487 filter) != value) { 488 range.set(start, c - 1, value); 489 return true; 490 } 491 trieValue = trieValue2; // may or may not help 492 } 493 } 494 } 495 } 496 } while (++i3 < i3BlockLength); 497 } while (c < highStart); 498 assert(haveValue); 499 int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 500 int highValue = data.getFromIndex(di); 501 if (maybeFilterValue(highValue, this.nullValue, nullValue, filter) != value) { 502 --c; 503 } else { 504 c = MAX_UNICODE; 505 } 506 range.set(start, c, value); 507 return true; 508 } 509 510 /** 511 * Writes a representation of the trie to the output stream. 512 * Inverse of {@link #fromBinary}. 513 * 514 * @param os the output stream 515 * @return the number of bytes written 516 * @draft ICU 63 517 * @provisional This API might change or be removed in a future release. 518 */ 519 public final int toBinary(OutputStream os) { 520 try { 521 DataOutputStream dos = new DataOutputStream(os); 522 523 // Write the UCPTrieHeader 524 dos.writeInt(0x54726933); // signature="Tri3" 525 dos.writeChar( // options 526 ((dataLength & 0xf0000) >> 4) | 527 ((dataNullOffset & 0xf0000) >> 8) | 528 (getType().ordinal() << 6) | 529 getValueWidth().ordinal()); 530 dos.writeChar(index.length); 531 dos.writeChar(dataLength); 532 dos.writeChar(index3NullOffset); 533 dos.writeChar(dataNullOffset); 534 dos.writeChar(highStart >> SHIFT_2); // shiftedHighStart 535 int length = 16; // sizeof(UCPTrieHeader) 536 537 for (char i : index) { dos.writeChar(i); } 538 length += index.length * 2; 539 length += data.write(dos); 540 return length; 541 } catch (IOException e) { 542 throw new UncheckedIOException(e); 543 } 544 } 545 546 /** @internal */ 547 static final int FAST_SHIFT = 6; 548 549 /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ 550 static final int FAST_DATA_BLOCK_LENGTH = 1 << FAST_SHIFT; 551 552 /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ 553 private static final int FAST_DATA_MASK = FAST_DATA_BLOCK_LENGTH - 1; 554 555 /** @internal */ 556 private static final int SMALL_MAX = 0xfff; 557 558 /** 559 * Offset from dataLength (to be subtracted) for fetching the 560 * value returned for out-of-range code points and ill-formed UTF-8/16. 561 * @internal 562 */ 563 private static final int ERROR_VALUE_NEG_DATA_OFFSET = 1; 564 /** 565 * Offset from dataLength (to be subtracted) for fetching the 566 * value returned for code points highStart..U+10FFFF. 567 * @internal 568 */ 569 private static final int HIGH_VALUE_NEG_DATA_OFFSET = 2; 570 571 // ucptrie_impl.h 572 573 /** The length of the BMP index table. 1024=0x400 */ 574 private static final int BMP_INDEX_LENGTH = 0x10000 >> FAST_SHIFT; 575 576 static final int SMALL_LIMIT = 0x1000; 577 private static final int SMALL_INDEX_LENGTH = SMALL_LIMIT >> FAST_SHIFT; 578 579 /** Shift size for getting the index-3 table offset. */ 580 static final int SHIFT_3 = 4; 581 582 /** Shift size for getting the index-2 table offset. */ 583 private static final int SHIFT_2 = 5 + SHIFT_3; 584 585 /** Shift size for getting the index-1 table offset. */ 586 private static final int SHIFT_1 = 5 + SHIFT_2; 587 588 /** 589 * Difference between two shift sizes, 590 * for getting an index-2 offset from an index-3 offset. 5=9-4 591 */ 592 static final int SHIFT_2_3 = SHIFT_2 - SHIFT_3; 593 594 /** 595 * Difference between two shift sizes, 596 * for getting an index-1 offset from an index-2 offset. 5=14-9 597 */ 598 static final int SHIFT_1_2 = SHIFT_1 - SHIFT_2; 599 600 /** 601 * Number of index-1 entries for the BMP. (4) 602 * This part of the index-1 table is omitted from the serialized form. 603 */ 604 private static final int OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1; 605 606 /** Number of entries in an index-2 block. 32=0x20 */ 607 static final int INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2; 608 609 /** Mask for getting the lower bits for the in-index-2-block offset. */ 610 static final int INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1; 611 612 /** Number of code points per index-2 table entry. 512=0x200 */ 613 static final int CP_PER_INDEX_2_ENTRY = 1 << SHIFT_2; 614 615 /** Number of entries in an index-3 block. 32=0x20 */ 616 static final int INDEX_3_BLOCK_LENGTH = 1 << SHIFT_2_3; 617 618 /** Mask for getting the lower bits for the in-index-3-block offset. */ 619 private static final int INDEX_3_MASK = INDEX_3_BLOCK_LENGTH - 1; 620 621 /** Number of entries in a small data block. 16=0x10 */ 622 static final int SMALL_DATA_BLOCK_LENGTH = 1 << SHIFT_3; 623 624 /** Mask for getting the lower bits for the in-small-data-block offset. */ 625 static final int SMALL_DATA_MASK = SMALL_DATA_BLOCK_LENGTH - 1; 626 627 // ucptrie_impl.h: Constants for use with UCPTrieHeader.options. 628 private static final int OPTIONS_DATA_LENGTH_MASK = 0xf000; 629 private static final int OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00; 630 private static final int OPTIONS_RESERVED_MASK = 0x38; 631 private static final int OPTIONS_VALUE_BITS_MASK = 7; 632 /** 633 * Value for index3NullOffset which indicates that there is no index-3 null block. 634 * Bit 15 is unused for this value because this bit is used if the index-3 contains 635 * 18-bit indexes. 636 */ 637 static final int NO_INDEX3_NULL_OFFSET = 0x7fff; 638 static final int NO_DATA_NULL_OFFSET = 0xfffff; 639 640 private static abstract class Data { 641 abstract ValueWidth getValueWidth(); 642 abstract int getDataLength(); 643 abstract int getFromIndex(int index); 644 abstract int write(DataOutputStream dos) throws IOException; 645 } 646 647 private static final class Data16 extends Data { 648 char[] array; 649 Data16(char[] a) { array = a; } 650 @Override ValueWidth getValueWidth() { return ValueWidth.BITS_16; } 651 @Override int getDataLength() { return array.length; } 652 @Override int getFromIndex(int index) { return array[index]; } 653 @Override int write(DataOutputStream dos) throws IOException { 654 for (char v : array) { dos.writeChar(v); } 655 return array.length * 2; 656 } 657 } 658 659 private static final class Data32 extends Data { 660 int[] array; 661 Data32(int[] a) { array = a; } 662 @Override ValueWidth getValueWidth() { return ValueWidth.BITS_32; } 663 @Override int getDataLength() { return array.length; } 664 @Override int getFromIndex(int index) { return array[index]; } 665 @Override int write(DataOutputStream dos) throws IOException { 666 for (int v : array) { dos.writeInt(v); } 667 return array.length * 4; 668 } 669 } 670 671 private static final class Data8 extends Data { 672 byte[] array; 673 Data8(byte[] a) { array = a; } 674 @Override ValueWidth getValueWidth() { return ValueWidth.BITS_8; } 675 @Override int getDataLength() { return array.length; } 676 @Override int getFromIndex(int index) { return array[index] & 0xff; } 677 @Override int write(DataOutputStream dos) throws IOException { 678 for (byte v : array) { dos.writeByte(v); } 679 return array.length; 680 } 681 } 682 683 /** @internal */ 684 private final int[] ascii; 685 686 /** @internal */ 687 private final char[] index; 688 689 /** 690 * @internal 691 * @deprecated This API is ICU internal only. 692 */ 693 @Deprecated 694 protected final Data data; 695 /** 696 * @internal 697 * @deprecated This API is ICU internal only. 698 */ 699 @Deprecated 700 protected final int dataLength; 701 /** 702 * Start of the last range which ends at U+10FFFF. 703 * @internal 704 * @deprecated This API is ICU internal only. 705 */ 706 @Deprecated 707 protected final int highStart; 708 709 /** 710 * Internal index-3 null block offset. 711 * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. 712 * @internal 713 */ 714 private final int index3NullOffset; 715 /** 716 * Internal data null block offset, not shifted. 717 * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. 718 * @internal 719 */ 720 private final int dataNullOffset; 721 /** @internal */ 722 private final int nullValue; 723 724 /** 725 * @internal 726 * @deprecated This API is ICU internal only. 727 */ 728 @Deprecated 729 protected final int fastIndex(int c) { 730 return index[c >> FAST_SHIFT] + (c & FAST_DATA_MASK); 731 } 732 733 /** 734 * @internal 735 * @deprecated This API is ICU internal only. 736 */ 737 @Deprecated 738 protected final int smallIndex(Type type, int c) { 739 // Split into two methods to make this part inline-friendly. 740 // In C, this part is a macro. 741 if (c >= highStart) { 742 return dataLength - HIGH_VALUE_NEG_DATA_OFFSET; 743 } 744 return internalSmallIndex(type, c); 745 } 746 747 private final int internalSmallIndex(Type type, int c) { 748 int i1 = c >> SHIFT_1; 749 if (type == Type.FAST) { 750 assert(0xffff < c && c < highStart); 751 i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; 752 } else { 753 assert(0 <= c && c < highStart && highStart > SMALL_LIMIT); 754 i1 += SMALL_INDEX_LENGTH; 755 } 756 int i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; 757 int i3 = (c >> SHIFT_3) & INDEX_3_MASK; 758 int dataBlock; 759 if ((i3Block & 0x8000) == 0) { 760 // 16-bit indexes 761 dataBlock = index[i3Block + i3]; 762 } else { 763 // 18-bit indexes stored in groups of 9 entries per 8 indexes. 764 i3Block = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); 765 i3 &= 7; 766 dataBlock = (index[i3Block++] << (2 + (2 * i3))) & 0x30000; 767 dataBlock |= index[i3Block + i3]; 768 } 769 return dataBlock + (c & SMALL_DATA_MASK); 770 } 771 772 /** 773 * @internal 774 * @deprecated This API is ICU internal only. 775 */ 776 @Deprecated 777 protected abstract int cpIndex(int c); 778 779 /** 780 * A CodePointTrie with {@link Type#FAST}. 781 * 782 * @draft ICU 63 783 * @provisional This API might change or be removed in a future release. 784 */ 785 public static abstract class Fast extends CodePointTrie { 786 private Fast(char[] index, Data data, int highStart, 787 int index3NullOffset, int dataNullOffset) { 788 super(index, data, highStart, index3NullOffset, dataNullOffset); 789 } 790 791 /** 792 * Creates a trie from its binary form. 793 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 794 * with {@link Type#FAST}. 795 * 796 * @param valueWidth selects the number of bits in a data value; this method throws an exception 797 * if the valueWidth does not match the binary data; 798 * use null to accept any data value width 799 * @param bytes a buffer containing the binary data of a CodePointTrie 800 * @return the trie 801 * @draft ICU 63 802 * @provisional This API might change or be removed in a future release. 803 */ 804 public static Fast fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { 805 return (Fast) CodePointTrie.fromBinary(Type.FAST, valueWidth, bytes); 806 } 807 808 /** 809 * @return {@link Type#FAST} 810 * @draft ICU 63 811 * @provisional This API might change or be removed in a future release. 812 */ 813 @Override 814 public final Type getType() { return Type.FAST; } 815 816 /** 817 * Returns a trie value for a BMP code point (U+0000..U+FFFF), without range checking. 818 * Can be used to look up a value for a UTF-16 code unit if other parts of 819 * the string processing check for surrogates. 820 * 821 * @param c the input code point, must be U+0000..U+FFFF 822 * @return The BMP code point's trie value. 823 * @draft ICU 63 824 * @provisional This API might change or be removed in a future release. 825 */ 826 public abstract int bmpGet(int c); 827 828 /** 829 * Returns a trie value for a supplementary code point (U+10000..U+10FFFF), 830 * without range checking. 831 * 832 * @param c the input code point, must be U+10000..U+10FFFF 833 * @return The supplementary code point's trie value. 834 * @draft ICU 63 835 * @provisional This API might change or be removed in a future release. 836 */ 837 public abstract int suppGet(int c); 838 839 /** 840 * @internal 841 * @deprecated This API is ICU internal only. 842 */ 843 @Deprecated 844 @Override 845 protected final int cpIndex(int c) { 846 if (c >= 0) { 847 if (c <= 0xffff) { 848 return fastIndex(c); 849 } else if (c <= 0x10ffff) { 850 return smallIndex(Type.FAST, c); 851 } 852 } 853 return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 854 } 855 856 /** 857 * {@inheritDoc} 858 * @draft ICU 63 859 * @provisional This API might change or be removed in a future release. 860 */ 861 @Override 862 public final StringIterator stringIterator(CharSequence s, int sIndex) { 863 return new FastStringIterator(s, sIndex); 864 } 865 866 private final class FastStringIterator extends StringIterator { 867 private FastStringIterator(CharSequence s, int sIndex) { 868 super(s, sIndex); 869 } 870 871 @Override 872 public boolean next() { 873 if (sIndex >= s.length()) { 874 return false; 875 } 876 char lead = s.charAt(sIndex++); 877 c = lead; 878 int dataIndex; 879 if (!Character.isSurrogate(lead)) { 880 dataIndex = fastIndex(c); 881 } else { 882 char trail; 883 if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && 884 Character.isLowSurrogate(trail = s.charAt(sIndex))) { 885 ++sIndex; 886 c = Character.toCodePoint(lead, trail); 887 dataIndex = smallIndex(Type.FAST, c); 888 } else { 889 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 890 } 891 } 892 value = data.getFromIndex(dataIndex); 893 return true; 894 } 895 896 @Override 897 public boolean previous() { 898 if (sIndex <= 0) { 899 return false; 900 } 901 char trail = s.charAt(--sIndex); 902 c = trail; 903 int dataIndex; 904 if (!Character.isSurrogate(trail)) { 905 dataIndex = fastIndex(c); 906 } else { 907 char lead; 908 if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && 909 Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { 910 --sIndex; 911 c = Character.toCodePoint(lead, trail); 912 dataIndex = smallIndex(Type.FAST, c); 913 } else { 914 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 915 } 916 } 917 value = data.getFromIndex(dataIndex); 918 return true; 919 } 920 } 921 } 922 923 /** 924 * A CodePointTrie with {@link Type#SMALL}. 925 * 926 * @draft ICU 63 927 * @provisional This API might change or be removed in a future release. 928 */ 929 public static abstract class Small extends CodePointTrie { 930 private Small(char[] index, Data data, int highStart, 931 int index3NullOffset, int dataNullOffset) { 932 super(index, data, highStart, index3NullOffset, dataNullOffset); 933 } 934 935 /** 936 * Creates a trie from its binary form. 937 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 938 * with {@link Type#SMALL}. 939 * 940 * @param valueWidth selects the number of bits in a data value; this method throws an exception 941 * if the valueWidth does not match the binary data; 942 * use null to accept any data value width 943 * @param bytes a buffer containing the binary data of a CodePointTrie 944 * @return the trie 945 * @draft ICU 63 946 * @provisional This API might change or be removed in a future release. 947 */ 948 public static Small fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { 949 return (Small) CodePointTrie.fromBinary(Type.SMALL, valueWidth, bytes); 950 } 951 952 /** 953 * @return {@link Type#SMALL} 954 * @draft ICU 63 955 * @provisional This API might change or be removed in a future release. 956 */ 957 @Override 958 public final Type getType() { return Type.SMALL; } 959 960 /** 961 * @internal 962 * @deprecated This API is ICU internal only. 963 */ 964 @Deprecated 965 @Override 966 protected final int cpIndex(int c) { 967 if (c >= 0) { 968 if (c <= SMALL_MAX) { 969 return fastIndex(c); 970 } else if (c <= 0x10ffff) { 971 return smallIndex(Type.SMALL, c); 972 } 973 } 974 return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 975 } 976 977 /** 978 * {@inheritDoc} 979 * @draft ICU 63 980 * @provisional This API might change or be removed in a future release. 981 */ 982 @Override 983 public final StringIterator stringIterator(CharSequence s, int sIndex) { 984 return new SmallStringIterator(s, sIndex); 985 } 986 987 private final class SmallStringIterator extends StringIterator { 988 private SmallStringIterator(CharSequence s, int sIndex) { 989 super(s, sIndex); 990 } 991 992 @Override 993 public boolean next() { 994 if (sIndex >= s.length()) { 995 return false; 996 } 997 char lead = s.charAt(sIndex++); 998 c = lead; 999 int dataIndex; 1000 if (!Character.isSurrogate(lead)) { 1001 dataIndex = cpIndex(c); 1002 } else { 1003 char trail; 1004 if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && 1005 Character.isLowSurrogate(trail = s.charAt(sIndex))) { 1006 ++sIndex; 1007 c = Character.toCodePoint(lead, trail); 1008 dataIndex = smallIndex(Type.SMALL, c); 1009 } else { 1010 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 1011 } 1012 } 1013 value = data.getFromIndex(dataIndex); 1014 return true; 1015 } 1016 1017 @Override 1018 public boolean previous() { 1019 if (sIndex <= 0) { 1020 return false; 1021 } 1022 char trail = s.charAt(--sIndex); 1023 c = trail; 1024 int dataIndex; 1025 if (!Character.isSurrogate(trail)) { 1026 dataIndex = cpIndex(c); 1027 } else { 1028 char lead; 1029 if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && 1030 Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { 1031 --sIndex; 1032 c = Character.toCodePoint(lead, trail); 1033 dataIndex = smallIndex(Type.SMALL, c); 1034 } else { 1035 dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; 1036 } 1037 } 1038 value = data.getFromIndex(dataIndex); 1039 return true; 1040 } 1041 } 1042 } 1043 1044 /** 1045 * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_16}. 1046 * 1047 * @draft ICU 63 1048 * @provisional This API might change or be removed in a future release. 1049 */ 1050 public static final class Fast16 extends Fast { 1051 private final char[] dataArray; 1052 1053 Fast16(char[] index, char[] data16, int highStart, 1054 int index3NullOffset, int dataNullOffset) { 1055 super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); 1056 this.dataArray = data16; 1057 } 1058 1059 /** 1060 * Creates a trie from its binary form. 1061 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1062 * with {@link Type#FAST} and {@link ValueWidth#BITS_16}. 1063 * 1064 * @param bytes a buffer containing the binary data of a CodePointTrie 1065 * @return the trie 1066 * @draft ICU 63 1067 * @provisional This API might change or be removed in a future release. 1068 */ 1069 public static Fast16 fromBinary(ByteBuffer bytes) { 1070 return (Fast16) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_16, bytes); 1071 } 1072 1073 /** 1074 * {@inheritDoc} 1075 * @draft ICU 63 1076 * @provisional This API might change or be removed in a future release. 1077 */ 1078 @Override 1079 public final int get(int c) { 1080 return dataArray[cpIndex(c)]; 1081 } 1082 1083 /** 1084 * {@inheritDoc} 1085 * @draft ICU 63 1086 * @provisional This API might change or be removed in a future release. 1087 */ 1088 @Override 1089 public final int bmpGet(int c) { 1090 assert 0 <= c && c <= 0xffff; 1091 return dataArray[fastIndex(c)]; 1092 } 1093 1094 /** 1095 * {@inheritDoc} 1096 * @draft ICU 63 1097 * @provisional This API might change or be removed in a future release. 1098 */ 1099 @Override 1100 public final int suppGet(int c) { 1101 assert 0x10000 <= c && c <= 0x10ffff; 1102 return dataArray[smallIndex(Type.FAST, c)]; 1103 } 1104 } 1105 1106 /** 1107 * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_32}. 1108 * 1109 * @draft ICU 63 1110 * @provisional This API might change or be removed in a future release. 1111 */ 1112 public static final class Fast32 extends Fast { 1113 private final int[] dataArray; 1114 1115 Fast32(char[] index, int[] data32, int highStart, 1116 int index3NullOffset, int dataNullOffset) { 1117 super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); 1118 this.dataArray = data32; 1119 } 1120 1121 /** 1122 * Creates a trie from its binary form. 1123 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1124 * with {@link Type#FAST} and {@link ValueWidth#BITS_32}. 1125 * 1126 * @param bytes a buffer containing the binary data of a CodePointTrie 1127 * @return the trie 1128 * @draft ICU 63 1129 * @provisional This API might change or be removed in a future release. 1130 */ 1131 public static Fast32 fromBinary(ByteBuffer bytes) { 1132 return (Fast32) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_32, bytes); 1133 } 1134 1135 /** 1136 * {@inheritDoc} 1137 * @draft ICU 63 1138 * @provisional This API might change or be removed in a future release. 1139 */ 1140 @Override 1141 public final int get(int c) { 1142 return dataArray[cpIndex(c)]; 1143 } 1144 1145 /** 1146 * {@inheritDoc} 1147 * @draft ICU 63 1148 * @provisional This API might change or be removed in a future release. 1149 */ 1150 @Override 1151 public final int bmpGet(int c) { 1152 assert 0 <= c && c <= 0xffff; 1153 return dataArray[fastIndex(c)]; 1154 } 1155 1156 /** 1157 * {@inheritDoc} 1158 * @draft ICU 63 1159 * @provisional This API might change or be removed in a future release. 1160 */ 1161 @Override 1162 public final int suppGet(int c) { 1163 assert 0x10000 <= c && c <= 0x10ffff; 1164 return dataArray[smallIndex(Type.FAST, c)]; 1165 } 1166 } 1167 1168 /** 1169 * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_8}. 1170 * 1171 * @draft ICU 63 1172 * @provisional This API might change or be removed in a future release. 1173 */ 1174 public static final class Fast8 extends Fast { 1175 private final byte[] dataArray; 1176 1177 Fast8(char[] index, byte[] data8, int highStart, 1178 int index3NullOffset, int dataNullOffset) { 1179 super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); 1180 this.dataArray = data8; 1181 } 1182 1183 /** 1184 * Creates a trie from its binary form. 1185 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1186 * with {@link Type#FAST} and {@link ValueWidth#BITS_8}. 1187 * 1188 * @param bytes a buffer containing the binary data of a CodePointTrie 1189 * @return the trie 1190 * @draft ICU 63 1191 * @provisional This API might change or be removed in a future release. 1192 */ 1193 public static Fast8 fromBinary(ByteBuffer bytes) { 1194 return (Fast8) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_8, bytes); 1195 } 1196 1197 /** 1198 * {@inheritDoc} 1199 * @draft ICU 63 1200 * @provisional This API might change or be removed in a future release. 1201 */ 1202 @Override 1203 public final int get(int c) { 1204 return dataArray[cpIndex(c)] & 0xff; 1205 } 1206 1207 /** 1208 * {@inheritDoc} 1209 * @draft ICU 63 1210 * @provisional This API might change or be removed in a future release. 1211 */ 1212 @Override 1213 public final int bmpGet(int c) { 1214 assert 0 <= c && c <= 0xffff; 1215 return dataArray[fastIndex(c)] & 0xff; 1216 } 1217 1218 /** 1219 * {@inheritDoc} 1220 * @draft ICU 63 1221 * @provisional This API might change or be removed in a future release. 1222 */ 1223 @Override 1224 public final int suppGet(int c) { 1225 assert 0x10000 <= c && c <= 0x10ffff; 1226 return dataArray[smallIndex(Type.FAST, c)] & 0xff; 1227 } 1228 } 1229 1230 /** 1231 * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. 1232 * 1233 * @draft ICU 63 1234 * @provisional This API might change or be removed in a future release. 1235 */ 1236 public static final class Small16 extends Small { 1237 Small16(char[] index, char[] data16, int highStart, 1238 int index3NullOffset, int dataNullOffset) { 1239 super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); 1240 } 1241 1242 /** 1243 * Creates a trie from its binary form. 1244 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1245 * with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. 1246 * 1247 * @param bytes a buffer containing the binary data of a CodePointTrie 1248 * @return the trie 1249 * @draft ICU 63 1250 * @provisional This API might change or be removed in a future release. 1251 */ 1252 public static Small16 fromBinary(ByteBuffer bytes) { 1253 return (Small16) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_16, bytes); 1254 } 1255 } 1256 1257 /** 1258 * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. 1259 * 1260 * @draft ICU 63 1261 * @provisional This API might change or be removed in a future release. 1262 */ 1263 public static final class Small32 extends Small { 1264 Small32(char[] index, int[] data32, int highStart, 1265 int index3NullOffset, int dataNullOffset) { 1266 super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); 1267 } 1268 1269 /** 1270 * Creates a trie from its binary form. 1271 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1272 * with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. 1273 * 1274 * @param bytes a buffer containing the binary data of a CodePointTrie 1275 * @return the trie 1276 * @draft ICU 63 1277 * @provisional This API might change or be removed in a future release. 1278 */ 1279 public static Small32 fromBinary(ByteBuffer bytes) { 1280 return (Small32) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_32, bytes); 1281 } 1282 } 1283 1284 /** 1285 * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. 1286 * 1287 * @draft ICU 63 1288 * @provisional This API might change or be removed in a future release. 1289 */ 1290 public static final class Small8 extends Small { 1291 Small8(char[] index, byte[] data8, int highStart, 1292 int index3NullOffset, int dataNullOffset) { 1293 super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); 1294 } 1295 1296 /** 1297 * Creates a trie from its binary form. 1298 * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} 1299 * with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. 1300 * 1301 * @param bytes a buffer containing the binary data of a CodePointTrie 1302 * @return the trie 1303 * @draft ICU 63 1304 * @provisional This API might change or be removed in a future release. 1305 */ 1306 public static Small8 fromBinary(ByteBuffer bytes) { 1307 return (Small8) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_8, bytes); 1308 } 1309 } 1310 }