1 /* 2 * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ****************************************************************************** 28 * 29 * Copyright (C) 2009-2014, International Business Machines 30 * Corporation and others. All Rights Reserved. 31 * 32 ****************************************************************************** 33 */ 34 35 package jdk.internal.icu.impl; 36 37 import jdk.internal.icu.text.UnicodeSet.SpanCondition; 38 import jdk.internal.icu.util.OutputInt; 39 40 /** 41 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. 42 * 43 * Latin-1: Look up bytes. 44 * 2-byte characters: Bits organized vertically. 45 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. 46 * Supplementary characters: Call contains() on the parent set. 47 */ 48 public final class BMPSet { 49 50 /** 51 * One boolean ('true' or 'false') per Latin-1 character. 52 */ 53 private boolean[] latin1Contains; 54 55 /** 56 * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points 57 * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} 58 * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) 59 * 60 * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at 61 * runtime. 62 */ 63 private int[] table7FF; 64 65 /** 66 * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks 67 * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} 68 * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit 69 * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed 70 * and set.contains(c) must be called. 71 * 72 * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster 73 * validity checking at runtime. 74 */ 75 private int[] bmpBlockBits; 76 77 /** 78 * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, 79 * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are 80 * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. 81 */ 82 private int[] list4kStarts; 83 84 /** 85 * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for 86 * supplementary code points. The list is terminated with list[listLength-1]=0x110000. 87 */ 88 private final int[] list; 89 private final int listLength; // length used; list may be longer to minimize reallocs 90 91 public BMPSet(final int[] parentList, int parentListLength) { 92 list = parentList; 93 listLength = parentListLength; 94 latin1Contains = new boolean[0x100]; 95 table7FF = new int[64]; 96 bmpBlockBits = new int[64]; 97 list4kStarts = new int[18]; 98 99 /* 100 * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the 101 * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of 102 * indexes is for finding supplementary code points. 103 */ 104 list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); 105 int i; 106 for (i = 1; i <= 0x10; ++i) { 107 list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); 108 } 109 list4kStarts[0x11] = listLength - 1; 110 111 initBits(); 112 } 113 114 public boolean contains(int c) { 115 if (c <= 0xff) { 116 return (latin1Contains[c]); 117 } else if (c <= 0x7ff) { 118 return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); 119 } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { 120 int lead = c >> 12; 121 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 122 if (twoBits <= 1) { 123 // All 64 code points with the same bits 15..6 124 // are either in the set or not. 125 return (0 != twoBits); 126 } else { 127 // Look up the code point in its 4k block of code points. 128 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); 129 } 130 } else if (c <= 0x10ffff) { 131 // surrogate or supplementary code point 132 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 133 } else { 134 // Out-of-range code points get false, consistent with long-standing 135 // behavior of UnicodeSet.contains(c). 136 return false; 137 } 138 } 139 140 /** 141 * Span the initial substring for which each character c has spanCondition==contains(c). It must be 142 * spanCondition==0 or 1. 143 * 144 * @param start The start index 145 * @param outCount If not null: Receives the number of code points in the span. 146 * @return the limit (exclusive end) of the span 147 * 148 * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for 149 * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points 150 * as usual in ICU. 151 */ 152 public final int span(CharSequence s, int start, SpanCondition spanCondition, 153 OutputInt outCount) { 154 char c, c2; 155 int i = start; 156 int limit = s.length(); 157 int numSupplementary = 0; 158 if (SpanCondition.NOT_CONTAINED != spanCondition) { 159 // span 160 while (i < limit) { 161 c = s.charAt(i); 162 if (c <= 0xff) { 163 if (!latin1Contains[c]) { 164 break; 165 } 166 } else if (c <= 0x7ff) { 167 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 168 break; 169 } 170 } else if (c < 0xd800 || 171 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 172 int lead = c >> 12; 173 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 174 if (twoBits <= 1) { 175 // All 64 code points with the same bits 15..6 176 // are either in the set or not. 177 if (twoBits == 0) { 178 break; 179 } 180 } else { 181 // Look up the code point in its 4k block of code points. 182 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 183 break; 184 } 185 } 186 } else { 187 // surrogate pair 188 int supplementary = UCharacterProperty.getRawSupplementary(c, c2); 189 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 190 break; 191 } 192 ++numSupplementary; 193 ++i; 194 } 195 ++i; 196 } 197 } else { 198 // span not 199 while (i < limit) { 200 c = s.charAt(i); 201 if (c <= 0xff) { 202 if (latin1Contains[c]) { 203 break; 204 } 205 } else if (c <= 0x7ff) { 206 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 207 break; 208 } 209 } else if (c < 0xd800 || 210 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 211 int lead = c >> 12; 212 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 213 if (twoBits <= 1) { 214 // All 64 code points with the same bits 15..6 215 // are either in the set or not. 216 if (twoBits != 0) { 217 break; 218 } 219 } else { 220 // Look up the code point in its 4k block of code points. 221 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 222 break; 223 } 224 } 225 } else { 226 // surrogate pair 227 int supplementary = UCharacterProperty.getRawSupplementary(c, c2); 228 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 229 break; 230 } 231 ++numSupplementary; 232 ++i; 233 } 234 ++i; 235 } 236 } 237 if (outCount != null) { 238 int spanLength = i - start; 239 outCount.value = spanLength - numSupplementary; // number of code points 240 } 241 return i; 242 } 243 244 /** 245 * Symmetrical with span(). 246 * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= 247 * limit and spanCondition==0 or 1. 248 * 249 * @return The string index which starts the span (i.e. inclusive). 250 */ 251 public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { 252 char c, c2; 253 254 if (SpanCondition.NOT_CONTAINED != spanCondition) { 255 // span 256 for (;;) { 257 c = s.charAt(--limit); 258 if (c <= 0xff) { 259 if (!latin1Contains[c]) { 260 break; 261 } 262 } else if (c <= 0x7ff) { 263 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 264 break; 265 } 266 } else if (c < 0xd800 || 267 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 268 int lead = c >> 12; 269 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 270 if (twoBits <= 1) { 271 // All 64 code points with the same bits 15..6 272 // are either in the set or not. 273 if (twoBits == 0) { 274 break; 275 } 276 } else { 277 // Look up the code point in its 4k block of code points. 278 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 279 break; 280 } 281 } 282 } else { 283 // surrogate pair 284 int supplementary = UCharacterProperty.getRawSupplementary(c2, c); 285 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 286 break; 287 } 288 --limit; 289 } 290 if (0 == limit) { 291 return 0; 292 } 293 } 294 } else { 295 // span not 296 for (;;) { 297 c = s.charAt(--limit); 298 if (c <= 0xff) { 299 if (latin1Contains[c]) { 300 break; 301 } 302 } else if (c <= 0x7ff) { 303 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 304 break; 305 } 306 } else if (c < 0xd800 || 307 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 308 int lead = c >> 12; 309 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 310 if (twoBits <= 1) { 311 // All 64 code points with the same bits 15..6 312 // are either in the set or not. 313 if (twoBits != 0) { 314 break; 315 } 316 } else { 317 // Look up the code point in its 4k block of code points. 318 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 319 break; 320 } 321 } 322 } else { 323 // surrogate pair 324 int supplementary = UCharacterProperty.getRawSupplementary(c2, c); 325 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 326 break; 327 } 328 --limit; 329 } 330 if (0 == limit) { 331 return 0; 332 } 333 } 334 } 335 return limit + 1; 336 } 337 338 /** 339 * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 340 */ 341 private static void set32x64Bits(int[] table, int start, int limit) { 342 assert (64 == table.length); 343 int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 344 int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 345 346 // Set one bit indicating an all-one block. 347 int bits = 1 << lead; 348 if ((start + 1) == limit) { // Single-character shortcut. 349 table[trail] |= bits; 350 return; 351 } 352 353 int limitLead = limit >> 6; 354 int limitTrail = limit & 0x3f; 355 356 if (lead == limitLead) { 357 // Partial vertical bit column. 358 while (trail < limitTrail) { 359 table[trail++] |= bits; 360 } 361 } else { 362 // Partial vertical bit column, 363 // followed by a bit rectangle, 364 // followed by another partial vertical bit column. 365 if (trail > 0) { 366 do { 367 table[trail++] |= bits; 368 } while (trail < 64); 369 ++lead; 370 } 371 if (lead < limitLead) { 372 bits = ~((1 << lead) - 1); 373 if (limitLead < 0x20) { 374 bits &= (1 << limitLead) - 1; 375 } 376 for (trail = 0; trail < 64; ++trail) { 377 table[trail] |= bits; 378 } 379 } 380 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 381 // In that case, bits=1<<limitLead == 1<<0 == 1 382 // (because Java << uses only the lower 5 bits of the shift operand) 383 // but the bits value is not used because trail<limitTrail is already false. 384 bits = 1 << limitLead; 385 for (trail = 0; trail < limitTrail; ++trail) { 386 table[trail] |= bits; 387 } 388 } 389 } 390 391 private void initBits() { 392 int start, limit; 393 int listIndex = 0; 394 395 // Set latin1Contains[]. 396 do { 397 start = list[listIndex++]; 398 if (listIndex < listLength) { 399 limit = list[listIndex++]; 400 } else { 401 limit = 0x110000; 402 } 403 if (start >= 0x100) { 404 break; 405 } 406 do { 407 latin1Contains[start++] = true; 408 } while (start < limit && start < 0x100); 409 } while (limit <= 0x100); 410 411 // Set table7FF[]. 412 while (start < 0x800) { 413 set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); 414 if (limit > 0x800) { 415 start = 0x800; 416 break; 417 } 418 419 start = list[listIndex++]; 420 if (listIndex < listLength) { 421 limit = list[listIndex++]; 422 } else { 423 limit = 0x110000; 424 } 425 } 426 427 // Set bmpBlockBits[]. 428 int minStart = 0x800; 429 while (start < 0x10000) { 430 if (limit > 0x10000) { 431 limit = 0x10000; 432 } 433 434 if (start < minStart) { 435 start = minStart; 436 } 437 if (start < limit) { // Else: Another range entirely in a known mixed-value block. 438 if (0 != (start & 0x3f)) { 439 // Mixed-value block of 64 code points. 440 start >>= 6; 441 bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); 442 start = (start + 1) << 6; // Round up to the next block boundary. 443 minStart = start; // Ignore further ranges in this block. 444 } 445 if (start < limit) { 446 if (start < (limit & ~0x3f)) { 447 // Multiple all-ones blocks of 64 code points each. 448 set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); 449 } 450 451 if (0 != (limit & 0x3f)) { 452 // Mixed-value block of 64 code points. 453 limit >>= 6; 454 bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); 455 limit = (limit + 1) << 6; // Round up to the next block boundary. 456 minStart = limit; // Ignore further ranges in this block. 457 } 458 } 459 } 460 461 if (limit == 0x10000) { 462 break; 463 } 464 465 start = list[listIndex++]; 466 if (listIndex < listLength) { 467 limit = list[listIndex++]; 468 } else { 469 limit = 0x110000; 470 } 471 } 472 } 473 474 /** 475 * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code 476 * points in a certain range. 477 * 478 * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and 479 * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. 480 * 481 * @param c 482 * a character in a subrange of MIN_VALUE..MAX_VALUE 483 * @param lo 484 * The lowest index to be returned. 485 * @param hi 486 * The highest index to be returned. 487 * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] 488 */ 489 private int findCodePoint(int c, int lo, int hi) { 490 /* Examples: 491 findCodePoint(c) 492 set list[] c=0 1 3 4 7 8 493 === ============== =========== 494 [] [110000] 0 0 0 0 0 0 495 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 496 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 497 [:Any:] [0, 110000] 1 1 1 1 1 1 498 */ 499 500 // Return the smallest i such that c < list[i]. Assume 501 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 502 if (c < list[lo]) 503 return lo; 504 // High runner test. c is often after the last range, so an 505 // initial check for this condition pays off. 506 if (lo >= hi || c >= list[hi - 1]) 507 return hi; 508 // invariant: c >= list[lo] 509 // invariant: c < list[hi] 510 for (;;) { 511 int i = (lo + hi) >>> 1; 512 if (i == lo) { 513 break; // Found! 514 } else if (c < list[i]) { 515 hi = i; 516 } else { 517 lo = i; 518 } 519 } 520 return hi; 521 } 522 523 private final boolean containsSlow(int c, int lo, int hi) { 524 return (0 != (findCodePoint(c, lo, hi) & 1)); 525 } 526 } 527