1 /* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ****************************************************************************** 28 * 29 * Copyright (C) 2009-2014, International Business Machines 30 * Corporation and others. All Rights Reserved. 31 * 32 ****************************************************************************** 33 */ 34 35 package sun.text.normalizer; 36 37 import sun.text.normalizer.UnicodeSet.SpanCondition; 38 39 /** 40 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. 41 * 42 * Latin-1: Look up bytes. 43 * 2-byte characters: Bits organized vertically. 44 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. 45 * Supplementary characters: Call contains() on the parent set. 46 */ 47 final class BMPSet { 48 49 /** 50 * One boolean ('true' or 'false') per Latin-1 character. 51 */ 52 private boolean[] latin1Contains; 53 54 /** 55 * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points 56 * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} 57 * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) 58 * 59 * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at 60 * runtime. 61 */ 62 private int[] table7FF; 63 64 /** 65 * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks 66 * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} 67 * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit 68 * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed 69 * and set.contains(c) must be called. 70 * 71 * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster 72 * validity checking at runtime. 73 */ 74 private int[] bmpBlockBits; 75 76 /** 77 * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, 78 * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are 79 * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. 80 */ 81 private int[] list4kStarts; 82 83 /** 84 * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for 85 * supplementary code points. The list is terminated with list[listLength-1]=0x110000. 86 */ 87 private final int[] list; 88 private final int listLength; // length used; list may be longer to minimize reallocs 89 90 public BMPSet(final int[] parentList, int parentListLength) { 91 list = parentList; 92 listLength = parentListLength; 93 latin1Contains = new boolean[0x100]; 94 table7FF = new int[64]; 95 bmpBlockBits = new int[64]; 96 list4kStarts = new int[18]; 97 98 /* 99 * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the 100 * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of 101 * indexes is for finding supplementary code points. 102 */ 103 list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); 104 int i; 105 for (i = 1; i <= 0x10; ++i) { 106 list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); 107 } 108 list4kStarts[0x11] = listLength - 1; 109 110 initBits(); 111 } 112 113 public boolean contains(int c) { 114 if (c <= 0xff) { 115 return (latin1Contains[c]); 116 } else if (c <= 0x7ff) { 117 return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); 118 } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { 119 int lead = c >> 12; 120 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 121 if (twoBits <= 1) { 122 // All 64 code points with the same bits 15..6 123 // are either in the set or not. 124 return (0 != twoBits); 125 } else { 126 // Look up the code point in its 4k block of code points. 127 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); 128 } 129 } else if (c <= 0x10ffff) { 130 // surrogate or supplementary code point 131 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 132 } else { 133 // Out-of-range code points get false, consistent with long-standing 134 // behavior of UnicodeSet.contains(c). 135 return false; 136 } 137 } 138 139 /** 140 * Span the initial substring for which each character c has spanCondition==contains(c). It must be 141 * spanCondition==0 or 1. 142 * 143 * @param start The start index 144 * @param outCount If not null: Receives the number of code points in the span. 145 * @return the limit (exclusive end) of the span 146 * 147 * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for 148 * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points 149 * as usual in ICU. 150 */ 151 public final int span(CharSequence s, int start, SpanCondition spanCondition, 152 OutputInt outCount) { 153 char c, c2; 154 int i = start; 155 int limit = s.length(); 156 int numSupplementary = 0; 157 if (SpanCondition.NOT_CONTAINED != spanCondition) { 158 // span 159 while (i < limit) { 160 c = s.charAt(i); 161 if (c <= 0xff) { 162 if (!latin1Contains[c]) { 163 break; 164 } 165 } else if (c <= 0x7ff) { 166 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 167 break; 168 } 169 } else if (c < 0xd800 || 170 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 171 int lead = c >> 12; 172 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 173 if (twoBits <= 1) { 174 // All 64 code points with the same bits 15..6 175 // are either in the set or not. 176 if (twoBits == 0) { 177 break; 178 } 179 } else { 180 // Look up the code point in its 4k block of code points. 181 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 182 break; 183 } 184 } 185 } else { 186 // surrogate pair 187 int supplementary = UCharacterProperty.getRawSupplementary(c, c2); 188 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 189 break; 190 } 191 ++numSupplementary; 192 ++i; 193 } 194 ++i; 195 } 196 } else { 197 // span not 198 while (i < limit) { 199 c = s.charAt(i); 200 if (c <= 0xff) { 201 if (latin1Contains[c]) { 202 break; 203 } 204 } else if (c <= 0x7ff) { 205 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 206 break; 207 } 208 } else if (c < 0xd800 || 209 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 210 int lead = c >> 12; 211 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 212 if (twoBits <= 1) { 213 // All 64 code points with the same bits 15..6 214 // are either in the set or not. 215 if (twoBits != 0) { 216 break; 217 } 218 } else { 219 // Look up the code point in its 4k block of code points. 220 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 221 break; 222 } 223 } 224 } else { 225 // surrogate pair 226 int supplementary = UCharacterProperty.getRawSupplementary(c, c2); 227 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 228 break; 229 } 230 ++numSupplementary; 231 ++i; 232 } 233 ++i; 234 } 235 } 236 if (outCount != null) { 237 int spanLength = i - start; 238 outCount.value = spanLength - numSupplementary; // number of code points 239 } 240 return i; 241 } 242 243 /** 244 * Symmetrical with span(). 245 * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= 246 * limit and spanCondition==0 or 1. 247 * 248 * @return The string index which starts the span (i.e. inclusive). 249 */ 250 public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { 251 char c, c2; 252 253 if (SpanCondition.NOT_CONTAINED != spanCondition) { 254 // span 255 for (;;) { 256 c = s.charAt(--limit); 257 if (c <= 0xff) { 258 if (!latin1Contains[c]) { 259 break; 260 } 261 } else if (c <= 0x7ff) { 262 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 263 break; 264 } 265 } else if (c < 0xd800 || 266 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 267 int lead = c >> 12; 268 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 269 if (twoBits <= 1) { 270 // All 64 code points with the same bits 15..6 271 // are either in the set or not. 272 if (twoBits == 0) { 273 break; 274 } 275 } else { 276 // Look up the code point in its 4k block of code points. 277 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 278 break; 279 } 280 } 281 } else { 282 // surrogate pair 283 int supplementary = UCharacterProperty.getRawSupplementary(c2, c); 284 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 285 break; 286 } 287 --limit; 288 } 289 if (0 == limit) { 290 return 0; 291 } 292 } 293 } else { 294 // span not 295 for (;;) { 296 c = s.charAt(--limit); 297 if (c <= 0xff) { 298 if (latin1Contains[c]) { 299 break; 300 } 301 } else if (c <= 0x7ff) { 302 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 303 break; 304 } 305 } else if (c < 0xd800 || 306 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 307 int lead = c >> 12; 308 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 309 if (twoBits <= 1) { 310 // All 64 code points with the same bits 15..6 311 // are either in the set or not. 312 if (twoBits != 0) { 313 break; 314 } 315 } else { 316 // Look up the code point in its 4k block of code points. 317 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 318 break; 319 } 320 } 321 } else { 322 // surrogate pair 323 int supplementary = UCharacterProperty.getRawSupplementary(c2, c); 324 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 325 break; 326 } 327 --limit; 328 } 329 if (0 == limit) { 330 return 0; 331 } 332 } 333 } 334 return limit + 1; 335 } 336 337 /** 338 * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 339 */ 340 private static void set32x64Bits(int[] table, int start, int limit) { 341 assert (64 == table.length); 342 int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 343 int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 344 345 // Set one bit indicating an all-one block. 346 int bits = 1 << lead; 347 if ((start + 1) == limit) { // Single-character shortcut. 348 table[trail] |= bits; 349 return; 350 } 351 352 int limitLead = limit >> 6; 353 int limitTrail = limit & 0x3f; 354 355 if (lead == limitLead) { 356 // Partial vertical bit column. 357 while (trail < limitTrail) { 358 table[trail++] |= bits; 359 } 360 } else { 361 // Partial vertical bit column, 362 // followed by a bit rectangle, 363 // followed by another partial vertical bit column. 364 if (trail > 0) { 365 do { 366 table[trail++] |= bits; 367 } while (trail < 64); 368 ++lead; 369 } 370 if (lead < limitLead) { 371 bits = ~((1 << lead) - 1); 372 if (limitLead < 0x20) { 373 bits &= (1 << limitLead) - 1; 374 } 375 for (trail = 0; trail < 64; ++trail) { 376 table[trail] |= bits; 377 } 378 } 379 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 380 // In that case, bits=1<<limitLead == 1<<0 == 1 381 // (because Java << uses only the lower 5 bits of the shift operand) 382 // but the bits value is not used because trail<limitTrail is already false. 383 bits = 1 << limitLead; 384 for (trail = 0; trail < limitTrail; ++trail) { 385 table[trail] |= bits; 386 } 387 } 388 } 389 390 private void initBits() { 391 int start, limit; 392 int listIndex = 0; 393 394 // Set latin1Contains[]. 395 do { 396 start = list[listIndex++]; 397 if (listIndex < listLength) { 398 limit = list[listIndex++]; 399 } else { 400 limit = 0x110000; 401 } 402 if (start >= 0x100) { 403 break; 404 } 405 do { 406 latin1Contains[start++] = true; 407 } while (start < limit && start < 0x100); 408 } while (limit <= 0x100); 409 410 // Set table7FF[]. 411 while (start < 0x800) { 412 set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); 413 if (limit > 0x800) { 414 start = 0x800; 415 break; 416 } 417 418 start = list[listIndex++]; 419 if (listIndex < listLength) { 420 limit = list[listIndex++]; 421 } else { 422 limit = 0x110000; 423 } 424 } 425 426 // Set bmpBlockBits[]. 427 int minStart = 0x800; 428 while (start < 0x10000) { 429 if (limit > 0x10000) { 430 limit = 0x10000; 431 } 432 433 if (start < minStart) { 434 start = minStart; 435 } 436 if (start < limit) { // Else: Another range entirely in a known mixed-value block. 437 if (0 != (start & 0x3f)) { 438 // Mixed-value block of 64 code points. 439 start >>= 6; 440 bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); 441 start = (start + 1) << 6; // Round up to the next block boundary. 442 minStart = start; // Ignore further ranges in this block. 443 } 444 if (start < limit) { 445 if (start < (limit & ~0x3f)) { 446 // Multiple all-ones blocks of 64 code points each. 447 set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); 448 } 449 450 if (0 != (limit & 0x3f)) { 451 // Mixed-value block of 64 code points. 452 limit >>= 6; 453 bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); 454 limit = (limit + 1) << 6; // Round up to the next block boundary. 455 minStart = limit; // Ignore further ranges in this block. 456 } 457 } 458 } 459 460 if (limit == 0x10000) { 461 break; 462 } 463 464 start = list[listIndex++]; 465 if (listIndex < listLength) { 466 limit = list[listIndex++]; 467 } else { 468 limit = 0x110000; 469 } 470 } 471 } 472 473 /** 474 * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code 475 * points in a certain range. 476 * 477 * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and 478 * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. 479 * 480 * @param c 481 * a character in a subrange of MIN_VALUE..MAX_VALUE 482 * @param lo 483 * The lowest index to be returned. 484 * @param hi 485 * The highest index to be returned. 486 * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] 487 */ 488 private int findCodePoint(int c, int lo, int hi) { 489 /* Examples: 490 findCodePoint(c) 491 set list[] c=0 1 3 4 7 8 492 === ============== =========== 493 [] [110000] 0 0 0 0 0 0 494 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 495 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 496 [:Any:] [0, 110000] 1 1 1 1 1 1 497 */ 498 499 // Return the smallest i such that c < list[i]. Assume 500 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 501 if (c < list[lo]) 502 return lo; 503 // High runner test. c is often after the last range, so an 504 // initial check for this condition pays off. 505 if (lo >= hi || c >= list[hi - 1]) 506 return hi; 507 // invariant: c >= list[lo] 508 // invariant: c < list[hi] 509 for (;;) { 510 int i = (lo + hi) >>> 1; 511 if (i == lo) { 512 break; // Found! 513 } else if (c < list[i]) { 514 hi = i; 515 } else { 516 lo = i; 517 } 518 } 519 return hi; 520 } 521 522 private final boolean containsSlow(int c, int lo, int hi) { 523 return (0 != (findCodePoint(c, lo, hi) & 1)); 524 } 525 } 526