1 /*
   2  * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  ******************************************************************************
  28  *
  29  *   Copyright (C) 2009-2014, International Business Machines
  30  *   Corporation and others.  All Rights Reserved.
  31  *
  32  ******************************************************************************
  33  */
  34 
  35 package jdk.internal.icu.impl;
  36 
  37 import jdk.internal.icu.text.UnicodeSet.SpanCondition;
  38 import jdk.internal.icu.util.OutputInt;
  39 
  40 /**
  41  * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
  42  *
  43  * Latin-1: Look up bytes.
  44  * 2-byte characters: Bits organized vertically.
  45  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
  46  * Supplementary characters: Call contains() on the parent set.
  47  */
  48 public final class BMPSet {
  49 
  50     /**
  51      * One boolean ('true' or 'false') per Latin-1 character.
  52      */
  53     private boolean[] latin1Contains;
  54 
  55     /**
  56      * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
  57      * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
  58      * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
  59      *
  60      * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
  61      * runtime.
  62      */
  63     private int[] table7FF;
  64 
  65     /**
  66      * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
  67      * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
  68      * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
  69      * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
  70      * and set.contains(c) must be called.
  71      *
  72      * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
  73      * validity checking at runtime.
  74      */
  75     private int[] bmpBlockBits;
  76 
  77     /**
  78      * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
  79      * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
  80      * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
  81      */
  82     private int[] list4kStarts;
  83 
  84     /**
  85      * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
  86      * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
  87      */
  88     private final int[] list;
  89     private final int listLength; // length used; list may be longer to minimize reallocs
  90 
  91     public BMPSet(final int[] parentList, int parentListLength) {
  92         list = parentList;
  93         listLength = parentListLength;
  94         latin1Contains = new boolean[0x100];
  95         table7FF = new int[64];
  96         bmpBlockBits = new int[64];
  97         list4kStarts = new int[18];
  98 
  99         /*
 100          * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
 101          * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
 102          * indexes is for finding supplementary code points.
 103          */
 104         list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
 105         int i;
 106         for (i = 1; i <= 0x10; ++i) {
 107             list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
 108         }
 109         list4kStarts[0x11] = listLength - 1;
 110 
 111         initBits();
 112     }
 113 
 114     public boolean contains(int c) {
 115         if (c <= 0xff) {
 116             return (latin1Contains[c]);
 117         } else if (c <= 0x7ff) {
 118             return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
 119         } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
 120             int lead = c >> 12;
 121             int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 122             if (twoBits <= 1) {
 123                 // All 64 code points with the same bits 15..6
 124                 // are either in the set or not.
 125                 return (0 != twoBits);
 126             } else {
 127                 // Look up the code point in its 4k block of code points.
 128                 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
 129             }
 130         } else if (c <= 0x10ffff) {
 131             // surrogate or supplementary code point
 132             return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
 133         } else {
 134             // Out-of-range code points get false, consistent with long-standing
 135             // behavior of UnicodeSet.contains(c).
 136             return false;
 137         }
 138     }
 139 
 140     /**
 141      * Span the initial substring for which each character c has spanCondition==contains(c). It must be
 142      * spanCondition==0 or 1.
 143      *
 144      * @param start The start index
 145      * @param outCount If not null: Receives the number of code points in the span.
 146      * @return the limit (exclusive end) of the span
 147      *
 148      * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
 149      * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
 150      * as usual in ICU.
 151      */
 152     public final int span(CharSequence s, int start, SpanCondition spanCondition,
 153             OutputInt outCount) {
 154         char c, c2;
 155         int i = start;
 156         int limit = s.length();
 157         int numSupplementary = 0;
 158         if (SpanCondition.NOT_CONTAINED != spanCondition) {
 159             // span
 160             while (i < limit) {
 161                 c = s.charAt(i);
 162                 if (c <= 0xff) {
 163                     if (!latin1Contains[c]) {
 164                         break;
 165                     }
 166                 } else if (c <= 0x7ff) {
 167                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
 168                         break;
 169                     }
 170                 } else if (c < 0xd800 ||
 171                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
 172                     int lead = c >> 12;
 173                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 174                     if (twoBits <= 1) {
 175                         // All 64 code points with the same bits 15..6
 176                         // are either in the set or not.
 177                         if (twoBits == 0) {
 178                             break;
 179                         }
 180                     } else {
 181                         // Look up the code point in its 4k block of code points.
 182                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 183                             break;
 184                         }
 185                     }
 186                 } else {
 187                     // surrogate pair
 188                     int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
 189                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 190                         break;
 191                     }
 192                     ++numSupplementary;
 193                     ++i;
 194                 }
 195                 ++i;
 196             }
 197         } else {
 198             // span not
 199             while (i < limit) {
 200                 c = s.charAt(i);
 201                 if (c <= 0xff) {
 202                     if (latin1Contains[c]) {
 203                         break;
 204                     }
 205                 } else if (c <= 0x7ff) {
 206                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
 207                         break;
 208                     }
 209                 } else if (c < 0xd800 ||
 210                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
 211                     int lead = c >> 12;
 212                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 213                     if (twoBits <= 1) {
 214                         // All 64 code points with the same bits 15..6
 215                         // are either in the set or not.
 216                         if (twoBits != 0) {
 217                             break;
 218                         }
 219                     } else {
 220                         // Look up the code point in its 4k block of code points.
 221                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 222                             break;
 223                         }
 224                     }
 225                 } else {
 226                     // surrogate pair
 227                     int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
 228                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 229                         break;
 230                     }
 231                     ++numSupplementary;
 232                     ++i;
 233                 }
 234                 ++i;
 235             }
 236         }
 237         if (outCount != null) {
 238             int spanLength = i - start;
 239             outCount.value = spanLength - numSupplementary;  // number of code points
 240         }
 241         return i;
 242     }
 243 
 244     /**
 245      * Symmetrical with span().
 246      * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
 247      * limit and spanCondition==0 or 1.
 248      *
 249      * @return The string index which starts the span (i.e. inclusive).
 250      */
 251     public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
 252         char c, c2;
 253 
 254         if (SpanCondition.NOT_CONTAINED != spanCondition) {
 255             // span
 256             for (;;) {
 257                 c = s.charAt(--limit);
 258                 if (c <= 0xff) {
 259                     if (!latin1Contains[c]) {
 260                         break;
 261                     }
 262                 } else if (c <= 0x7ff) {
 263                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
 264                         break;
 265                     }
 266                 } else if (c < 0xd800 ||
 267                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
 268                     int lead = c >> 12;
 269                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 270                     if (twoBits <= 1) {
 271                         // All 64 code points with the same bits 15..6
 272                         // are either in the set or not.
 273                         if (twoBits == 0) {
 274                             break;
 275                         }
 276                     } else {
 277                         // Look up the code point in its 4k block of code points.
 278                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 279                             break;
 280                         }
 281                     }
 282                 } else {
 283                     // surrogate pair
 284                     int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
 285                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 286                         break;
 287                     }
 288                     --limit;
 289                 }
 290                 if (0 == limit) {
 291                     return 0;
 292                 }
 293             }
 294         } else {
 295             // span not
 296             for (;;) {
 297                 c = s.charAt(--limit);
 298                 if (c <= 0xff) {
 299                     if (latin1Contains[c]) {
 300                         break;
 301                     }
 302                 } else if (c <= 0x7ff) {
 303                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
 304                         break;
 305                     }
 306                 } else if (c < 0xd800 ||
 307                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
 308                     int lead = c >> 12;
 309                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 310                     if (twoBits <= 1) {
 311                         // All 64 code points with the same bits 15..6
 312                         // are either in the set or not.
 313                         if (twoBits != 0) {
 314                             break;
 315                         }
 316                     } else {
 317                         // Look up the code point in its 4k block of code points.
 318                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 319                             break;
 320                         }
 321                     }
 322                 } else {
 323                     // surrogate pair
 324                     int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
 325                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 326                         break;
 327                     }
 328                     --limit;
 329                 }
 330                 if (0 == limit) {
 331                     return 0;
 332                 }
 333             }
 334         }
 335         return limit + 1;
 336     }
 337 
 338     /**
 339      * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
 340      */
 341     private static void set32x64Bits(int[] table, int start, int limit) {
 342         assert (64 == table.length);
 343         int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
 344         int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
 345 
 346         // Set one bit indicating an all-one block.
 347         int bits = 1 << lead;
 348         if ((start + 1) == limit) { // Single-character shortcut.
 349             table[trail] |= bits;
 350             return;
 351         }
 352 
 353         int limitLead = limit >> 6;
 354         int limitTrail = limit & 0x3f;
 355 
 356         if (lead == limitLead) {
 357             // Partial vertical bit column.
 358             while (trail < limitTrail) {
 359                 table[trail++] |= bits;
 360             }
 361         } else {
 362             // Partial vertical bit column,
 363             // followed by a bit rectangle,
 364             // followed by another partial vertical bit column.
 365             if (trail > 0) {
 366                 do {
 367                     table[trail++] |= bits;
 368                 } while (trail < 64);
 369                 ++lead;
 370             }
 371             if (lead < limitLead) {
 372                 bits = ~((1 << lead) - 1);
 373                 if (limitLead < 0x20) {
 374                     bits &= (1 << limitLead) - 1;
 375                 }
 376                 for (trail = 0; trail < 64; ++trail) {
 377                     table[trail] |= bits;
 378                 }
 379             }
 380             // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
 381             // In that case, bits=1<<limitLead == 1<<0 == 1
 382             // (because Java << uses only the lower 5 bits of the shift operand)
 383             // but the bits value is not used because trail<limitTrail is already false.
 384             bits = 1 << limitLead;
 385             for (trail = 0; trail < limitTrail; ++trail) {
 386                 table[trail] |= bits;
 387             }
 388         }
 389     }
 390 
 391     private void initBits() {
 392         int start, limit;
 393         int listIndex = 0;
 394 
 395         // Set latin1Contains[].
 396         do {
 397             start = list[listIndex++];
 398             if (listIndex < listLength) {
 399                 limit = list[listIndex++];
 400             } else {
 401                 limit = 0x110000;
 402             }
 403             if (start >= 0x100) {
 404                 break;
 405             }
 406             do {
 407                 latin1Contains[start++] = true;
 408             } while (start < limit && start < 0x100);
 409         } while (limit <= 0x100);
 410 
 411         // Set table7FF[].
 412         while (start < 0x800) {
 413             set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
 414             if (limit > 0x800) {
 415                 start = 0x800;
 416                 break;
 417             }
 418 
 419             start = list[listIndex++];
 420             if (listIndex < listLength) {
 421                 limit = list[listIndex++];
 422             } else {
 423                 limit = 0x110000;
 424             }
 425         }
 426 
 427         // Set bmpBlockBits[].
 428         int minStart = 0x800;
 429         while (start < 0x10000) {
 430             if (limit > 0x10000) {
 431                 limit = 0x10000;
 432             }
 433 
 434             if (start < minStart) {
 435                 start = minStart;
 436             }
 437             if (start < limit) { // Else: Another range entirely in a known mixed-value block.
 438                 if (0 != (start & 0x3f)) {
 439                     // Mixed-value block of 64 code points.
 440                     start >>= 6;
 441                     bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
 442                     start = (start + 1) << 6; // Round up to the next block boundary.
 443                     minStart = start; // Ignore further ranges in this block.
 444                 }
 445                 if (start < limit) {
 446                     if (start < (limit & ~0x3f)) {
 447                         // Multiple all-ones blocks of 64 code points each.
 448                         set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
 449                     }
 450 
 451                     if (0 != (limit & 0x3f)) {
 452                         // Mixed-value block of 64 code points.
 453                         limit >>= 6;
 454                         bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
 455                       limit = (limit + 1) << 6; // Round up to the next block boundary.
 456                         minStart = limit; // Ignore further ranges in this block.
 457                     }
 458                 }
 459             }
 460 
 461             if (limit == 0x10000) {
 462                 break;
 463           }
 464 
 465             start = list[listIndex++];
 466             if (listIndex < listLength) {
 467                 limit = list[listIndex++];
 468             } else {
 469                 limit = 0x110000;
 470             }
 471         }
 472     }
 473 
 474     /**
 475      * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
 476      * points in a certain range.
 477      *
 478      * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
 479      * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
 480      *
 481      * @param c
 482      *            a character in a subrange of MIN_VALUE..MAX_VALUE
 483      * @param lo
 484      *            The lowest index to be returned.
 485      * @param hi
 486      *            The highest index to be returned.
 487      * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
 488      */
 489     private int findCodePoint(int c, int lo, int hi) {
 490         /* Examples:
 491                                            findCodePoint(c)
 492            set              list[]         c=0 1 3 4 7 8
 493            ===              ==============   ===========
 494            []               [110000]         0 0 0 0 0 0
 495            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 496            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 497            [:Any:]          [0, 110000]      1 1 1 1 1 1
 498          */
 499 
 500         // Return the smallest i such that c < list[i]. Assume
 501         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 502         if (c < list[lo])
 503             return lo;
 504         // High runner test. c is often after the last range, so an
 505         // initial check for this condition pays off.
 506         if (lo >= hi || c >= list[hi - 1])
 507             return hi;
 508         // invariant: c >= list[lo]
 509         // invariant: c < list[hi]
 510         for (;;) {
 511             int i = (lo + hi) >>> 1;
 512             if (i == lo) {
 513                 break; // Found!
 514             } else if (c < list[i]) {
 515                 hi = i;
 516             } else {
 517                 lo = i;
 518             }
 519         }
 520         return hi;
 521     }
 522 
 523     private final boolean containsSlow(int c, int lo, int hi) {
 524         return (0 != (findCodePoint(c, lo, hi) & 1));
 525     }
 526 }
 527