1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  ******************************************************************************
  28  *
  29  *   Copyright (C) 2009-2014, International Business Machines
  30  *   Corporation and others.  All Rights Reserved.
  31  *
  32  ******************************************************************************
  33  */
  34 
  35 package sun.text.normalizer;
  36 
  37 import sun.text.normalizer.UnicodeSet.SpanCondition;
  38 
  39 /**
  40  * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
  41  *
  42  * Latin-1: Look up bytes.
  43  * 2-byte characters: Bits organized vertically.
  44  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
  45  * Supplementary characters: Call contains() on the parent set.
  46  */
  47 final class BMPSet {
  48 
  49     /**
  50      * One boolean ('true' or 'false') per Latin-1 character.
  51      */
  52     private boolean[] latin1Contains;
  53 
  54     /**
  55      * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
  56      * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
  57      * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
  58      *
  59      * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
  60      * runtime.
  61      */
  62     private int[] table7FF;
  63 
  64     /**
  65      * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
  66      * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
  67      * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
  68      * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
  69      * and set.contains(c) must be called.
  70      *
  71      * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
  72      * validity checking at runtime.
  73      */
  74     private int[] bmpBlockBits;
  75 
  76     /**
  77      * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
  78      * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
  79      * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
  80      */
  81     private int[] list4kStarts;
  82 
  83     /**
  84      * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
  85      * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
  86      */
  87     private final int[] list;
  88     private final int listLength; // length used; list may be longer to minimize reallocs
  89 
  90     public BMPSet(final int[] parentList, int parentListLength) {
  91         list = parentList;
  92         listLength = parentListLength;
  93         latin1Contains = new boolean[0x100];
  94         table7FF = new int[64];
  95         bmpBlockBits = new int[64];
  96         list4kStarts = new int[18];
  97 
  98         /*
  99          * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
 100          * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
 101          * indexes is for finding supplementary code points.
 102          */
 103         list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
 104         int i;
 105         for (i = 1; i <= 0x10; ++i) {
 106             list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
 107         }
 108         list4kStarts[0x11] = listLength - 1;
 109 
 110         initBits();
 111     }
 112 
 113     public boolean contains(int c) {
 114         if (c <= 0xff) {
 115             return (latin1Contains[c]);
 116         } else if (c <= 0x7ff) {
 117             return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
 118         } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
 119             int lead = c >> 12;
 120             int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 121             if (twoBits <= 1) {
 122                 // All 64 code points with the same bits 15..6
 123                 // are either in the set or not.
 124                 return (0 != twoBits);
 125             } else {
 126                 // Look up the code point in its 4k block of code points.
 127                 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
 128             }
 129         } else if (c <= 0x10ffff) {
 130             // surrogate or supplementary code point
 131             return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
 132         } else {
 133             // Out-of-range code points get false, consistent with long-standing
 134             // behavior of UnicodeSet.contains(c).
 135             return false;
 136         }
 137     }
 138 
 139     /**
 140      * Span the initial substring for which each character c has spanCondition==contains(c). It must be
 141      * spanCondition==0 or 1.
 142      *
 143      * @param start The start index
 144      * @param outCount If not null: Receives the number of code points in the span.
 145      * @return the limit (exclusive end) of the span
 146      *
 147      * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
 148      * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
 149      * as usual in ICU.
 150      */
 151     public final int span(CharSequence s, int start, SpanCondition spanCondition,
 152             OutputInt outCount) {
 153         char c, c2;
 154         int i = start;
 155         int limit = s.length();
 156         int numSupplementary = 0;
 157         if (SpanCondition.NOT_CONTAINED != spanCondition) {
 158             // span
 159             while (i < limit) {
 160                 c = s.charAt(i);
 161                 if (c <= 0xff) {
 162                     if (!latin1Contains[c]) {
 163                         break;
 164                     }
 165                 } else if (c <= 0x7ff) {
 166                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
 167                         break;
 168                     }
 169                 } else if (c < 0xd800 ||
 170                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
 171                     int lead = c >> 12;
 172                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 173                     if (twoBits <= 1) {
 174                         // All 64 code points with the same bits 15..6
 175                         // are either in the set or not.
 176                         if (twoBits == 0) {
 177                             break;
 178                         }
 179                     } else {
 180                         // Look up the code point in its 4k block of code points.
 181                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 182                             break;
 183                         }
 184                     }
 185                 } else {
 186                     // surrogate pair
 187                     int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
 188                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 189                         break;
 190                     }
 191                     ++numSupplementary;
 192                     ++i;
 193                 }
 194                 ++i;
 195             }
 196         } else {
 197             // span not
 198             while (i < limit) {
 199                 c = s.charAt(i);
 200                 if (c <= 0xff) {
 201                     if (latin1Contains[c]) {
 202                         break;
 203                     }
 204                 } else if (c <= 0x7ff) {
 205                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
 206                         break;
 207                     }
 208                 } else if (c < 0xd800 ||
 209                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
 210                     int lead = c >> 12;
 211                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 212                     if (twoBits <= 1) {
 213                         // All 64 code points with the same bits 15..6
 214                         // are either in the set or not.
 215                         if (twoBits != 0) {
 216                             break;
 217                         }
 218                     } else {
 219                         // Look up the code point in its 4k block of code points.
 220                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 221                             break;
 222                         }
 223                     }
 224                 } else {
 225                     // surrogate pair
 226                     int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
 227                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 228                         break;
 229                     }
 230                     ++numSupplementary;
 231                     ++i;
 232                 }
 233                 ++i;
 234             }
 235         }
 236         if (outCount != null) {
 237             int spanLength = i - start;
 238             outCount.value = spanLength - numSupplementary;  // number of code points
 239         }
 240         return i;
 241     }
 242 
 243     /**
 244      * Symmetrical with span().
 245      * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
 246      * limit and spanCondition==0 or 1.
 247      *
 248      * @return The string index which starts the span (i.e. inclusive).
 249      */
 250     public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
 251         char c, c2;
 252 
 253         if (SpanCondition.NOT_CONTAINED != spanCondition) {
 254             // span
 255             for (;;) {
 256                 c = s.charAt(--limit);
 257                 if (c <= 0xff) {
 258                     if (!latin1Contains[c]) {
 259                         break;
 260                     }
 261                 } else if (c <= 0x7ff) {
 262                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
 263                         break;
 264                     }
 265                 } else if (c < 0xd800 ||
 266                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
 267                     int lead = c >> 12;
 268                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 269                     if (twoBits <= 1) {
 270                         // All 64 code points with the same bits 15..6
 271                         // are either in the set or not.
 272                         if (twoBits == 0) {
 273                             break;
 274                         }
 275                     } else {
 276                         // Look up the code point in its 4k block of code points.
 277                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 278                             break;
 279                         }
 280                     }
 281                 } else {
 282                     // surrogate pair
 283                     int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
 284                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 285                         break;
 286                     }
 287                     --limit;
 288                 }
 289                 if (0 == limit) {
 290                     return 0;
 291                 }
 292             }
 293         } else {
 294             // span not
 295             for (;;) {
 296                 c = s.charAt(--limit);
 297                 if (c <= 0xff) {
 298                     if (latin1Contains[c]) {
 299                         break;
 300                     }
 301                 } else if (c <= 0x7ff) {
 302                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
 303                         break;
 304                     }
 305                 } else if (c < 0xd800 ||
 306                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
 307                     int lead = c >> 12;
 308                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
 309                     if (twoBits <= 1) {
 310                         // All 64 code points with the same bits 15..6
 311                         // are either in the set or not.
 312                         if (twoBits != 0) {
 313                             break;
 314                         }
 315                     } else {
 316                         // Look up the code point in its 4k block of code points.
 317                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
 318                             break;
 319                         }
 320                     }
 321                 } else {
 322                     // surrogate pair
 323                     int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
 324                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
 325                         break;
 326                     }
 327                     --limit;
 328                 }
 329                 if (0 == limit) {
 330                     return 0;
 331                 }
 332             }
 333         }
 334         return limit + 1;
 335     }
 336 
 337     /**
 338      * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
 339      */
 340     private static void set32x64Bits(int[] table, int start, int limit) {
 341         assert (64 == table.length);
 342         int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
 343         int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
 344 
 345         // Set one bit indicating an all-one block.
 346         int bits = 1 << lead;
 347         if ((start + 1) == limit) { // Single-character shortcut.
 348             table[trail] |= bits;
 349             return;
 350         }
 351 
 352         int limitLead = limit >> 6;
 353         int limitTrail = limit & 0x3f;
 354 
 355         if (lead == limitLead) {
 356             // Partial vertical bit column.
 357             while (trail < limitTrail) {
 358                 table[trail++] |= bits;
 359             }
 360         } else {
 361             // Partial vertical bit column,
 362             // followed by a bit rectangle,
 363             // followed by another partial vertical bit column.
 364             if (trail > 0) {
 365                 do {
 366                     table[trail++] |= bits;
 367                 } while (trail < 64);
 368                 ++lead;
 369             }
 370             if (lead < limitLead) {
 371                 bits = ~((1 << lead) - 1);
 372                 if (limitLead < 0x20) {
 373                     bits &= (1 << limitLead) - 1;
 374                 }
 375                 for (trail = 0; trail < 64; ++trail) {
 376                     table[trail] |= bits;
 377                 }
 378             }
 379             // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
 380             // In that case, bits=1<<limitLead == 1<<0 == 1
 381             // (because Java << uses only the lower 5 bits of the shift operand)
 382             // but the bits value is not used because trail<limitTrail is already false.
 383             bits = 1 << limitLead;
 384             for (trail = 0; trail < limitTrail; ++trail) {
 385                 table[trail] |= bits;
 386             }
 387         }
 388     }
 389 
 390     private void initBits() {
 391         int start, limit;
 392         int listIndex = 0;
 393 
 394         // Set latin1Contains[].
 395         do {
 396             start = list[listIndex++];
 397             if (listIndex < listLength) {
 398                 limit = list[listIndex++];
 399             } else {
 400                 limit = 0x110000;
 401             }
 402             if (start >= 0x100) {
 403                 break;
 404             }
 405             do {
 406                 latin1Contains[start++] = true;
 407             } while (start < limit && start < 0x100);
 408         } while (limit <= 0x100);
 409 
 410         // Set table7FF[].
 411         while (start < 0x800) {
 412             set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
 413             if (limit > 0x800) {
 414                 start = 0x800;
 415                 break;
 416             }
 417 
 418             start = list[listIndex++];
 419             if (listIndex < listLength) {
 420                 limit = list[listIndex++];
 421             } else {
 422                 limit = 0x110000;
 423             }
 424         }
 425 
 426         // Set bmpBlockBits[].
 427         int minStart = 0x800;
 428         while (start < 0x10000) {
 429             if (limit > 0x10000) {
 430                 limit = 0x10000;
 431             }
 432 
 433             if (start < minStart) {
 434                 start = minStart;
 435             }
 436             if (start < limit) { // Else: Another range entirely in a known mixed-value block.
 437                 if (0 != (start & 0x3f)) {
 438                     // Mixed-value block of 64 code points.
 439                     start >>= 6;
 440                     bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
 441                     start = (start + 1) << 6; // Round up to the next block boundary.
 442                     minStart = start; // Ignore further ranges in this block.
 443                 }
 444                 if (start < limit) {
 445                     if (start < (limit & ~0x3f)) {
 446                         // Multiple all-ones blocks of 64 code points each.
 447                         set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
 448                     }
 449 
 450                     if (0 != (limit & 0x3f)) {
 451                         // Mixed-value block of 64 code points.
 452                         limit >>= 6;
 453                         bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
 454                       limit = (limit + 1) << 6; // Round up to the next block boundary.
 455                         minStart = limit; // Ignore further ranges in this block.
 456                     }
 457                 }
 458             }
 459 
 460             if (limit == 0x10000) {
 461                 break;
 462           }
 463 
 464             start = list[listIndex++];
 465             if (listIndex < listLength) {
 466                 limit = list[listIndex++];
 467             } else {
 468                 limit = 0x110000;
 469             }
 470         }
 471     }
 472 
 473     /**
 474      * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
 475      * points in a certain range.
 476      *
 477      * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
 478      * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
 479      *
 480      * @param c
 481      *            a character in a subrange of MIN_VALUE..MAX_VALUE
 482      * @param lo
 483      *            The lowest index to be returned.
 484      * @param hi
 485      *            The highest index to be returned.
 486      * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
 487      */
 488     private int findCodePoint(int c, int lo, int hi) {
 489         /* Examples:
 490                                            findCodePoint(c)
 491            set              list[]         c=0 1 3 4 7 8
 492            ===              ==============   ===========
 493            []               [110000]         0 0 0 0 0 0
 494            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
 495            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
 496            [:Any:]          [0, 110000]      1 1 1 1 1 1
 497          */
 498 
 499         // Return the smallest i such that c < list[i]. Assume
 500         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
 501         if (c < list[lo])
 502             return lo;
 503         // High runner test. c is often after the last range, so an
 504         // initial check for this condition pays off.
 505         if (lo >= hi || c >= list[hi - 1])
 506             return hi;
 507         // invariant: c >= list[lo]
 508         // invariant: c < list[hi]
 509         for (;;) {
 510             int i = (lo + hi) >>> 1;
 511             if (i == lo) {
 512                 break; // Found!
 513             } else if (c < list[i]) {
 514                 hi = i;
 515             } else {
 516                 lo = i;
 517             }
 518         }
 519         return hi;
 520     }
 521 
 522     private final boolean containsSlow(int c, int lo, int hi) {
 523         return (0 != (findCodePoint(c, lo, hi) & 1));
 524     }
 525 }
 526