New src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java

   1 /*
   2  * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  ******************************************************************************
  28  *
  29  *   Copyright (C) 2009-2014, International Business Machines
  30  *   Corporation and others.  All Rights Reserved.
  31  *
  32  ******************************************************************************
  33  */
  34 
  35 package jdk.internal.icu.impl;
  36 
  37 import java.util.ArrayList;
  38 
  39 import jdk.internal.icu.text.UTF16;
  40 import jdk.internal.icu.text.UnicodeSet;
  41 import jdk.internal.icu.text.UnicodeSet.SpanCondition;
  42 import jdk.internal.icu.util.OutputInt;
  43 
  44 /*
  45  * Implement span() etc. for a set with strings.
  46  * Avoid recursion because of its exponential complexity.
  47  * Instead, try multiple paths at once and track them with an IndexList.
  48  */
  49 public class UnicodeSetStringSpan {
  50 
  51     /*
  52      * Which span() variant will be used? The object is either built for one variant and used once,
  53      * or built for all and may be used many times.
  54      */
  55     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
  56     public static final int FWD           = 0x20;
  57     public static final int BACK          = 0x10;
  58     // public static final int UTF16      = 8;
  59     public static final int CONTAINED     = 2;
  60     public static final int NOT_CONTAINED = 1;
  61 
  62     public static final int ALL = 0x7f;
  63 
  64     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
  65     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
  66     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
  67     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
  68 
  69     /**
  70      * Special spanLength short values. (since Java has not unsigned byte type)
  71      * All code points in the string are contained in the parent set.
  72      */
  73     static final short ALL_CP_CONTAINED = 0xff;
  74 
  75     /** The spanLength is >=0xfe. */
  76     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
  77 
  78     /** Set for span(). Same as parent but without strings. */
  79     private UnicodeSet spanSet;
  80 
  81     /**
  82      * Set for span(not contained).
  83      * Same as spanSet, plus characters that start or end strings.
  84      */
  85     private UnicodeSet spanNotSet;
  86 
  87     /** The strings of the parent set. */
  88     private ArrayList<String> strings;
  89 
  90     /** The lengths of span(), spanBack() etc. for each string. */
  91     private short[] spanLengths;
  92 
  93     /** Maximum lengths of relevant strings. */
  94     private int maxLength16;
  95 
  96     /** Are there strings that are not fully contained in the code point set? */
  97     private boolean someRelevant;
  98 
  99     /** Set up for all variants of span()? */
 100     private boolean all;
 101 
 102     /** Span helper */
 103     private OffsetList offsets;
 104 
 105     /**
 106      * Constructs for all variants of span(), or only for any one variant.
 107      * Initializes as little as possible, for single use.
 108      */
 109     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
 110         spanSet = new UnicodeSet(0, 0x10ffff);
 111         // TODO: With Java 6, just take the parent set's strings as is,
 112         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
 113         // Then iterate via the first() and higher() methods.
 114         // (We do not want to create multiple Iterator objects in each span().)
 115         // See ICU ticket #7454.
 116         strings = setStrings;
 117         all = (which == ALL);
 118         spanSet.retainAll(set);
 119         if (0 != (which & NOT_CONTAINED)) {
 120             // Default to the same sets.
 121             // addToSpanNotSet() will create a separate set if necessary.
 122             spanNotSet = spanSet;
 123         }
 124         offsets = new OffsetList();
 125 
 126         // Determine if the strings even need to be taken into account at all for span() etc.
 127         // If any string is relevant, then all strings need to be used for
 128         // span(longest match) but only the relevant ones for span(while contained).
 129         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
 130         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
 131         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
 132         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
 133         int stringsLength = strings.size();
 134 
 135         int i, spanLength;
 136         someRelevant = false;
 137         for (i = 0; i < stringsLength; ++i) {
 138             String string = strings.get(i);
 139             int length16 = string.length();
 140             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
 141             if (spanLength < length16) { // Relevant string.
 142                 someRelevant = true;
 143             }
 144             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
 145                 maxLength16 = length16;
 146             }
 147         }
 148         if (!someRelevant && (which & WITH_COUNT) == 0) {
 149             return;
 150         }
 151 
 152         // Freeze after checking for the need to use strings at all because freezing
 153         // a set takes some time and memory which are wasted if there are no relevant strings.
 154         if (all) {
 155             spanSet.freeze();
 156         }
 157 
 158         int spanBackLengthsOffset;
 159 
 160         // Allocate a block of meta data.
 161         int allocSize;
 162         if (all) {
 163             // 2 sets of span lengths
 164             allocSize = stringsLength * (2);
 165         } else {
 166             allocSize = stringsLength; // One set of span lengths.
 167         }
 168         spanLengths = new short[allocSize];
 169 
 170         if (all) {
 171             // Store span lengths for all span() variants.
 172             spanBackLengthsOffset = stringsLength;
 173         } else {
 174             // Store span lengths for only one span() variant.
 175             spanBackLengthsOffset = 0;
 176         }
 177 
 178         // Set the meta data and spanNotSet and write the UTF-8 strings.
 179 
 180         for (i = 0; i < stringsLength; ++i) {
 181             String string = strings.get(i);
 182             int length16 = string.length();
 183             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
 184             if (spanLength < length16) { // Relevant string.
 185                 if (true /* 0 != (which & UTF16) */) {
 186                     if (0 != (which & CONTAINED)) {
 187                         if (0 != (which & FWD)) {
 188                             spanLengths[i] = makeSpanLengthByte(spanLength);
 189                         }
 190                         if (0 != (which & BACK)) {
 191                             spanLength = length16
 192                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
 193                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
 194                         }
 195                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
 196                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
 197                                                                                      // flag.
 198                     }
 199                 }
 200                 if (0 != (which & NOT_CONTAINED)) {
 201                     // Add string start and end code points to the spanNotSet so that
 202                     // a span(while not contained) stops before any string.
 203                     int c;
 204                     if (0 != (which & FWD)) {
 205                         c = string.codePointAt(0);
 206                         addToSpanNotSet(c);
 207                     }
 208                     if (0 != (which & BACK)) {
 209                         c = string.codePointBefore(length16);
 210                         addToSpanNotSet(c);
 211                     }
 212                 }
 213             } else { // Irrelevant string.
 214                 if (all) {
 215                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
 216                 } else {
 217                     // All spanXYZLengths pointers contain the same address.
 218                     spanLengths[i] = ALL_CP_CONTAINED;
 219                 }
 220             }
 221         }
 222 
 223         // Finish.
 224         if (all) {
 225             spanNotSet.freeze();
 226         }
 227     }
 228 
 229     /**
 230      * Do the strings need to be checked in span() etc.?
 231      *
 232      * @return true if strings need to be checked (call span() here),
 233      *         false if not (use a BMPSet for best performance).
 234      */
 235     public boolean needsStringSpanUTF16() {
 236         return someRelevant;
 237     }
 238 
 239     /** For fast UnicodeSet::contains(c). */
 240     public boolean contains(int c) {
 241         return spanSet.contains(c);
 242     }
 243 
 244     /**
 245      * Adds a starting or ending string character to the spanNotSet
 246      * so that a character span ends before any string.
 247      */
 248     private void addToSpanNotSet(int c) {
 249         if (spanNotSet == null || spanNotSet == spanSet) {
 250             if (spanSet.contains(c)) {
 251                 return; // Nothing to do.
 252             }
 253             spanNotSet = spanSet.cloneAsThawed();
 254         }
 255         spanNotSet.add(c);
 256     }
 257 
 258     /*
 259      * Note: In span() when spanLength==0
 260      * (after a string match, or at the beginning after an empty code point span)
 261      * and in spanNot() and spanNotUTF8(),
 262      * string matching could use a binary search because all string matches are done
 263      * from the same start index.
 264      *
 265      * For UTF-8, this would require a comparison function that returns UTF-16 order.
 266      *
 267      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
 268      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
 269      * API and implementation with a DFA (state machine).
 270      */
 271 
 272     /*
 273      * Algorithm for span(SpanCondition.CONTAINED)
 274      *
 275      * Theoretical algorithm:
 276      * - Iterate through the string, and at each code point boundary:
 277      *   + If the code point there is in the set, then remember to continue after it.
 278      *   + If a set string matches at the current position, then remember to continue after it.
 279      *   + Either recursively span for each code point or string match, or recursively span
 280      *     for all but the shortest one and iteratively continue the span with the shortest local match.
 281      *   + Remember the longest recursive span (the farthest end point).
 282      *   + If there is no match at the current position,
 283      *     neither for the code point there nor for any set string,
 284      *     then stop and return the longest recursive span length.
 285      *
 286      * Optimized implementation:
 287      *
 288      * (We assume that most sets will have very few very short strings.
 289      * A span using a string-less set is extremely fast.)
 290      *
 291      * Create and cache a spanSet which contains all of the single code points of the original set
 292      * but none of its strings.
 293      *
 294      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
 295      * - Loop:
 296      *   + Try to match each set string at the end of the spanLength.
 297      *     ~ Set strings that start with set-contained code points
 298      *       must be matched with a partial overlap
 299      *       because the recursive algorithm would have tried to match them at every position.
 300      *     ~ Set strings that entirely consist of set-contained code points
 301      *       are irrelevant for span(SpanCondition.CONTAINED)
 302      *       because the recursive algorithm would continue after them anyway and
 303      *       find the longest recursive match from their end.
 304      *     ~ Rather than recursing, note each end point of a set string match.
 305      *   + If no set string matched after spanSet.span(),
 306      *     then return with where the spanSet.span() ended.
 307      *   + If at least one set string matched after spanSet.span(),
 308      *     then pop the shortest string match end point and continue the loop,
 309      *     trying to match all set strings from there.
 310      *   + If at least one more set string matched after a previous string match, then test if the
 311      *     code point after the previous string match is also contained in the set.
 312      *     Continue the loop with the shortest end point of
 313      *     either this code point or a matching set string.
 314      *   + If no more set string matched after a previous string match,
 315      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
 316      *     Stop if spanLength==0, otherwise continue the loop.
 317      *
 318      * By noting each end point of a set string match, the function visits each string position at most once and
 319      * finishes in linear time.
 320      *
 321      * The recursive algorithm may visit the same string position many times
 322      * if multiple paths lead to it and finishes in exponential time.
 323      */
 324 
 325     /*
 326      * Algorithm for span(SIMPLE)
 327      *
 328      * Theoretical algorithm:
 329      * - Iterate through the string, and at each code point boundary:
 330      *   + If the code point there is in the set, then remember to continue after it.
 331      *   + If a set string matches at the current position, then remember to continue after it.
 332      *   + Continue from the farthest match position and ignore all others.
 333      *   + If there is no match at the current position, then stop and return the current position.
 334      *
 335      * Optimized implementation:
 336      *
 337      * (Same assumption and spanSet as above.)
 338      *
 339      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
 340      * - Loop:
 341      *   + Try to match each set string at the end of the spanLength.
 342      *     ~ Set strings that start with set-contained code points
 343      *       must be matched with a partial overlap
 344      *       because the standard algorithm would have tried to match them earlier.
 345      *     ~ Set strings that entirely consist of set-contained code points
 346      *       must be matched with a full overlap because the longest-match algorithm
 347      *       would hide set string matches that end earlier.
 348      *       Such set strings need not be matched earlier inside the code point span
 349      *       because the standard algorithm would then have
 350      *       continued after the set string match anyway.
 351      *     ~ Remember the longest set string match (farthest end point)
 352      *       from the earliest starting point.
 353      *   + If no set string matched after spanSet.span(),
 354      *     then return with where the spanSet.span() ended.
 355      *   + If at least one set string matched,
 356      *     then continue the loop after the longest match from the earliest position.
 357      *   + If no more set string matched after a previous string match,
 358      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
 359      *     Stop if spanLength==0, otherwise continue the loop.
 360      */
 361     /**
 362      * Spans a string.
 363      *
 364      * @param s The string to be spanned
 365      * @param start The start index that the span begins
 366      * @param spanCondition The span condition
 367      * @return the limit (exclusive end) of the span
 368      */
 369     public int span(CharSequence s, int start, SpanCondition spanCondition) {
 370         if (spanCondition == SpanCondition.NOT_CONTAINED) {
 371             return spanNot(s, start, null);
 372         }
 373         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
 374         if (spanLimit == s.length()) {
 375             return spanLimit;
 376         }
 377         return spanWithStrings(s, start, spanLimit, spanCondition);
 378     }
 379 
 380     /**
 381      * Synchronized method for complicated spans using the offsets.
 382      * Avoids synchronization for simple cases.
 383      *
 384      * @param spanLimit = spanSet.span(s, start, CONTAINED)
 385      */
 386     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
 387             SpanCondition spanCondition) {
 388         // Consider strings; they may overlap with the span.
 389         int initSize = 0;
 390         if (spanCondition == SpanCondition.CONTAINED) {
 391             // Use offset list to try all possibilities.
 392             initSize = maxLength16;
 393         }
 394         offsets.setMaxLength(initSize);
 395         int length = s.length();
 396         int pos = spanLimit, rest = length - spanLimit;
 397         int spanLength = spanLimit - start;
 398         int i, stringsLength = strings.size();
 399         for (;;) {
 400             if (spanCondition == SpanCondition.CONTAINED) {
 401                 for (i = 0; i < stringsLength; ++i) {
 402                     int overlap = spanLengths[i];
 403                     if (overlap == ALL_CP_CONTAINED) {
 404                         continue; // Irrelevant string.
 405                     }
 406                     String string = strings.get(i);
 407 
 408                     int length16 = string.length();
 409 
 410                     // Try to match this string at pos-overlap..pos.
 411                     if (overlap >= LONG_SPAN) {
 412                         overlap = length16;
 413                         // While contained: No point matching fully inside the code point span.
 414                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
 415                                                                           // point.
 416                     }
 417                     if (overlap > spanLength) {
 418                         overlap = spanLength;
 419                     }
 420                     int inc = length16 - overlap; // Keep overlap+inc==length16.
 421                     for (;;) {
 422                         if (inc > rest) {
 423                             break;
 424                         }
 425                         // Try to match if the increment is not listed already.
 426                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
 427                             if (inc == rest) {
 428                                 return length; // Reached the end of the string.
 429                             }
 430                             offsets.addOffset(inc);
 431                         }
 432                         if (overlap == 0) {
 433                             break;
 434                         }
 435                         --overlap;
 436                         ++inc;
 437                     }
 438                 }
 439             } else /* SIMPLE */{
 440                 int maxInc = 0, maxOverlap = 0;
 441                 for (i = 0; i < stringsLength; ++i) {
 442                     int overlap = spanLengths[i];
 443                     // For longest match, we do need to try to match even an all-contained string
 444                     // to find the match from the earliest start.
 445 
 446                     String string = strings.get(i);
 447 
 448                     int length16 = string.length();
 449 
 450                     // Try to match this string at pos-overlap..pos.
 451                     if (overlap >= LONG_SPAN) {
 452                         overlap = length16;
 453                         // Longest match: Need to match fully inside the code point span
 454                         // to find the match from the earliest start.
 455                     }
 456                     if (overlap > spanLength) {
 457                         overlap = spanLength;
 458                     }
 459                     int inc = length16 - overlap; // Keep overlap+inc==length16.
 460                     for (;;) {
 461                         if (inc > rest || overlap < maxOverlap) {
 462                             break;
 463                         }
 464                         // Try to match if the string is longer or starts earlier.
 465                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
 466                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
 467                             maxInc = inc; // Longest match from earliest start.
 468                             maxOverlap = overlap;
 469                             break;
 470                         }
 471                         --overlap;
 472                         ++inc;
 473                     }
 474                 }
 475 
 476                 if (maxInc != 0 || maxOverlap != 0) {
 477                     // Longest-match algorithm, and there was a string match.
 478                     // Simply continue after it.
 479                     pos += maxInc;
 480                     rest -= maxInc;
 481                     if (rest == 0) {
 482                         return length; // Reached the end of the string.
 483                     }
 484                     spanLength = 0; // Match strings from after a string match.
 485                     continue;
 486                 }
 487             }
 488             // Finished trying to match all strings at pos.
 489 
 490             if (spanLength != 0 || pos == 0) {
 491                 // The position is after an unlimited code point span (spanLength!=0),
 492                 // not after a string match.
 493                 // The only position where spanLength==0 after a span is pos==0.
 494                 // Otherwise, an unlimited code point span is only tried again when no
 495                 // strings match, and if such a non-initial span fails we stop.
 496                 if (offsets.isEmpty()) {
 497                     return pos; // No strings matched after a span.
 498                 }
 499                 // Match strings from after the next string match.
 500             } else {
 501                 // The position is after a string match (or a single code point).
 502                 if (offsets.isEmpty()) {
 503                     // No more strings matched after a previous string match.
 504                     // Try another code point span from after the last string match.
 505                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
 506                     spanLength = spanLimit - pos;
 507                     if (spanLength == rest || // Reached the end of the string, or
 508                             spanLength == 0 // neither strings nor span progressed.
 509                     ) {
 510                         return spanLimit;
 511                     }
 512                     pos += spanLength;
 513                     rest -= spanLength;
 514                     continue; // spanLength>0: Match strings from after a span.
 515                 } else {
 516                     // Try to match only one code point from after a string match if some
 517                     // string matched beyond it, so that we try all possible positions
 518                     // and don't overshoot.
 519                     spanLength = spanOne(spanSet, s, pos, rest);
 520                     if (spanLength > 0) {
 521                         if (spanLength == rest) {
 522                             return length; // Reached the end of the string.
 523                         }
 524                         // Match strings after this code point.
 525                         // There cannot be any increments below it because UnicodeSet strings
 526                         // contain multiple code points.
 527                         pos += spanLength;
 528                         rest -= spanLength;
 529                         offsets.shift(spanLength);
 530                         spanLength = 0;
 531                         continue; // Match strings from after a single code point.
 532                     }
 533                     // Match strings from after the next string match.
 534                 }
 535             }
 536             int minOffset = offsets.popMinimum(null);
 537             pos += minOffset;
 538             rest -= minOffset;
 539             spanLength = 0; // Match strings from after a string match.
 540         }
 541     }
 542 
 543     /**
 544      * Spans a string and counts the smallest number of set elements on any path across the span.
 545      *
 546      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
 547      *
 548      * <p>If the set does not have any fully-contained strings, then we could optimize this
 549      * like span(), but such sets are likely rare, and this is at least still linear.
 550      *
 551      * @param s The string to be spanned
 552      * @param start The start index that the span begins
 553      * @param spanCondition The span condition
 554      * @param outCount The count
 555      * @return the limit (exclusive end) of the span
 556      */
 557     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
 558             OutputInt outCount) {
 559         if (spanCondition == SpanCondition.NOT_CONTAINED) {
 560             return spanNot(s, start, outCount);
 561         }
 562         // Consider strings; they may overlap with the span,
 563         // and they may result in a smaller count that with just code points.
 564         if (spanCondition == SpanCondition.CONTAINED) {
 565             return spanContainedAndCount(s, start, outCount);
 566         }
 567         // SIMPLE (not synchronized, does not use offsets)
 568         int stringsLength = strings.size();
 569         int length = s.length();
 570         int pos = start;
 571         int rest = length - start;
 572         int count = 0;
 573         while (rest != 0) {
 574             // Try to match the next code point.
 575             int cpLength = spanOne(spanSet, s, pos, rest);
 576             int maxInc = (cpLength > 0) ? cpLength : 0;
 577             // Try to match all of the strings.
 578             for (int i = 0; i < stringsLength; ++i) {
 579                 String string = strings.get(i);
 580                 int length16 = string.length();
 581                 if (maxInc < length16 && length16 <= rest &&
 582                         matches16CPB(s, pos, length, string, length16)) {
 583                     maxInc = length16;
 584                 }
 585             }
 586             // We are done if there is no match beyond pos.
 587             if (maxInc == 0) {
 588                 outCount.value = count;
 589                 return pos;
 590             }
 591             // Continue from the longest match.
 592             ++count;
 593             pos += maxInc;
 594             rest -= maxInc;
 595         }
 596         outCount.value = count;
 597         return pos;
 598     }
 599 
 600     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
 601         // Use offset list to try all possibilities.
 602         offsets.setMaxLength(maxLength16);
 603         int stringsLength = strings.size();
 604         int length = s.length();
 605         int pos = start;
 606         int rest = length - start;
 607         int count = 0;
 608         while (rest != 0) {
 609             // Try to match the next code point.
 610             int cpLength = spanOne(spanSet, s, pos, rest);
 611             if (cpLength > 0) {
 612                 offsets.addOffsetAndCount(cpLength, count + 1);
 613             }
 614             // Try to match all of the strings.
 615             for (int i = 0; i < stringsLength; ++i) {
 616                 String string = strings.get(i);
 617                 int length16 = string.length();
 618                 // Note: If the strings were sorted by length, then we could also
 619                 // avoid trying to match if there is already a match of the same length.
 620                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
 621                         matches16CPB(s, pos, length, string, length16)) {
 622                     offsets.addOffsetAndCount(length16, count + 1);
 623                 }
 624             }
 625             // We are done if there is no match beyond pos.
 626             if (offsets.isEmpty()) {
 627                 outCount.value = count;
 628                 return pos;
 629             }
 630             // Continue from the nearest match.
 631             int minOffset = offsets.popMinimum(outCount);
 632             count = outCount.value;
 633             pos += minOffset;
 634             rest -= minOffset;
 635         }
 636         outCount.value = count;
 637         return pos;
 638     }
 639 
 640     /**
 641      * Span a string backwards.
 642      *
 643      * @param s The string to be spanned
 644      * @param spanCondition The span condition
 645      * @return The string index which starts the span (i.e. inclusive).
 646      */
 647     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
 648         if (spanCondition == SpanCondition.NOT_CONTAINED) {
 649             return spanNotBack(s, length);
 650         }
 651         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
 652         if (pos == 0) {
 653             return 0;
 654         }
 655         int spanLength = length - pos;
 656 
 657         // Consider strings; they may overlap with the span.
 658         int initSize = 0;
 659         if (spanCondition == SpanCondition.CONTAINED) {
 660             // Use offset list to try all possibilities.
 661             initSize = maxLength16;
 662         }
 663         offsets.setMaxLength(initSize);
 664         int i, stringsLength = strings.size();
 665         int spanBackLengthsOffset = 0;
 666         if (all) {
 667             spanBackLengthsOffset = stringsLength;
 668         }
 669         for (;;) {
 670             if (spanCondition == SpanCondition.CONTAINED) {
 671                 for (i = 0; i < stringsLength; ++i) {
 672                     int overlap = spanLengths[spanBackLengthsOffset + i];
 673                     if (overlap == ALL_CP_CONTAINED) {
 674                         continue; // Irrelevant string.
 675                     }
 676                     String string = strings.get(i);
 677 
 678                     int length16 = string.length();
 679 
 680                     // Try to match this string at pos-(length16-overlap)..pos-length16.
 681                     if (overlap >= LONG_SPAN) {
 682                         overlap = length16;
 683                         // While contained: No point matching fully inside the code point span.
 684                         int len1 = 0;
 685                         len1 = string.offsetByCodePoints(0, 1);
 686                         overlap -= len1; // Length of the string minus the first code point.
 687                     }
 688                     if (overlap > spanLength) {
 689                         overlap = spanLength;
 690                     }
 691                     int dec = length16 - overlap; // Keep dec+overlap==length16.
 692                     for (;;) {
 693                         if (dec > pos) {
 694                             break;
 695                         }
 696                         // Try to match if the decrement is not listed already.
 697                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
 698                             if (dec == pos) {
 699                                 return 0; // Reached the start of the string.
 700                             }
 701                             offsets.addOffset(dec);
 702                         }
 703                         if (overlap == 0) {
 704                             break;
 705                         }
 706                         --overlap;
 707                         ++dec;
 708                     }
 709                 }
 710             } else /* SIMPLE */{
 711                 int maxDec = 0, maxOverlap = 0;
 712                 for (i = 0; i < stringsLength; ++i) {
 713                     int overlap = spanLengths[spanBackLengthsOffset + i];
 714                     // For longest match, we do need to try to match even an all-contained string
 715                     // to find the match from the latest end.
 716 
 717                     String string = strings.get(i);
 718 
 719                     int length16 = string.length();
 720 
 721                     // Try to match this string at pos-(length16-overlap)..pos-length16.
 722                     if (overlap >= LONG_SPAN) {
 723                         overlap = length16;
 724                         // Longest match: Need to match fully inside the code point span
 725                         // to find the match from the latest end.
 726                     }
 727                     if (overlap > spanLength) {
 728                       overlap = spanLength;
 729                     }
 730                     int dec = length16 - overlap; // Keep dec+overlap==length16.
 731                     for (;;) {
 732                         if (dec > pos || overlap < maxOverlap) {
 733                             break;
 734                         }
 735                         // Try to match if the string is longer or ends later.
 736                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
 737                                 && matches16CPB(s, pos - dec, length, string, length16)) {
 738                             maxDec = dec; // Longest match from latest end.
 739                             maxOverlap = overlap;
 740                             break;
 741                         }
 742                         --overlap;
 743                         ++dec;
 744                     }
 745                 }
 746 
 747                 if (maxDec != 0 || maxOverlap != 0) {
 748                     // Longest-match algorithm, and there was a string match.
 749                     // Simply continue before it.
 750                     pos -= maxDec;
 751                     if (pos == 0) {
 752                         return 0; // Reached the start of the string.
 753                     }
 754                     spanLength = 0; // Match strings from before a string match.
 755                     continue;
 756                 }
 757             }
 758             // Finished trying to match all strings at pos.
 759 
 760             if (spanLength != 0 || pos == length) {
 761                 // The position is before an unlimited code point span (spanLength!=0),
 762                 // not before a string match.
 763                 // The only position where spanLength==0 before a span is pos==length.
 764                 // Otherwise, an unlimited code point span is only tried again when no
 765                 // strings match, and if such a non-initial span fails we stop.
 766                 if (offsets.isEmpty()) {
 767                     return pos; // No strings matched before a span.
 768                 }
 769                 // Match strings from before the next string match.
 770             } else {
 771                 // The position is before a string match (or a single code point).
 772                 if (offsets.isEmpty()) {
 773                     // No more strings matched before a previous string match.
 774                     // Try another code point span from before the last string match.
 775                     int oldPos = pos;
 776                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
 777                     spanLength = oldPos - pos;
 778                     if (pos == 0 || // Reached the start of the string, or
 779                             spanLength == 0 // neither strings nor span progressed.
 780                     ) {
 781                         return pos;
 782                     }
 783                     continue; // spanLength>0: Match strings from before a span.
 784                 } else {
 785                     // Try to match only one code point from before a string match if some
 786                     // string matched beyond it, so that we try all possible positions
 787                     // and don't overshoot.
 788                     spanLength = spanOneBack(spanSet, s, pos);
 789                     if (spanLength > 0) {
 790                         if (spanLength == pos) {
 791                             return 0; // Reached the start of the string.
 792                         }
 793                         // Match strings before this code point.
 794                         // There cannot be any decrements below it because UnicodeSet strings
 795                         // contain multiple code points.
 796                         pos -= spanLength;
 797                         offsets.shift(spanLength);
 798                         spanLength = 0;
 799                         continue; // Match strings from before a single code point.
 800                     }
 801                     // Match strings from before the next string match.
 802                 }
 803             }
 804             pos -= offsets.popMinimum(null);
 805             spanLength = 0; // Match strings from before a string match.
 806         }
 807     }
 808 
 809     /**
 810      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
 811      *
 812      * Theoretical algorithm:
 813      * - Iterate through the string, and at each code point boundary:
 814      *   + If the code point there is in the set, then return with the current position.
 815      *   + If a set string matches at the current position, then return with the current position.
 816      *
 817      * Optimized implementation:
 818      *
 819      * (Same assumption as for span() above.)
 820      *
 821      * Create and cache a spanNotSet which contains
 822      * all of the single code points of the original set but none of its strings.
 823      * For each set string add its initial code point to the spanNotSet.
 824      * (Also add its final code point for spanNotBack().)
 825      *
 826      * - Loop:
 827      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
 828      *   + If the current code point is in the original set, then return the current position.
 829      *   + If any set string matches at the current position, then return the current position.
 830      *   + If there is no match at the current position, neither for the code point
 831      *     there nor for any set string, then skip this code point and continue the loop.
 832      *     This happens for set-string-initial code points that were added to spanNotSet
 833      *     when there is not actually a match for such a set string.
 834      *
 835      * @param s The string to be spanned
 836      * @param start The start index that the span begins
 837      * @param outCount If not null: Receives the number of code points across the span.
 838      * @return the limit (exclusive end) of the span
 839      */
 840     private int spanNot(CharSequence s, int start, OutputInt outCount) {
 841         int length = s.length();
 842         int pos = start, rest = length - start;
 843         int stringsLength = strings.size();
 844         int count = 0;
 845         do {
 846             // Span until we find a code point from the set,
 847             // or a code point that starts or ends some string.
 848             int spanLimit;
 849             if (outCount == null) {
 850                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
 851             } else {
 852                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
 853                 outCount.value = count = count + outCount.value;
 854             }
 855             if (spanLimit == length) {
 856                 return length; // Reached the end of the string.
 857             }
 858             pos = spanLimit;
 859             rest = length - spanLimit;
 860 
 861             // Check whether the current code point is in the original set,
 862             // without the string starts and ends.
 863             int cpLength = spanOne(spanSet, s, pos, rest);
 864             if (cpLength > 0) {
 865                 return pos; // There is a set element at pos.
 866             }
 867 
 868             // Try to match the strings at pos.
 869             for (int i = 0; i < stringsLength; ++i) {
 870                 if (spanLengths[i] == ALL_CP_CONTAINED) {
 871                     continue; // Irrelevant string.
 872                 }
 873                 String string = strings.get(i);
 874 
 875                 int length16 = string.length();
 876                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
 877                     return pos; // There is a set element at pos.
 878                 }
 879             }
 880 
 881             // The span(while not contained) ended on a string start/end which is
 882             // not in the original set. Skip this code point and continue.
 883             // cpLength<0
 884             pos -= cpLength;
 885             rest += cpLength;
 886             ++count;
 887         } while (rest != 0);
 888         if (outCount != null) {
 889             outCount.value = count;
 890         }
 891         return length; // Reached the end of the string.
 892     }
 893 
 894     private int spanNotBack(CharSequence s, int length) {
 895         int pos = length;
 896         int i, stringsLength = strings.size();
 897         do {
 898             // Span until we find a code point from the set,
 899             // or a code point that starts or ends some string.
 900             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
 901             if (pos == 0) {
 902                 return 0; // Reached the start of the string.
 903             }
 904 
 905             // Check whether the current code point is in the original set,
 906             // without the string starts and ends.
 907             int cpLength = spanOneBack(spanSet, s, pos);
 908             if (cpLength > 0) {
 909                 return pos; // There is a set element at pos.
 910             }
 911 
 912             // Try to match the strings at pos.
 913             for (i = 0; i < stringsLength; ++i) {
 914                 // Use spanLengths rather than a spanLengths pointer because
 915                 // it is easier and we only need to know whether the string is irrelevant
 916                 // which is the same in either array.
 917                 if (spanLengths[i] == ALL_CP_CONTAINED) {
 918                     continue; // Irrelevant string.
 919                 }
 920                 String string = strings.get(i);
 921 
 922                 int length16 = string.length();
 923                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
 924                     return pos; // There is a set element at pos.
 925                 }
 926             }
 927 
 928             // The span(while not contained) ended on a string start/end which is
 929             // not in the original set. Skip this code point and continue.
 930             // cpLength<0
 931             pos += cpLength;
 932         } while (pos != 0);
 933         return 0; // Reached the start of the string.
 934     }
 935 
 936     static short makeSpanLengthByte(int spanLength) {
 937         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
 938         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
 939     }
 940 
 941     // Compare strings without any argument checks. Requires length>0.
 942     private static boolean matches16(CharSequence s, int start, final String t, int length) {
 943         int end = start + length;
 944         while (length-- > 0) {
 945             if (s.charAt(--end) != t.charAt(length)) {
 946                 return false;
 947             }
 948         }
 949         return true;
 950     }
 951 
 952     /**
 953      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
 954      * at code point boundaries.
 955      * That is, each edge of a match must not be in the middle of a surrogate pair.
 956      * @param s       The string to match in.
 957      * @param start   The start index of s.
 958      * @param limit   The limit of the subsequence of s being spanned.
 959      * @param t       The substring to be matched in s.
 960      * @param tlength The length of t.
 961      */
 962     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
 963         return matches16(s, start, t, tlength)
 964                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
 965                         Character.isLowSurrogate(s.charAt(start)))
 966                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
 967                         Character.isLowSurrogate(s.charAt(start + tlength)));
 968     }
 969 
 970     /**
 971      * Does the set contain the next code point?
 972      * If so, return its length; otherwise return its negative length.
 973      */
 974     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
 975         char c = s.charAt(start);
 976         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
 977             char c2 = s.charAt(start + 1);
 978             if (UTF16.isTrailSurrogate(c2)) {
 979                 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
 980                 return set.contains(supplementary) ? 2 : -2;
 981             }
 982         }
 983         return set.contains(c) ? 1 : -1;
 984     }
 985 
 986     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
 987         char c = s.charAt(length - 1);
 988         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
 989             char c2 = s.charAt(length - 2);
 990             if (UTF16.isLeadSurrogate(c2)) {
 991                 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
 992                 return set.contains(supplementary) ? 2 : -2;
 993             }
 994         }
 995         return set.contains(c) ? 1 : -1;
 996     }
 997 
 998     /**
 999      * Helper class for UnicodeSetStringSpan.
1000      *
1001      * <p>List of offsets from the current position from where to try matching
1002      * a code point or a string.
1003      * Stores offsets rather than indexes to simplify the code and use the same list
1004      * for both increments (in span()) and decrements (in spanBack()).
1005      *
1006      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1007      * are relatively dense, that is,
1008      * there are normally no gaps of hundreds or thousands of offset values.
1009      *
1010      * <p>This class optionally also tracks the minimum non-negative count for each position,
1011      * intended to count the smallest number of elements of any path leading to that position.
1012      *
1013      * <p>The implementation uses a circular buffer of count integers,
1014      * each indicating whether the corresponding offset is in the list,
1015      * and its path element count.
1016      * This avoids inserting into a sorted list of offsets (or absolute indexes)
1017      * and physically moving part of the list.
1018      *
1019      * <p>Note: In principle, the caller should setMaxLength() to
1020      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1021      * to account for "long" single code points.
1022      *
1023      * <p>Note: An earlier version did not track counts and stored only byte flags.
1024      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1025      * the list could be stored as bit flags in a single integer.
1026      * Rather than handling a circular buffer with a start list index,
1027      * the integer would simply be shifted when lower offsets are removed.
1028      * UnicodeSet does not have a limit on the lengths of strings.
1029      */
1030     private static final class OffsetList {
1031         private int[] list;
1032         private int length;
1033         private int start;
1034 
1035         public OffsetList() {
1036             list = new int[16];  // default size
1037         }
1038 
1039         public void setMaxLength(int maxLength) {
1040             if (maxLength > list.length) {
1041                 list = new int[maxLength];
1042             }
1043             clear();
1044         }
1045 
1046         public void clear() {
1047             for (int i = list.length; i-- > 0;) {
1048                 list[i] = 0;
1049             }
1050             start = length = 0;
1051         }
1052 
1053         public boolean isEmpty() {
1054             return (length == 0);
1055         }
1056 
1057         /**
1058          * Reduces all stored offsets by delta, used when the current position moves by delta.
1059          * There must not be any offsets lower than delta.
1060          * If there is an offset equal to delta, it is removed.
1061          *
1062          * @param delta [1..maxLength]
1063          */
1064         public void shift(int delta) {
1065             int i = start + delta;
1066             if (i >= list.length) {
1067                 i -= list.length;
1068             }
1069             if (list[i] != 0) {
1070                 list[i] = 0;
1071                 --length;
1072             }
1073             start = i;
1074         }
1075 
1076         /**
1077          * Adds an offset. The list must not contain it yet.
1078          * @param offset [1..maxLength]
1079          */
1080         public void addOffset(int offset) {
1081             int i = start + offset;
1082             if (i >= list.length) {
1083                 i -= list.length;
1084             }
1085             assert list[i] == 0;
1086             list[i] = 1;
1087             ++length;
1088         }
1089 
1090         /**
1091          * Adds an offset and updates its count.
1092          * The list may already contain the offset.
1093          * @param offset [1..maxLength]
1094          */
1095         public void addOffsetAndCount(int offset, int count) {
1096             assert count > 0;
1097             int i = start + offset;
1098             if (i >= list.length) {
1099                 i -= list.length;
1100             }
1101             if (list[i] == 0) {
1102                 list[i] = count;
1103                 ++length;
1104             } else if (count < list[i]) {
1105                 list[i] = count;
1106             }
1107         }
1108 
1109         /**
1110          * @param offset [1..maxLength]
1111          */
1112         public boolean containsOffset(int offset) {
1113             int i = start + offset;
1114             if (i >= list.length) {
1115                 i -= list.length;
1116             }
1117             return list[i] != 0;
1118         }
1119 
1120         /**
1121          * @param offset [1..maxLength]
1122          */
1123         public boolean hasCountAtOffset(int offset, int count) {
1124             int i = start + offset;
1125             if (i >= list.length) {
1126                 i -= list.length;
1127             }
1128             int oldCount = list[i];
1129             return oldCount != 0 && oldCount <= count;
1130         }
1131 
1132         /**
1133          * Finds the lowest stored offset from a non-empty list, removes it,
1134          * and reduces all other offsets by this minimum.
1135          * @return min=[1..maxLength]
1136          */
1137         public int popMinimum(OutputInt outCount) {
1138             // Look for the next offset in list[start+1..list.length-1].
1139             int i = start, result;
1140             while (++i < list.length) {
1141                 int count = list[i];
1142                 if (count != 0) {
1143                     list[i] = 0;
1144                     --length;
1145                     result = i - start;
1146                     start = i;
1147                     if (outCount != null) { outCount.value = count; }
1148                     return result;
1149                 }
1150             }
1151             // i==list.length
1152 
1153             // Wrap around and look for the next offset in list[0..start].
1154             // Since the list is not empty, there will be one.
1155             result = list.length - start;
1156             i = 0;
1157             int count;
1158             while ((count = list[i]) == 0) {
1159                 ++i;
1160             }
1161             list[i] = 0;
1162             --length;
1163             start = i;
1164             if (outCount != null) { outCount.value = count; }
1165             return result + i;
1166         }
1167     }
1168 }