1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  ******************************************************************************
  28  *
  29  *   Copyright (C) 2009-2014, International Business Machines
  30  *   Corporation and others.  All Rights Reserved.
  31  *
  32  ******************************************************************************
  33  */
  34 
  35 package sun.text.normalizer;
  36 
  37 import java.util.ArrayList;
  38 
  39 import sun.text.normalizer.UnicodeSet.SpanCondition;
  40 
  41 /*
  42  * Implement span() etc. for a set with strings.
  43  * Avoid recursion because of its exponential complexity.
  44  * Instead, try multiple paths at once and track them with an IndexList.
  45  */
  46 class UnicodeSetStringSpan {
  47 
  48     /*
  49      * Which span() variant will be used? The object is either built for one variant and used once,
  50      * or built for all and may be used many times.
  51      */
  52     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
  53     public static final int FWD           = 0x20;
  54     public static final int BACK          = 0x10;
  55     // public static final int UTF16      = 8;
  56     public static final int CONTAINED     = 2;
  57     public static final int NOT_CONTAINED = 1;
  58 
  59     public static final int ALL = 0x7f;
  60 
  61     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
  62     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
  63     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
  64     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
  65 
  66     /**
  67      * Special spanLength short values. (since Java has not unsigned byte type)
  68      * All code points in the string are contained in the parent set.
  69      */
  70     static final short ALL_CP_CONTAINED = 0xff;
  71 
  72     /** The spanLength is >=0xfe. */
  73     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
  74 
  75     /** Set for span(). Same as parent but without strings. */
  76     private UnicodeSet spanSet;
  77 
  78     /**
  79      * Set for span(not contained).
  80      * Same as spanSet, plus characters that start or end strings.
  81      */
  82     private UnicodeSet spanNotSet;
  83 
  84     /** The strings of the parent set. */
  85     private ArrayList<String> strings;
  86 
  87     /** The lengths of span(), spanBack() etc. for each string. */
  88     private short[] spanLengths;
  89 
  90     /** Maximum lengths of relevant strings. */
  91     private int maxLength16;
  92 
  93     /** Are there strings that are not fully contained in the code point set? */
  94     private boolean someRelevant;
  95 
  96     /** Set up for all variants of span()? */
  97     private boolean all;
  98 
  99     /** Span helper */
 100     private OffsetList offsets;
 101 
 102     /**
 103      * Constructs for all variants of span(), or only for any one variant.
 104      * Initializes as little as possible, for single use.
 105      */
 106     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
 107         spanSet = new UnicodeSet(0, 0x10ffff);
 108         // TODO: With Java 6, just take the parent set's strings as is,
 109         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
 110         // Then iterate via the first() and higher() methods.
 111         // (We do not want to create multiple Iterator objects in each span().)
 112         // See ICU ticket #7454.
 113         strings = setStrings;
 114         all = (which == ALL);
 115         spanSet.retainAll(set);
 116         if (0 != (which & NOT_CONTAINED)) {
 117             // Default to the same sets.
 118             // addToSpanNotSet() will create a separate set if necessary.
 119             spanNotSet = spanSet;
 120         }
 121         offsets = new OffsetList();
 122 
 123         // Determine if the strings even need to be taken into account at all for span() etc.
 124         // If any string is relevant, then all strings need to be used for
 125         // span(longest match) but only the relevant ones for span(while contained).
 126         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
 127         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
 128         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
 129         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
 130         int stringsLength = strings.size();
 131 
 132         int i, spanLength;
 133         someRelevant = false;
 134         for (i = 0; i < stringsLength; ++i) {
 135             String string = strings.get(i);
 136             int length16 = string.length();
 137             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
 138             if (spanLength < length16) { // Relevant string.
 139                 someRelevant = true;
 140             }
 141             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
 142                 maxLength16 = length16;
 143             }
 144         }
 145         if (!someRelevant && (which & WITH_COUNT) == 0) {
 146             return;
 147         }
 148 
 149         // Freeze after checking for the need to use strings at all because freezing
 150         // a set takes some time and memory which are wasted if there are no relevant strings.
 151         if (all) {
 152             spanSet.freeze();
 153         }
 154 
 155         int spanBackLengthsOffset;
 156 
 157         // Allocate a block of meta data.
 158         int allocSize;
 159         if (all) {
 160             // 2 sets of span lengths
 161             allocSize = stringsLength * (2);
 162         } else {
 163             allocSize = stringsLength; // One set of span lengths.
 164         }
 165         spanLengths = new short[allocSize];
 166 
 167         if (all) {
 168             // Store span lengths for all span() variants.
 169             spanBackLengthsOffset = stringsLength;
 170         } else {
 171             // Store span lengths for only one span() variant.
 172             spanBackLengthsOffset = 0;
 173         }
 174 
 175         // Set the meta data and spanNotSet and write the UTF-8 strings.
 176 
 177         for (i = 0; i < stringsLength; ++i) {
 178             String string = strings.get(i);
 179             int length16 = string.length();
 180             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
 181             if (spanLength < length16) { // Relevant string.
 182                 if (true /* 0 != (which & UTF16) */) {
 183                     if (0 != (which & CONTAINED)) {
 184                         if (0 != (which & FWD)) {
 185                             spanLengths[i] = makeSpanLengthByte(spanLength);
 186                         }
 187                         if (0 != (which & BACK)) {
 188                             spanLength = length16
 189                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
 190                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
 191                         }
 192                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
 193                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
 194                                                                                      // flag.
 195                     }
 196                 }
 197                 if (0 != (which & NOT_CONTAINED)) {
 198                     // Add string start and end code points to the spanNotSet so that
 199                     // a span(while not contained) stops before any string.
 200                     int c;
 201                     if (0 != (which & FWD)) {
 202                         c = string.codePointAt(0);
 203                         addToSpanNotSet(c);
 204                     }
 205                     if (0 != (which & BACK)) {
 206                         c = string.codePointBefore(length16);
 207                         addToSpanNotSet(c);
 208                     }
 209                 }
 210             } else { // Irrelevant string.
 211                 if (all) {
 212                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
 213                 } else {
 214                     // All spanXYZLengths pointers contain the same address.
 215                     spanLengths[i] = ALL_CP_CONTAINED;
 216                 }
 217             }
 218         }
 219 
 220         // Finish.
 221         if (all) {
 222             spanNotSet.freeze();
 223         }
 224     }
 225 
 226     /**
 227      * Do the strings need to be checked in span() etc.?
 228      *
 229      * @return true if strings need to be checked (call span() here),
 230      *         false if not (use a BMPSet for best performance).
 231      */
 232     public boolean needsStringSpanUTF16() {
 233         return someRelevant;
 234     }
 235 
 236     /** For fast UnicodeSet::contains(c). */
 237     public boolean contains(int c) {
 238         return spanSet.contains(c);
 239     }
 240 
 241     /**
 242      * Adds a starting or ending string character to the spanNotSet
 243      * so that a character span ends before any string.
 244      */
 245     private void addToSpanNotSet(int c) {
 246         if (spanNotSet == null || spanNotSet == spanSet) {
 247             if (spanSet.contains(c)) {
 248                 return; // Nothing to do.
 249             }
 250             spanNotSet = spanSet.cloneAsThawed();
 251         }
 252         spanNotSet.add(c);
 253     }
 254 
 255     /*
 256      * Note: In span() when spanLength==0
 257      * (after a string match, or at the beginning after an empty code point span)
 258      * and in spanNot() and spanNotUTF8(),
 259      * string matching could use a binary search because all string matches are done
 260      * from the same start index.
 261      *
 262      * For UTF-8, this would require a comparison function that returns UTF-16 order.
 263      *
 264      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
 265      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
 266      * API and implementation with a DFA (state machine).
 267      */
 268 
 269     /*
 270      * Algorithm for span(SpanCondition.CONTAINED)
 271      *
 272      * Theoretical algorithm:
 273      * - Iterate through the string, and at each code point boundary:
 274      *   + If the code point there is in the set, then remember to continue after it.
 275      *   + If a set string matches at the current position, then remember to continue after it.
 276      *   + Either recursively span for each code point or string match, or recursively span
 277      *     for all but the shortest one and iteratively continue the span with the shortest local match.
 278      *   + Remember the longest recursive span (the farthest end point).
 279      *   + If there is no match at the current position,
 280      *     neither for the code point there nor for any set string,
 281      *     then stop and return the longest recursive span length.
 282      *
 283      * Optimized implementation:
 284      *
 285      * (We assume that most sets will have very few very short strings.
 286      * A span using a string-less set is extremely fast.)
 287      *
 288      * Create and cache a spanSet which contains all of the single code points of the original set
 289      * but none of its strings.
 290      *
 291      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
 292      * - Loop:
 293      *   + Try to match each set string at the end of the spanLength.
 294      *     ~ Set strings that start with set-contained code points
 295      *       must be matched with a partial overlap
 296      *       because the recursive algorithm would have tried to match them at every position.
 297      *     ~ Set strings that entirely consist of set-contained code points
 298      *       are irrelevant for span(SpanCondition.CONTAINED)
 299      *       because the recursive algorithm would continue after them anyway and
 300      *       find the longest recursive match from their end.
 301      *     ~ Rather than recursing, note each end point of a set string match.
 302      *   + If no set string matched after spanSet.span(),
 303      *     then return with where the spanSet.span() ended.
 304      *   + If at least one set string matched after spanSet.span(),
 305      *     then pop the shortest string match end point and continue the loop,
 306      *     trying to match all set strings from there.
 307      *   + If at least one more set string matched after a previous string match, then test if the
 308      *     code point after the previous string match is also contained in the set.
 309      *     Continue the loop with the shortest end point of
 310      *     either this code point or a matching set string.
 311      *   + If no more set string matched after a previous string match,
 312      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
 313      *     Stop if spanLength==0, otherwise continue the loop.
 314      *
 315      * By noting each end point of a set string match, the function visits each string position at most once and
 316      * finishes in linear time.
 317      *
 318      * The recursive algorithm may visit the same string position many times
 319      * if multiple paths lead to it and finishes in exponential time.
 320      */
 321 
 322     /*
 323      * Algorithm for span(SIMPLE)
 324      *
 325      * Theoretical algorithm:
 326      * - Iterate through the string, and at each code point boundary:
 327      *   + If the code point there is in the set, then remember to continue after it.
 328      *   + If a set string matches at the current position, then remember to continue after it.
 329      *   + Continue from the farthest match position and ignore all others.
 330      *   + If there is no match at the current position, then stop and return the current position.
 331      *
 332      * Optimized implementation:
 333      *
 334      * (Same assumption and spanSet as above.)
 335      *
 336      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
 337      * - Loop:
 338      *   + Try to match each set string at the end of the spanLength.
 339      *     ~ Set strings that start with set-contained code points
 340      *       must be matched with a partial overlap
 341      *       because the standard algorithm would have tried to match them earlier.
 342      *     ~ Set strings that entirely consist of set-contained code points
 343      *       must be matched with a full overlap because the longest-match algorithm
 344      *       would hide set string matches that end earlier.
 345      *       Such set strings need not be matched earlier inside the code point span
 346      *       because the standard algorithm would then have
 347      *       continued after the set string match anyway.
 348      *     ~ Remember the longest set string match (farthest end point)
 349      *       from the earliest starting point.
 350      *   + If no set string matched after spanSet.span(),
 351      *     then return with where the spanSet.span() ended.
 352      *   + If at least one set string matched,
 353      *     then continue the loop after the longest match from the earliest position.
 354      *   + If no more set string matched after a previous string match,
 355      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
 356      *     Stop if spanLength==0, otherwise continue the loop.
 357      */
 358     /**
 359      * Spans a string.
 360      *
 361      * @param s The string to be spanned
 362      * @param start The start index that the span begins
 363      * @param spanCondition The span condition
 364      * @return the limit (exclusive end) of the span
 365      */
 366     public int span(CharSequence s, int start, SpanCondition spanCondition) {
 367         if (spanCondition == SpanCondition.NOT_CONTAINED) {
 368             return spanNot(s, start, null);
 369         }
 370         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
 371         if (spanLimit == s.length()) {
 372             return spanLimit;
 373         }
 374         return spanWithStrings(s, start, spanLimit, spanCondition);
 375     }
 376 
 377     /**
 378      * Synchronized method for complicated spans using the offsets.
 379      * Avoids synchronization for simple cases.
 380      *
 381      * @param spanLimit = spanSet.span(s, start, CONTAINED)
 382      */
 383     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
 384             SpanCondition spanCondition) {
 385         // Consider strings; they may overlap with the span.
 386         int initSize = 0;
 387         if (spanCondition == SpanCondition.CONTAINED) {
 388             // Use offset list to try all possibilities.
 389             initSize = maxLength16;
 390         }
 391         offsets.setMaxLength(initSize);
 392         int length = s.length();
 393         int pos = spanLimit, rest = length - spanLimit;
 394         int spanLength = spanLimit - start;
 395         int i, stringsLength = strings.size();
 396         for (;;) {
 397             if (spanCondition == SpanCondition.CONTAINED) {
 398                 for (i = 0; i < stringsLength; ++i) {
 399                     int overlap = spanLengths[i];
 400                     if (overlap == ALL_CP_CONTAINED) {
 401                         continue; // Irrelevant string.
 402                     }
 403                     String string = strings.get(i);
 404 
 405                     int length16 = string.length();
 406 
 407                     // Try to match this string at pos-overlap..pos.
 408                     if (overlap >= LONG_SPAN) {
 409                         overlap = length16;
 410                         // While contained: No point matching fully inside the code point span.
 411                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
 412                                                                           // point.
 413                     }
 414                     if (overlap > spanLength) {
 415                         overlap = spanLength;
 416                     }
 417                     int inc = length16 - overlap; // Keep overlap+inc==length16.
 418                     for (;;) {
 419                         if (inc > rest) {
 420                             break;
 421                         }
 422                         // Try to match if the increment is not listed already.
 423                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
 424                             if (inc == rest) {
 425                                 return length; // Reached the end of the string.
 426                             }
 427                             offsets.addOffset(inc);
 428                         }
 429                         if (overlap == 0) {
 430                             break;
 431                         }
 432                         --overlap;
 433                         ++inc;
 434                     }
 435                 }
 436             } else /* SIMPLE */{
 437                 int maxInc = 0, maxOverlap = 0;
 438                 for (i = 0; i < stringsLength; ++i) {
 439                     int overlap = spanLengths[i];
 440                     // For longest match, we do need to try to match even an all-contained string
 441                     // to find the match from the earliest start.
 442 
 443                     String string = strings.get(i);
 444 
 445                     int length16 = string.length();
 446 
 447                     // Try to match this string at pos-overlap..pos.
 448                     if (overlap >= LONG_SPAN) {
 449                         overlap = length16;
 450                         // Longest match: Need to match fully inside the code point span
 451                         // to find the match from the earliest start.
 452                     }
 453                     if (overlap > spanLength) {
 454                         overlap = spanLength;
 455                     }
 456                     int inc = length16 - overlap; // Keep overlap+inc==length16.
 457                     for (;;) {
 458                         if (inc > rest || overlap < maxOverlap) {
 459                             break;
 460                         }
 461                         // Try to match if the string is longer or starts earlier.
 462                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
 463                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
 464                             maxInc = inc; // Longest match from earliest start.
 465                             maxOverlap = overlap;
 466                             break;
 467                         }
 468                         --overlap;
 469                         ++inc;
 470                     }
 471                 }
 472 
 473                 if (maxInc != 0 || maxOverlap != 0) {
 474                     // Longest-match algorithm, and there was a string match.
 475                     // Simply continue after it.
 476                     pos += maxInc;
 477                     rest -= maxInc;
 478                     if (rest == 0) {
 479                         return length; // Reached the end of the string.
 480                     }
 481                     spanLength = 0; // Match strings from after a string match.
 482                     continue;
 483                 }
 484             }
 485             // Finished trying to match all strings at pos.
 486 
 487             if (spanLength != 0 || pos == 0) {
 488                 // The position is after an unlimited code point span (spanLength!=0),
 489                 // not after a string match.
 490                 // The only position where spanLength==0 after a span is pos==0.
 491                 // Otherwise, an unlimited code point span is only tried again when no
 492                 // strings match, and if such a non-initial span fails we stop.
 493                 if (offsets.isEmpty()) {
 494                     return pos; // No strings matched after a span.
 495                 }
 496                 // Match strings from after the next string match.
 497             } else {
 498                 // The position is after a string match (or a single code point).
 499                 if (offsets.isEmpty()) {
 500                     // No more strings matched after a previous string match.
 501                     // Try another code point span from after the last string match.
 502                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
 503                     spanLength = spanLimit - pos;
 504                     if (spanLength == rest || // Reached the end of the string, or
 505                             spanLength == 0 // neither strings nor span progressed.
 506                     ) {
 507                         return spanLimit;
 508                     }
 509                     pos += spanLength;
 510                     rest -= spanLength;
 511                     continue; // spanLength>0: Match strings from after a span.
 512                 } else {
 513                     // Try to match only one code point from after a string match if some
 514                     // string matched beyond it, so that we try all possible positions
 515                     // and don't overshoot.
 516                     spanLength = spanOne(spanSet, s, pos, rest);
 517                     if (spanLength > 0) {
 518                         if (spanLength == rest) {
 519                             return length; // Reached the end of the string.
 520                         }
 521                         // Match strings after this code point.
 522                         // There cannot be any increments below it because UnicodeSet strings
 523                         // contain multiple code points.
 524                         pos += spanLength;
 525                         rest -= spanLength;
 526                         offsets.shift(spanLength);
 527                         spanLength = 0;
 528                         continue; // Match strings from after a single code point.
 529                     }
 530                     // Match strings from after the next string match.
 531                 }
 532             }
 533             int minOffset = offsets.popMinimum(null);
 534             pos += minOffset;
 535             rest -= minOffset;
 536             spanLength = 0; // Match strings from after a string match.
 537         }
 538     }
 539 
 540     /**
 541      * Spans a string and counts the smallest number of set elements on any path across the span.
 542      *
 543      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
 544      *
 545      * <p>If the set does not have any fully-contained strings, then we could optimize this
 546      * like span(), but such sets are likely rare, and this is at least still linear.
 547      *
 548      * @param s The string to be spanned
 549      * @param start The start index that the span begins
 550      * @param spanCondition The span condition
 551      * @param outCount The count
 552      * @return the limit (exclusive end) of the span
 553      */
 554     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
 555             OutputInt outCount) {
 556         if (spanCondition == SpanCondition.NOT_CONTAINED) {
 557             return spanNot(s, start, outCount);
 558         }
 559         // Consider strings; they may overlap with the span,
 560         // and they may result in a smaller count that with just code points.
 561         if (spanCondition == SpanCondition.CONTAINED) {
 562             return spanContainedAndCount(s, start, outCount);
 563         }
 564         // SIMPLE (not synchronized, does not use offsets)
 565         int stringsLength = strings.size();
 566         int length = s.length();
 567         int pos = start;
 568         int rest = length - start;
 569         int count = 0;
 570         while (rest != 0) {
 571             // Try to match the next code point.
 572             int cpLength = spanOne(spanSet, s, pos, rest);
 573             int maxInc = (cpLength > 0) ? cpLength : 0;
 574             // Try to match all of the strings.
 575             for (int i = 0; i < stringsLength; ++i) {
 576                 String string = strings.get(i);
 577                 int length16 = string.length();
 578                 if (maxInc < length16 && length16 <= rest &&
 579                         matches16CPB(s, pos, length, string, length16)) {
 580                     maxInc = length16;
 581                 }
 582             }
 583             // We are done if there is no match beyond pos.
 584             if (maxInc == 0) {
 585                 outCount.value = count;
 586                 return pos;
 587             }
 588             // Continue from the longest match.
 589             ++count;
 590             pos += maxInc;
 591             rest -= maxInc;
 592         }
 593         outCount.value = count;
 594         return pos;
 595     }
 596 
 597     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
 598         // Use offset list to try all possibilities.
 599         offsets.setMaxLength(maxLength16);
 600         int stringsLength = strings.size();
 601         int length = s.length();
 602         int pos = start;
 603         int rest = length - start;
 604         int count = 0;
 605         while (rest != 0) {
 606             // Try to match the next code point.
 607             int cpLength = spanOne(spanSet, s, pos, rest);
 608             if (cpLength > 0) {
 609                 offsets.addOffsetAndCount(cpLength, count + 1);
 610             }
 611             // Try to match all of the strings.
 612             for (int i = 0; i < stringsLength; ++i) {
 613                 String string = strings.get(i);
 614                 int length16 = string.length();
 615                 // Note: If the strings were sorted by length, then we could also
 616                 // avoid trying to match if there is already a match of the same length.
 617                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
 618                         matches16CPB(s, pos, length, string, length16)) {
 619                     offsets.addOffsetAndCount(length16, count + 1);
 620                 }
 621             }
 622             // We are done if there is no match beyond pos.
 623             if (offsets.isEmpty()) {
 624                 outCount.value = count;
 625                 return pos;
 626             }
 627             // Continue from the nearest match.
 628             int minOffset = offsets.popMinimum(outCount);
 629             count = outCount.value;
 630             pos += minOffset;
 631             rest -= minOffset;
 632         }
 633         outCount.value = count;
 634         return pos;
 635     }
 636 
 637     /**
 638      * Span a string backwards.
 639      *
 640      * @param s The string to be spanned
 641      * @param spanCondition The span condition
 642      * @return The string index which starts the span (i.e. inclusive).
 643      */
 644     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
 645         if (spanCondition == SpanCondition.NOT_CONTAINED) {
 646             return spanNotBack(s, length);
 647         }
 648         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
 649         if (pos == 0) {
 650             return 0;
 651         }
 652         int spanLength = length - pos;
 653 
 654         // Consider strings; they may overlap with the span.
 655         int initSize = 0;
 656         if (spanCondition == SpanCondition.CONTAINED) {
 657             // Use offset list to try all possibilities.
 658             initSize = maxLength16;
 659         }
 660         offsets.setMaxLength(initSize);
 661         int i, stringsLength = strings.size();
 662         int spanBackLengthsOffset = 0;
 663         if (all) {
 664             spanBackLengthsOffset = stringsLength;
 665         }
 666         for (;;) {
 667             if (spanCondition == SpanCondition.CONTAINED) {
 668                 for (i = 0; i < stringsLength; ++i) {
 669                     int overlap = spanLengths[spanBackLengthsOffset + i];
 670                     if (overlap == ALL_CP_CONTAINED) {
 671                         continue; // Irrelevant string.
 672                     }
 673                     String string = strings.get(i);
 674 
 675                     int length16 = string.length();
 676 
 677                     // Try to match this string at pos-(length16-overlap)..pos-length16.
 678                     if (overlap >= LONG_SPAN) {
 679                         overlap = length16;
 680                         // While contained: No point matching fully inside the code point span.
 681                         int len1 = 0;
 682                         len1 = string.offsetByCodePoints(0, 1);
 683                         overlap -= len1; // Length of the string minus the first code point.
 684                     }
 685                     if (overlap > spanLength) {
 686                         overlap = spanLength;
 687                     }
 688                     int dec = length16 - overlap; // Keep dec+overlap==length16.
 689                     for (;;) {
 690                         if (dec > pos) {
 691                             break;
 692                         }
 693                         // Try to match if the decrement is not listed already.
 694                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
 695                             if (dec == pos) {
 696                                 return 0; // Reached the start of the string.
 697                             }
 698                             offsets.addOffset(dec);
 699                         }
 700                         if (overlap == 0) {
 701                             break;
 702                         }
 703                         --overlap;
 704                         ++dec;
 705                     }
 706                 }
 707             } else /* SIMPLE */{
 708                 int maxDec = 0, maxOverlap = 0;
 709                 for (i = 0; i < stringsLength; ++i) {
 710                     int overlap = spanLengths[spanBackLengthsOffset + i];
 711                     // For longest match, we do need to try to match even an all-contained string
 712                     // to find the match from the latest end.
 713 
 714                     String string = strings.get(i);
 715 
 716                     int length16 = string.length();
 717 
 718                     // Try to match this string at pos-(length16-overlap)..pos-length16.
 719                     if (overlap >= LONG_SPAN) {
 720                         overlap = length16;
 721                         // Longest match: Need to match fully inside the code point span
 722                         // to find the match from the latest end.
 723                     }
 724                     if (overlap > spanLength) {
 725                       overlap = spanLength;
 726                     }
 727                     int dec = length16 - overlap; // Keep dec+overlap==length16.
 728                     for (;;) {
 729                         if (dec > pos || overlap < maxOverlap) {
 730                             break;
 731                         }
 732                         // Try to match if the string is longer or ends later.
 733                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
 734                                 && matches16CPB(s, pos - dec, length, string, length16)) {
 735                             maxDec = dec; // Longest match from latest end.
 736                             maxOverlap = overlap;
 737                             break;
 738                         }
 739                         --overlap;
 740                         ++dec;
 741                     }
 742                 }
 743 
 744                 if (maxDec != 0 || maxOverlap != 0) {
 745                     // Longest-match algorithm, and there was a string match.
 746                     // Simply continue before it.
 747                     pos -= maxDec;
 748                     if (pos == 0) {
 749                         return 0; // Reached the start of the string.
 750                     }
 751                     spanLength = 0; // Match strings from before a string match.
 752                     continue;
 753                 }
 754             }
 755             // Finished trying to match all strings at pos.
 756 
 757             if (spanLength != 0 || pos == length) {
 758                 // The position is before an unlimited code point span (spanLength!=0),
 759                 // not before a string match.
 760                 // The only position where spanLength==0 before a span is pos==length.
 761                 // Otherwise, an unlimited code point span is only tried again when no
 762                 // strings match, and if such a non-initial span fails we stop.
 763                 if (offsets.isEmpty()) {
 764                     return pos; // No strings matched before a span.
 765                 }
 766                 // Match strings from before the next string match.
 767             } else {
 768                 // The position is before a string match (or a single code point).
 769                 if (offsets.isEmpty()) {
 770                     // No more strings matched before a previous string match.
 771                     // Try another code point span from before the last string match.
 772                     int oldPos = pos;
 773                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
 774                     spanLength = oldPos - pos;
 775                     if (pos == 0 || // Reached the start of the string, or
 776                             spanLength == 0 // neither strings nor span progressed.
 777                     ) {
 778                         return pos;
 779                     }
 780                     continue; // spanLength>0: Match strings from before a span.
 781                 } else {
 782                     // Try to match only one code point from before a string match if some
 783                     // string matched beyond it, so that we try all possible positions
 784                     // and don't overshoot.
 785                     spanLength = spanOneBack(spanSet, s, pos);
 786                     if (spanLength > 0) {
 787                         if (spanLength == pos) {
 788                             return 0; // Reached the start of the string.
 789                         }
 790                         // Match strings before this code point.
 791                         // There cannot be any decrements below it because UnicodeSet strings
 792                         // contain multiple code points.
 793                         pos -= spanLength;
 794                         offsets.shift(spanLength);
 795                         spanLength = 0;
 796                         continue; // Match strings from before a single code point.
 797                     }
 798                     // Match strings from before the next string match.
 799                 }
 800             }
 801             pos -= offsets.popMinimum(null);
 802             spanLength = 0; // Match strings from before a string match.
 803         }
 804     }
 805 
 806     /**
 807      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
 808      *
 809      * Theoretical algorithm:
 810      * - Iterate through the string, and at each code point boundary:
 811      *   + If the code point there is in the set, then return with the current position.
 812      *   + If a set string matches at the current position, then return with the current position.
 813      *
 814      * Optimized implementation:
 815      *
 816      * (Same assumption as for span() above.)
 817      *
 818      * Create and cache a spanNotSet which contains
 819      * all of the single code points of the original set but none of its strings.
 820      * For each set string add its initial code point to the spanNotSet.
 821      * (Also add its final code point for spanNotBack().)
 822      *
 823      * - Loop:
 824      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
 825      *   + If the current code point is in the original set, then return the current position.
 826      *   + If any set string matches at the current position, then return the current position.
 827      *   + If there is no match at the current position, neither for the code point
 828      *     there nor for any set string, then skip this code point and continue the loop.
 829      *     This happens for set-string-initial code points that were added to spanNotSet
 830      *     when there is not actually a match for such a set string.
 831      *
 832      * @param s The string to be spanned
 833      * @param start The start index that the span begins
 834      * @param outCount If not null: Receives the number of code points across the span.
 835      * @return the limit (exclusive end) of the span
 836      */
 837     private int spanNot(CharSequence s, int start, OutputInt outCount) {
 838         int length = s.length();
 839         int pos = start, rest = length - start;
 840         int stringsLength = strings.size();
 841         int count = 0;
 842         do {
 843             // Span until we find a code point from the set,
 844             // or a code point that starts or ends some string.
 845             int spanLimit;
 846             if (outCount == null) {
 847                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
 848             } else {
 849                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
 850                 outCount.value = count = count + outCount.value;
 851             }
 852             if (spanLimit == length) {
 853                 return length; // Reached the end of the string.
 854             }
 855             pos = spanLimit;
 856             rest = length - spanLimit;
 857 
 858             // Check whether the current code point is in the original set,
 859             // without the string starts and ends.
 860             int cpLength = spanOne(spanSet, s, pos, rest);
 861             if (cpLength > 0) {
 862                 return pos; // There is a set element at pos.
 863             }
 864 
 865             // Try to match the strings at pos.
 866             for (int i = 0; i < stringsLength; ++i) {
 867                 if (spanLengths[i] == ALL_CP_CONTAINED) {
 868                     continue; // Irrelevant string.
 869                 }
 870                 String string = strings.get(i);
 871 
 872                 int length16 = string.length();
 873                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
 874                     return pos; // There is a set element at pos.
 875                 }
 876             }
 877 
 878             // The span(while not contained) ended on a string start/end which is
 879             // not in the original set. Skip this code point and continue.
 880             // cpLength<0
 881             pos -= cpLength;
 882             rest += cpLength;
 883             ++count;
 884         } while (rest != 0);
 885         if (outCount != null) {
 886             outCount.value = count;
 887         }
 888         return length; // Reached the end of the string.
 889     }
 890 
 891     private int spanNotBack(CharSequence s, int length) {
 892         int pos = length;
 893         int i, stringsLength = strings.size();
 894         do {
 895             // Span until we find a code point from the set,
 896             // or a code point that starts or ends some string.
 897             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
 898             if (pos == 0) {
 899                 return 0; // Reached the start of the string.
 900             }
 901 
 902             // Check whether the current code point is in the original set,
 903             // without the string starts and ends.
 904             int cpLength = spanOneBack(spanSet, s, pos);
 905             if (cpLength > 0) {
 906                 return pos; // There is a set element at pos.
 907             }
 908 
 909             // Try to match the strings at pos.
 910             for (i = 0; i < stringsLength; ++i) {
 911                 // Use spanLengths rather than a spanLengths pointer because
 912                 // it is easier and we only need to know whether the string is irrelevant
 913                 // which is the same in either array.
 914                 if (spanLengths[i] == ALL_CP_CONTAINED) {
 915                     continue; // Irrelevant string.
 916                 }
 917                 String string = strings.get(i);
 918 
 919                 int length16 = string.length();
 920                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
 921                     return pos; // There is a set element at pos.
 922                 }
 923             }
 924 
 925             // The span(while not contained) ended on a string start/end which is
 926             // not in the original set. Skip this code point and continue.
 927             // cpLength<0
 928             pos += cpLength;
 929         } while (pos != 0);
 930         return 0; // Reached the start of the string.
 931     }
 932 
 933     static short makeSpanLengthByte(int spanLength) {
 934         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
 935         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
 936     }
 937 
 938     // Compare strings without any argument checks. Requires length>0.
 939     private static boolean matches16(CharSequence s, int start, final String t, int length) {
 940         int end = start + length;
 941         while (length-- > 0) {
 942             if (s.charAt(--end) != t.charAt(length)) {
 943                 return false;
 944             }
 945         }
 946         return true;
 947     }
 948 
 949     /**
 950      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
 951      * at code point boundaries.
 952      * That is, each edge of a match must not be in the middle of a surrogate pair.
 953      * @param s       The string to match in.
 954      * @param start   The start index of s.
 955      * @param limit   The limit of the subsequence of s being spanned.
 956      * @param t       The substring to be matched in s.
 957      * @param tlength The length of t.
 958      */
 959     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
 960         return matches16(s, start, t, tlength)
 961                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
 962                         Character.isLowSurrogate(s.charAt(start)))
 963                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
 964                         Character.isLowSurrogate(s.charAt(start + tlength)));
 965     }
 966 
 967     /**
 968      * Does the set contain the next code point?
 969      * If so, return its length; otherwise return its negative length.
 970      */
 971     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
 972         char c = s.charAt(start);
 973         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
 974             char c2 = s.charAt(start + 1);
 975             if (UTF16.isTrailSurrogate(c2)) {
 976                 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
 977                 return set.contains(supplementary) ? 2 : -2;
 978             }
 979         }
 980         return set.contains(c) ? 1 : -1;
 981     }
 982 
 983     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
 984         char c = s.charAt(length - 1);
 985         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
 986             char c2 = s.charAt(length - 2);
 987             if (UTF16.isLeadSurrogate(c2)) {
 988                 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
 989                 return set.contains(supplementary) ? 2 : -2;
 990             }
 991         }
 992         return set.contains(c) ? 1 : -1;
 993     }
 994 
 995     /**
 996      * Helper class for UnicodeSetStringSpan.
 997      *
 998      * <p>List of offsets from the current position from where to try matching
 999      * a code point or a string.
1000      * Stores offsets rather than indexes to simplify the code and use the same list
1001      * for both increments (in span()) and decrements (in spanBack()).
1002      *
1003      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1004      * are relatively dense, that is,
1005      * there are normally no gaps of hundreds or thousands of offset values.
1006      *
1007      * <p>This class optionally also tracks the minimum non-negative count for each position,
1008      * intended to count the smallest number of elements of any path leading to that position.
1009      *
1010      * <p>The implementation uses a circular buffer of count integers,
1011      * each indicating whether the corresponding offset is in the list,
1012      * and its path element count.
1013      * This avoids inserting into a sorted list of offsets (or absolute indexes)
1014      * and physically moving part of the list.
1015      *
1016      * <p>Note: In principle, the caller should setMaxLength() to
1017      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1018      * to account for "long" single code points.
1019      *
1020      * <p>Note: An earlier version did not track counts and stored only byte flags.
1021      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1022      * the list could be stored as bit flags in a single integer.
1023      * Rather than handling a circular buffer with a start list index,
1024      * the integer would simply be shifted when lower offsets are removed.
1025      * UnicodeSet does not have a limit on the lengths of strings.
1026      */
1027     private static final class OffsetList {
1028         private int[] list;
1029         private int length;
1030         private int start;
1031 
1032         public OffsetList() {
1033             list = new int[16];  // default size
1034         }
1035 
1036         public void setMaxLength(int maxLength) {
1037             if (maxLength > list.length) {
1038                 list = new int[maxLength];
1039             }
1040             clear();
1041         }
1042 
1043         public void clear() {
1044             for (int i = list.length; i-- > 0;) {
1045                 list[i] = 0;
1046             }
1047             start = length = 0;
1048         }
1049 
1050         public boolean isEmpty() {
1051             return (length == 0);
1052         }
1053 
1054         /**
1055          * Reduces all stored offsets by delta, used when the current position moves by delta.
1056          * There must not be any offsets lower than delta.
1057          * If there is an offset equal to delta, it is removed.
1058          *
1059          * @param delta [1..maxLength]
1060          */
1061         public void shift(int delta) {
1062             int i = start + delta;
1063             if (i >= list.length) {
1064                 i -= list.length;
1065             }
1066             if (list[i] != 0) {
1067                 list[i] = 0;
1068                 --length;
1069             }
1070             start = i;
1071         }
1072 
1073         /**
1074          * Adds an offset. The list must not contain it yet.
1075          * @param offset [1..maxLength]
1076          */
1077         public void addOffset(int offset) {
1078             int i = start + offset;
1079             if (i >= list.length) {
1080                 i -= list.length;
1081             }
1082             assert list[i] == 0;
1083             list[i] = 1;
1084             ++length;
1085         }
1086 
1087         /**
1088          * Adds an offset and updates its count.
1089          * The list may already contain the offset.
1090          * @param offset [1..maxLength]
1091          */
1092         public void addOffsetAndCount(int offset, int count) {
1093             assert count > 0;
1094             int i = start + offset;
1095             if (i >= list.length) {
1096                 i -= list.length;
1097             }
1098             if (list[i] == 0) {
1099                 list[i] = count;
1100                 ++length;
1101             } else if (count < list[i]) {
1102                 list[i] = count;
1103             }
1104         }
1105 
1106         /**
1107          * @param offset [1..maxLength]
1108          */
1109         public boolean containsOffset(int offset) {
1110             int i = start + offset;
1111             if (i >= list.length) {
1112                 i -= list.length;
1113             }
1114             return list[i] != 0;
1115         }
1116 
1117         /**
1118          * @param offset [1..maxLength]
1119          */
1120         public boolean hasCountAtOffset(int offset, int count) {
1121             int i = start + offset;
1122             if (i >= list.length) {
1123                 i -= list.length;
1124             }
1125             int oldCount = list[i];
1126             return oldCount != 0 && oldCount <= count;
1127         }
1128 
1129         /**
1130          * Finds the lowest stored offset from a non-empty list, removes it,
1131          * and reduces all other offsets by this minimum.
1132          * @return min=[1..maxLength]
1133          */
1134         public int popMinimum(OutputInt outCount) {
1135             // Look for the next offset in list[start+1..list.length-1].
1136             int i = start, result;
1137             while (++i < list.length) {
1138                 int count = list[i];
1139                 if (count != 0) {
1140                     list[i] = 0;
1141                     --length;
1142                     result = i - start;
1143                     start = i;
1144                     if (outCount != null) { outCount.value = count; }
1145                     return result;
1146                 }
1147             }
1148             // i==list.length
1149 
1150             // Wrap around and look for the next offset in list[0..start].
1151             // Since the list is not empty, there will be one.
1152             result = list.length - start;
1153             i = 0;
1154             int count;
1155             while ((count = list[i]) == 0) {
1156                 ++i;
1157             }
1158             list[i] = 0;
1159             --length;
1160             start = i;
1161             if (outCount != null) { outCount.value = count; }
1162             return result + i;
1163         }
1164     }
1165 }