1 /* 2 * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ****************************************************************************** 28 * 29 * Copyright (C) 2009-2014, International Business Machines 30 * Corporation and others. All Rights Reserved. 31 * 32 ****************************************************************************** 33 */ 34 35 package jdk.internal.icu.impl; 36 37 import java.util.ArrayList; 38 39 import jdk.internal.icu.text.UTF16; 40 import jdk.internal.icu.text.UnicodeSet; 41 import jdk.internal.icu.text.UnicodeSet.SpanCondition; 42 import jdk.internal.icu.util.OutputInt; 43 44 /* 45 * Implement span() etc. for a set with strings. 46 * Avoid recursion because of its exponential complexity. 47 * Instead, try multiple paths at once and track them with an IndexList. 48 */ 49 public class UnicodeSetStringSpan { 50 51 /* 52 * Which span() variant will be used? The object is either built for one variant and used once, 53 * or built for all and may be used many times. 54 */ 55 public static final int WITH_COUNT = 0x40; // spanAndCount() may be called 56 public static final int FWD = 0x20; 57 public static final int BACK = 0x10; 58 // public static final int UTF16 = 8; 59 public static final int CONTAINED = 2; 60 public static final int NOT_CONTAINED = 1; 61 62 public static final int ALL = 0x7f; 63 64 public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; 65 public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; 66 public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; 67 public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; 68 69 /** 70 * Special spanLength short values. (since Java has not unsigned byte type) 71 * All code points in the string are contained in the parent set. 72 */ 73 static final short ALL_CP_CONTAINED = 0xff; 74 75 /** The spanLength is >=0xfe. */ 76 static final short LONG_SPAN = ALL_CP_CONTAINED - 1; 77 78 /** Set for span(). Same as parent but without strings. */ 79 private UnicodeSet spanSet; 80 81 /** 82 * Set for span(not contained). 83 * Same as spanSet, plus characters that start or end strings. 84 */ 85 private UnicodeSet spanNotSet; 86 87 /** The strings of the parent set. */ 88 private ArrayList<String> strings; 89 90 /** The lengths of span(), spanBack() etc. for each string. */ 91 private short[] spanLengths; 92 93 /** Maximum lengths of relevant strings. */ 94 private int maxLength16; 95 96 /** Are there strings that are not fully contained in the code point set? */ 97 private boolean someRelevant; 98 99 /** Set up for all variants of span()? */ 100 private boolean all; 101 102 /** Span helper */ 103 private OffsetList offsets; 104 105 /** 106 * Constructs for all variants of span(), or only for any one variant. 107 * Initializes as little as possible, for single use. 108 */ 109 public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { 110 spanSet = new UnicodeSet(0, 0x10ffff); 111 // TODO: With Java 6, just take the parent set's strings as is, 112 // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. 113 // Then iterate via the first() and higher() methods. 114 // (We do not want to create multiple Iterator objects in each span().) 115 // See ICU ticket #7454. 116 strings = setStrings; 117 all = (which == ALL); 118 spanSet.retainAll(set); 119 if (0 != (which & NOT_CONTAINED)) { 120 // Default to the same sets. 121 // addToSpanNotSet() will create a separate set if necessary. 122 spanNotSet = spanSet; 123 } 124 offsets = new OffsetList(); 125 126 // Determine if the strings even need to be taken into account at all for span() etc. 127 // If any string is relevant, then all strings need to be used for 128 // span(longest match) but only the relevant ones for span(while contained). 129 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 130 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 131 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 132 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 133 int stringsLength = strings.size(); 134 135 int i, spanLength; 136 someRelevant = false; 137 for (i = 0; i < stringsLength; ++i) { 138 String string = strings.get(i); 139 int length16 = string.length(); 140 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 141 if (spanLength < length16) { // Relevant string. 142 someRelevant = true; 143 } 144 if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { 145 maxLength16 = length16; 146 } 147 } 148 if (!someRelevant && (which & WITH_COUNT) == 0) { 149 return; 150 } 151 152 // Freeze after checking for the need to use strings at all because freezing 153 // a set takes some time and memory which are wasted if there are no relevant strings. 154 if (all) { 155 spanSet.freeze(); 156 } 157 158 int spanBackLengthsOffset; 159 160 // Allocate a block of meta data. 161 int allocSize; 162 if (all) { 163 // 2 sets of span lengths 164 allocSize = stringsLength * (2); 165 } else { 166 allocSize = stringsLength; // One set of span lengths. 167 } 168 spanLengths = new short[allocSize]; 169 170 if (all) { 171 // Store span lengths for all span() variants. 172 spanBackLengthsOffset = stringsLength; 173 } else { 174 // Store span lengths for only one span() variant. 175 spanBackLengthsOffset = 0; 176 } 177 178 // Set the meta data and spanNotSet and write the UTF-8 strings. 179 180 for (i = 0; i < stringsLength; ++i) { 181 String string = strings.get(i); 182 int length16 = string.length(); 183 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 184 if (spanLength < length16) { // Relevant string. 185 if (true /* 0 != (which & UTF16) */) { 186 if (0 != (which & CONTAINED)) { 187 if (0 != (which & FWD)) { 188 spanLengths[i] = makeSpanLengthByte(spanLength); 189 } 190 if (0 != (which & BACK)) { 191 spanLength = length16 192 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); 193 spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); 194 } 195 } else /* not CONTAINED, not all, but NOT_CONTAINED */{ 196 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant 197 // flag. 198 } 199 } 200 if (0 != (which & NOT_CONTAINED)) { 201 // Add string start and end code points to the spanNotSet so that 202 // a span(while not contained) stops before any string. 203 int c; 204 if (0 != (which & FWD)) { 205 c = string.codePointAt(0); 206 addToSpanNotSet(c); 207 } 208 if (0 != (which & BACK)) { 209 c = string.codePointBefore(length16); 210 addToSpanNotSet(c); 211 } 212 } 213 } else { // Irrelevant string. 214 if (all) { 215 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; 216 } else { 217 // All spanXYZLengths pointers contain the same address. 218 spanLengths[i] = ALL_CP_CONTAINED; 219 } 220 } 221 } 222 223 // Finish. 224 if (all) { 225 spanNotSet.freeze(); 226 } 227 } 228 229 /** 230 * Do the strings need to be checked in span() etc.? 231 * 232 * @return true if strings need to be checked (call span() here), 233 * false if not (use a BMPSet for best performance). 234 */ 235 public boolean needsStringSpanUTF16() { 236 return someRelevant; 237 } 238 239 /** For fast UnicodeSet::contains(c). */ 240 public boolean contains(int c) { 241 return spanSet.contains(c); 242 } 243 244 /** 245 * Adds a starting or ending string character to the spanNotSet 246 * so that a character span ends before any string. 247 */ 248 private void addToSpanNotSet(int c) { 249 if (spanNotSet == null || spanNotSet == spanSet) { 250 if (spanSet.contains(c)) { 251 return; // Nothing to do. 252 } 253 spanNotSet = spanSet.cloneAsThawed(); 254 } 255 spanNotSet.add(c); 256 } 257 258 /* 259 * Note: In span() when spanLength==0 260 * (after a string match, or at the beginning after an empty code point span) 261 * and in spanNot() and spanNotUTF8(), 262 * string matching could use a binary search because all string matches are done 263 * from the same start index. 264 * 265 * For UTF-8, this would require a comparison function that returns UTF-16 order. 266 * 267 * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets 268 * with strings have very few very short strings. For cases with many strings, it might be better to use a different 269 * API and implementation with a DFA (state machine). 270 */ 271 272 /* 273 * Algorithm for span(SpanCondition.CONTAINED) 274 * 275 * Theoretical algorithm: 276 * - Iterate through the string, and at each code point boundary: 277 * + If the code point there is in the set, then remember to continue after it. 278 * + If a set string matches at the current position, then remember to continue after it. 279 * + Either recursively span for each code point or string match, or recursively span 280 * for all but the shortest one and iteratively continue the span with the shortest local match. 281 * + Remember the longest recursive span (the farthest end point). 282 * + If there is no match at the current position, 283 * neither for the code point there nor for any set string, 284 * then stop and return the longest recursive span length. 285 * 286 * Optimized implementation: 287 * 288 * (We assume that most sets will have very few very short strings. 289 * A span using a string-less set is extremely fast.) 290 * 291 * Create and cache a spanSet which contains all of the single code points of the original set 292 * but none of its strings. 293 * 294 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 295 * - Loop: 296 * + Try to match each set string at the end of the spanLength. 297 * ~ Set strings that start with set-contained code points 298 * must be matched with a partial overlap 299 * because the recursive algorithm would have tried to match them at every position. 300 * ~ Set strings that entirely consist of set-contained code points 301 * are irrelevant for span(SpanCondition.CONTAINED) 302 * because the recursive algorithm would continue after them anyway and 303 * find the longest recursive match from their end. 304 * ~ Rather than recursing, note each end point of a set string match. 305 * + If no set string matched after spanSet.span(), 306 * then return with where the spanSet.span() ended. 307 * + If at least one set string matched after spanSet.span(), 308 * then pop the shortest string match end point and continue the loop, 309 * trying to match all set strings from there. 310 * + If at least one more set string matched after a previous string match, then test if the 311 * code point after the previous string match is also contained in the set. 312 * Continue the loop with the shortest end point of 313 * either this code point or a matching set string. 314 * + If no more set string matched after a previous string match, 315 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 316 * Stop if spanLength==0, otherwise continue the loop. 317 * 318 * By noting each end point of a set string match, the function visits each string position at most once and 319 * finishes in linear time. 320 * 321 * The recursive algorithm may visit the same string position many times 322 * if multiple paths lead to it and finishes in exponential time. 323 */ 324 325 /* 326 * Algorithm for span(SIMPLE) 327 * 328 * Theoretical algorithm: 329 * - Iterate through the string, and at each code point boundary: 330 * + If the code point there is in the set, then remember to continue after it. 331 * + If a set string matches at the current position, then remember to continue after it. 332 * + Continue from the farthest match position and ignore all others. 333 * + If there is no match at the current position, then stop and return the current position. 334 * 335 * Optimized implementation: 336 * 337 * (Same assumption and spanSet as above.) 338 * 339 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 340 * - Loop: 341 * + Try to match each set string at the end of the spanLength. 342 * ~ Set strings that start with set-contained code points 343 * must be matched with a partial overlap 344 * because the standard algorithm would have tried to match them earlier. 345 * ~ Set strings that entirely consist of set-contained code points 346 * must be matched with a full overlap because the longest-match algorithm 347 * would hide set string matches that end earlier. 348 * Such set strings need not be matched earlier inside the code point span 349 * because the standard algorithm would then have 350 * continued after the set string match anyway. 351 * ~ Remember the longest set string match (farthest end point) 352 * from the earliest starting point. 353 * + If no set string matched after spanSet.span(), 354 * then return with where the spanSet.span() ended. 355 * + If at least one set string matched, 356 * then continue the loop after the longest match from the earliest position. 357 * + If no more set string matched after a previous string match, 358 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 359 * Stop if spanLength==0, otherwise continue the loop. 360 */ 361 /** 362 * Spans a string. 363 * 364 * @param s The string to be spanned 365 * @param start The start index that the span begins 366 * @param spanCondition The span condition 367 * @return the limit (exclusive end) of the span 368 */ 369 public int span(CharSequence s, int start, SpanCondition spanCondition) { 370 if (spanCondition == SpanCondition.NOT_CONTAINED) { 371 return spanNot(s, start, null); 372 } 373 int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); 374 if (spanLimit == s.length()) { 375 return spanLimit; 376 } 377 return spanWithStrings(s, start, spanLimit, spanCondition); 378 } 379 380 /** 381 * Synchronized method for complicated spans using the offsets. 382 * Avoids synchronization for simple cases. 383 * 384 * @param spanLimit = spanSet.span(s, start, CONTAINED) 385 */ 386 private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, 387 SpanCondition spanCondition) { 388 // Consider strings; they may overlap with the span. 389 int initSize = 0; 390 if (spanCondition == SpanCondition.CONTAINED) { 391 // Use offset list to try all possibilities. 392 initSize = maxLength16; 393 } 394 offsets.setMaxLength(initSize); 395 int length = s.length(); 396 int pos = spanLimit, rest = length - spanLimit; 397 int spanLength = spanLimit - start; 398 int i, stringsLength = strings.size(); 399 for (;;) { 400 if (spanCondition == SpanCondition.CONTAINED) { 401 for (i = 0; i < stringsLength; ++i) { 402 int overlap = spanLengths[i]; 403 if (overlap == ALL_CP_CONTAINED) { 404 continue; // Irrelevant string. 405 } 406 String string = strings.get(i); 407 408 int length16 = string.length(); 409 410 // Try to match this string at pos-overlap..pos. 411 if (overlap >= LONG_SPAN) { 412 overlap = length16; 413 // While contained: No point matching fully inside the code point span. 414 overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code 415 // point. 416 } 417 if (overlap > spanLength) { 418 overlap = spanLength; 419 } 420 int inc = length16 - overlap; // Keep overlap+inc==length16. 421 for (;;) { 422 if (inc > rest) { 423 break; 424 } 425 // Try to match if the increment is not listed already. 426 if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { 427 if (inc == rest) { 428 return length; // Reached the end of the string. 429 } 430 offsets.addOffset(inc); 431 } 432 if (overlap == 0) { 433 break; 434 } 435 --overlap; 436 ++inc; 437 } 438 } 439 } else /* SIMPLE */{ 440 int maxInc = 0, maxOverlap = 0; 441 for (i = 0; i < stringsLength; ++i) { 442 int overlap = spanLengths[i]; 443 // For longest match, we do need to try to match even an all-contained string 444 // to find the match from the earliest start. 445 446 String string = strings.get(i); 447 448 int length16 = string.length(); 449 450 // Try to match this string at pos-overlap..pos. 451 if (overlap >= LONG_SPAN) { 452 overlap = length16; 453 // Longest match: Need to match fully inside the code point span 454 // to find the match from the earliest start. 455 } 456 if (overlap > spanLength) { 457 overlap = spanLength; 458 } 459 int inc = length16 - overlap; // Keep overlap+inc==length16. 460 for (;;) { 461 if (inc > rest || overlap < maxOverlap) { 462 break; 463 } 464 // Try to match if the string is longer or starts earlier. 465 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) 466 && matches16CPB(s, pos - overlap, length, string, length16)) { 467 maxInc = inc; // Longest match from earliest start. 468 maxOverlap = overlap; 469 break; 470 } 471 --overlap; 472 ++inc; 473 } 474 } 475 476 if (maxInc != 0 || maxOverlap != 0) { 477 // Longest-match algorithm, and there was a string match. 478 // Simply continue after it. 479 pos += maxInc; 480 rest -= maxInc; 481 if (rest == 0) { 482 return length; // Reached the end of the string. 483 } 484 spanLength = 0; // Match strings from after a string match. 485 continue; 486 } 487 } 488 // Finished trying to match all strings at pos. 489 490 if (spanLength != 0 || pos == 0) { 491 // The position is after an unlimited code point span (spanLength!=0), 492 // not after a string match. 493 // The only position where spanLength==0 after a span is pos==0. 494 // Otherwise, an unlimited code point span is only tried again when no 495 // strings match, and if such a non-initial span fails we stop. 496 if (offsets.isEmpty()) { 497 return pos; // No strings matched after a span. 498 } 499 // Match strings from after the next string match. 500 } else { 501 // The position is after a string match (or a single code point). 502 if (offsets.isEmpty()) { 503 // No more strings matched after a previous string match. 504 // Try another code point span from after the last string match. 505 spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); 506 spanLength = spanLimit - pos; 507 if (spanLength == rest || // Reached the end of the string, or 508 spanLength == 0 // neither strings nor span progressed. 509 ) { 510 return spanLimit; 511 } 512 pos += spanLength; 513 rest -= spanLength; 514 continue; // spanLength>0: Match strings from after a span. 515 } else { 516 // Try to match only one code point from after a string match if some 517 // string matched beyond it, so that we try all possible positions 518 // and don't overshoot. 519 spanLength = spanOne(spanSet, s, pos, rest); 520 if (spanLength > 0) { 521 if (spanLength == rest) { 522 return length; // Reached the end of the string. 523 } 524 // Match strings after this code point. 525 // There cannot be any increments below it because UnicodeSet strings 526 // contain multiple code points. 527 pos += spanLength; 528 rest -= spanLength; 529 offsets.shift(spanLength); 530 spanLength = 0; 531 continue; // Match strings from after a single code point. 532 } 533 // Match strings from after the next string match. 534 } 535 } 536 int minOffset = offsets.popMinimum(null); 537 pos += minOffset; 538 rest -= minOffset; 539 spanLength = 0; // Match strings from after a string match. 540 } 541 } 542 543 /** 544 * Spans a string and counts the smallest number of set elements on any path across the span. 545 * 546 * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. 547 * 548 * <p>If the set does not have any fully-contained strings, then we could optimize this 549 * like span(), but such sets are likely rare, and this is at least still linear. 550 * 551 * @param s The string to be spanned 552 * @param start The start index that the span begins 553 * @param spanCondition The span condition 554 * @param outCount The count 555 * @return the limit (exclusive end) of the span 556 */ 557 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, 558 OutputInt outCount) { 559 if (spanCondition == SpanCondition.NOT_CONTAINED) { 560 return spanNot(s, start, outCount); 561 } 562 // Consider strings; they may overlap with the span, 563 // and they may result in a smaller count that with just code points. 564 if (spanCondition == SpanCondition.CONTAINED) { 565 return spanContainedAndCount(s, start, outCount); 566 } 567 // SIMPLE (not synchronized, does not use offsets) 568 int stringsLength = strings.size(); 569 int length = s.length(); 570 int pos = start; 571 int rest = length - start; 572 int count = 0; 573 while (rest != 0) { 574 // Try to match the next code point. 575 int cpLength = spanOne(spanSet, s, pos, rest); 576 int maxInc = (cpLength > 0) ? cpLength : 0; 577 // Try to match all of the strings. 578 for (int i = 0; i < stringsLength; ++i) { 579 String string = strings.get(i); 580 int length16 = string.length(); 581 if (maxInc < length16 && length16 <= rest && 582 matches16CPB(s, pos, length, string, length16)) { 583 maxInc = length16; 584 } 585 } 586 // We are done if there is no match beyond pos. 587 if (maxInc == 0) { 588 outCount.value = count; 589 return pos; 590 } 591 // Continue from the longest match. 592 ++count; 593 pos += maxInc; 594 rest -= maxInc; 595 } 596 outCount.value = count; 597 return pos; 598 } 599 600 private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { 601 // Use offset list to try all possibilities. 602 offsets.setMaxLength(maxLength16); 603 int stringsLength = strings.size(); 604 int length = s.length(); 605 int pos = start; 606 int rest = length - start; 607 int count = 0; 608 while (rest != 0) { 609 // Try to match the next code point. 610 int cpLength = spanOne(spanSet, s, pos, rest); 611 if (cpLength > 0) { 612 offsets.addOffsetAndCount(cpLength, count + 1); 613 } 614 // Try to match all of the strings. 615 for (int i = 0; i < stringsLength; ++i) { 616 String string = strings.get(i); 617 int length16 = string.length(); 618 // Note: If the strings were sorted by length, then we could also 619 // avoid trying to match if there is already a match of the same length. 620 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && 621 matches16CPB(s, pos, length, string, length16)) { 622 offsets.addOffsetAndCount(length16, count + 1); 623 } 624 } 625 // We are done if there is no match beyond pos. 626 if (offsets.isEmpty()) { 627 outCount.value = count; 628 return pos; 629 } 630 // Continue from the nearest match. 631 int minOffset = offsets.popMinimum(outCount); 632 count = outCount.value; 633 pos += minOffset; 634 rest -= minOffset; 635 } 636 outCount.value = count; 637 return pos; 638 } 639 640 /** 641 * Span a string backwards. 642 * 643 * @param s The string to be spanned 644 * @param spanCondition The span condition 645 * @return The string index which starts the span (i.e. inclusive). 646 */ 647 public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { 648 if (spanCondition == SpanCondition.NOT_CONTAINED) { 649 return spanNotBack(s, length); 650 } 651 int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); 652 if (pos == 0) { 653 return 0; 654 } 655 int spanLength = length - pos; 656 657 // Consider strings; they may overlap with the span. 658 int initSize = 0; 659 if (spanCondition == SpanCondition.CONTAINED) { 660 // Use offset list to try all possibilities. 661 initSize = maxLength16; 662 } 663 offsets.setMaxLength(initSize); 664 int i, stringsLength = strings.size(); 665 int spanBackLengthsOffset = 0; 666 if (all) { 667 spanBackLengthsOffset = stringsLength; 668 } 669 for (;;) { 670 if (spanCondition == SpanCondition.CONTAINED) { 671 for (i = 0; i < stringsLength; ++i) { 672 int overlap = spanLengths[spanBackLengthsOffset + i]; 673 if (overlap == ALL_CP_CONTAINED) { 674 continue; // Irrelevant string. 675 } 676 String string = strings.get(i); 677 678 int length16 = string.length(); 679 680 // Try to match this string at pos-(length16-overlap)..pos-length16. 681 if (overlap >= LONG_SPAN) { 682 overlap = length16; 683 // While contained: No point matching fully inside the code point span. 684 int len1 = 0; 685 len1 = string.offsetByCodePoints(0, 1); 686 overlap -= len1; // Length of the string minus the first code point. 687 } 688 if (overlap > spanLength) { 689 overlap = spanLength; 690 } 691 int dec = length16 - overlap; // Keep dec+overlap==length16. 692 for (;;) { 693 if (dec > pos) { 694 break; 695 } 696 // Try to match if the decrement is not listed already. 697 if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { 698 if (dec == pos) { 699 return 0; // Reached the start of the string. 700 } 701 offsets.addOffset(dec); 702 } 703 if (overlap == 0) { 704 break; 705 } 706 --overlap; 707 ++dec; 708 } 709 } 710 } else /* SIMPLE */{ 711 int maxDec = 0, maxOverlap = 0; 712 for (i = 0; i < stringsLength; ++i) { 713 int overlap = spanLengths[spanBackLengthsOffset + i]; 714 // For longest match, we do need to try to match even an all-contained string 715 // to find the match from the latest end. 716 717 String string = strings.get(i); 718 719 int length16 = string.length(); 720 721 // Try to match this string at pos-(length16-overlap)..pos-length16. 722 if (overlap >= LONG_SPAN) { 723 overlap = length16; 724 // Longest match: Need to match fully inside the code point span 725 // to find the match from the latest end. 726 } 727 if (overlap > spanLength) { 728 overlap = spanLength; 729 } 730 int dec = length16 - overlap; // Keep dec+overlap==length16. 731 for (;;) { 732 if (dec > pos || overlap < maxOverlap) { 733 break; 734 } 735 // Try to match if the string is longer or ends later. 736 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) 737 && matches16CPB(s, pos - dec, length, string, length16)) { 738 maxDec = dec; // Longest match from latest end. 739 maxOverlap = overlap; 740 break; 741 } 742 --overlap; 743 ++dec; 744 } 745 } 746 747 if (maxDec != 0 || maxOverlap != 0) { 748 // Longest-match algorithm, and there was a string match. 749 // Simply continue before it. 750 pos -= maxDec; 751 if (pos == 0) { 752 return 0; // Reached the start of the string. 753 } 754 spanLength = 0; // Match strings from before a string match. 755 continue; 756 } 757 } 758 // Finished trying to match all strings at pos. 759 760 if (spanLength != 0 || pos == length) { 761 // The position is before an unlimited code point span (spanLength!=0), 762 // not before a string match. 763 // The only position where spanLength==0 before a span is pos==length. 764 // Otherwise, an unlimited code point span is only tried again when no 765 // strings match, and if such a non-initial span fails we stop. 766 if (offsets.isEmpty()) { 767 return pos; // No strings matched before a span. 768 } 769 // Match strings from before the next string match. 770 } else { 771 // The position is before a string match (or a single code point). 772 if (offsets.isEmpty()) { 773 // No more strings matched before a previous string match. 774 // Try another code point span from before the last string match. 775 int oldPos = pos; 776 pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); 777 spanLength = oldPos - pos; 778 if (pos == 0 || // Reached the start of the string, or 779 spanLength == 0 // neither strings nor span progressed. 780 ) { 781 return pos; 782 } 783 continue; // spanLength>0: Match strings from before a span. 784 } else { 785 // Try to match only one code point from before a string match if some 786 // string matched beyond it, so that we try all possible positions 787 // and don't overshoot. 788 spanLength = spanOneBack(spanSet, s, pos); 789 if (spanLength > 0) { 790 if (spanLength == pos) { 791 return 0; // Reached the start of the string. 792 } 793 // Match strings before this code point. 794 // There cannot be any decrements below it because UnicodeSet strings 795 // contain multiple code points. 796 pos -= spanLength; 797 offsets.shift(spanLength); 798 spanLength = 0; 799 continue; // Match strings from before a single code point. 800 } 801 // Match strings from before the next string match. 802 } 803 } 804 pos -= offsets.popMinimum(null); 805 spanLength = 0; // Match strings from before a string match. 806 } 807 } 808 809 /** 810 * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) 811 * 812 * Theoretical algorithm: 813 * - Iterate through the string, and at each code point boundary: 814 * + If the code point there is in the set, then return with the current position. 815 * + If a set string matches at the current position, then return with the current position. 816 * 817 * Optimized implementation: 818 * 819 * (Same assumption as for span() above.) 820 * 821 * Create and cache a spanNotSet which contains 822 * all of the single code points of the original set but none of its strings. 823 * For each set string add its initial code point to the spanNotSet. 824 * (Also add its final code point for spanNotBack().) 825 * 826 * - Loop: 827 * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). 828 * + If the current code point is in the original set, then return the current position. 829 * + If any set string matches at the current position, then return the current position. 830 * + If there is no match at the current position, neither for the code point 831 * there nor for any set string, then skip this code point and continue the loop. 832 * This happens for set-string-initial code points that were added to spanNotSet 833 * when there is not actually a match for such a set string. 834 * 835 * @param s The string to be spanned 836 * @param start The start index that the span begins 837 * @param outCount If not null: Receives the number of code points across the span. 838 * @return the limit (exclusive end) of the span 839 */ 840 private int spanNot(CharSequence s, int start, OutputInt outCount) { 841 int length = s.length(); 842 int pos = start, rest = length - start; 843 int stringsLength = strings.size(); 844 int count = 0; 845 do { 846 // Span until we find a code point from the set, 847 // or a code point that starts or ends some string. 848 int spanLimit; 849 if (outCount == null) { 850 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); 851 } else { 852 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); 853 outCount.value = count = count + outCount.value; 854 } 855 if (spanLimit == length) { 856 return length; // Reached the end of the string. 857 } 858 pos = spanLimit; 859 rest = length - spanLimit; 860 861 // Check whether the current code point is in the original set, 862 // without the string starts and ends. 863 int cpLength = spanOne(spanSet, s, pos, rest); 864 if (cpLength > 0) { 865 return pos; // There is a set element at pos. 866 } 867 868 // Try to match the strings at pos. 869 for (int i = 0; i < stringsLength; ++i) { 870 if (spanLengths[i] == ALL_CP_CONTAINED) { 871 continue; // Irrelevant string. 872 } 873 String string = strings.get(i); 874 875 int length16 = string.length(); 876 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { 877 return pos; // There is a set element at pos. 878 } 879 } 880 881 // The span(while not contained) ended on a string start/end which is 882 // not in the original set. Skip this code point and continue. 883 // cpLength<0 884 pos -= cpLength; 885 rest += cpLength; 886 ++count; 887 } while (rest != 0); 888 if (outCount != null) { 889 outCount.value = count; 890 } 891 return length; // Reached the end of the string. 892 } 893 894 private int spanNotBack(CharSequence s, int length) { 895 int pos = length; 896 int i, stringsLength = strings.size(); 897 do { 898 // Span until we find a code point from the set, 899 // or a code point that starts or ends some string. 900 pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); 901 if (pos == 0) { 902 return 0; // Reached the start of the string. 903 } 904 905 // Check whether the current code point is in the original set, 906 // without the string starts and ends. 907 int cpLength = spanOneBack(spanSet, s, pos); 908 if (cpLength > 0) { 909 return pos; // There is a set element at pos. 910 } 911 912 // Try to match the strings at pos. 913 for (i = 0; i < stringsLength; ++i) { 914 // Use spanLengths rather than a spanLengths pointer because 915 // it is easier and we only need to know whether the string is irrelevant 916 // which is the same in either array. 917 if (spanLengths[i] == ALL_CP_CONTAINED) { 918 continue; // Irrelevant string. 919 } 920 String string = strings.get(i); 921 922 int length16 = string.length(); 923 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { 924 return pos; // There is a set element at pos. 925 } 926 } 927 928 // The span(while not contained) ended on a string start/end which is 929 // not in the original set. Skip this code point and continue. 930 // cpLength<0 931 pos += cpLength; 932 } while (pos != 0); 933 return 0; // Reached the start of the string. 934 } 935 936 static short makeSpanLengthByte(int spanLength) { 937 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 938 return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; 939 } 940 941 // Compare strings without any argument checks. Requires length>0. 942 private static boolean matches16(CharSequence s, int start, final String t, int length) { 943 int end = start + length; 944 while (length-- > 0) { 945 if (s.charAt(--end) != t.charAt(length)) { 946 return false; 947 } 948 } 949 return true; 950 } 951 952 /** 953 * Compare 16-bit Unicode strings (which may be malformed UTF-16) 954 * at code point boundaries. 955 * That is, each edge of a match must not be in the middle of a surrogate pair. 956 * @param s The string to match in. 957 * @param start The start index of s. 958 * @param limit The limit of the subsequence of s being spanned. 959 * @param t The substring to be matched in s. 960 * @param tlength The length of t. 961 */ 962 static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { 963 return matches16(s, start, t, tlength) 964 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && 965 Character.isLowSurrogate(s.charAt(start))) 966 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && 967 Character.isLowSurrogate(s.charAt(start + tlength))); 968 } 969 970 /** 971 * Does the set contain the next code point? 972 * If so, return its length; otherwise return its negative length. 973 */ 974 static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { 975 char c = s.charAt(start); 976 if (c >= 0xd800 && c <= 0xdbff && length >= 2) { 977 char c2 = s.charAt(start + 1); 978 if (UTF16.isTrailSurrogate(c2)) { 979 int supplementary = UCharacterProperty.getRawSupplementary(c, c2); 980 return set.contains(supplementary) ? 2 : -2; 981 } 982 } 983 return set.contains(c) ? 1 : -1; 984 } 985 986 static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { 987 char c = s.charAt(length - 1); 988 if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { 989 char c2 = s.charAt(length - 2); 990 if (UTF16.isLeadSurrogate(c2)) { 991 int supplementary = UCharacterProperty.getRawSupplementary(c2, c); 992 return set.contains(supplementary) ? 2 : -2; 993 } 994 } 995 return set.contains(c) ? 1 : -1; 996 } 997 998 /** 999 * Helper class for UnicodeSetStringSpan. 1000 * 1001 * <p>List of offsets from the current position from where to try matching 1002 * a code point or a string. 1003 * Stores offsets rather than indexes to simplify the code and use the same list 1004 * for both increments (in span()) and decrements (in spanBack()). 1005 * 1006 * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time 1007 * are relatively dense, that is, 1008 * there are normally no gaps of hundreds or thousands of offset values. 1009 * 1010 * <p>This class optionally also tracks the minimum non-negative count for each position, 1011 * intended to count the smallest number of elements of any path leading to that position. 1012 * 1013 * <p>The implementation uses a circular buffer of count integers, 1014 * each indicating whether the corresponding offset is in the list, 1015 * and its path element count. 1016 * This avoids inserting into a sorted list of offsets (or absolute indexes) 1017 * and physically moving part of the list. 1018 * 1019 * <p>Note: In principle, the caller should setMaxLength() to 1020 * the maximum of the max string length and U16_LENGTH/U8_LENGTH 1021 * to account for "long" single code points. 1022 * 1023 * <p>Note: An earlier version did not track counts and stored only byte flags. 1024 * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, 1025 * the list could be stored as bit flags in a single integer. 1026 * Rather than handling a circular buffer with a start list index, 1027 * the integer would simply be shifted when lower offsets are removed. 1028 * UnicodeSet does not have a limit on the lengths of strings. 1029 */ 1030 private static final class OffsetList { 1031 private int[] list; 1032 private int length; 1033 private int start; 1034 1035 public OffsetList() { 1036 list = new int[16]; // default size 1037 } 1038 1039 public void setMaxLength(int maxLength) { 1040 if (maxLength > list.length) { 1041 list = new int[maxLength]; 1042 } 1043 clear(); 1044 } 1045 1046 public void clear() { 1047 for (int i = list.length; i-- > 0;) { 1048 list[i] = 0; 1049 } 1050 start = length = 0; 1051 } 1052 1053 public boolean isEmpty() { 1054 return (length == 0); 1055 } 1056 1057 /** 1058 * Reduces all stored offsets by delta, used when the current position moves by delta. 1059 * There must not be any offsets lower than delta. 1060 * If there is an offset equal to delta, it is removed. 1061 * 1062 * @param delta [1..maxLength] 1063 */ 1064 public void shift(int delta) { 1065 int i = start + delta; 1066 if (i >= list.length) { 1067 i -= list.length; 1068 } 1069 if (list[i] != 0) { 1070 list[i] = 0; 1071 --length; 1072 } 1073 start = i; 1074 } 1075 1076 /** 1077 * Adds an offset. The list must not contain it yet. 1078 * @param offset [1..maxLength] 1079 */ 1080 public void addOffset(int offset) { 1081 int i = start + offset; 1082 if (i >= list.length) { 1083 i -= list.length; 1084 } 1085 assert list[i] == 0; 1086 list[i] = 1; 1087 ++length; 1088 } 1089 1090 /** 1091 * Adds an offset and updates its count. 1092 * The list may already contain the offset. 1093 * @param offset [1..maxLength] 1094 */ 1095 public void addOffsetAndCount(int offset, int count) { 1096 assert count > 0; 1097 int i = start + offset; 1098 if (i >= list.length) { 1099 i -= list.length; 1100 } 1101 if (list[i] == 0) { 1102 list[i] = count; 1103 ++length; 1104 } else if (count < list[i]) { 1105 list[i] = count; 1106 } 1107 } 1108 1109 /** 1110 * @param offset [1..maxLength] 1111 */ 1112 public boolean containsOffset(int offset) { 1113 int i = start + offset; 1114 if (i >= list.length) { 1115 i -= list.length; 1116 } 1117 return list[i] != 0; 1118 } 1119 1120 /** 1121 * @param offset [1..maxLength] 1122 */ 1123 public boolean hasCountAtOffset(int offset, int count) { 1124 int i = start + offset; 1125 if (i >= list.length) { 1126 i -= list.length; 1127 } 1128 int oldCount = list[i]; 1129 return oldCount != 0 && oldCount <= count; 1130 } 1131 1132 /** 1133 * Finds the lowest stored offset from a non-empty list, removes it, 1134 * and reduces all other offsets by this minimum. 1135 * @return min=[1..maxLength] 1136 */ 1137 public int popMinimum(OutputInt outCount) { 1138 // Look for the next offset in list[start+1..list.length-1]. 1139 int i = start, result; 1140 while (++i < list.length) { 1141 int count = list[i]; 1142 if (count != 0) { 1143 list[i] = 0; 1144 --length; 1145 result = i - start; 1146 start = i; 1147 if (outCount != null) { outCount.value = count; } 1148 return result; 1149 } 1150 } 1151 // i==list.length 1152 1153 // Wrap around and look for the next offset in list[0..start]. 1154 // Since the list is not empty, there will be one. 1155 result = list.length - start; 1156 i = 0; 1157 int count; 1158 while ((count = list[i]) == 0) { 1159 ++i; 1160 } 1161 list[i] = 0; 1162 --length; 1163 start = i; 1164 if (outCount != null) { outCount.value = count; } 1165 return result + i; 1166 } 1167 } 1168 }