--- old/src/java.base/share/classes/sun/text/normalizer/UnicodeSetStringSpan.java 2020-01-10 13:50:20.000000000 -0800 +++ /dev/null 2020-01-10 13:50:20.000000000 -0800 @@ -1,1165 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ****************************************************************************** - * - * Copyright (C) 2009-2014, International Business Machines - * Corporation and others. All Rights Reserved. - * - ****************************************************************************** - */ - -package sun.text.normalizer; - -import java.util.ArrayList; - -import sun.text.normalizer.UnicodeSet.SpanCondition; - -/* - * Implement span() etc. for a set with strings. - * Avoid recursion because of its exponential complexity. - * Instead, try multiple paths at once and track them with an IndexList. - */ -class UnicodeSetStringSpan { - - /* - * Which span() variant will be used? The object is either built for one variant and used once, - * or built for all and may be used many times. - */ - public static final int WITH_COUNT = 0x40; // spanAndCount() may be called - public static final int FWD = 0x20; - public static final int BACK = 0x10; - // public static final int UTF16 = 8; - public static final int CONTAINED = 2; - public static final int NOT_CONTAINED = 1; - - public static final int ALL = 0x7f; - - public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; - public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; - public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; - public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; - - /** - * Special spanLength short values. (since Java has not unsigned byte type) - * All code points in the string are contained in the parent set. - */ - static final short ALL_CP_CONTAINED = 0xff; - - /** The spanLength is >=0xfe. */ - static final short LONG_SPAN = ALL_CP_CONTAINED - 1; - - /** Set for span(). Same as parent but without strings. */ - private UnicodeSet spanSet; - - /** - * Set for span(not contained). - * Same as spanSet, plus characters that start or end strings. - */ - private UnicodeSet spanNotSet; - - /** The strings of the parent set. */ - private ArrayList strings; - - /** The lengths of span(), spanBack() etc. for each string. */ - private short[] spanLengths; - - /** Maximum lengths of relevant strings. */ - private int maxLength16; - - /** Are there strings that are not fully contained in the code point set? */ - private boolean someRelevant; - - /** Set up for all variants of span()? */ - private boolean all; - - /** Span helper */ - private OffsetList offsets; - - /** - * Constructs for all variants of span(), or only for any one variant. - * Initializes as little as possible, for single use. - */ - public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList setStrings, int which) { - spanSet = new UnicodeSet(0, 0x10ffff); - // TODO: With Java 6, just take the parent set's strings as is, - // as a NavigableSet, rather than as an ArrayList copy of the set of strings. - // Then iterate via the first() and higher() methods. - // (We do not want to create multiple Iterator objects in each span().) - // See ICU ticket #7454. - strings = setStrings; - all = (which == ALL); - spanSet.retainAll(set); - if (0 != (which & NOT_CONTAINED)) { - // Default to the same sets. - // addToSpanNotSet() will create a separate set if necessary. - spanNotSet = spanSet; - } - offsets = new OffsetList(); - - // Determine if the strings even need to be taken into account at all for span() etc. - // If any string is relevant, then all strings need to be used for - // span(longest match) but only the relevant ones for span(while contained). - // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH - // and do not store UTF-8 strings if !thisRelevant and CONTAINED. - // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) - // Also count the lengths of the UTF-8 versions of the strings for memory allocation. - int stringsLength = strings.size(); - - int i, spanLength; - someRelevant = false; - for (i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - spanLength = spanSet.span(string, SpanCondition.CONTAINED); - if (spanLength < length16) { // Relevant string. - someRelevant = true; - } - if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { - maxLength16 = length16; - } - } - if (!someRelevant && (which & WITH_COUNT) == 0) { - return; - } - - // Freeze after checking for the need to use strings at all because freezing - // a set takes some time and memory which are wasted if there are no relevant strings. - if (all) { - spanSet.freeze(); - } - - int spanBackLengthsOffset; - - // Allocate a block of meta data. - int allocSize; - if (all) { - // 2 sets of span lengths - allocSize = stringsLength * (2); - } else { - allocSize = stringsLength; // One set of span lengths. - } - spanLengths = new short[allocSize]; - - if (all) { - // Store span lengths for all span() variants. - spanBackLengthsOffset = stringsLength; - } else { - // Store span lengths for only one span() variant. - spanBackLengthsOffset = 0; - } - - // Set the meta data and spanNotSet and write the UTF-8 strings. - - for (i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - spanLength = spanSet.span(string, SpanCondition.CONTAINED); - if (spanLength < length16) { // Relevant string. - if (true /* 0 != (which & UTF16) */) { - if (0 != (which & CONTAINED)) { - if (0 != (which & FWD)) { - spanLengths[i] = makeSpanLengthByte(spanLength); - } - if (0 != (which & BACK)) { - spanLength = length16 - - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); - spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); - } - } else /* not CONTAINED, not all, but NOT_CONTAINED */{ - spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant - // flag. - } - } - if (0 != (which & NOT_CONTAINED)) { - // Add string start and end code points to the spanNotSet so that - // a span(while not contained) stops before any string. - int c; - if (0 != (which & FWD)) { - c = string.codePointAt(0); - addToSpanNotSet(c); - } - if (0 != (which & BACK)) { - c = string.codePointBefore(length16); - addToSpanNotSet(c); - } - } - } else { // Irrelevant string. - if (all) { - spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; - } else { - // All spanXYZLengths pointers contain the same address. - spanLengths[i] = ALL_CP_CONTAINED; - } - } - } - - // Finish. - if (all) { - spanNotSet.freeze(); - } - } - - /** - * Do the strings need to be checked in span() etc.? - * - * @return true if strings need to be checked (call span() here), - * false if not (use a BMPSet for best performance). - */ - public boolean needsStringSpanUTF16() { - return someRelevant; - } - - /** For fast UnicodeSet::contains(c). */ - public boolean contains(int c) { - return spanSet.contains(c); - } - - /** - * Adds a starting or ending string character to the spanNotSet - * so that a character span ends before any string. - */ - private void addToSpanNotSet(int c) { - if (spanNotSet == null || spanNotSet == spanSet) { - if (spanSet.contains(c)) { - return; // Nothing to do. - } - spanNotSet = spanSet.cloneAsThawed(); - } - spanNotSet.add(c); - } - - /* - * Note: In span() when spanLength==0 - * (after a string match, or at the beginning after an empty code point span) - * and in spanNot() and spanNotUTF8(), - * string matching could use a binary search because all string matches are done - * from the same start index. - * - * For UTF-8, this would require a comparison function that returns UTF-16 order. - * - * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets - * with strings have very few very short strings. For cases with many strings, it might be better to use a different - * API and implementation with a DFA (state machine). - */ - - /* - * Algorithm for span(SpanCondition.CONTAINED) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then remember to continue after it. - * + If a set string matches at the current position, then remember to continue after it. - * + Either recursively span for each code point or string match, or recursively span - * for all but the shortest one and iteratively continue the span with the shortest local match. - * + Remember the longest recursive span (the farthest end point). - * + If there is no match at the current position, - * neither for the code point there nor for any set string, - * then stop and return the longest recursive span length. - * - * Optimized implementation: - * - * (We assume that most sets will have very few very short strings. - * A span using a string-less set is extremely fast.) - * - * Create and cache a spanSet which contains all of the single code points of the original set - * but none of its strings. - * - * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - * - Loop: - * + Try to match each set string at the end of the spanLength. - * ~ Set strings that start with set-contained code points - * must be matched with a partial overlap - * because the recursive algorithm would have tried to match them at every position. - * ~ Set strings that entirely consist of set-contained code points - * are irrelevant for span(SpanCondition.CONTAINED) - * because the recursive algorithm would continue after them anyway and - * find the longest recursive match from their end. - * ~ Rather than recursing, note each end point of a set string match. - * + If no set string matched after spanSet.span(), - * then return with where the spanSet.span() ended. - * + If at least one set string matched after spanSet.span(), - * then pop the shortest string match end point and continue the loop, - * trying to match all set strings from there. - * + If at least one more set string matched after a previous string match, then test if the - * code point after the previous string match is also contained in the set. - * Continue the loop with the shortest end point of - * either this code point or a matching set string. - * + If no more set string matched after a previous string match, - * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). - * Stop if spanLength==0, otherwise continue the loop. - * - * By noting each end point of a set string match, the function visits each string position at most once and - * finishes in linear time. - * - * The recursive algorithm may visit the same string position many times - * if multiple paths lead to it and finishes in exponential time. - */ - - /* - * Algorithm for span(SIMPLE) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then remember to continue after it. - * + If a set string matches at the current position, then remember to continue after it. - * + Continue from the farthest match position and ignore all others. - * + If there is no match at the current position, then stop and return the current position. - * - * Optimized implementation: - * - * (Same assumption and spanSet as above.) - * - * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - * - Loop: - * + Try to match each set string at the end of the spanLength. - * ~ Set strings that start with set-contained code points - * must be matched with a partial overlap - * because the standard algorithm would have tried to match them earlier. - * ~ Set strings that entirely consist of set-contained code points - * must be matched with a full overlap because the longest-match algorithm - * would hide set string matches that end earlier. - * Such set strings need not be matched earlier inside the code point span - * because the standard algorithm would then have - * continued after the set string match anyway. - * ~ Remember the longest set string match (farthest end point) - * from the earliest starting point. - * + If no set string matched after spanSet.span(), - * then return with where the spanSet.span() ended. - * + If at least one set string matched, - * then continue the loop after the longest match from the earliest position. - * + If no more set string matched after a previous string match, - * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). - * Stop if spanLength==0, otherwise continue the loop. - */ - /** - * Spans a string. - * - * @param s The string to be spanned - * @param start The start index that the span begins - * @param spanCondition The span condition - * @return the limit (exclusive end) of the span - */ - public int span(CharSequence s, int start, SpanCondition spanCondition) { - if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNot(s, start, null); - } - int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); - if (spanLimit == s.length()) { - return spanLimit; - } - return spanWithStrings(s, start, spanLimit, spanCondition); - } - - /** - * Synchronized method for complicated spans using the offsets. - * Avoids synchronization for simple cases. - * - * @param spanLimit = spanSet.span(s, start, CONTAINED) - */ - private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, - SpanCondition spanCondition) { - // Consider strings; they may overlap with the span. - int initSize = 0; - if (spanCondition == SpanCondition.CONTAINED) { - // Use offset list to try all possibilities. - initSize = maxLength16; - } - offsets.setMaxLength(initSize); - int length = s.length(); - int pos = spanLimit, rest = length - spanLimit; - int spanLength = spanLimit - start; - int i, stringsLength = strings.size(); - for (;;) { - if (spanCondition == SpanCondition.CONTAINED) { - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[i]; - if (overlap == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-overlap..pos. - if (overlap >= LONG_SPAN) { - overlap = length16; - // While contained: No point matching fully inside the code point span. - overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code - // point. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int inc = length16 - overlap; // Keep overlap+inc==length16. - for (;;) { - if (inc > rest) { - break; - } - // Try to match if the increment is not listed already. - if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { - if (inc == rest) { - return length; // Reached the end of the string. - } - offsets.addOffset(inc); - } - if (overlap == 0) { - break; - } - --overlap; - ++inc; - } - } - } else /* SIMPLE */{ - int maxInc = 0, maxOverlap = 0; - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the earliest start. - - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-overlap..pos. - if (overlap >= LONG_SPAN) { - overlap = length16; - // Longest match: Need to match fully inside the code point span - // to find the match from the earliest start. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int inc = length16 - overlap; // Keep overlap+inc==length16. - for (;;) { - if (inc > rest || overlap < maxOverlap) { - break; - } - // Try to match if the string is longer or starts earlier. - if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) - && matches16CPB(s, pos - overlap, length, string, length16)) { - maxInc = inc; // Longest match from earliest start. - maxOverlap = overlap; - break; - } - --overlap; - ++inc; - } - } - - if (maxInc != 0 || maxOverlap != 0) { - // Longest-match algorithm, and there was a string match. - // Simply continue after it. - pos += maxInc; - rest -= maxInc; - if (rest == 0) { - return length; // Reached the end of the string. - } - spanLength = 0; // Match strings from after a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if (spanLength != 0 || pos == 0) { - // The position is after an unlimited code point span (spanLength!=0), - // not after a string match. - // The only position where spanLength==0 after a span is pos==0. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if (offsets.isEmpty()) { - return pos; // No strings matched after a span. - } - // Match strings from after the next string match. - } else { - // The position is after a string match (or a single code point). - if (offsets.isEmpty()) { - // No more strings matched after a previous string match. - // Try another code point span from after the last string match. - spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); - spanLength = spanLimit - pos; - if (spanLength == rest || // Reached the end of the string, or - spanLength == 0 // neither strings nor span progressed. - ) { - return spanLimit; - } - pos += spanLength; - rest -= spanLength; - continue; // spanLength>0: Match strings from after a span. - } else { - // Try to match only one code point from after a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength = spanOne(spanSet, s, pos, rest); - if (spanLength > 0) { - if (spanLength == rest) { - return length; // Reached the end of the string. - } - // Match strings after this code point. - // There cannot be any increments below it because UnicodeSet strings - // contain multiple code points. - pos += spanLength; - rest -= spanLength; - offsets.shift(spanLength); - spanLength = 0; - continue; // Match strings from after a single code point. - } - // Match strings from after the next string match. - } - } - int minOffset = offsets.popMinimum(null); - pos += minOffset; - rest -= minOffset; - spanLength = 0; // Match strings from after a string match. - } - } - - /** - * Spans a string and counts the smallest number of set elements on any path across the span. - * - *

For proper counting, we cannot ignore strings that are fully contained in code point spans. - * - *

If the set does not have any fully-contained strings, then we could optimize this - * like span(), but such sets are likely rare, and this is at least still linear. - * - * @param s The string to be spanned - * @param start The start index that the span begins - * @param spanCondition The span condition - * @param outCount The count - * @return the limit (exclusive end) of the span - */ - public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, - OutputInt outCount) { - if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNot(s, start, outCount); - } - // Consider strings; they may overlap with the span, - // and they may result in a smaller count that with just code points. - if (spanCondition == SpanCondition.CONTAINED) { - return spanContainedAndCount(s, start, outCount); - } - // SIMPLE (not synchronized, does not use offsets) - int stringsLength = strings.size(); - int length = s.length(); - int pos = start; - int rest = length - start; - int count = 0; - while (rest != 0) { - // Try to match the next code point. - int cpLength = spanOne(spanSet, s, pos, rest); - int maxInc = (cpLength > 0) ? cpLength : 0; - // Try to match all of the strings. - for (int i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - if (maxInc < length16 && length16 <= rest && - matches16CPB(s, pos, length, string, length16)) { - maxInc = length16; - } - } - // We are done if there is no match beyond pos. - if (maxInc == 0) { - outCount.value = count; - return pos; - } - // Continue from the longest match. - ++count; - pos += maxInc; - rest -= maxInc; - } - outCount.value = count; - return pos; - } - - private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { - // Use offset list to try all possibilities. - offsets.setMaxLength(maxLength16); - int stringsLength = strings.size(); - int length = s.length(); - int pos = start; - int rest = length - start; - int count = 0; - while (rest != 0) { - // Try to match the next code point. - int cpLength = spanOne(spanSet, s, pos, rest); - if (cpLength > 0) { - offsets.addOffsetAndCount(cpLength, count + 1); - } - // Try to match all of the strings. - for (int i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - // Note: If the strings were sorted by length, then we could also - // avoid trying to match if there is already a match of the same length. - if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && - matches16CPB(s, pos, length, string, length16)) { - offsets.addOffsetAndCount(length16, count + 1); - } - } - // We are done if there is no match beyond pos. - if (offsets.isEmpty()) { - outCount.value = count; - return pos; - } - // Continue from the nearest match. - int minOffset = offsets.popMinimum(outCount); - count = outCount.value; - pos += minOffset; - rest -= minOffset; - } - outCount.value = count; - return pos; - } - - /** - * Span a string backwards. - * - * @param s The string to be spanned - * @param spanCondition The span condition - * @return The string index which starts the span (i.e. inclusive). - */ - public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { - if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNotBack(s, length); - } - int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); - if (pos == 0) { - return 0; - } - int spanLength = length - pos; - - // Consider strings; they may overlap with the span. - int initSize = 0; - if (spanCondition == SpanCondition.CONTAINED) { - // Use offset list to try all possibilities. - initSize = maxLength16; - } - offsets.setMaxLength(initSize); - int i, stringsLength = strings.size(); - int spanBackLengthsOffset = 0; - if (all) { - spanBackLengthsOffset = stringsLength; - } - for (;;) { - if (spanCondition == SpanCondition.CONTAINED) { - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[spanBackLengthsOffset + i]; - if (overlap == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-(length16-overlap)..pos-length16. - if (overlap >= LONG_SPAN) { - overlap = length16; - // While contained: No point matching fully inside the code point span. - int len1 = 0; - len1 = string.offsetByCodePoints(0, 1); - overlap -= len1; // Length of the string minus the first code point. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int dec = length16 - overlap; // Keep dec+overlap==length16. - for (;;) { - if (dec > pos) { - break; - } - // Try to match if the decrement is not listed already. - if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { - if (dec == pos) { - return 0; // Reached the start of the string. - } - offsets.addOffset(dec); - } - if (overlap == 0) { - break; - } - --overlap; - ++dec; - } - } - } else /* SIMPLE */{ - int maxDec = 0, maxOverlap = 0; - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[spanBackLengthsOffset + i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the latest end. - - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-(length16-overlap)..pos-length16. - if (overlap >= LONG_SPAN) { - overlap = length16; - // Longest match: Need to match fully inside the code point span - // to find the match from the latest end. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int dec = length16 - overlap; // Keep dec+overlap==length16. - for (;;) { - if (dec > pos || overlap < maxOverlap) { - break; - } - // Try to match if the string is longer or ends later. - if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) - && matches16CPB(s, pos - dec, length, string, length16)) { - maxDec = dec; // Longest match from latest end. - maxOverlap = overlap; - break; - } - --overlap; - ++dec; - } - } - - if (maxDec != 0 || maxOverlap != 0) { - // Longest-match algorithm, and there was a string match. - // Simply continue before it. - pos -= maxDec; - if (pos == 0) { - return 0; // Reached the start of the string. - } - spanLength = 0; // Match strings from before a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if (spanLength != 0 || pos == length) { - // The position is before an unlimited code point span (spanLength!=0), - // not before a string match. - // The only position where spanLength==0 before a span is pos==length. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if (offsets.isEmpty()) { - return pos; // No strings matched before a span. - } - // Match strings from before the next string match. - } else { - // The position is before a string match (or a single code point). - if (offsets.isEmpty()) { - // No more strings matched before a previous string match. - // Try another code point span from before the last string match. - int oldPos = pos; - pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); - spanLength = oldPos - pos; - if (pos == 0 || // Reached the start of the string, or - spanLength == 0 // neither strings nor span progressed. - ) { - return pos; - } - continue; // spanLength>0: Match strings from before a span. - } else { - // Try to match only one code point from before a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength = spanOneBack(spanSet, s, pos); - if (spanLength > 0) { - if (spanLength == pos) { - return 0; // Reached the start of the string. - } - // Match strings before this code point. - // There cannot be any decrements below it because UnicodeSet strings - // contain multiple code points. - pos -= spanLength; - offsets.shift(spanLength); - spanLength = 0; - continue; // Match strings from before a single code point. - } - // Match strings from before the next string match. - } - } - pos -= offsets.popMinimum(null); - spanLength = 0; // Match strings from before a string match. - } - } - - /** - * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then return with the current position. - * + If a set string matches at the current position, then return with the current position. - * - * Optimized implementation: - * - * (Same assumption as for span() above.) - * - * Create and cache a spanNotSet which contains - * all of the single code points of the original set but none of its strings. - * For each set string add its initial code point to the spanNotSet. - * (Also add its final code point for spanNotBack().) - * - * - Loop: - * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). - * + If the current code point is in the original set, then return the current position. - * + If any set string matches at the current position, then return the current position. - * + If there is no match at the current position, neither for the code point - * there nor for any set string, then skip this code point and continue the loop. - * This happens for set-string-initial code points that were added to spanNotSet - * when there is not actually a match for such a set string. - * - * @param s The string to be spanned - * @param start The start index that the span begins - * @param outCount If not null: Receives the number of code points across the span. - * @return the limit (exclusive end) of the span - */ - private int spanNot(CharSequence s, int start, OutputInt outCount) { - int length = s.length(); - int pos = start, rest = length - start; - int stringsLength = strings.size(); - int count = 0; - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - int spanLimit; - if (outCount == null) { - spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); - } else { - spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); - outCount.value = count = count + outCount.value; - } - if (spanLimit == length) { - return length; // Reached the end of the string. - } - pos = spanLimit; - rest = length - spanLimit; - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int cpLength = spanOne(spanSet, s, pos, rest); - if (cpLength > 0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - for (int i = 0; i < stringsLength; ++i) { - if (spanLengths[i] == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { - return pos; // There is a set element at pos. - } - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos -= cpLength; - rest += cpLength; - ++count; - } while (rest != 0); - if (outCount != null) { - outCount.value = count; - } - return length; // Reached the end of the string. - } - - private int spanNotBack(CharSequence s, int length) { - int pos = length; - int i, stringsLength = strings.size(); - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); - if (pos == 0) { - return 0; // Reached the start of the string. - } - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int cpLength = spanOneBack(spanSet, s, pos); - if (cpLength > 0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - for (i = 0; i < stringsLength; ++i) { - // Use spanLengths rather than a spanLengths pointer because - // it is easier and we only need to know whether the string is irrelevant - // which is the same in either array. - if (spanLengths[i] == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { - return pos; // There is a set element at pos. - } - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos += cpLength; - } while (pos != 0); - return 0; // Reached the start of the string. - } - - static short makeSpanLengthByte(int spanLength) { - // 0xfe==UnicodeSetStringSpan::LONG_SPAN - return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; - } - - // Compare strings without any argument checks. Requires length>0. - private static boolean matches16(CharSequence s, int start, final String t, int length) { - int end = start + length; - while (length-- > 0) { - if (s.charAt(--end) != t.charAt(length)) { - return false; - } - } - return true; - } - - /** - * Compare 16-bit Unicode strings (which may be malformed UTF-16) - * at code point boundaries. - * That is, each edge of a match must not be in the middle of a surrogate pair. - * @param s The string to match in. - * @param start The start index of s. - * @param limit The limit of the subsequence of s being spanned. - * @param t The substring to be matched in s. - * @param tlength The length of t. - */ - static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { - return matches16(s, start, t, tlength) - && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && - Character.isLowSurrogate(s.charAt(start))) - && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && - Character.isLowSurrogate(s.charAt(start + tlength))); - } - - /** - * Does the set contain the next code point? - * If so, return its length; otherwise return its negative length. - */ - static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { - char c = s.charAt(start); - if (c >= 0xd800 && c <= 0xdbff && length >= 2) { - char c2 = s.charAt(start + 1); - if (UTF16.isTrailSurrogate(c2)) { - int supplementary = UCharacterProperty.getRawSupplementary(c, c2); - return set.contains(supplementary) ? 2 : -2; - } - } - return set.contains(c) ? 1 : -1; - } - - static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { - char c = s.charAt(length - 1); - if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { - char c2 = s.charAt(length - 2); - if (UTF16.isLeadSurrogate(c2)) { - int supplementary = UCharacterProperty.getRawSupplementary(c2, c); - return set.contains(supplementary) ? 2 : -2; - } - } - return set.contains(c) ? 1 : -1; - } - - /** - * Helper class for UnicodeSetStringSpan. - * - *

List of offsets from the current position from where to try matching - * a code point or a string. - * Stores offsets rather than indexes to simplify the code and use the same list - * for both increments (in span()) and decrements (in spanBack()). - * - *

Assumption: The maximum offset is limited, and the offsets that are stored at any one time - * are relatively dense, that is, - * there are normally no gaps of hundreds or thousands of offset values. - * - *

This class optionally also tracks the minimum non-negative count for each position, - * intended to count the smallest number of elements of any path leading to that position. - * - *

The implementation uses a circular buffer of count integers, - * each indicating whether the corresponding offset is in the list, - * and its path element count. - * This avoids inserting into a sorted list of offsets (or absolute indexes) - * and physically moving part of the list. - * - *

Note: In principle, the caller should setMaxLength() to - * the maximum of the max string length and U16_LENGTH/U8_LENGTH - * to account for "long" single code points. - * - *

Note: An earlier version did not track counts and stored only byte flags. - * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, - * the list could be stored as bit flags in a single integer. - * Rather than handling a circular buffer with a start list index, - * the integer would simply be shifted when lower offsets are removed. - * UnicodeSet does not have a limit on the lengths of strings. - */ - private static final class OffsetList { - private int[] list; - private int length; - private int start; - - public OffsetList() { - list = new int[16]; // default size - } - - public void setMaxLength(int maxLength) { - if (maxLength > list.length) { - list = new int[maxLength]; - } - clear(); - } - - public void clear() { - for (int i = list.length; i-- > 0;) { - list[i] = 0; - } - start = length = 0; - } - - public boolean isEmpty() { - return (length == 0); - } - - /** - * Reduces all stored offsets by delta, used when the current position moves by delta. - * There must not be any offsets lower than delta. - * If there is an offset equal to delta, it is removed. - * - * @param delta [1..maxLength] - */ - public void shift(int delta) { - int i = start + delta; - if (i >= list.length) { - i -= list.length; - } - if (list[i] != 0) { - list[i] = 0; - --length; - } - start = i; - } - - /** - * Adds an offset. The list must not contain it yet. - * @param offset [1..maxLength] - */ - public void addOffset(int offset) { - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - assert list[i] == 0; - list[i] = 1; - ++length; - } - - /** - * Adds an offset and updates its count. - * The list may already contain the offset. - * @param offset [1..maxLength] - */ - public void addOffsetAndCount(int offset, int count) { - assert count > 0; - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - if (list[i] == 0) { - list[i] = count; - ++length; - } else if (count < list[i]) { - list[i] = count; - } - } - - /** - * @param offset [1..maxLength] - */ - public boolean containsOffset(int offset) { - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - return list[i] != 0; - } - - /** - * @param offset [1..maxLength] - */ - public boolean hasCountAtOffset(int offset, int count) { - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - int oldCount = list[i]; - return oldCount != 0 && oldCount <= count; - } - - /** - * Finds the lowest stored offset from a non-empty list, removes it, - * and reduces all other offsets by this minimum. - * @return min=[1..maxLength] - */ - public int popMinimum(OutputInt outCount) { - // Look for the next offset in list[start+1..list.length-1]. - int i = start, result; - while (++i < list.length) { - int count = list[i]; - if (count != 0) { - list[i] = 0; - --length; - result = i - start; - start = i; - if (outCount != null) { outCount.value = count; } - return result; - } - } - // i==list.length - - // Wrap around and look for the next offset in list[0..start]. - // Since the list is not empty, there will be one. - result = list.length - start; - i = 0; - int count; - while ((count = list[i]) == 0) { - ++i; - } - list[i] = 0; - --length; - start = i; - if (outCount != null) { outCount.value = count; } - return result + i; - } - } -} --- /dev/null 2020-01-10 13:50:20.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java 2020-01-10 13:50:20.000000000 -0800 @@ -0,0 +1,1168 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import java.util.ArrayList; + +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.text.UnicodeSet; +import jdk.internal.icu.text.UnicodeSet.SpanCondition; +import jdk.internal.icu.util.OutputInt; + +/* + * Implement span() etc. for a set with strings. + * Avoid recursion because of its exponential complexity. + * Instead, try multiple paths at once and track them with an IndexList. + */ +public class UnicodeSetStringSpan { + + /* + * Which span() variant will be used? The object is either built for one variant and used once, + * or built for all and may be used many times. + */ + public static final int WITH_COUNT = 0x40; // spanAndCount() may be called + public static final int FWD = 0x20; + public static final int BACK = 0x10; + // public static final int UTF16 = 8; + public static final int CONTAINED = 2; + public static final int NOT_CONTAINED = 1; + + public static final int ALL = 0x7f; + + public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; + public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; + public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; + public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; + + /** + * Special spanLength short values. (since Java has not unsigned byte type) + * All code points in the string are contained in the parent set. + */ + static final short ALL_CP_CONTAINED = 0xff; + + /** The spanLength is >=0xfe. */ + static final short LONG_SPAN = ALL_CP_CONTAINED - 1; + + /** Set for span(). Same as parent but without strings. */ + private UnicodeSet spanSet; + + /** + * Set for span(not contained). + * Same as spanSet, plus characters that start or end strings. + */ + private UnicodeSet spanNotSet; + + /** The strings of the parent set. */ + private ArrayList strings; + + /** The lengths of span(), spanBack() etc. for each string. */ + private short[] spanLengths; + + /** Maximum lengths of relevant strings. */ + private int maxLength16; + + /** Are there strings that are not fully contained in the code point set? */ + private boolean someRelevant; + + /** Set up for all variants of span()? */ + private boolean all; + + /** Span helper */ + private OffsetList offsets; + + /** + * Constructs for all variants of span(), or only for any one variant. + * Initializes as little as possible, for single use. + */ + public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList setStrings, int which) { + spanSet = new UnicodeSet(0, 0x10ffff); + // TODO: With Java 6, just take the parent set's strings as is, + // as a NavigableSet, rather than as an ArrayList copy of the set of strings. + // Then iterate via the first() and higher() methods. + // (We do not want to create multiple Iterator objects in each span().) + // See ICU ticket #7454. + strings = setStrings; + all = (which == ALL); + spanSet.retainAll(set); + if (0 != (which & NOT_CONTAINED)) { + // Default to the same sets. + // addToSpanNotSet() will create a separate set if necessary. + spanNotSet = spanSet; + } + offsets = new OffsetList(); + + // Determine if the strings even need to be taken into account at all for span() etc. + // If any string is relevant, then all strings need to be used for + // span(longest match) but only the relevant ones for span(while contained). + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. + int stringsLength = strings.size(); + + int i, spanLength; + someRelevant = false; + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + someRelevant = true; + } + if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { + maxLength16 = length16; + } + } + if (!someRelevant && (which & WITH_COUNT) == 0) { + return; + } + + // Freeze after checking for the need to use strings at all because freezing + // a set takes some time and memory which are wasted if there are no relevant strings. + if (all) { + spanSet.freeze(); + } + + int spanBackLengthsOffset; + + // Allocate a block of meta data. + int allocSize; + if (all) { + // 2 sets of span lengths + allocSize = stringsLength * (2); + } else { + allocSize = stringsLength; // One set of span lengths. + } + spanLengths = new short[allocSize]; + + if (all) { + // Store span lengths for all span() variants. + spanBackLengthsOffset = stringsLength; + } else { + // Store span lengths for only one span() variant. + spanBackLengthsOffset = 0; + } + + // Set the meta data and spanNotSet and write the UTF-8 strings. + + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + if (true /* 0 != (which & UTF16) */) { + if (0 != (which & CONTAINED)) { + if (0 != (which & FWD)) { + spanLengths[i] = makeSpanLengthByte(spanLength); + } + if (0 != (which & BACK)) { + spanLength = length16 + - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); + spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */{ + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant + // flag. + } + } + if (0 != (which & NOT_CONTAINED)) { + // Add string start and end code points to the spanNotSet so that + // a span(while not contained) stops before any string. + int c; + if (0 != (which & FWD)) { + c = string.codePointAt(0); + addToSpanNotSet(c); + } + if (0 != (which & BACK)) { + c = string.codePointBefore(length16); + addToSpanNotSet(c); + } + } + } else { // Irrelevant string. + if (all) { + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; + } else { + // All spanXYZLengths pointers contain the same address. + spanLengths[i] = ALL_CP_CONTAINED; + } + } + } + + // Finish. + if (all) { + spanNotSet.freeze(); + } + } + + /** + * Do the strings need to be checked in span() etc.? + * + * @return true if strings need to be checked (call span() here), + * false if not (use a BMPSet for best performance). + */ + public boolean needsStringSpanUTF16() { + return someRelevant; + } + + /** For fast UnicodeSet::contains(c). */ + public boolean contains(int c) { + return spanSet.contains(c); + } + + /** + * Adds a starting or ending string character to the spanNotSet + * so that a character span ends before any string. + */ + private void addToSpanNotSet(int c) { + if (spanNotSet == null || spanNotSet == spanSet) { + if (spanSet.contains(c)) { + return; // Nothing to do. + } + spanNotSet = spanSet.cloneAsThawed(); + } + spanNotSet.add(c); + } + + /* + * Note: In span() when spanLength==0 + * (after a string match, or at the beginning after an empty code point span) + * and in spanNot() and spanNotUTF8(), + * string matching could use a binary search because all string matches are done + * from the same start index. + * + * For UTF-8, this would require a comparison function that returns UTF-16 order. + * + * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets + * with strings have very few very short strings. For cases with many strings, it might be better to use a different + * API and implementation with a DFA (state machine). + */ + + /* + * Algorithm for span(SpanCondition.CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, or recursively span + * for all but the shortest one and iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, + * neither for the code point there nor for any set string, + * then stop and return the longest recursive span length. + * + * Optimized implementation: + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points of the original set + * but none of its strings. + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the recursive algorithm would have tried to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(SpanCondition.CONTAINED) + * because the recursive algorithm would continue after them anyway and + * find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), + * then pop the shortest string match end point and continue the loop, + * trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, then test if the + * code point after the previous string match is also contained in the set. + * Continue the loop with the shortest end point of + * either this code point or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * + * By noting each end point of a set string match, the function visits each string position at most once and + * finishes in linear time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. + */ + + /* + * Algorithm for span(SIMPLE) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, then stop and return the current position. + * + * Optimized implementation: + * + * (Same assumption and spanSet as above.) + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the standard algorithm would have tried to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have + * continued after the set string match anyway. + * ~ Remember the longest set string match (farthest end point) + * from the earliest starting point. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched, + * then continue the loop after the longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + */ + /** + * Spans a string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @return the limit (exclusive end) of the span + */ + public int span(CharSequence s, int start, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, null); + } + int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); + if (spanLimit == s.length()) { + return spanLimit; + } + return spanWithStrings(s, start, spanLimit, spanCondition); + } + + /** + * Synchronized method for complicated spans using the offsets. + * Avoids synchronization for simple cases. + * + * @param spanLimit = spanSet.span(s, start, CONTAINED) + */ + private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, + SpanCondition spanCondition) { + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int length = s.length(); + int pos = spanLimit, rest = length - spanLimit; + int spanLength = spanLimit - start; + int i, stringsLength = strings.size(); + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code + // point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest) { + break; + } + // Try to match if the increment is not listed already. + if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { + if (inc == rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if (overlap == 0) { + break; + } + --overlap; + ++inc; + } + } + } else /* SIMPLE */{ + int maxInc = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) + && matches16CPB(s, pos - overlap, length, string, length16)) { + maxInc = inc; // Longest match from earliest start. + maxOverlap = overlap; + break; + } + --overlap; + ++inc; + } + } + + if (maxInc != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos += maxInc; + rest -= maxInc; + if (rest == 0) { + return length; // Reached the end of the string. + } + spanLength = 0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == 0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); + spanLength = spanLimit - pos; + if (spanLength == rest || // Reached the end of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return spanLimit; + } + pos += spanLength; + rest -= spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOne(spanSet, s, pos, rest); + if (spanLength > 0) { + if (spanLength == rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos += spanLength; + rest -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int minOffset = offsets.popMinimum(null); + pos += minOffset; + rest -= minOffset; + spanLength = 0; // Match strings from after a string match. + } + } + + /** + * Spans a string and counts the smallest number of set elements on any path across the span. + * + *

For proper counting, we cannot ignore strings that are fully contained in code point spans. + * + *

If the set does not have any fully-contained strings, then we could optimize this + * like span(), but such sets are likely rare, and this is at least still linear. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @param outCount The count + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, outCount); + } + // Consider strings; they may overlap with the span, + // and they may result in a smaller count that with just code points. + if (spanCondition == SpanCondition.CONTAINED) { + return spanContainedAndCount(s, start, outCount); + } + // SIMPLE (not synchronized, does not use offsets) + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + int maxInc = (cpLength > 0) ? cpLength : 0; + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + if (maxInc < length16 && length16 <= rest && + matches16CPB(s, pos, length, string, length16)) { + maxInc = length16; + } + } + // We are done if there is no match beyond pos. + if (maxInc == 0) { + outCount.value = count; + return pos; + } + // Continue from the longest match. + ++count; + pos += maxInc; + rest -= maxInc; + } + outCount.value = count; + return pos; + } + + private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + offsets.addOffsetAndCount(cpLength, count + 1); + } + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + // Note: If the strings were sorted by length, then we could also + // avoid trying to match if there is already a match of the same length. + if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && + matches16CPB(s, pos, length, string, length16)) { + offsets.addOffsetAndCount(length16, count + 1); + } + } + // We are done if there is no match beyond pos. + if (offsets.isEmpty()) { + outCount.value = count; + return pos; + } + // Continue from the nearest match. + int minOffset = offsets.popMinimum(outCount); + count = outCount.value; + pos += minOffset; + rest -= minOffset; + } + outCount.value = count; + return pos; + } + + /** + * Span a string backwards. + * + * @param s The string to be spanned + * @param spanCondition The span condition + * @return The string index which starts the span (i.e. inclusive). + */ + public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNotBack(s, length); + } + int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); + if (pos == 0) { + return 0; + } + int spanLength = length - pos; + + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int i, stringsLength = strings.size(); + int spanBackLengthsOffset = 0; + if (all) { + spanBackLengthsOffset = stringsLength; + } + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + int len1 = 0; + len1 = string.offsetByCodePoints(0, 1); + overlap -= len1; // Length of the string minus the first code point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos) { + break; + } + // Try to match if the decrement is not listed already. + if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { + if (dec == pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if (overlap == 0) { + break; + } + --overlap; + ++dec; + } + } + } else /* SIMPLE */{ + int maxDec = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) + && matches16CPB(s, pos - dec, length, string, length16)) { + maxDec = dec; // Longest match from latest end. + maxOverlap = overlap; + break; + } + --overlap; + ++dec; + } + } + + if (maxDec != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos -= maxDec; + if (pos == 0) { + return 0; // Reached the start of the string. + } + spanLength = 0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int oldPos = pos; + pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); + spanLength = oldPos - pos; + if (pos == 0 || // Reached the start of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOneBack(spanSet, s, pos); + if (spanLength > 0) { + if (spanLength == pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos -= offsets.popMinimum(null); + spanLength = 0; // Match strings from before a string match. + } + } + + /** + * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * + * Optimized implementation: + * + * (Same assumption as for span() above.) + * + * Create and cache a spanNotSet which contains + * all of the single code points of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * + * - Loop: + * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). + * + If the current code point is in the original set, then return the current position. + * + If any set string matches at the current position, then return the current position. + * + If there is no match at the current position, neither for the code point + * there nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param outCount If not null: Receives the number of code points across the span. + * @return the limit (exclusive end) of the span + */ + private int spanNot(CharSequence s, int start, OutputInt outCount) { + int length = s.length(); + int pos = start, rest = length - start; + int stringsLength = strings.size(); + int count = 0; + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + int spanLimit; + if (outCount == null) { + spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); + } else { + spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); + outCount.value = count = count + outCount.value; + } + if (spanLimit == length) { + return length; // Reached the end of the string. + } + pos = spanLimit; + rest = length - spanLimit; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (int i = 0; i < stringsLength; ++i) { + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos -= cpLength; + rest += cpLength; + ++count; + } while (rest != 0); + if (outCount != null) { + outCount.value = count; + } + return length; // Reached the end of the string. + } + + private int spanNotBack(CharSequence s, int length) { + int pos = length; + int i, stringsLength = strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); + if (pos == 0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOneBack(spanSet, s, pos); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (i = 0; i < stringsLength; ++i) { + // Use spanLengths rather than a spanLengths pointer because + // it is easier and we only need to know whether the string is irrelevant + // which is the same in either array. + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos += cpLength; + } while (pos != 0); + return 0; // Reached the start of the string. + } + + static short makeSpanLengthByte(int spanLength) { + // 0xfe==UnicodeSetStringSpan::LONG_SPAN + return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; + } + + // Compare strings without any argument checks. Requires length>0. + private static boolean matches16(CharSequence s, int start, final String t, int length) { + int end = start + length; + while (length-- > 0) { + if (s.charAt(--end) != t.charAt(length)) { + return false; + } + } + return true; + } + + /** + * Compare 16-bit Unicode strings (which may be malformed UTF-16) + * at code point boundaries. + * That is, each edge of a match must not be in the middle of a surrogate pair. + * @param s The string to match in. + * @param start The start index of s. + * @param limit The limit of the subsequence of s being spanned. + * @param t The substring to be matched in s. + * @param tlength The length of t. + */ + static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { + return matches16(s, start, t, tlength) + && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && + Character.isLowSurrogate(s.charAt(start))) + && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && + Character.isLowSurrogate(s.charAt(start + tlength))); + } + + /** + * Does the set contain the next code point? + * If so, return its length; otherwise return its negative length. + */ + static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { + char c = s.charAt(start); + if (c >= 0xd800 && c <= 0xdbff && length >= 2) { + char c2 = s.charAt(start + 1); + if (UTF16.isTrailSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { + char c = s.charAt(length - 1); + if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { + char c2 = s.charAt(length - 2); + if (UTF16.isLeadSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + /** + * Helper class for UnicodeSetStringSpan. + * + *

List of offsets from the current position from where to try matching + * a code point or a string. + * Stores offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + *

Assumption: The maximum offset is limited, and the offsets that are stored at any one time + * are relatively dense, that is, + * there are normally no gaps of hundreds or thousands of offset values. + * + *

This class optionally also tracks the minimum non-negative count for each position, + * intended to count the smallest number of elements of any path leading to that position. + * + *

The implementation uses a circular buffer of count integers, + * each indicating whether the corresponding offset is in the list, + * and its path element count. + * This avoids inserting into a sorted list of offsets (or absolute indexes) + * and physically moving part of the list. + * + *

Note: In principle, the caller should setMaxLength() to + * the maximum of the max string length and U16_LENGTH/U8_LENGTH + * to account for "long" single code points. + * + *

Note: An earlier version did not track counts and stored only byte flags. + * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. + */ + private static final class OffsetList { + private int[] list; + private int length; + private int start; + + public OffsetList() { + list = new int[16]; // default size + } + + public void setMaxLength(int maxLength) { + if (maxLength > list.length) { + list = new int[maxLength]; + } + clear(); + } + + public void clear() { + for (int i = list.length; i-- > 0;) { + list[i] = 0; + } + start = length = 0; + } + + public boolean isEmpty() { + return (length == 0); + } + + /** + * Reduces all stored offsets by delta, used when the current position moves by delta. + * There must not be any offsets lower than delta. + * If there is an offset equal to delta, it is removed. + * + * @param delta [1..maxLength] + */ + public void shift(int delta) { + int i = start + delta; + if (i >= list.length) { + i -= list.length; + } + if (list[i] != 0) { + list[i] = 0; + --length; + } + start = i; + } + + /** + * Adds an offset. The list must not contain it yet. + * @param offset [1..maxLength] + */ + public void addOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + assert list[i] == 0; + list[i] = 1; + ++length; + } + + /** + * Adds an offset and updates its count. + * The list may already contain the offset. + * @param offset [1..maxLength] + */ + public void addOffsetAndCount(int offset, int count) { + assert count > 0; + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + if (list[i] == 0) { + list[i] = count; + ++length; + } else if (count < list[i]) { + list[i] = count; + } + } + + /** + * @param offset [1..maxLength] + */ + public boolean containsOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + return list[i] != 0; + } + + /** + * @param offset [1..maxLength] + */ + public boolean hasCountAtOffset(int offset, int count) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + int oldCount = list[i]; + return oldCount != 0 && oldCount <= count; + } + + /** + * Finds the lowest stored offset from a non-empty list, removes it, + * and reduces all other offsets by this minimum. + * @return min=[1..maxLength] + */ + public int popMinimum(OutputInt outCount) { + // Look for the next offset in list[start+1..list.length-1]. + int i = start, result; + while (++i < list.length) { + int count = list[i]; + if (count != 0) { + list[i] = 0; + --length; + result = i - start; + start = i; + if (outCount != null) { outCount.value = count; } + return result; + } + } + // i==list.length + + // Wrap around and look for the next offset in list[0..start]. + // Since the list is not empty, there will be one. + result = list.length - start; + i = 0; + int count; + while ((count = list[i]) == 0) { + ++i; + } + list[i] = 0; + --length; + start = i; + if (outCount != null) { outCount.value = count; } + return result + i; + } + } +}