1 /*
   2  * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 // (c) 2018 and later: Unicode, Inc. and others.
  26 // License & terms of use: http://www.unicode.org/copyright.html#License
  27 
  28 // created: 2018may10 Markus W. Scherer
  29 
  30 package jdk.internal.icu.util;
  31 
  32 import java.util.Iterator;
  33 import java.util.NoSuchElementException;
  34 
  35 /**
  36  * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
  37  * This does not implement java.util.Map.
  38  *
  39  * @draft ICU 63
  40  * @provisional This API might change or be removed in a future release.
  41  */
  42 public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
  43     /**
  44      * Selectors for how getRange() should report value ranges overlapping with surrogates.
  45      * Most users should use NORMAL.
  46      *
  47      * @see #getRange
  48      * @draft ICU 63
  49      * @provisional This API might change or be removed in a future release.
  50      */
  51     public enum RangeOption {
  52         /**
  53          * getRange() enumerates all same-value ranges as stored in the map.
  54          * Most users should use this option.
  55          *
  56          * @draft ICU 63
  57          * @provisional This API might change or be removed in a future release.
  58          */
  59         NORMAL,
  60         /**
  61          * getRange() enumerates all same-value ranges as stored in the map,
  62          * except that lead surrogates (U+D800..U+DBFF) are treated as having the
  63          * surrogateValue, which is passed to getRange() as a separate parameter.
  64          * The surrogateValue is not transformed via filter().
  65          * See {@link Character#isHighSurrogate}.
  66          *
  67          * <p>Most users should use NORMAL instead.
  68          *
  69          * <p>This option is useful for maps that map surrogate code *units* to
  70          * special values optimized for UTF-16 string processing
  71          * or for special error behavior for unpaired surrogates,
  72          * but those values are not to be associated with the lead surrogate code *points*.
  73          *
  74          * @draft ICU 63
  75          * @provisional This API might change or be removed in a future release.
  76          */
  77         FIXED_LEAD_SURROGATES,
  78         /**
  79          * getRange() enumerates all same-value ranges as stored in the map,
  80          * except that all surrogates (U+D800..U+DFFF) are treated as having the
  81          * surrogateValue, which is passed to getRange() as a separate parameter.
  82          * The surrogateValue is not transformed via filter().
  83          * See {@link Character#isSurrogate}.
  84          *
  85          * <p>Most users should use NORMAL instead.
  86          *
  87          * <p>This option is useful for maps that map surrogate code *units* to
  88          * special values optimized for UTF-16 string processing
  89          * or for special error behavior for unpaired surrogates,
  90          * but those values are not to be associated with the lead surrogate code *points*.
  91          *
  92          * @draft ICU 63
  93          * @provisional This API might change or be removed in a future release.
  94          */
  95         FIXED_ALL_SURROGATES
  96     }
  97 
  98     /**
  99      * Callback function interface: Modifies a map value.
 100      * Optionally called by getRange().
 101      * The modified value will be returned by the getRange() function.
 102      *
 103      * <p>Can be used to ignore some of the value bits,
 104      * make a filter for one of several values,
 105      * return a value index computed from the map value, etc.
 106      *
 107      * @see #getRange
 108      * @see #iterator
 109      * @draft ICU 63
 110      * @provisional This API might change or be removed in a future release.
 111      */
 112     public interface ValueFilter {
 113         /**
 114          * Modifies the map value.
 115          *
 116          * @param value map value
 117          * @return modified value
 118          * @draft ICU 63
 119          * @provisional This API might change or be removed in a future release.
 120          */
 121         public int apply(int value);
 122     }
 123 
 124     /**
 125      * Range iteration result data.
 126      * Code points from start to end map to the same value.
 127      * The value may have been modified by {@link ValueFilter#apply(int)},
 128      * or it may be the surrogateValue if a RangeOption other than "normal" was used.
 129      *
 130      * @see #getRange
 131      * @see #iterator
 132      * @draft ICU 63
 133      * @provisional This API might change or be removed in a future release.
 134      */
 135     public static final class Range {
 136         private int start;
 137         private int end;
 138         private int value;
 139 
 140         /**
 141          * Constructor. Sets start and end to -1 and value to 0.
 142          *
 143          * @draft ICU 63
 144          * @provisional This API might change or be removed in a future release.
 145          */
 146         public Range() {
 147             start = end = -1;
 148             value = 0;
 149         }
 150 
 151         /**
 152          * @return the start code point
 153          * @draft ICU 63
 154          * @provisional This API might change or be removed in a future release.
 155          */
 156         public int getStart() { return start; }
 157         /**
 158          * @return the (inclusive) end code point
 159          * @draft ICU 63
 160          * @provisional This API might change or be removed in a future release.
 161          */
 162         public int getEnd() { return end; }
 163         /**
 164          * @return the range value
 165          * @draft ICU 63
 166          * @provisional This API might change or be removed in a future release.
 167          */
 168         public int getValue() { return value; }
 169         /**
 170          * Sets the range. When using {@link #iterator()},
 171          * iteration will resume after the newly set end.
 172          *
 173          * @param start new start code point
 174          * @param end new end code point
 175          * @param value new value
 176          * @draft ICU 63
 177          * @provisional This API might change or be removed in a future release.
 178          */
 179         public void set(int start, int end, int value) {
 180             this.start = start;
 181             this.end = end;
 182             this.value = value;
 183         }
 184     }
 185 
 186     private final class RangeIterator implements Iterator<Range> {
 187         private Range range = new Range();
 188 
 189         @Override
 190         public boolean hasNext() {
 191             return -1 <= range.end && range.end < 0x10ffff;
 192         }
 193 
 194         @Override
 195         public Range next() {
 196             if (getRange(range.end + 1, null, range)) {
 197                 return range;
 198             } else {
 199                 throw new NoSuchElementException();
 200             }
 201         }
 202 
 203         @Override
 204         public final void remove() {
 205             throw new UnsupportedOperationException();
 206         }
 207     }
 208 
 209     /**
 210      * Iterates over code points of a string and fetches map values.
 211      * This does not implement java.util.Iterator.
 212      *
 213      * <pre>
 214      * void onString(CodePointMap map, CharSequence s, int start) {
 215      *     CodePointMap.StringIterator iter = map.stringIterator(s, start);
 216      *     while (iter.next()) {
 217      *         int end = iter.getIndex();  // code point from between start and end
 218      *         useValue(s, start, end, iter.getCodePoint(), iter.getValue());
 219      *         start = end;
 220      *     }
 221      * }
 222      * </pre>
 223      *
 224      * <p>This class is not intended for public subclassing.
 225      *
 226      * @draft ICU 63
 227      * @provisional This API might change or be removed in a future release.
 228      */
 229     public class StringIterator {
 230         /**
 231          * @internal
 232          * @deprecated This API is ICU internal only.
 233          */
 234         @Deprecated
 235         protected CharSequence s;
 236         /**
 237          * @internal
 238          * @deprecated This API is ICU internal only.
 239          */
 240         @Deprecated
 241         protected int sIndex;
 242         /**
 243          * @internal
 244          * @deprecated This API is ICU internal only.
 245          */
 246         @Deprecated
 247         protected int c;
 248         /**
 249          * @internal
 250          * @deprecated This API is ICU internal only.
 251          */
 252         @Deprecated
 253         protected int value;
 254 
 255         /**
 256          * @internal
 257          * @deprecated This API is ICU internal only.
 258          */
 259         @Deprecated
 260         protected StringIterator(CharSequence s, int sIndex) {
 261             this.s = s;
 262             this.sIndex = sIndex;
 263             c = -1;
 264             value = 0;
 265         }
 266 
 267         /**
 268          * Resets the iterator to a new string and/or a new string index.
 269          *
 270          * @param s string to iterate over
 271          * @param sIndex string index where the iteration will start
 272          * @draft ICU 63
 273          * @provisional This API might change or be removed in a future release.
 274          */
 275         public void reset(CharSequence s, int sIndex) {
 276             this.s = s;
 277             this.sIndex = sIndex;
 278             c = -1;
 279             value = 0;
 280         }
 281 
 282         /**
 283          * Reads the next code point, post-increments the string index,
 284          * and gets a value from the map.
 285          * Sets an implementation-defined error value if the code point is an unpaired surrogate.
 286          *
 287          * @return true if the string index was not yet at the end of the string;
 288          *         otherwise the iterator did not advance
 289          * @draft ICU 63
 290          * @provisional This API might change or be removed in a future release.
 291          */
 292         public boolean next() {
 293             if (sIndex >= s.length()) {
 294                 return false;
 295             }
 296             c = Character.codePointAt(s, sIndex);
 297             sIndex += Character.charCount(c);
 298             value = get(c);
 299             return true;
 300         }
 301 
 302         /**
 303          * Reads the previous code point, pre-decrements the string index,
 304          * and gets a value from the map.
 305          * Sets an implementation-defined error value if the code point is an unpaired surrogate.
 306          *
 307          * @return true if the string index was not yet at the start of the string;
 308          *         otherwise the iterator did not advance
 309          * @draft ICU 63
 310          * @provisional This API might change or be removed in a future release.
 311          */
 312         public boolean previous() {
 313             if (sIndex <= 0) {
 314                 return false;
 315             }
 316             c = Character.codePointBefore(s, sIndex);
 317             sIndex -= Character.charCount(c);
 318             value = get(c);
 319             return true;
 320         }
 321         /**
 322          * @return the string index
 323          * @draft ICU 63
 324          * @provisional This API might change or be removed in a future release.
 325          */
 326         public final int getIndex() { return sIndex; }
 327         /**
 328          * @return the code point
 329          * @draft ICU 63
 330          * @provisional This API might change or be removed in a future release.
 331          */
 332         public final int getCodePoint() { return c; }
 333         /**
 334          * @return the map value,
 335          *         or an implementation-defined error value if
 336          *         the code point is an unpaired surrogate
 337          * @draft ICU 63
 338          * @provisional This API might change or be removed in a future release.
 339          */
 340         public final int getValue() { return value; }
 341     }
 342 
 343     /**
 344      * Protected no-args constructor.
 345      *
 346      * @draft ICU 63
 347      * @provisional This API might change or be removed in a future release.
 348      */
 349     protected CodePointMap() {
 350     }
 351 
 352     /**
 353      * Returns the value for a code point as stored in the map, with range checking.
 354      * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
 355      *
 356      * @param c the code point
 357      * @return the map value,
 358      *         or an implementation-defined error value if
 359      *         the code point is not in the range 0..U+10FFFF
 360      * @draft ICU 63
 361      * @provisional This API might change or be removed in a future release.
 362      */
 363     public abstract int get(int c);
 364 
 365     /**
 366      * Sets the range object to a range of code points beginning with the start parameter.
 367      * The range start is the same as the start input parameter
 368      * (even if there are preceding code points that have the same value).
 369      * The range end is the last code point such that
 370      * all those from start to there have the same value.
 371      * Returns false if start is not 0..U+10FFFF.
 372      * Can be used to efficiently iterate over all same-value ranges in a map.
 373      * (This is normally faster than iterating over code points and get()ting each value,
 374      * but may be much slower than a data structure that stores ranges directly.)
 375      *
 376      * <p>If the {@link ValueFilter} parameter is not null, then
 377      * the value to be delivered is passed through that filter, and the return value is the end
 378      * of the range where all values are modified to the same actual value.
 379      * The value is unchanged if that parameter is null.
 380      *
 381      * <p>Example:
 382      * <pre>
 383      * int start = 0;
 384      * CodePointMap.Range range = new CodePointMap.Range();
 385      * while (map.getRange(start, null, range)) {
 386      *     int end = range.getEnd();
 387      *     int value = range.getValue();
 388      *     // Work with the range start..end and its value.
 389      *     start = end + 1;
 390      * }
 391      * </pre>
 392      *
 393      * @param start range start
 394      * @param filter an object that may modify the map data value,
 395      *     or null if the values from the map are to be used unmodified
 396      * @param range the range object that will be set to the code point range and value
 397      * @return true if start is 0..U+10FFFF; otherwise no new range is fetched
 398      * @draft ICU 63
 399      * @provisional This API might change or be removed in a future release.
 400      */
 401     public abstract boolean getRange(int start, ValueFilter filter, Range range);
 402 
 403     /**
 404      * Sets the range object to a range of code points beginning with the start parameter.
 405      * The range start is the same as the start input parameter
 406      * (even if there are preceding code points that have the same value).
 407      * The range end is the last code point such that
 408      * all those from start to there have the same value.
 409      * Returns false if start is not 0..U+10FFFF.
 410      *
 411      * <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
 412      * modifies the range if it overlaps with surrogate code points.
 413      *
 414      * @param start range start
 415      * @param option defines whether surrogates are treated normally,
 416      *               or as having the surrogateValue; usually {@link RangeOption#NORMAL}
 417      * @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
 418      * @param filter an object that may modify the map data value,
 419      *     or null if the values from the map are to be used unmodified
 420      * @param range the range object that will be set to the code point range and value
 421      * @return true if start is 0..U+10FFFF; otherwise no new range is fetched
 422      * @draft ICU 63
 423      * @provisional This API might change or be removed in a future release.
 424      */
 425     public boolean getRange(int start, RangeOption option, int surrogateValue,
 426             ValueFilter filter, Range range) {
 427         assert option != null;
 428         if (!getRange(start, filter, range)) {
 429             return false;
 430         }
 431         if (option == RangeOption.NORMAL) {
 432             return true;
 433         }
 434         int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
 435         int end = range.end;
 436         if (end < 0xd7ff || start > surrEnd) {
 437             return true;
 438         }
 439         // The range overlaps with surrogates, or ends just before the first one.
 440         if (range.value == surrogateValue) {
 441             if (end >= surrEnd) {
 442                 // Surrogates followed by a non-surrValue range,
 443                 // or surrogates are part of a larger surrValue range.
 444                 return true;
 445             }
 446         } else {
 447             if (start <= 0xd7ff) {
 448                 range.end = 0xd7ff;  // Non-surrValue range ends before surrValue surrogates.
 449                 return true;
 450             }
 451             // Start is a surrogate with a non-surrValue code *unit* value.
 452             // Return a surrValue code *point* range.
 453             range.value = surrogateValue;
 454             if (end > surrEnd) {
 455                 range.end = surrEnd;  // Surrogate range ends before non-surrValue rest of range.
 456                 return true;
 457             }
 458         }
 459         // See if the surrValue surrogate range can be merged with
 460         // an immediately following range.
 461         if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
 462             range.start = start;
 463             return true;
 464         }
 465         range.start = start;
 466         range.end = surrEnd;
 467         range.value = surrogateValue;
 468         return true;
 469     }
 470 
 471     /**
 472      * Convenience iterator over same-map-value code point ranges.
 473      * Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
 474      * without filtering.
 475      * Adjacent ranges have different map values.
 476      *
 477      * <p>The iterator always returns the same Range object.
 478      *
 479      * @return a Range iterator
 480      * @draft ICU 63
 481      * @provisional This API might change or be removed in a future release.
 482      */
 483     @Override
 484     public Iterator<Range> iterator() {
 485         return new RangeIterator();
 486     }
 487 
 488     /**
 489      * Returns an iterator (not a java.util.Iterator) over code points of a string
 490      * for fetching map values.
 491      *
 492      * @param s string to iterate over
 493      * @param sIndex string index where the iteration will start
 494      * @return the iterator
 495      * @draft ICU 63
 496      * @provisional This API might change or be removed in a future release.
 497      */
 498     public StringIterator stringIterator(CharSequence s, int sIndex) {
 499         return new StringIterator(s, sIndex);
 500     }
 501 }