1 /* 2 * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 // (c) 2018 and later: Unicode, Inc. and others. 26 // License & terms of use: http://www.unicode.org/copyright.html#License 27 28 // created: 2018may10 Markus W. Scherer 29 30 package jdk.internal.icu.util; 31 32 import java.util.Iterator; 33 import java.util.NoSuchElementException; 34 35 /** 36 * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values. 37 * This does not implement java.util.Map. 38 * 39 * @draft ICU 63 40 * @provisional This API might change or be removed in a future release. 41 */ 42 public abstract class CodePointMap implements Iterable<CodePointMap.Range> { 43 /** 44 * Selectors for how getRange() should report value ranges overlapping with surrogates. 45 * Most users should use NORMAL. 46 * 47 * @see #getRange 48 * @draft ICU 63 49 * @provisional This API might change or be removed in a future release. 50 */ 51 public enum RangeOption { 52 /** 53 * getRange() enumerates all same-value ranges as stored in the map. 54 * Most users should use this option. 55 * 56 * @draft ICU 63 57 * @provisional This API might change or be removed in a future release. 58 */ 59 NORMAL, 60 /** 61 * getRange() enumerates all same-value ranges as stored in the map, 62 * except that lead surrogates (U+D800..U+DBFF) are treated as having the 63 * surrogateValue, which is passed to getRange() as a separate parameter. 64 * The surrogateValue is not transformed via filter(). 65 * See {@link Character#isHighSurrogate}. 66 * 67 * <p>Most users should use NORMAL instead. 68 * 69 * <p>This option is useful for maps that map surrogate code *units* to 70 * special values optimized for UTF-16 string processing 71 * or for special error behavior for unpaired surrogates, 72 * but those values are not to be associated with the lead surrogate code *points*. 73 * 74 * @draft ICU 63 75 * @provisional This API might change or be removed in a future release. 76 */ 77 FIXED_LEAD_SURROGATES, 78 /** 79 * getRange() enumerates all same-value ranges as stored in the map, 80 * except that all surrogates (U+D800..U+DFFF) are treated as having the 81 * surrogateValue, which is passed to getRange() as a separate parameter. 82 * The surrogateValue is not transformed via filter(). 83 * See {@link Character#isSurrogate}. 84 * 85 * <p>Most users should use NORMAL instead. 86 * 87 * <p>This option is useful for maps that map surrogate code *units* to 88 * special values optimized for UTF-16 string processing 89 * or for special error behavior for unpaired surrogates, 90 * but those values are not to be associated with the lead surrogate code *points*. 91 * 92 * @draft ICU 63 93 * @provisional This API might change or be removed in a future release. 94 */ 95 FIXED_ALL_SURROGATES 96 } 97 98 /** 99 * Callback function interface: Modifies a map value. 100 * Optionally called by getRange(). 101 * The modified value will be returned by the getRange() function. 102 * 103 * <p>Can be used to ignore some of the value bits, 104 * make a filter for one of several values, 105 * return a value index computed from the map value, etc. 106 * 107 * @see #getRange 108 * @see #iterator 109 * @draft ICU 63 110 * @provisional This API might change or be removed in a future release. 111 */ 112 public interface ValueFilter { 113 /** 114 * Modifies the map value. 115 * 116 * @param value map value 117 * @return modified value 118 * @draft ICU 63 119 * @provisional This API might change or be removed in a future release. 120 */ 121 public int apply(int value); 122 } 123 124 /** 125 * Range iteration result data. 126 * Code points from start to end map to the same value. 127 * The value may have been modified by {@link ValueFilter#apply(int)}, 128 * or it may be the surrogateValue if a RangeOption other than "normal" was used. 129 * 130 * @see #getRange 131 * @see #iterator 132 * @draft ICU 63 133 * @provisional This API might change or be removed in a future release. 134 */ 135 public static final class Range { 136 private int start; 137 private int end; 138 private int value; 139 140 /** 141 * Constructor. Sets start and end to -1 and value to 0. 142 * 143 * @draft ICU 63 144 * @provisional This API might change or be removed in a future release. 145 */ 146 public Range() { 147 start = end = -1; 148 value = 0; 149 } 150 151 /** 152 * @return the start code point 153 * @draft ICU 63 154 * @provisional This API might change or be removed in a future release. 155 */ 156 public int getStart() { return start; } 157 /** 158 * @return the (inclusive) end code point 159 * @draft ICU 63 160 * @provisional This API might change or be removed in a future release. 161 */ 162 public int getEnd() { return end; } 163 /** 164 * @return the range value 165 * @draft ICU 63 166 * @provisional This API might change or be removed in a future release. 167 */ 168 public int getValue() { return value; } 169 /** 170 * Sets the range. When using {@link #iterator()}, 171 * iteration will resume after the newly set end. 172 * 173 * @param start new start code point 174 * @param end new end code point 175 * @param value new value 176 * @draft ICU 63 177 * @provisional This API might change or be removed in a future release. 178 */ 179 public void set(int start, int end, int value) { 180 this.start = start; 181 this.end = end; 182 this.value = value; 183 } 184 } 185 186 private final class RangeIterator implements Iterator<Range> { 187 private Range range = new Range(); 188 189 @Override 190 public boolean hasNext() { 191 return -1 <= range.end && range.end < 0x10ffff; 192 } 193 194 @Override 195 public Range next() { 196 if (getRange(range.end + 1, null, range)) { 197 return range; 198 } else { 199 throw new NoSuchElementException(); 200 } 201 } 202 203 @Override 204 public final void remove() { 205 throw new UnsupportedOperationException(); 206 } 207 } 208 209 /** 210 * Iterates over code points of a string and fetches map values. 211 * This does not implement java.util.Iterator. 212 * 213 * <pre> 214 * void onString(CodePointMap map, CharSequence s, int start) { 215 * CodePointMap.StringIterator iter = map.stringIterator(s, start); 216 * while (iter.next()) { 217 * int end = iter.getIndex(); // code point from between start and end 218 * useValue(s, start, end, iter.getCodePoint(), iter.getValue()); 219 * start = end; 220 * } 221 * } 222 * </pre> 223 * 224 * <p>This class is not intended for public subclassing. 225 * 226 * @draft ICU 63 227 * @provisional This API might change or be removed in a future release. 228 */ 229 public class StringIterator { 230 /** 231 * @internal 232 * @deprecated This API is ICU internal only. 233 */ 234 @Deprecated 235 protected CharSequence s; 236 /** 237 * @internal 238 * @deprecated This API is ICU internal only. 239 */ 240 @Deprecated 241 protected int sIndex; 242 /** 243 * @internal 244 * @deprecated This API is ICU internal only. 245 */ 246 @Deprecated 247 protected int c; 248 /** 249 * @internal 250 * @deprecated This API is ICU internal only. 251 */ 252 @Deprecated 253 protected int value; 254 255 /** 256 * @internal 257 * @deprecated This API is ICU internal only. 258 */ 259 @Deprecated 260 protected StringIterator(CharSequence s, int sIndex) { 261 this.s = s; 262 this.sIndex = sIndex; 263 c = -1; 264 value = 0; 265 } 266 267 /** 268 * Resets the iterator to a new string and/or a new string index. 269 * 270 * @param s string to iterate over 271 * @param sIndex string index where the iteration will start 272 * @draft ICU 63 273 * @provisional This API might change or be removed in a future release. 274 */ 275 public void reset(CharSequence s, int sIndex) { 276 this.s = s; 277 this.sIndex = sIndex; 278 c = -1; 279 value = 0; 280 } 281 282 /** 283 * Reads the next code point, post-increments the string index, 284 * and gets a value from the map. 285 * Sets an implementation-defined error value if the code point is an unpaired surrogate. 286 * 287 * @return true if the string index was not yet at the end of the string; 288 * otherwise the iterator did not advance 289 * @draft ICU 63 290 * @provisional This API might change or be removed in a future release. 291 */ 292 public boolean next() { 293 if (sIndex >= s.length()) { 294 return false; 295 } 296 c = Character.codePointAt(s, sIndex); 297 sIndex += Character.charCount(c); 298 value = get(c); 299 return true; 300 } 301 302 /** 303 * Reads the previous code point, pre-decrements the string index, 304 * and gets a value from the map. 305 * Sets an implementation-defined error value if the code point is an unpaired surrogate. 306 * 307 * @return true if the string index was not yet at the start of the string; 308 * otherwise the iterator did not advance 309 * @draft ICU 63 310 * @provisional This API might change or be removed in a future release. 311 */ 312 public boolean previous() { 313 if (sIndex <= 0) { 314 return false; 315 } 316 c = Character.codePointBefore(s, sIndex); 317 sIndex -= Character.charCount(c); 318 value = get(c); 319 return true; 320 } 321 /** 322 * @return the string index 323 * @draft ICU 63 324 * @provisional This API might change or be removed in a future release. 325 */ 326 public final int getIndex() { return sIndex; } 327 /** 328 * @return the code point 329 * @draft ICU 63 330 * @provisional This API might change or be removed in a future release. 331 */ 332 public final int getCodePoint() { return c; } 333 /** 334 * @return the map value, 335 * or an implementation-defined error value if 336 * the code point is an unpaired surrogate 337 * @draft ICU 63 338 * @provisional This API might change or be removed in a future release. 339 */ 340 public final int getValue() { return value; } 341 } 342 343 /** 344 * Protected no-args constructor. 345 * 346 * @draft ICU 63 347 * @provisional This API might change or be removed in a future release. 348 */ 349 protected CodePointMap() { 350 } 351 352 /** 353 * Returns the value for a code point as stored in the map, with range checking. 354 * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF. 355 * 356 * @param c the code point 357 * @return the map value, 358 * or an implementation-defined error value if 359 * the code point is not in the range 0..U+10FFFF 360 * @draft ICU 63 361 * @provisional This API might change or be removed in a future release. 362 */ 363 public abstract int get(int c); 364 365 /** 366 * Sets the range object to a range of code points beginning with the start parameter. 367 * The range start is the same as the start input parameter 368 * (even if there are preceding code points that have the same value). 369 * The range end is the last code point such that 370 * all those from start to there have the same value. 371 * Returns false if start is not 0..U+10FFFF. 372 * Can be used to efficiently iterate over all same-value ranges in a map. 373 * (This is normally faster than iterating over code points and get()ting each value, 374 * but may be much slower than a data structure that stores ranges directly.) 375 * 376 * <p>If the {@link ValueFilter} parameter is not null, then 377 * the value to be delivered is passed through that filter, and the return value is the end 378 * of the range where all values are modified to the same actual value. 379 * The value is unchanged if that parameter is null. 380 * 381 * <p>Example: 382 * <pre> 383 * int start = 0; 384 * CodePointMap.Range range = new CodePointMap.Range(); 385 * while (map.getRange(start, null, range)) { 386 * int end = range.getEnd(); 387 * int value = range.getValue(); 388 * // Work with the range start..end and its value. 389 * start = end + 1; 390 * } 391 * </pre> 392 * 393 * @param start range start 394 * @param filter an object that may modify the map data value, 395 * or null if the values from the map are to be used unmodified 396 * @param range the range object that will be set to the code point range and value 397 * @return true if start is 0..U+10FFFF; otherwise no new range is fetched 398 * @draft ICU 63 399 * @provisional This API might change or be removed in a future release. 400 */ 401 public abstract boolean getRange(int start, ValueFilter filter, Range range); 402 403 /** 404 * Sets the range object to a range of code points beginning with the start parameter. 405 * The range start is the same as the start input parameter 406 * (even if there are preceding code points that have the same value). 407 * The range end is the last code point such that 408 * all those from start to there have the same value. 409 * Returns false if start is not 0..U+10FFFF. 410 * 411 * <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally 412 * modifies the range if it overlaps with surrogate code points. 413 * 414 * @param start range start 415 * @param option defines whether surrogates are treated normally, 416 * or as having the surrogateValue; usually {@link RangeOption#NORMAL} 417 * @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL} 418 * @param filter an object that may modify the map data value, 419 * or null if the values from the map are to be used unmodified 420 * @param range the range object that will be set to the code point range and value 421 * @return true if start is 0..U+10FFFF; otherwise no new range is fetched 422 * @draft ICU 63 423 * @provisional This API might change or be removed in a future release. 424 */ 425 public boolean getRange(int start, RangeOption option, int surrogateValue, 426 ValueFilter filter, Range range) { 427 assert option != null; 428 if (!getRange(start, filter, range)) { 429 return false; 430 } 431 if (option == RangeOption.NORMAL) { 432 return true; 433 } 434 int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff; 435 int end = range.end; 436 if (end < 0xd7ff || start > surrEnd) { 437 return true; 438 } 439 // The range overlaps with surrogates, or ends just before the first one. 440 if (range.value == surrogateValue) { 441 if (end >= surrEnd) { 442 // Surrogates followed by a non-surrValue range, 443 // or surrogates are part of a larger surrValue range. 444 return true; 445 } 446 } else { 447 if (start <= 0xd7ff) { 448 range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates. 449 return true; 450 } 451 // Start is a surrogate with a non-surrValue code *unit* value. 452 // Return a surrValue code *point* range. 453 range.value = surrogateValue; 454 if (end > surrEnd) { 455 range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range. 456 return true; 457 } 458 } 459 // See if the surrValue surrogate range can be merged with 460 // an immediately following range. 461 if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) { 462 range.start = start; 463 return true; 464 } 465 range.start = start; 466 range.end = surrEnd; 467 range.value = surrogateValue; 468 return true; 469 } 470 471 /** 472 * Convenience iterator over same-map-value code point ranges. 473 * Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)} 474 * without filtering. 475 * Adjacent ranges have different map values. 476 * 477 * <p>The iterator always returns the same Range object. 478 * 479 * @return a Range iterator 480 * @draft ICU 63 481 * @provisional This API might change or be removed in a future release. 482 */ 483 @Override 484 public Iterator<Range> iterator() { 485 return new RangeIterator(); 486 } 487 488 /** 489 * Returns an iterator (not a java.util.Iterator) over code points of a string 490 * for fetching map values. 491 * 492 * @param s string to iterate over 493 * @param sIndex string index where the iteration will start 494 * @return the iterator 495 * @draft ICU 63 496 * @provisional This API might change or be removed in a future release. 497 */ 498 public StringIterator stringIterator(CharSequence s, int sIndex) { 499 return new StringIterator(s, sIndex); 500 } 501 }