New src/java.base/share/classes/java/text/BreakIterator.java

   1 /*
   2  * Copyright (c) 1996, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  28  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  29  *
  30  * The original version of this source code and documentation
  31  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  32  * subsidiary of IBM. These materials are provided under terms
  33  * of a License Agreement between Taligent and Sun. This technology
  34  * is protected by multiple US and International patents.
  35  *
  36  * This notice and attribution to Taligent may not be removed.
  37  * Taligent is a registered trademark of Taligent, Inc.
  38  *
  39  */
  40 
  41 package java.text;
  42 
  43 import java.lang.ref.SoftReference;
  44 import java.text.spi.BreakIteratorProvider;
  45 import java.util.Locale;
  46 import sun.util.locale.provider.LocaleProviderAdapter;
  47 import sun.util.locale.provider.LocaleServiceProviderPool;
  48 
  49 
  50 /**
  51  * The {@code BreakIterator} class implements methods for finding
  52  * the location of boundaries in text. Instances of {@code BreakIterator}
  53  * maintain a current position and scan over text
  54  * returning the index of characters where boundaries occur.
  55  * Internally, {@code BreakIterator} scans text using a
  56  * {@code CharacterIterator}, and is thus able to scan text held
  57  * by any object implementing that protocol. A {@code StringCharacterIterator}
  58  * is used to scan {@code String} objects passed to {@code setText}.
  59  *
  60  * <p>
  61  * You use the factory methods provided by this class to create
  62  * instances of various types of break iterators. In particular,
  63  * use {@code getWordInstance}, {@code getLineInstance},
  64  * {@code getSentenceInstance}, and {@code getCharacterInstance}
  65  * to create {@code BreakIterator}s that perform
  66  * word, line, sentence, and character boundary analysis respectively.
  67  * A single {@code BreakIterator} can work only on one unit
  68  * (word, line, sentence, and so on). You must use a different iterator
  69  * for each unit boundary analysis you wish to perform.
  70  *
  71  * <p><a id="line"></a>
  72  * Line boundary analysis determines where a text string can be
  73  * broken when line-wrapping. The mechanism correctly handles
  74  * punctuation and hyphenated words. Actual line breaking needs
  75  * to also consider the available line width and is handled by
  76  * higher-level software.
  77  *
  78  * <p><a id="sentence"></a>
  79  * Sentence boundary analysis allows selection with correct interpretation
  80  * of periods within numbers and abbreviations, and trailing punctuation
  81  * marks such as quotation marks and parentheses.
  82  *
  83  * <p><a id="word"></a>
  84  * Word boundary analysis is used by search and replace functions, as
  85  * well as within text editing applications that allow the user to
  86  * select words with a double click. Word selection provides correct
  87  * interpretation of punctuation marks within and following
  88  * words. Characters that are not part of a word, such as symbols
  89  * or punctuation marks, have word-breaks on both sides.
  90  *
  91  * <p><a id="character"></a>
  92  * Character boundary analysis allows users to interact with characters
  93  * as they expect to, for example, when moving the cursor through a text
  94  * string. Character boundary analysis provides correct navigation
  95  * through character strings, regardless of how the character is stored.
  96  * The boundaries returned may be those of supplementary characters,
  97  * combining character sequences, or ligature clusters.
  98  * For example, an accented character might be stored as a base character
  99  * and a diacritical mark. What users consider to be a character can
 100  * differ between languages.
 101  *
 102  * <p>
 103  * The {@code BreakIterator} instances returned by the factory methods
 104  * of this class are intended for use with natural languages only, not for
 105  * programming language text. It is however possible to define subclasses
 106  * that tokenize a programming language.
 107  *
 108  * <P>
 109  * <strong>Examples</strong>:<P>
 110  * Creating and using text boundaries:
 111  * <blockquote>
 112  * <pre>
 113  * public static void main(String args[]) {
 114  *      if (args.length == 1) {
 115  *          String stringToExamine = args[0];
 116  *          //print each word in order
 117  *          BreakIterator boundary = BreakIterator.getWordInstance();
 118  *          boundary.setText(stringToExamine);
 119  *          printEachForward(boundary, stringToExamine);
 120  *          //print each sentence in reverse order
 121  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
 122  *          boundary.setText(stringToExamine);
 123  *          printEachBackward(boundary, stringToExamine);
 124  *          printFirst(boundary, stringToExamine);
 125  *          printLast(boundary, stringToExamine);
 126  *      }
 127  * }
 128  * </pre>
 129  * </blockquote>
 130  *
 131  * Print each element in order:
 132  * <blockquote>
 133  * <pre>
 134  * public static void printEachForward(BreakIterator boundary, String source) {
 135  *     int start = boundary.first();
 136  *     for (int end = boundary.next();
 137  *          end != BreakIterator.DONE;
 138  *          start = end, end = boundary.next()) {
 139  *          System.out.println(source.substring(start,end));
 140  *     }
 141  * }
 142  * </pre>
 143  * </blockquote>
 144  *
 145  * Print each element in reverse order:
 146  * <blockquote>
 147  * <pre>
 148  * public static void printEachBackward(BreakIterator boundary, String source) {
 149  *     int end = boundary.last();
 150  *     for (int start = boundary.previous();
 151  *          start != BreakIterator.DONE;
 152  *          end = start, start = boundary.previous()) {
 153  *         System.out.println(source.substring(start,end));
 154  *     }
 155  * }
 156  * </pre>
 157  * </blockquote>
 158  *
 159  * Print first element:
 160  * <blockquote>
 161  * <pre>
 162  * public static void printFirst(BreakIterator boundary, String source) {
 163  *     int start = boundary.first();
 164  *     int end = boundary.next();
 165  *     System.out.println(source.substring(start,end));
 166  * }
 167  * </pre>
 168  * </blockquote>
 169  *
 170  * Print last element:
 171  * <blockquote>
 172  * <pre>
 173  * public static void printLast(BreakIterator boundary, String source) {
 174  *     int end = boundary.last();
 175  *     int start = boundary.previous();
 176  *     System.out.println(source.substring(start,end));
 177  * }
 178  * </pre>
 179  * </blockquote>
 180  *
 181  * Print the element at a specified position:
 182  * <blockquote>
 183  * <pre>
 184  * public static void printAt(BreakIterator boundary, int pos, String source) {
 185  *     int end = boundary.following(pos);
 186  *     int start = boundary.previous();
 187  *     System.out.println(source.substring(start,end));
 188  * }
 189  * </pre>
 190  * </blockquote>
 191  *
 192  * Find the next word:
 193  * <blockquote>
 194  * <pre>{@code
 195  * public static int nextWordStartAfter(int pos, String text) {
 196  *     BreakIterator wb = BreakIterator.getWordInstance();
 197  *     wb.setText(text);
 198  *     int last = wb.following(pos);
 199  *     int current = wb.next();
 200  *     while (current != BreakIterator.DONE) {
 201  *         for (int p = last; p < current; p++) {
 202  *             if (Character.isLetter(text.codePointAt(p)))
 203  *                 return last;
 204  *         }
 205  *         last = current;
 206  *         current = wb.next();
 207  *     }
 208  *     return BreakIterator.DONE;
 209  * }
 210  * }</pre>
 211  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
 212  * the break positions it returns don't represent both the start and end of the
 213  * thing being iterated over.  That is, a sentence-break iterator returns breaks
 214  * that each represent the end of one sentence and the beginning of the next.
 215  * With the word-break iterator, the characters between two boundaries might be a
 216  * word, or they might be the punctuation or whitespace between two words.  The
 217  * above code uses a simple heuristic to determine which boundary is the beginning
 218  * of a word: If the characters between this boundary and the next boundary
 219  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
 220  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
 221  * and the next is a word; otherwise, it's the material between words.)
 222  * </blockquote>
 223  *
 224  * @since 1.1
 225  * @see CharacterIterator
 226  *
 227  */
 228 
 229 public abstract class BreakIterator implements Cloneable
 230 {
 231     /**
 232      * Constructor. BreakIterator is stateless and has no default behavior.
 233      */
 234     protected BreakIterator()
 235     {
 236     }
 237 
 238     /**
 239      * Create a copy of this iterator
 240      * @return A copy of this
 241      */
 242     @Override
 243     public Object clone()
 244     {
 245         try {
 246             return super.clone();
 247         }
 248         catch (CloneNotSupportedException e) {
 249             throw new InternalError(e);
 250         }
 251     }
 252 
 253     /**
 254      * DONE is returned by previous(), next(), next(int), preceding(int)
 255      * and following(int) when either the first or last text boundary has been
 256      * reached.
 257      */
 258     public static final int DONE = -1;
 259 
 260     /**
 261      * Returns the first boundary. The iterator's current position is set
 262      * to the first text boundary.
 263      * @return The character index of the first text boundary.
 264      */
 265     public abstract int first();
 266 
 267     /**
 268      * Returns the last boundary. The iterator's current position is set
 269      * to the last text boundary.
 270      * @return The character index of the last text boundary.
 271      */
 272     public abstract int last();
 273 
 274     /**
 275      * Returns the nth boundary from the current boundary. If either
 276      * the first or last text boundary has been reached, it returns
 277      * {@code BreakIterator.DONE} and the current position is set to either
 278      * the first or last text boundary depending on which one is reached. Otherwise,
 279      * the iterator's current position is set to the new boundary.
 280      * For example, if the iterator's current position is the mth text boundary
 281      * and three more boundaries exist from the current boundary to the last text
 282      * boundary, the next(2) call will return m + 2. The new text position is set
 283      * to the (m + 2)th text boundary. A next(4) call would return
 284      * {@code BreakIterator.DONE} and the last text boundary would become the
 285      * new text position.
 286      * @param n which boundary to return.  A value of 0
 287      * does nothing.  Negative values move to previous boundaries
 288      * and positive values move to later boundaries.
 289      * @return The character index of the nth boundary from the current position
 290      * or {@code BreakIterator.DONE} if either first or last text boundary
 291      * has been reached.
 292      */
 293     public abstract int next(int n);
 294 
 295     /**
 296      * Returns the boundary following the current boundary. If the current boundary
 297      * is the last text boundary, it returns {@code BreakIterator.DONE} and
 298      * the iterator's current position is unchanged. Otherwise, the iterator's
 299      * current position is set to the boundary following the current boundary.
 300      * @return The character index of the next text boundary or
 301      * {@code BreakIterator.DONE} if the current boundary is the last text
 302      * boundary.
 303      * Equivalent to next(1).
 304      * @see #next(int)
 305      */
 306     public abstract int next();
 307 
 308     /**
 309      * Returns the boundary preceding the current boundary. If the current boundary
 310      * is the first text boundary, it returns {@code BreakIterator.DONE} and
 311      * the iterator's current position is unchanged. Otherwise, the iterator's
 312      * current position is set to the boundary preceding the current boundary.
 313      * @return The character index of the previous text boundary or
 314      * {@code BreakIterator.DONE} if the current boundary is the first text
 315      * boundary.
 316      */
 317     public abstract int previous();
 318 
 319     /**
 320      * Returns the first boundary following the specified character offset. If the
 321      * specified offset is equal to the last text boundary, it returns
 322      * {@code BreakIterator.DONE} and the iterator's current position is unchanged.
 323      * Otherwise, the iterator's current position is set to the returned boundary.
 324      * The value returned is always greater than the offset or the value
 325      * {@code BreakIterator.DONE}.
 326      * @param offset the character offset to begin scanning.
 327      * @return The first boundary after the specified offset or
 328      * {@code BreakIterator.DONE} if the last text boundary is passed in
 329      * as the offset.
 330      * @throws     IllegalArgumentException if the specified offset is less than
 331      * the first text boundary or greater than the last text boundary.
 332      */
 333     public abstract int following(int offset);
 334 
 335     /**
 336      * Returns the last boundary preceding the specified character offset. If the
 337      * specified offset is equal to the first text boundary, it returns
 338      * {@code BreakIterator.DONE} and the iterator's current position is unchanged.
 339      * Otherwise, the iterator's current position is set to the returned boundary.
 340      * The value returned is always less than the offset or the value
 341      * {@code BreakIterator.DONE}.
 342      * @param offset the character offset to begin scanning.
 343      * @return The last boundary before the specified offset or
 344      * {@code BreakIterator.DONE} if the first text boundary is passed in
 345      * as the offset.
 346      * @throws      IllegalArgumentException if the specified offset is less than
 347      * the first text boundary or greater than the last text boundary.
 348      * @since 1.2
 349      */
 350     public int preceding(int offset) {
 351         // NOTE:  This implementation is here solely because we can't add new
 352         // abstract methods to an existing class.  There is almost ALWAYS a
 353         // better, faster way to do this.
 354         int pos = following(offset);
 355         while (pos >= offset && pos != DONE) {
 356             pos = previous();
 357         }
 358         return pos;
 359     }
 360 
 361     /**
 362      * Returns true if the specified character offset is a text boundary.
 363      * @param offset the character offset to check.
 364      * @return {@code true} if "offset" is a boundary position,
 365      * {@code false} otherwise.
 366      * @throws      IllegalArgumentException if the specified offset is less than
 367      * the first text boundary or greater than the last text boundary.
 368      * @since 1.2
 369      */
 370     public boolean isBoundary(int offset) {
 371         // NOTE: This implementation probably is wrong for most situations
 372         // because it fails to take into account the possibility that a
 373         // CharacterIterator passed to setText() may not have a begin offset
 374         // of 0.  But since the abstract BreakIterator doesn't have that
 375         // knowledge, it assumes the begin offset is 0.  If you subclass
 376         // BreakIterator, copy the SimpleTextBoundary implementation of this
 377         // function into your subclass.  [This should have been abstract at
 378         // this level, but it's too late to fix that now.]
 379         if (offset == 0) {
 380             return true;
 381         }
 382         int boundary = following(offset - 1);
 383         if (boundary == DONE) {
 384             throw new IllegalArgumentException();
 385         }
 386         return boundary == offset;
 387     }
 388 
 389     /**
 390      * Returns character index of the text boundary that was most
 391      * recently returned by next(), next(int), previous(), first(), last(),
 392      * following(int) or preceding(int). If any of these methods returns
 393      * {@code BreakIterator.DONE} because either first or last text boundary
 394      * has been reached, it returns the first or last text boundary depending on
 395      * which one is reached.
 396      * @return The text boundary returned from the above methods, first or last
 397      * text boundary.
 398      * @see #next()
 399      * @see #next(int)
 400      * @see #previous()
 401      * @see #first()
 402      * @see #last()
 403      * @see #following(int)
 404      * @see #preceding(int)
 405      */
 406     public abstract int current();
 407 
 408     /**
 409      * Get the text being scanned
 410      * @return the text being scanned
 411      */
 412     public abstract CharacterIterator getText();
 413 
 414     /**
 415      * Set a new text string to be scanned.  The current scan
 416      * position is reset to first().
 417      * @param newText new text to scan.
 418      */
 419     public void setText(String newText)
 420     {
 421         setText(new StringCharacterIterator(newText));
 422     }
 423 
 424     /**
 425      * Set a new text for scanning.  The current scan
 426      * position is reset to first().
 427      * @param newText new text to scan.
 428      */
 429     public abstract void setText(CharacterIterator newText);
 430 
 431     private static final int CHARACTER_INDEX = 0;
 432     private static final int WORD_INDEX = 1;
 433     private static final int LINE_INDEX = 2;
 434     private static final int SENTENCE_INDEX = 3;
 435 
 436     @SuppressWarnings("unchecked")
 437     private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];
 438 
 439     /**
 440      * Returns a new {@code BreakIterator} instance
 441      * for <a href="BreakIterator.html#word">word breaks</a>
 442      * for the {@linkplain Locale#getDefault() default locale}.
 443      * @return A break iterator for word breaks
 444      */
 445     public static BreakIterator getWordInstance()
 446     {
 447         return getWordInstance(Locale.getDefault());
 448     }
 449 
 450     /**
 451      * Returns a new {@code BreakIterator} instance
 452      * for <a href="BreakIterator.html#word">word breaks</a>
 453      * for the given locale.
 454      * @param locale the desired locale
 455      * @return A break iterator for word breaks
 456      * @throws    NullPointerException if {@code locale} is null
 457      */
 458     public static BreakIterator getWordInstance(Locale locale)
 459     {
 460         return getBreakInstance(locale, WORD_INDEX);
 461     }
 462 
 463     /**
 464      * Returns a new {@code BreakIterator} instance
 465      * for <a href="BreakIterator.html#line">line breaks</a>
 466      * for the {@linkplain Locale#getDefault() default locale}.
 467      * @return A break iterator for line breaks
 468      */
 469     public static BreakIterator getLineInstance()
 470     {
 471         return getLineInstance(Locale.getDefault());
 472     }
 473 
 474     /**
 475      * Returns a new {@code BreakIterator} instance
 476      * for <a href="BreakIterator.html#line">line breaks</a>
 477      * for the given locale.
 478      * @param locale the desired locale
 479      * @return A break iterator for line breaks
 480      * @throws    NullPointerException if {@code locale} is null
 481      */
 482     public static BreakIterator getLineInstance(Locale locale)
 483     {
 484         return getBreakInstance(locale, LINE_INDEX);
 485     }
 486 
 487     /**
 488      * Returns a new {@code BreakIterator} instance
 489      * for <a href="BreakIterator.html#character">character breaks</a>
 490      * for the {@linkplain Locale#getDefault() default locale}.
 491      * @return A break iterator for character breaks
 492      */
 493     public static BreakIterator getCharacterInstance()
 494     {
 495         return getCharacterInstance(Locale.getDefault());
 496     }
 497 
 498     /**
 499      * Returns a new {@code BreakIterator} instance
 500      * for <a href="BreakIterator.html#character">character breaks</a>
 501      * for the given locale.
 502      * @param locale the desired locale
 503      * @return A break iterator for character breaks
 504      * @throws    NullPointerException if {@code locale} is null
 505      */
 506     public static BreakIterator getCharacterInstance(Locale locale)
 507     {
 508         return getBreakInstance(locale, CHARACTER_INDEX);
 509     }
 510 
 511     /**
 512      * Returns a new {@code BreakIterator} instance
 513      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
 514      * for the {@linkplain Locale#getDefault() default locale}.
 515      * @return A break iterator for sentence breaks
 516      */
 517     public static BreakIterator getSentenceInstance()
 518     {
 519         return getSentenceInstance(Locale.getDefault());
 520     }
 521 
 522     /**
 523      * Returns a new {@code BreakIterator} instance
 524      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
 525      * for the given locale.
 526      * @param locale the desired locale
 527      * @return A break iterator for sentence breaks
 528      * @throws    NullPointerException if {@code locale} is null
 529      */
 530     public static BreakIterator getSentenceInstance(Locale locale)
 531     {
 532         return getBreakInstance(locale, SENTENCE_INDEX);
 533     }
 534 
 535     private static BreakIterator getBreakInstance(Locale locale, int type) {
 536         if (iterCache[type] != null) {
 537             BreakIteratorCache cache = iterCache[type].get();
 538             if (cache != null) {
 539                 if (cache.getLocale().equals(locale)) {
 540                     return cache.createBreakInstance();
 541                 }
 542             }
 543         }
 544 
 545         BreakIterator result = createBreakInstance(locale, type);
 546         BreakIteratorCache cache = new BreakIteratorCache(locale, result);
 547         iterCache[type] = new SoftReference<>(cache);
 548         return result;
 549     }
 550 
 551     private static BreakIterator createBreakInstance(Locale locale,
 552                                                      int type) {
 553         LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
 554         BreakIterator iterator = createBreakInstance(adapter, locale, type);
 555         if (iterator == null) {
 556             iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);
 557         }
 558         return iterator;
 559     }
 560 
 561     private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {
 562         BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
 563         BreakIterator iterator = null;
 564         switch (type) {
 565         case CHARACTER_INDEX:
 566             iterator = breakIteratorProvider.getCharacterInstance(locale);
 567             break;
 568         case WORD_INDEX:
 569             iterator = breakIteratorProvider.getWordInstance(locale);
 570             break;
 571         case LINE_INDEX:
 572             iterator = breakIteratorProvider.getLineInstance(locale);
 573             break;
 574         case SENTENCE_INDEX:
 575             iterator = breakIteratorProvider.getSentenceInstance(locale);
 576             break;
 577         }
 578         return iterator;
 579     }
 580 
 581     /**
 582      * Returns an array of all locales for which the
 583      * {@code get*Instance} methods of this class can return
 584      * localized instances.
 585      * The returned array represents the union of locales supported by the Java
 586      * runtime and by installed
 587      * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
 588      * It must contain at least a {@code Locale}
 589      * instance equal to {@link java.util.Locale#US Locale.US}.
 590      *
 591      * @return An array of locales for which localized
 592      *         {@code BreakIterator} instances are available.
 593      */
 594     public static synchronized Locale[] getAvailableLocales()
 595     {
 596         LocaleServiceProviderPool pool =
 597             LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);
 598         return pool.getAvailableLocales();
 599     }
 600 
 601     private static final class BreakIteratorCache {
 602 
 603         private BreakIterator iter;
 604         private Locale locale;
 605 
 606         BreakIteratorCache(Locale locale, BreakIterator iter) {
 607             this.locale = locale;
 608             this.iter = (BreakIterator) iter.clone();
 609         }
 610 
 611         Locale getLocale() {
 612             return locale;
 613         }
 614 
 615         BreakIterator createBreakInstance() {
 616             return (BreakIterator) iter.clone();
 617         }
 618     }
 619 }