1 /*
   2  * Copyright (c) 1996, 2006, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  28  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  29  *
  30  * The original version of this source code and documentation
  31  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  32  * subsidiary of IBM. These materials are provided under terms
  33  * of a License Agreement between Taligent and Sun. This technology
  34  * is protected by multiple US and International patents.
  35  *
  36  * This notice and attribution to Taligent may not be removed.
  37  * Taligent is a registered trademark of Taligent, Inc.
  38  *
  39  */
  40 
  41 package java.text;
  42 
  43 import java.lang.ref.SoftReference;
  44 import java.text.spi.BreakIteratorProvider;
  45 import java.util.Locale;
  46 import sun.util.locale.provider.LocaleProviderAdapter;
  47 import sun.util.locale.provider.LocaleServiceProviderPool;
  48 
  49 
  50 /**
  51  * The <code>BreakIterator</code> class implements methods for finding
  52  * the location of boundaries in text. Instances of <code>BreakIterator</code>
  53  * maintain a current position and scan over text
  54  * returning the index of characters where boundaries occur.
  55  * Internally, <code>BreakIterator</code> scans text using a
  56  * <code>CharacterIterator</code>, and is thus able to scan text held
  57  * by any object implementing that protocol. A <code>StringCharacterIterator</code>
  58  * is used to scan <code>String</code> objects passed to <code>setText</code>.
  59  *
  60  * <p>
  61  * You use the factory methods provided by this class to create
  62  * instances of various types of break iterators. In particular,
  63  * use <code>getWordInstance</code>, <code>getLineInstance</code>,
  64  * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
  65  * to create <code>BreakIterator</code>s that perform
  66  * word, line, sentence, and character boundary analysis respectively.
  67  * A single <code>BreakIterator</code> can work only on one unit
  68  * (word, line, sentence, and so on). You must use a different iterator
  69  * for each unit boundary analysis you wish to perform.
  70  *
  71  * <p><a name="line"></a>
  72  * Line boundary analysis determines where a text string can be
  73  * broken when line-wrapping. The mechanism correctly handles
  74  * punctuation and hyphenated words. Actual line breaking needs
  75  * to also consider the available line width and is handled by
  76  * higher-level software.
  77  *
  78  * <p><a name="sentence"></a>
  79  * Sentence boundary analysis allows selection with correct interpretation
  80  * of periods within numbers and abbreviations, and trailing punctuation
  81  * marks such as quotation marks and parentheses.
  82  *
  83  * <p><a name="word"></a>
  84  * Word boundary analysis is used by search and replace functions, as
  85  * well as within text editing applications that allow the user to
  86  * select words with a double click. Word selection provides correct
  87  * interpretation of punctuation marks within and following
  88  * words. Characters that are not part of a word, such as symbols
  89  * or punctuation marks, have word-breaks on both sides.
  90  *
  91  * <p><a name="character"></a>
  92  * Character boundary analysis allows users to interact with characters
  93  * as they expect to, for example, when moving the cursor through a text
  94  * string. Character boundary analysis provides correct navigation
  95  * through character strings, regardless of how the character is stored.
  96  * The boundaries returned may be those of supplementary characters,
  97  * combining character sequences, or ligature clusters.
  98  * For example, an accented character might be stored as a base character
  99  * and a diacritical mark. What users consider to be a character can
 100  * differ between languages.
 101  *
 102  * <p>
 103  * The <code>BreakIterator</code> instances returned by the factory methods
 104  * of this class are intended for use with natural languages only, not for
 105  * programming language text. It is however possible to define subclasses
 106  * that tokenize a programming language.
 107  *
 108  * <P>
 109  * <strong>Examples</strong>:<P>
 110  * Creating and using text boundaries:
 111  * <blockquote>
 112  * <pre>
 113  * public static void main(String args[]) {
 114  *      if (args.length == 1) {
 115  *          String stringToExamine = args[0];
 116  *          //print each word in order
 117  *          BreakIterator boundary = BreakIterator.getWordInstance();
 118  *          boundary.setText(stringToExamine);
 119  *          printEachForward(boundary, stringToExamine);
 120  *          //print each sentence in reverse order
 121  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
 122  *          boundary.setText(stringToExamine);
 123  *          printEachBackward(boundary, stringToExamine);
 124  *          printFirst(boundary, stringToExamine);
 125  *          printLast(boundary, stringToExamine);
 126  *      }
 127  * }
 128  * </pre>
 129  * </blockquote>
 130  *
 131  * Print each element in order:
 132  * <blockquote>
 133  * <pre>
 134  * public static void printEachForward(BreakIterator boundary, String source) {
 135  *     int start = boundary.first();
 136  *     for (int end = boundary.next();
 137  *          end != BreakIterator.DONE;
 138  *          start = end, end = boundary.next()) {
 139  *          System.out.println(source.substring(start,end));
 140  *     }
 141  * }
 142  * </pre>
 143  * </blockquote>
 144  *
 145  * Print each element in reverse order:
 146  * <blockquote>
 147  * <pre>
 148  * public static void printEachBackward(BreakIterator boundary, String source) {
 149  *     int end = boundary.last();
 150  *     for (int start = boundary.previous();
 151  *          start != BreakIterator.DONE;
 152  *          end = start, start = boundary.previous()) {
 153  *         System.out.println(source.substring(start,end));
 154  *     }
 155  * }
 156  * </pre>
 157  * </blockquote>
 158  *
 159  * Print first element:
 160  * <blockquote>
 161  * <pre>
 162  * public static void printFirst(BreakIterator boundary, String source) {
 163  *     int start = boundary.first();
 164  *     int end = boundary.next();
 165  *     System.out.println(source.substring(start,end));
 166  * }
 167  * </pre>
 168  * </blockquote>
 169  *
 170  * Print last element:
 171  * <blockquote>
 172  * <pre>
 173  * public static void printLast(BreakIterator boundary, String source) {
 174  *     int end = boundary.last();
 175  *     int start = boundary.previous();
 176  *     System.out.println(source.substring(start,end));
 177  * }
 178  * </pre>
 179  * </blockquote>
 180  *
 181  * Print the element at a specified position:
 182  * <blockquote>
 183  * <pre>
 184  * public static void printAt(BreakIterator boundary, int pos, String source) {
 185  *     int end = boundary.following(pos);
 186  *     int start = boundary.previous();
 187  *     System.out.println(source.substring(start,end));
 188  * }
 189  * </pre>
 190  * </blockquote>
 191  *
 192  * Find the next word:
 193  * <blockquote>
 194  * <pre>
 195  * public static int nextWordStartAfter(int pos, String text) {
 196  *     BreakIterator wb = BreakIterator.getWordInstance();
 197  *     wb.setText(text);
 198  *     int last = wb.following(pos);
 199  *     int current = wb.next();
 200  *     while (current != BreakIterator.DONE) {
 201  *         for (int p = last; p < current; p++) {
 202  *             if (Character.isLetter(text.codePointAt(p)))
 203  *                 return last;
 204  *         }
 205  *         last = current;
 206  *         current = wb.next();
 207  *     }
 208  *     return BreakIterator.DONE;
 209  * }
 210  * </pre>
 211  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
 212  * the break positions it returns don't represent both the start and end of the
 213  * thing being iterated over.  That is, a sentence-break iterator returns breaks
 214  * that each represent the end of one sentence and the beginning of the next.
 215  * With the word-break iterator, the characters between two boundaries might be a
 216  * word, or they might be the punctuation or whitespace between two words.  The
 217  * above code uses a simple heuristic to determine which boundary is the beginning
 218  * of a word: If the characters between this boundary and the next boundary
 219  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
 220  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
 221  * and the next is a word; otherwise, it's the material between words.)
 222  * </blockquote>
 223  *
 224  * @see CharacterIterator
 225  *
 226  */
 227 
 228 public abstract class BreakIterator implements Cloneable
 229 {
 230     /**
 231      * Constructor. BreakIterator is stateless and has no default behavior.
 232      */
 233     protected BreakIterator()
 234     {
 235     }
 236 
 237     /**
 238      * Create a copy of this iterator
 239      * @return A copy of this
 240      */
 241     @Override
 242     public Object clone()
 243     {
 244         try {
 245             return super.clone();
 246         }
 247         catch (CloneNotSupportedException e) {
 248             throw new InternalError(e);
 249         }
 250     }
 251 
 252     /**
 253      * DONE is returned by previous(), next(), next(int), preceding(int)
 254      * and following(int) when either the first or last text boundary has been
 255      * reached.
 256      */
 257     public static final int DONE = -1;
 258 
 259     /**
 260      * Returns the first boundary. The iterator's current position is set
 261      * to the first text boundary.
 262      * @return The character index of the first text boundary.
 263      */
 264     public abstract int first();
 265 
 266     /**
 267      * Returns the last boundary. The iterator's current position is set
 268      * to the last text boundary.
 269      * @return The character index of the last text boundary.
 270      */
 271     public abstract int last();
 272 
 273     /**
 274      * Returns the nth boundary from the current boundary. If either
 275      * the first or last text boundary has been reached, it returns
 276      * <code>BreakIterator.DONE</code> and the current position is set to either
 277      * the first or last text boundary depending on which one is reached. Otherwise,
 278      * the iterator's current position is set to the new boundary.
 279      * For example, if the iterator's current position is the mth text boundary
 280      * and three more boundaries exist from the current boundary to the last text
 281      * boundary, the next(2) call will return m + 2. The new text position is set
 282      * to the (m + 2)th text boundary. A next(4) call would return
 283      * <code>BreakIterator.DONE</code> and the last text boundary would become the
 284      * new text position.
 285      * @param n which boundary to return.  A value of 0
 286      * does nothing.  Negative values move to previous boundaries
 287      * and positive values move to later boundaries.
 288      * @return The character index of the nth boundary from the current position
 289      * or <code>BreakIterator.DONE</code> if either first or last text boundary
 290      * has been reached.
 291      */
 292     public abstract int next(int n);
 293 
 294     /**
 295      * Returns the boundary following the current boundary. If the current boundary
 296      * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
 297      * the iterator's current position is unchanged. Otherwise, the iterator's
 298      * current position is set to the boundary following the current boundary.
 299      * @return The character index of the next text boundary or
 300      * <code>BreakIterator.DONE</code> if the current boundary is the last text
 301      * boundary.
 302      * Equivalent to next(1).
 303      * @see #next(int)
 304      */
 305     public abstract int next();
 306 
 307     /**
 308      * Returns the boundary preceding the current boundary. If the current boundary
 309      * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
 310      * the iterator's current position is unchanged. Otherwise, the iterator's
 311      * current position is set to the boundary preceding the current boundary.
 312      * @return The character index of the previous text boundary or
 313      * <code>BreakIterator.DONE</code> if the current boundary is the first text
 314      * boundary.
 315      */
 316     public abstract int previous();
 317 
 318     /**
 319      * Returns the first boundary following the specified character offset. If the
 320      * specified offset equals to the last text boundary, it returns
 321      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
 322      * Otherwise, the iterator's current position is set to the returned boundary.
 323      * The value returned is always greater than the offset or the value
 324      * <code>BreakIterator.DONE</code>.
 325      * @param offset the character offset to begin scanning.
 326      * @return The first boundary after the specified offset or
 327      * <code>BreakIterator.DONE</code> if the last text boundary is passed in
 328      * as the offset.
 329      * @exception  IllegalArgumentException if the specified offset is less than
 330      * the first text boundary or greater than the last text boundary.
 331      */
 332     public abstract int following(int offset);
 333 
 334     /**
 335      * Returns the last boundary preceding the specified character offset. If the
 336      * specified offset equals to the first text boundary, it returns
 337      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
 338      * Otherwise, the iterator's current position is set to the returned boundary.
 339      * The value returned is always less than the offset or the value
 340      * <code>BreakIterator.DONE</code>.
 341      * @param offset the characater offset to begin scanning.
 342      * @return The last boundary before the specified offset or
 343      * <code>BreakIterator.DONE</code> if the first text boundary is passed in
 344      * as the offset.
 345      * @exception   IllegalArgumentException if the specified offset is less than
 346      * the first text boundary or greater than the last text boundary.
 347      * @since 1.2
 348      */
 349     public int preceding(int offset) {
 350         // NOTE:  This implementation is here solely because we can't add new
 351         // abstract methods to an existing class.  There is almost ALWAYS a
 352         // better, faster way to do this.
 353         int pos = following(offset);
 354         while (pos >= offset && pos != DONE) {
 355             pos = previous();
 356         }
 357         return pos;
 358     }
 359 
 360     /**
 361      * Returns true if the specified character offset is a text boundary.
 362      * @param offset the character offset to check.
 363      * @return <code>true</code> if "offset" is a boundary position,
 364      * <code>false</code> otherwise.
 365      * @exception   IllegalArgumentException if the specified offset is less than
 366      * the first text boundary or greater than the last text boundary.
 367      * @since 1.2
 368      */
 369     public boolean isBoundary(int offset) {
 370         // NOTE: This implementation probably is wrong for most situations
 371         // because it fails to take into account the possibility that a
 372         // CharacterIterator passed to setText() may not have a begin offset
 373         // of 0.  But since the abstract BreakIterator doesn't have that
 374         // knowledge, it assumes the begin offset is 0.  If you subclass
 375         // BreakIterator, copy the SimpleTextBoundary implementation of this
 376         // function into your subclass.  [This should have been abstract at
 377         // this level, but it's too late to fix that now.]
 378         if (offset == 0) {
 379             return true;
 380         }
 381         int boundary = following(offset - 1);
 382         if (boundary == DONE) {
 383             throw new IllegalArgumentException();
 384         }
 385         return boundary == offset;
 386     }
 387 
 388     /**
 389      * Returns character index of the text boundary that was most
 390      * recently returned by next(), next(int), previous(), first(), last(),
 391      * following(int) or preceding(int). If any of these methods returns
 392      * <code>BreakIterator.DONE</code> because either first or last text boundary
 393      * has been reached, it returns the first or last text boundary depending on
 394      * which one is reached.
 395      * @return The text boundary returned from the above methods, first or last
 396      * text boundary.
 397      * @see #next()
 398      * @see #next(int)
 399      * @see #previous()
 400      * @see #first()
 401      * @see #last()
 402      * @see #following(int)
 403      * @see #preceding(int)
 404      */
 405     public abstract int current();
 406 
 407     /**
 408      * Get the text being scanned
 409      * @return the text being scanned
 410      */
 411     public abstract CharacterIterator getText();
 412 
 413     /**
 414      * Set a new text string to be scanned.  The current scan
 415      * position is reset to first().
 416      * @param newText new text to scan.
 417      */
 418     public void setText(String newText)
 419     {
 420         setText(new StringCharacterIterator(newText));
 421     }
 422 
 423     /**
 424      * Set a new text for scanning.  The current scan
 425      * position is reset to first().
 426      * @param newText new text to scan.
 427      */
 428     public abstract void setText(CharacterIterator newText);
 429 
 430     private static final int CHARACTER_INDEX = 0;
 431     private static final int WORD_INDEX = 1;
 432     private static final int LINE_INDEX = 2;
 433     private static final int SENTENCE_INDEX = 3;
 434 
 435     @SuppressWarnings("unchecked")
 436     private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];
 437 
 438     /**
 439      * Returns a new <code>BreakIterator</code> instance
 440      * for <a href="BreakIterator.html#word">word breaks</a>
 441      * for the {@linkplain Locale#getDefault() default locale}.
 442      * @return A break iterator for word breaks
 443      */
 444     public static BreakIterator getWordInstance()
 445     {
 446         return getWordInstance(Locale.getDefault());
 447     }
 448 
 449     /**
 450      * Returns a new <code>BreakIterator</code> instance
 451      * for <a href="BreakIterator.html#word">word breaks</a>
 452      * for the given locale.
 453      * @param locale the desired locale
 454      * @return A break iterator for word breaks
 455      * @exception NullPointerException if <code>locale</code> is null
 456      */
 457     public static BreakIterator getWordInstance(Locale locale)
 458     {
 459         return getBreakInstance(locale, WORD_INDEX);
 460     }
 461 
 462     /**
 463      * Returns a new <code>BreakIterator</code> instance
 464      * for <a href="BreakIterator.html#line">line breaks</a>
 465      * for the {@linkplain Locale#getDefault() default locale}.
 466      * @return A break iterator for line breaks
 467      */
 468     public static BreakIterator getLineInstance()
 469     {
 470         return getLineInstance(Locale.getDefault());
 471     }
 472 
 473     /**
 474      * Returns a new <code>BreakIterator</code> instance
 475      * for <a href="BreakIterator.html#line">line breaks</a>
 476      * for the given locale.
 477      * @param locale the desired locale
 478      * @return A break iterator for line breaks
 479      * @exception NullPointerException if <code>locale</code> is null
 480      */
 481     public static BreakIterator getLineInstance(Locale locale)
 482     {
 483         return getBreakInstance(locale, LINE_INDEX);
 484     }
 485 
 486     /**
 487      * Returns a new <code>BreakIterator</code> instance
 488      * for <a href="BreakIterator.html#character">character breaks</a>
 489      * for the {@linkplain Locale#getDefault() default locale}.
 490      * @return A break iterator for character breaks
 491      */
 492     public static BreakIterator getCharacterInstance()
 493     {
 494         return getCharacterInstance(Locale.getDefault());
 495     }
 496 
 497     /**
 498      * Returns a new <code>BreakIterator</code> instance
 499      * for <a href="BreakIterator.html#character">character breaks</a>
 500      * for the given locale.
 501      * @param locale the desired locale
 502      * @return A break iterator for character breaks
 503      * @exception NullPointerException if <code>locale</code> is null
 504      */
 505     public static BreakIterator getCharacterInstance(Locale locale)
 506     {
 507         return getBreakInstance(locale, CHARACTER_INDEX);
 508     }
 509 
 510     /**
 511      * Returns a new <code>BreakIterator</code> instance
 512      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
 513      * for the {@linkplain Locale#getDefault() default locale}.
 514      * @return A break iterator for sentence breaks
 515      */
 516     public static BreakIterator getSentenceInstance()
 517     {
 518         return getSentenceInstance(Locale.getDefault());
 519     }
 520 
 521     /**
 522      * Returns a new <code>BreakIterator</code> instance
 523      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
 524      * for the given locale.
 525      * @param locale the desired locale
 526      * @return A break iterator for sentence breaks
 527      * @exception NullPointerException if <code>locale</code> is null
 528      */
 529     public static BreakIterator getSentenceInstance(Locale locale)
 530     {
 531         return getBreakInstance(locale, SENTENCE_INDEX);
 532     }
 533 
 534     private static BreakIterator getBreakInstance(Locale locale, int type) {
 535         if (iterCache[type] != null) {
 536             BreakIteratorCache cache = iterCache[type].get();
 537             if (cache != null) {
 538                 if (cache.getLocale().equals(locale)) {
 539                     return cache.createBreakInstance();
 540                 }
 541             }
 542         }
 543 
 544         BreakIterator result = createBreakInstance(locale, type);
 545         BreakIteratorCache cache = new BreakIteratorCache(locale, result);
 546         iterCache[type] = new SoftReference<>(cache);
 547         return result;
 548     }
 549 
 550     private static BreakIterator createBreakInstance(Locale locale,
 551                                                      int type) {
 552         LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
 553         BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
 554         BreakIterator iterator = null;
 555         switch (type) {
 556         case CHARACTER_INDEX:
 557             iterator = breakIteratorProvider.getCharacterInstance(locale);
 558             break;
 559         case WORD_INDEX:
 560             iterator = breakIteratorProvider.getWordInstance(locale);
 561             break;
 562         case LINE_INDEX:
 563             iterator = breakIteratorProvider.getLineInstance(locale);
 564             break;
 565         case SENTENCE_INDEX:
 566             iterator = breakIteratorProvider.getSentenceInstance(locale);
 567             break;
 568         }
 569         if (iterator == null) {
 570             throw new RuntimeException("BreakIterator instance creation failed. (provider="
 571                                        + breakIteratorProvider + ")");
 572         }
 573         return iterator;
 574     }
 575 
 576     /**
 577      * Returns an array of all locales for which the
 578      * <code>get*Instance</code> methods of this class can return
 579      * localized instances.
 580      * The returned array represents the union of locales supported by the Java
 581      * runtime and by installed
 582      * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
 583      * It must contain at least a <code>Locale</code>
 584      * instance equal to {@link java.util.Locale#US Locale.US}.
 585      *
 586      * @return An array of locales for which localized
 587      *         <code>BreakIterator</code> instances are available.
 588      */
 589     public static synchronized Locale[] getAvailableLocales()
 590     {
 591         LocaleServiceProviderPool pool =
 592             LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);
 593         return pool.getAvailableLocales();
 594     }
 595 
 596     private static final class BreakIteratorCache {
 597 
 598         private BreakIterator iter;
 599         private Locale locale;
 600 
 601         BreakIteratorCache(Locale locale, BreakIterator iter) {
 602             this.locale = locale;
 603             this.iter = (BreakIterator) iter.clone();
 604         }
 605 
 606         Locale getLocale() {
 607             return locale;
 608         }
 609 
 610         BreakIterator createBreakInstance() {
 611             return (BreakIterator) iter.clone();
 612         }
 613     }
 614 }