1 /* 2 * Copyright (c) 1996, 2006, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 28 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved 29 * 30 * The original version of this source code and documentation 31 * is copyrighted and owned by Taligent, Inc., a wholly-owned 32 * subsidiary of IBM. These materials are provided under terms 33 * of a License Agreement between Taligent and Sun. This technology 34 * is protected by multiple US and International patents. 35 * 36 * This notice and attribution to Taligent may not be removed. 37 * Taligent is a registered trademark of Taligent, Inc. 38 * 39 */ 40 41 package java.text; 42 43 import java.lang.ref.SoftReference; 44 import java.text.spi.BreakIteratorProvider; 45 import java.util.Locale; 46 import sun.util.locale.provider.LocaleProviderAdapter; 47 import sun.util.locale.provider.LocaleServiceProviderPool; 48 49 50 /** 51 * The <code>BreakIterator</code> class implements methods for finding 52 * the location of boundaries in text. Instances of <code>BreakIterator</code> 53 * maintain a current position and scan over text 54 * returning the index of characters where boundaries occur. 55 * Internally, <code>BreakIterator</code> scans text using a 56 * <code>CharacterIterator</code>, and is thus able to scan text held 57 * by any object implementing that protocol. A <code>StringCharacterIterator</code> 58 * is used to scan <code>String</code> objects passed to <code>setText</code>. 59 * 60 * <p> 61 * You use the factory methods provided by this class to create 62 * instances of various types of break iterators. In particular, 63 * use <code>getWordInstance</code>, <code>getLineInstance</code>, 64 * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code> 65 * to create <code>BreakIterator</code>s that perform 66 * word, line, sentence, and character boundary analysis respectively. 67 * A single <code>BreakIterator</code> can work only on one unit 68 * (word, line, sentence, and so on). You must use a different iterator 69 * for each unit boundary analysis you wish to perform. 70 * 71 * <p><a name="line"></a> 72 * Line boundary analysis determines where a text string can be 73 * broken when line-wrapping. The mechanism correctly handles 74 * punctuation and hyphenated words. Actual line breaking needs 75 * to also consider the available line width and is handled by 76 * higher-level software. 77 * 78 * <p><a name="sentence"></a> 79 * Sentence boundary analysis allows selection with correct interpretation 80 * of periods within numbers and abbreviations, and trailing punctuation 81 * marks such as quotation marks and parentheses. 82 * 83 * <p><a name="word"></a> 84 * Word boundary analysis is used by search and replace functions, as 85 * well as within text editing applications that allow the user to 86 * select words with a double click. Word selection provides correct 87 * interpretation of punctuation marks within and following 88 * words. Characters that are not part of a word, such as symbols 89 * or punctuation marks, have word-breaks on both sides. 90 * 91 * <p><a name="character"></a> 92 * Character boundary analysis allows users to interact with characters 93 * as they expect to, for example, when moving the cursor through a text 94 * string. Character boundary analysis provides correct navigation 95 * through character strings, regardless of how the character is stored. 96 * The boundaries returned may be those of supplementary characters, 97 * combining character sequences, or ligature clusters. 98 * For example, an accented character might be stored as a base character 99 * and a diacritical mark. What users consider to be a character can 100 * differ between languages. 101 * 102 * <p> 103 * The <code>BreakIterator</code> instances returned by the factory methods 104 * of this class are intended for use with natural languages only, not for 105 * programming language text. It is however possible to define subclasses 106 * that tokenize a programming language. 107 * 108 * <P> 109 * <strong>Examples</strong>:<P> 110 * Creating and using text boundaries: 111 * <blockquote> 112 * <pre> 113 * public static void main(String args[]) { 114 * if (args.length == 1) { 115 * String stringToExamine = args[0]; 116 * //print each word in order 117 * BreakIterator boundary = BreakIterator.getWordInstance(); 118 * boundary.setText(stringToExamine); 119 * printEachForward(boundary, stringToExamine); 120 * //print each sentence in reverse order 121 * boundary = BreakIterator.getSentenceInstance(Locale.US); 122 * boundary.setText(stringToExamine); 123 * printEachBackward(boundary, stringToExamine); 124 * printFirst(boundary, stringToExamine); 125 * printLast(boundary, stringToExamine); 126 * } 127 * } 128 * </pre> 129 * </blockquote> 130 * 131 * Print each element in order: 132 * <blockquote> 133 * <pre> 134 * public static void printEachForward(BreakIterator boundary, String source) { 135 * int start = boundary.first(); 136 * for (int end = boundary.next(); 137 * end != BreakIterator.DONE; 138 * start = end, end = boundary.next()) { 139 * System.out.println(source.substring(start,end)); 140 * } 141 * } 142 * </pre> 143 * </blockquote> 144 * 145 * Print each element in reverse order: 146 * <blockquote> 147 * <pre> 148 * public static void printEachBackward(BreakIterator boundary, String source) { 149 * int end = boundary.last(); 150 * for (int start = boundary.previous(); 151 * start != BreakIterator.DONE; 152 * end = start, start = boundary.previous()) { 153 * System.out.println(source.substring(start,end)); 154 * } 155 * } 156 * </pre> 157 * </blockquote> 158 * 159 * Print first element: 160 * <blockquote> 161 * <pre> 162 * public static void printFirst(BreakIterator boundary, String source) { 163 * int start = boundary.first(); 164 * int end = boundary.next(); 165 * System.out.println(source.substring(start,end)); 166 * } 167 * </pre> 168 * </blockquote> 169 * 170 * Print last element: 171 * <blockquote> 172 * <pre> 173 * public static void printLast(BreakIterator boundary, String source) { 174 * int end = boundary.last(); 175 * int start = boundary.previous(); 176 * System.out.println(source.substring(start,end)); 177 * } 178 * </pre> 179 * </blockquote> 180 * 181 * Print the element at a specified position: 182 * <blockquote> 183 * <pre> 184 * public static void printAt(BreakIterator boundary, int pos, String source) { 185 * int end = boundary.following(pos); 186 * int start = boundary.previous(); 187 * System.out.println(source.substring(start,end)); 188 * } 189 * </pre> 190 * </blockquote> 191 * 192 * Find the next word: 193 * <blockquote> 194 * <pre> 195 * public static int nextWordStartAfter(int pos, String text) { 196 * BreakIterator wb = BreakIterator.getWordInstance(); 197 * wb.setText(text); 198 * int last = wb.following(pos); 199 * int current = wb.next(); 200 * while (current != BreakIterator.DONE) { 201 * for (int p = last; p < current; p++) { 202 * if (Character.isLetter(text.codePointAt(p))) 203 * return last; 204 * } 205 * last = current; 206 * current = wb.next(); 207 * } 208 * return BreakIterator.DONE; 209 * } 210 * </pre> 211 * (The iterator returned by BreakIterator.getWordInstance() is unique in that 212 * the break positions it returns don't represent both the start and end of the 213 * thing being iterated over. That is, a sentence-break iterator returns breaks 214 * that each represent the end of one sentence and the beginning of the next. 215 * With the word-break iterator, the characters between two boundaries might be a 216 * word, or they might be the punctuation or whitespace between two words. The 217 * above code uses a simple heuristic to determine which boundary is the beginning 218 * of a word: If the characters between this boundary and the next boundary 219 * include at least one letter (this can be an alphabetical letter, a CJK ideograph, 220 * a Hangul syllable, a Kana character, etc.), then the text between this boundary 221 * and the next is a word; otherwise, it's the material between words.) 222 * </blockquote> 223 * 224 * @see CharacterIterator 225 * 226 */ 227 228 public abstract class BreakIterator implements Cloneable 229 { 230 /** 231 * Constructor. BreakIterator is stateless and has no default behavior. 232 */ 233 protected BreakIterator() 234 { 235 } 236 237 /** 238 * Create a copy of this iterator 239 * @return A copy of this 240 */ 241 @Override 242 public Object clone() 243 { 244 try { 245 return super.clone(); 246 } 247 catch (CloneNotSupportedException e) { 248 throw new InternalError(e); 249 } 250 } 251 252 /** 253 * DONE is returned by previous(), next(), next(int), preceding(int) 254 * and following(int) when either the first or last text boundary has been 255 * reached. 256 */ 257 public static final int DONE = -1; 258 259 /** 260 * Returns the first boundary. The iterator's current position is set 261 * to the first text boundary. 262 * @return The character index of the first text boundary. 263 */ 264 public abstract int first(); 265 266 /** 267 * Returns the last boundary. The iterator's current position is set 268 * to the last text boundary. 269 * @return The character index of the last text boundary. 270 */ 271 public abstract int last(); 272 273 /** 274 * Returns the nth boundary from the current boundary. If either 275 * the first or last text boundary has been reached, it returns 276 * <code>BreakIterator.DONE</code> and the current position is set to either 277 * the first or last text boundary depending on which one is reached. Otherwise, 278 * the iterator's current position is set to the new boundary. 279 * For example, if the iterator's current position is the mth text boundary 280 * and three more boundaries exist from the current boundary to the last text 281 * boundary, the next(2) call will return m + 2. The new text position is set 282 * to the (m + 2)th text boundary. A next(4) call would return 283 * <code>BreakIterator.DONE</code> and the last text boundary would become the 284 * new text position. 285 * @param n which boundary to return. A value of 0 286 * does nothing. Negative values move to previous boundaries 287 * and positive values move to later boundaries. 288 * @return The character index of the nth boundary from the current position 289 * or <code>BreakIterator.DONE</code> if either first or last text boundary 290 * has been reached. 291 */ 292 public abstract int next(int n); 293 294 /** 295 * Returns the boundary following the current boundary. If the current boundary 296 * is the last text boundary, it returns <code>BreakIterator.DONE</code> and 297 * the iterator's current position is unchanged. Otherwise, the iterator's 298 * current position is set to the boundary following the current boundary. 299 * @return The character index of the next text boundary or 300 * <code>BreakIterator.DONE</code> if the current boundary is the last text 301 * boundary. 302 * Equivalent to next(1). 303 * @see #next(int) 304 */ 305 public abstract int next(); 306 307 /** 308 * Returns the boundary preceding the current boundary. If the current boundary 309 * is the first text boundary, it returns <code>BreakIterator.DONE</code> and 310 * the iterator's current position is unchanged. Otherwise, the iterator's 311 * current position is set to the boundary preceding the current boundary. 312 * @return The character index of the previous text boundary or 313 * <code>BreakIterator.DONE</code> if the current boundary is the first text 314 * boundary. 315 */ 316 public abstract int previous(); 317 318 /** 319 * Returns the first boundary following the specified character offset. If the 320 * specified offset equals to the last text boundary, it returns 321 * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged. 322 * Otherwise, the iterator's current position is set to the returned boundary. 323 * The value returned is always greater than the offset or the value 324 * <code>BreakIterator.DONE</code>. 325 * @param offset the character offset to begin scanning. 326 * @return The first boundary after the specified offset or 327 * <code>BreakIterator.DONE</code> if the last text boundary is passed in 328 * as the offset. 329 * @exception IllegalArgumentException if the specified offset is less than 330 * the first text boundary or greater than the last text boundary. 331 */ 332 public abstract int following(int offset); 333 334 /** 335 * Returns the last boundary preceding the specified character offset. If the 336 * specified offset equals to the first text boundary, it returns 337 * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged. 338 * Otherwise, the iterator's current position is set to the returned boundary. 339 * The value returned is always less than the offset or the value 340 * <code>BreakIterator.DONE</code>. 341 * @param offset the characater offset to begin scanning. 342 * @return The last boundary before the specified offset or 343 * <code>BreakIterator.DONE</code> if the first text boundary is passed in 344 * as the offset. 345 * @exception IllegalArgumentException if the specified offset is less than 346 * the first text boundary or greater than the last text boundary. 347 * @since 1.2 348 */ 349 public int preceding(int offset) { 350 // NOTE: This implementation is here solely because we can't add new 351 // abstract methods to an existing class. There is almost ALWAYS a 352 // better, faster way to do this. 353 int pos = following(offset); 354 while (pos >= offset && pos != DONE) { 355 pos = previous(); 356 } 357 return pos; 358 } 359 360 /** 361 * Returns true if the specified character offset is a text boundary. 362 * @param offset the character offset to check. 363 * @return <code>true</code> if "offset" is a boundary position, 364 * <code>false</code> otherwise. 365 * @exception IllegalArgumentException if the specified offset is less than 366 * the first text boundary or greater than the last text boundary. 367 * @since 1.2 368 */ 369 public boolean isBoundary(int offset) { 370 // NOTE: This implementation probably is wrong for most situations 371 // because it fails to take into account the possibility that a 372 // CharacterIterator passed to setText() may not have a begin offset 373 // of 0. But since the abstract BreakIterator doesn't have that 374 // knowledge, it assumes the begin offset is 0. If you subclass 375 // BreakIterator, copy the SimpleTextBoundary implementation of this 376 // function into your subclass. [This should have been abstract at 377 // this level, but it's too late to fix that now.] 378 if (offset == 0) { 379 return true; 380 } 381 int boundary = following(offset - 1); 382 if (boundary == DONE) { 383 throw new IllegalArgumentException(); 384 } 385 return boundary == offset; 386 } 387 388 /** 389 * Returns character index of the text boundary that was most 390 * recently returned by next(), next(int), previous(), first(), last(), 391 * following(int) or preceding(int). If any of these methods returns 392 * <code>BreakIterator.DONE</code> because either first or last text boundary 393 * has been reached, it returns the first or last text boundary depending on 394 * which one is reached. 395 * @return The text boundary returned from the above methods, first or last 396 * text boundary. 397 * @see #next() 398 * @see #next(int) 399 * @see #previous() 400 * @see #first() 401 * @see #last() 402 * @see #following(int) 403 * @see #preceding(int) 404 */ 405 public abstract int current(); 406 407 /** 408 * Get the text being scanned 409 * @return the text being scanned 410 */ 411 public abstract CharacterIterator getText(); 412 413 /** 414 * Set a new text string to be scanned. The current scan 415 * position is reset to first(). 416 * @param newText new text to scan. 417 */ 418 public void setText(String newText) 419 { 420 setText(new StringCharacterIterator(newText)); 421 } 422 423 /** 424 * Set a new text for scanning. The current scan 425 * position is reset to first(). 426 * @param newText new text to scan. 427 */ 428 public abstract void setText(CharacterIterator newText); 429 430 private static final int CHARACTER_INDEX = 0; 431 private static final int WORD_INDEX = 1; 432 private static final int LINE_INDEX = 2; 433 private static final int SENTENCE_INDEX = 3; 434 435 @SuppressWarnings("unchecked") 436 private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4]; 437 438 /** 439 * Returns a new <code>BreakIterator</code> instance 440 * for <a href="BreakIterator.html#word">word breaks</a> 441 * for the {@linkplain Locale#getDefault() default locale}. 442 * @return A break iterator for word breaks 443 */ 444 public static BreakIterator getWordInstance() 445 { 446 return getWordInstance(Locale.getDefault()); 447 } 448 449 /** 450 * Returns a new <code>BreakIterator</code> instance 451 * for <a href="BreakIterator.html#word">word breaks</a> 452 * for the given locale. 453 * @param locale the desired locale 454 * @return A break iterator for word breaks 455 * @exception NullPointerException if <code>locale</code> is null 456 */ 457 public static BreakIterator getWordInstance(Locale locale) 458 { 459 return getBreakInstance(locale, WORD_INDEX); 460 } 461 462 /** 463 * Returns a new <code>BreakIterator</code> instance 464 * for <a href="BreakIterator.html#line">line breaks</a> 465 * for the {@linkplain Locale#getDefault() default locale}. 466 * @return A break iterator for line breaks 467 */ 468 public static BreakIterator getLineInstance() 469 { 470 return getLineInstance(Locale.getDefault()); 471 } 472 473 /** 474 * Returns a new <code>BreakIterator</code> instance 475 * for <a href="BreakIterator.html#line">line breaks</a> 476 * for the given locale. 477 * @param locale the desired locale 478 * @return A break iterator for line breaks 479 * @exception NullPointerException if <code>locale</code> is null 480 */ 481 public static BreakIterator getLineInstance(Locale locale) 482 { 483 return getBreakInstance(locale, LINE_INDEX); 484 } 485 486 /** 487 * Returns a new <code>BreakIterator</code> instance 488 * for <a href="BreakIterator.html#character">character breaks</a> 489 * for the {@linkplain Locale#getDefault() default locale}. 490 * @return A break iterator for character breaks 491 */ 492 public static BreakIterator getCharacterInstance() 493 { 494 return getCharacterInstance(Locale.getDefault()); 495 } 496 497 /** 498 * Returns a new <code>BreakIterator</code> instance 499 * for <a href="BreakIterator.html#character">character breaks</a> 500 * for the given locale. 501 * @param locale the desired locale 502 * @return A break iterator for character breaks 503 * @exception NullPointerException if <code>locale</code> is null 504 */ 505 public static BreakIterator getCharacterInstance(Locale locale) 506 { 507 return getBreakInstance(locale, CHARACTER_INDEX); 508 } 509 510 /** 511 * Returns a new <code>BreakIterator</code> instance 512 * for <a href="BreakIterator.html#sentence">sentence breaks</a> 513 * for the {@linkplain Locale#getDefault() default locale}. 514 * @return A break iterator for sentence breaks 515 */ 516 public static BreakIterator getSentenceInstance() 517 { 518 return getSentenceInstance(Locale.getDefault()); 519 } 520 521 /** 522 * Returns a new <code>BreakIterator</code> instance 523 * for <a href="BreakIterator.html#sentence">sentence breaks</a> 524 * for the given locale. 525 * @param locale the desired locale 526 * @return A break iterator for sentence breaks 527 * @exception NullPointerException if <code>locale</code> is null 528 */ 529 public static BreakIterator getSentenceInstance(Locale locale) 530 { 531 return getBreakInstance(locale, SENTENCE_INDEX); 532 } 533 534 private static BreakIterator getBreakInstance(Locale locale, int type) { 535 if (iterCache[type] != null) { 536 BreakIteratorCache cache = iterCache[type].get(); 537 if (cache != null) { 538 if (cache.getLocale().equals(locale)) { 539 return cache.createBreakInstance(); 540 } 541 } 542 } 543 544 BreakIterator result = createBreakInstance(locale, type); 545 BreakIteratorCache cache = new BreakIteratorCache(locale, result); 546 iterCache[type] = new SoftReference<>(cache); 547 return result; 548 } 549 550 private static BreakIterator createBreakInstance(Locale locale, 551 int type) { 552 LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale); 553 BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider(); 554 BreakIterator iterator = null; 555 switch (type) { 556 case CHARACTER_INDEX: 557 iterator = breakIteratorProvider.getCharacterInstance(locale); 558 break; 559 case WORD_INDEX: 560 iterator = breakIteratorProvider.getWordInstance(locale); 561 break; 562 case LINE_INDEX: 563 iterator = breakIteratorProvider.getLineInstance(locale); 564 break; 565 case SENTENCE_INDEX: 566 iterator = breakIteratorProvider.getSentenceInstance(locale); 567 break; 568 } 569 if (iterator == null) { 570 throw new RuntimeException("BreakIterator instance creation failed. (provider=" 571 + breakIteratorProvider + ")"); 572 } 573 return iterator; 574 } 575 576 /** 577 * Returns an array of all locales for which the 578 * <code>get*Instance</code> methods of this class can return 579 * localized instances. 580 * The returned array represents the union of locales supported by the Java 581 * runtime and by installed 582 * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations. 583 * It must contain at least a <code>Locale</code> 584 * instance equal to {@link java.util.Locale#US Locale.US}. 585 * 586 * @return An array of locales for which localized 587 * <code>BreakIterator</code> instances are available. 588 */ 589 public static synchronized Locale[] getAvailableLocales() 590 { 591 LocaleServiceProviderPool pool = 592 LocaleServiceProviderPool.getPool(BreakIteratorProvider.class); 593 return pool.getAvailableLocales(); 594 } 595 596 private static final class BreakIteratorCache { 597 598 private BreakIterator iter; 599 private Locale locale; 600 601 BreakIteratorCache(Locale locale, BreakIterator iter) { 602 this.locale = locale; 603 this.iter = (BreakIterator) iter.clone(); 604 } 605 606 Locale getLocale() { 607 return locale; 608 } 609 610 BreakIterator createBreakInstance() { 611 return (BreakIterator) iter.clone(); 612 } 613 } 614 }