1 /* 2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 41 package sun.text; 42 43 import java.nio.BufferUnderflowException; 44 import java.nio.ByteBuffer; 45 import java.text.BreakIterator; 46 import java.text.CharacterIterator; 47 import java.text.StringCharacterIterator; 48 import java.util.MissingResourceException; 49 import sun.text.CompactByteArray; 50 import sun.text.SupplementaryCharacterData; 51 52 /** 53 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p> 54 * 55 * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i> 56 * and <i>regular expressions.</i></p> 57 * 58 * <p>A substitution rule defines a name that can be used in place of an expression. It 59 * consists of a name, which is a string of characters contained in angle brackets, an equals 60 * sign, and an expression. (There can be no whitespace on either side of the equals sign.) 61 * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or 62 * square brackets. A substitution is visible after its definition, and is filled in using 63 * simple textual substitution. Substitution definitions can contain other substitutions, as 64 * long as those substitutions have been defined first. Substitutions are generally used to 65 * make the regular expressions (which can get quite complex) shorted and easier to read. 66 * They typically define either character categories or commonly-used subexpressions.</p> 67 * 68 * <p>There is one special substitution. If the description defines a substitution 69 * called "<ignore>", the expression must be a [] expression, and the 70 * expression defines a set of characters (the "<em>ignore characters</em>") that 71 * will be transparent to the BreakIterator. A sequence of characters will break the 72 * same way it would if any ignore characters it contains are taken out. Break 73 * positions never occur befoer ignore characters.</p> 74 * 75 * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and 76 * defines a sequence of characters to be kept together. With one significant exception, the 77 * iterator uses a longest-possible-match algorithm when matching text to regular 78 * expressions. The iterator also treats descriptions containing multiple regular expressions 79 * as if they were ORed together (i.e., as if they were separated by |).</p> 80 * 81 * <p>The special characters recognized by the regular-expression parser are as follows:</p> 82 * 83 * <blockquote> 84 * <table border="1" width="100%"> 85 * <tr> 86 * <td width="6%">*</td> 87 * <td width="94%">Specifies that the expression preceding the asterisk may occur any number 88 * of times (including not at all).</td> 89 * </tr> 90 * <tr> 91 * <td width="6%">{}</td> 92 * <td width="94%">Encloses a sequence of characters that is optional.</td> 93 * </tr> 94 * <tr> 95 * <td width="6%">()</td> 96 * <td width="94%">Encloses a sequence of characters. If followed by *, the sequence 97 * repeats. Otherwise, the parentheses are just a grouping device and a way to delimit 98 * the ends of expressions containing |.</td> 99 * </tr> 100 * <tr> 101 * <td width="6%">|</td> 102 * <td width="94%">Separates two alternative sequences of characters. Either one 103 * sequence or the other, but not both, matches this expression. The | character can 104 * only occur inside ().</td> 105 * </tr> 106 * <tr> 107 * <td width="6%">.</td> 108 * <td width="94%">Matches any character.</td> 109 * </tr> 110 * <tr> 111 * <td width="6%">*?</td> 112 * <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except 113 * when there is overlap between the last group of characters in the expression preceding the 114 * * and the first group of characters following the *. When there is this kind of 115 * overlap, * will match the longest sequence of characters that match the expression before 116 * the *, and *? will match the shortest sequence of characters matching the expression 117 * before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text, 118 * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy", 119 * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td> 120 * </tr> 121 * <tr> 122 * <td width="6%">[]</td> 123 * <td width="94%">Specifies a group of alternative characters. A [] expression will 124 * match any single character that is specified in the [] expression. For more on the 125 * syntax of [] expressions, see below.</td> 126 * </tr> 127 * <tr> 128 * <td width="6%">/</td> 129 * <td width="94%">Specifies where the break position should go if text matches this 130 * expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run 131 * of letters, followed by a run of whitespace, followed by a digit, but the break position 132 * will actually go before the whitespace). Expressions that don't contain / put the 133 * break position at the end of the matching text.</td> 134 * </tr> 135 * <tr> 136 * <td width="6%">\</td> 137 * <td width="94%">Escape character. The \ itself is ignored, but causes the next 138 * character to be treated as literal character. This has no effect for many 139 * characters, but for the characters listed above, this deprives them of their special 140 * meaning. (There are no special escape sequences for Unicode characters, or tabs and 141 * newlines; these are all handled by a higher-level protocol. In a Java string, 142 * "\n" will be converted to a literal newline character by the time the 143 * regular-expression parser sees it. Of course, this means that \ sequences that are 144 * visible to the regexp parser must be written as \\ when inside a Java string.) All 145 * characters in the ASCII range except for letters, digits, and control characters are 146 * reserved characters to the parser and must be preceded by \ even if they currently don't 147 * mean anything.</td> 148 * </tr> 149 * <tr> 150 * <td width="6%">!</td> 151 * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp 152 * parser that this expression specifies the backwards-iteration behavior of the iterator, 153 * and not its normal iteration behavior. This is generally only used in situations 154 * where the automatically-generated backwards-iteration brhavior doesn't produce 155 * satisfactory results and must be supplemented with extra client-specified rules.</td> 156 * </tr> 157 * <tr> 158 * <td width="6%"><em>(all others)</em></td> 159 * <td width="94%">All other characters are treated as literal characters, which must match 160 * the corresponding character(s) in the text exactly.</td> 161 * </tr> 162 * </table> 163 * </blockquote> 164 * 165 * <p>Within a [] expression, a number of other special characters can be used to specify 166 * groups of characters:</p> 167 * 168 * <blockquote> 169 * <table border="1" width="100%"> 170 * <tr> 171 * <td width="6%">-</td> 172 * <td width="94%">Specifies a range of matching characters. For example 173 * "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The - 174 * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a 175 * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does 176 * it include accented letters such as a-umlaut.</td> 177 * </tr> 178 * <tr> 179 * <td width="6%">::</td> 180 * <td width="94%">A pair of colons containing a one- or two-letter code matches all 181 * characters in the corresponding Unicode category. The two-letter codes are the same 182 * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]" 183 * matches all currency symbols and all math symbols). Specifying a one-letter code is 184 * the same as specifying all two-letter codes that begin with that letter (for example, 185 * "[:L:]" matches all letters, and is equivalent to 186 * "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode 187 * category code or a single letter that begins a Unicode category code is illegal within 188 * colons.</td> 189 * </tr> 190 * <tr> 191 * <td width="6%">[]</td> 192 * <td width="94%">[] expressions can nest. This has no effect, except when used in 193 * conjunction with the ^ token.</td> 194 * </tr> 195 * <tr> 196 * <td width="6%">^</td> 197 * <td width="94%">Excludes the character (or the characters in the [] expression) following 198 * it from the group of characters. For example, "[a-z^p]" matches all Latin 199 * lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters 200 * except the Han ideographs.</td> 201 * </tr> 202 * <tr> 203 * <td width="6%"><em>(all others)</em></td> 204 * <td width="94%">All other characters are treated as literal characters. (For 205 * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td> 206 * </tr> 207 * </table> 208 * </blockquote> 209 * 210 * <p>For a more complete explanation, see <a 211 * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>. 212 * For examples, see the resource data (which is annotated).</p> 213 * 214 * @author Richard Gillam 215 */ 216 public class RuleBasedBreakIterator extends BreakIterator { 217 218 /** 219 * A token used as a character-category value to identify ignore characters 220 */ 221 protected static final byte IGNORE = -1; 222 223 /** 224 * The state number of the starting state 225 */ 226 private static final short START_STATE = 1; 227 228 /** 229 * The state-transition value indicating "stop" 230 */ 231 private static final short STOP_STATE = 0; 232 233 /** 234 * Magic number for the BreakIterator data file format. 235 */ 236 static final byte[] LABEL = { 237 (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a', 238 (byte)'\0' 239 }; 240 static final int LABEL_LENGTH = LABEL.length; 241 242 /** 243 * Version number of the dictionary that was read in. 244 */ 245 static final byte supportedVersion = 1; 246 247 /** 248 * An array length of indices for BMP characters 249 */ 250 private static final int BMP_INDICES_LENGTH = 512; 251 252 /** 253 * Tables that indexes from character values to character category numbers 254 */ 255 private CompactByteArray charCategoryTable = null; 256 private SupplementaryCharacterData supplementaryCharCategoryTable = null; 257 258 /** 259 * The table of state transitions used for forward iteration 260 */ 261 private short[] stateTable = null; 262 263 /** 264 * The table of state transitions used to sync up the iterator with the 265 * text in backwards and random-access iteration 266 */ 267 private short[] backwardsStateTable = null; 268 269 /** 270 * A list of flags indicating which states in the state table are accepting 271 * ("end") states 272 */ 273 private boolean[] endStates = null; 274 275 /** 276 * A list of flags indicating which states in the state table are 277 * lookahead states (states which turn lookahead on and off) 278 */ 279 private boolean[] lookaheadStates = null; 280 281 /** 282 * A table for additional data. May be used by a subclass of 283 * RuleBasedBreakIterator. 284 */ 285 private byte[] additionalData = null; 286 287 /** 288 * The number of character categories (and, thus, the number of columns in 289 * the state tables) 290 */ 291 private int numCategories; 292 293 /** 294 * The character iterator through which this BreakIterator accesses the text 295 */ 296 private CharacterIterator text = null; 297 298 /** 299 * A CRC32 value of all data in datafile 300 */ 301 private long checksum; 302 303 //======================================================================= 304 // constructors 305 //======================================================================= 306 307 /** 308 * Constructs a RuleBasedBreakIterator using the given rule data. 309 * 310 * @throws MissingResourceException if the rule data is invalid or corrupted 311 */ 312 public RuleBasedBreakIterator(String ruleFile, byte[] ruleData) { 313 ByteBuffer bb = ByteBuffer.wrap(ruleData); 314 try { 315 validateRuleData(ruleFile, bb); 316 setupTables(ruleFile, bb); 317 } catch (BufferUnderflowException bue) { 318 MissingResourceException e; 319 e = new MissingResourceException("Corrupted rule data file", ruleFile, ""); 320 e.initCause(bue); 321 throw e; 322 } 323 } 324 325 /** 326 * Initializes the fields with the given rule data. 327 * The data format is as follows: 328 * <pre> 329 * BreakIteratorData { 330 * u1 magic[7]; 331 * u1 version; 332 * u4 totalDataSize; 333 * header_info header; 334 * body value; 335 * } 336 * </pre> 337 * <code>totalDataSize</code> is the summation of the size of 338 * <code>header_info</code> and <code>body</code> in byte count. 339 * <p> 340 * In <code>header</code>, each field except for checksum implies the 341 * length of each field. Since <code>BMPdataLength</code> is a fixed-length 342 * data(512 entries), its length isn't included in <code>header</code>. 343 * <code>checksum</code> is a CRC32 value of all in <code>body</code>. 344 * <pre> 345 * header_info { 346 * u4 stateTableLength; 347 * u4 backwardsStateTableLength; 348 * u4 endStatesLength; 349 * u4 lookaheadStatesLength; 350 * u4 BMPdataLength; 351 * u4 nonBMPdataLength; 352 * u4 additionalDataLength; 353 * u8 checksum; 354 * } 355 * </pre> 356 * <p> 357 * 358 * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to 359 * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to 360 * <code>supplementaryCharCategoryTable</code>. 361 * <pre> 362 * body { 363 * u2 stateTable[stateTableLength]; 364 * u2 backwardsStateTable[backwardsStateTableLength]; 365 * u1 endStates[endStatesLength]; 366 * u1 lookaheadStates[lookaheadStatesLength]; 367 * u2 BMPindices[512]; 368 * u1 BMPdata[BMPdataLength]; 369 * u4 nonBMPdata[numNonBMPdataLength]; 370 * u1 additionalData[additionalDataLength]; 371 * } 372 * </pre> 373 * 374 * @throws BufferUnderflowException if the end-of-data is reached before 375 * setting up all the tables 376 */ 377 private void setupTables(String ruleFile, ByteBuffer bb) { 378 /* Read header_info. */ 379 int stateTableLength = bb.getInt(); 380 int backwardsStateTableLength = bb.getInt(); 381 int endStatesLength = bb.getInt(); 382 int lookaheadStatesLength = bb.getInt(); 383 int BMPdataLength = bb.getInt(); 384 int nonBMPdataLength = bb.getInt(); 385 int additionalDataLength = bb.getInt(); 386 checksum = bb.getLong(); 387 388 /* Read stateTable[numCategories * numRows] */ 389 stateTable = new short[stateTableLength]; 390 for (int i = 0; i < stateTableLength; i++) { 391 stateTable[i] = bb.getShort(); 392 } 393 394 /* Read backwardsStateTable[numCategories * numRows] */ 395 backwardsStateTable = new short[backwardsStateTableLength]; 396 for (int i = 0; i < backwardsStateTableLength; i++) { 397 backwardsStateTable[i] = bb.getShort(); 398 } 399 400 /* Read endStates[numRows] */ 401 endStates = new boolean[endStatesLength]; 402 for (int i = 0; i < endStatesLength; i++) { 403 endStates[i] = bb.get() == 1; 404 } 405 406 /* Read lookaheadStates[numRows] */ 407 lookaheadStates = new boolean[lookaheadStatesLength]; 408 for (int i = 0; i < lookaheadStatesLength; i++) { 409 lookaheadStates[i] = bb.get() == 1; 410 } 411 412 /* Read a category table and indices for BMP characters. */ 413 short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices 414 for (int i = 0; i < BMP_INDICES_LENGTH; i++) { 415 temp1[i] = bb.getShort(); 416 } 417 byte[] temp2 = new byte[BMPdataLength]; // BMPdata 418 bb.get(temp2); 419 charCategoryTable = new CompactByteArray(temp1, temp2); 420 421 /* Read a category table for non-BMP characters. */ 422 int[] temp3 = new int[nonBMPdataLength]; 423 for (int i = 0; i < nonBMPdataLength; i++) { 424 temp3[i] = bb.getInt(); 425 } 426 supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3); 427 428 /* Read additional data */ 429 if (additionalDataLength > 0) { 430 additionalData = new byte[additionalDataLength]; 431 bb.get(additionalData); 432 } 433 assert bb.position() == bb.limit(); 434 435 /* Set numCategories */ 436 numCategories = stateTable.length / endStates.length; 437 } 438 439 /** 440 * Validates the magic number, version, and the length of the given data. 441 * 442 * @throws BufferUnderflowException if the end-of-data is reached while 443 * validating data 444 * @throws MissingResourceException if valification failed 445 */ 446 void validateRuleData(String ruleFile, ByteBuffer bb) { 447 /* Verify the magic number. */ 448 for (int i = 0; i < LABEL_LENGTH; i++) { 449 if (bb.get() != LABEL[i]) { 450 throw new MissingResourceException("Wrong magic number", 451 ruleFile, ""); 452 } 453 } 454 455 /* Verify the version number. */ 456 byte version = bb.get(); 457 if (version != supportedVersion) { 458 throw new MissingResourceException("Unsupported version(" + version + ")", 459 ruleFile, ""); 460 } 461 462 // Check the length of the rest of data 463 int len = bb.getInt(); 464 if (bb.position() + len != bb.limit()) { 465 throw new MissingResourceException("Wrong data length", 466 ruleFile, ""); 467 } 468 } 469 470 byte[] getAdditionalData() { 471 return additionalData; 472 } 473 474 void setAdditionalData(byte[] b) { 475 additionalData = b; 476 } 477 478 //======================================================================= 479 // boilerplate 480 //======================================================================= 481 /** 482 * Clones this iterator. 483 * @return A newly-constructed RuleBasedBreakIterator with the same 484 * behavior as this one. 485 */ 486 @Override 487 public Object clone() { 488 RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone(); 489 if (text != null) { 490 result.text = (CharacterIterator) text.clone(); 491 } 492 return result; 493 } 494 495 /** 496 * Returns true if both BreakIterators are of the same class, have the same 497 * rules, and iterate over the same text. 498 */ 499 @Override 500 public boolean equals(Object that) { 501 try { 502 if (that == null) { 503 return false; 504 } 505 506 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 507 if (checksum != other.checksum) { 508 return false; 509 } 510 if (text == null) { 511 return other.text == null; 512 } else { 513 return text.equals(other.text); 514 } 515 } 516 catch(ClassCastException e) { 517 return false; 518 } 519 } 520 521 /** 522 * Returns text 523 */ 524 @Override 525 public String toString() { 526 return "[checksum=0x" + Long.toHexString(checksum) + ']'; 527 } 528 529 /** 530 * Compute a hashcode for this BreakIterator 531 * @return A hash code 532 */ 533 @Override 534 public int hashCode() { 535 return (int)checksum; 536 } 537 538 //======================================================================= 539 // BreakIterator overrides 540 //======================================================================= 541 542 /** 543 * Sets the current iteration position to the beginning of the text. 544 * (i.e., the CharacterIterator's starting offset). 545 * @return The offset of the beginning of the text. 546 */ 547 @Override 548 public int first() { 549 CharacterIterator t = getText(); 550 551 t.first(); 552 return t.getIndex(); 553 } 554 555 /** 556 * Sets the current iteration position to the end of the text. 557 * (i.e., the CharacterIterator's ending offset). 558 * @return The text's past-the-end offset. 559 */ 560 @Override 561 public int last() { 562 CharacterIterator t = getText(); 563 564 // I'm not sure why, but t.last() returns the offset of the last character, 565 // rather than the past-the-end offset 566 t.setIndex(t.getEndIndex()); 567 return t.getIndex(); 568 } 569 570 /** 571 * Advances the iterator either forward or backward the specified number of steps. 572 * Negative values move backward, and positive values move forward. This is 573 * equivalent to repeatedly calling next() or previous(). 574 * @param n The number of steps to move. The sign indicates the direction 575 * (negative is backwards, and positive is forwards). 576 * @return The character offset of the boundary position n boundaries away from 577 * the current one. 578 */ 579 @Override 580 public int next(int n) { 581 int result = current(); 582 while (n > 0) { 583 result = handleNext(); 584 --n; 585 } 586 while (n < 0) { 587 result = previous(); 588 ++n; 589 } 590 return result; 591 } 592 593 /** 594 * Advances the iterator to the next boundary position. 595 * @return The position of the first boundary after this one. 596 */ 597 @Override 598 public int next() { 599 return handleNext(); 600 } 601 602 private int cachedLastKnownBreak = BreakIterator.DONE; 603 604 /** 605 * Advances the iterator backwards, to the last boundary preceding this one. 606 * @return The position of the last boundary position preceding this one. 607 */ 608 @Override 609 public int previous() { 610 // if we're already sitting at the beginning of the text, return DONE 611 CharacterIterator text = getText(); 612 if (current() == text.getBeginIndex()) { 613 return BreakIterator.DONE; 614 } 615 616 // set things up. handlePrevious() will back us up to some valid 617 // break position before the current position (we back our internal 618 // iterator up one step to prevent handlePrevious() from returning 619 // the current position), but not necessarily the last one before 620 // where we started 621 int start = current(); 622 int lastResult = cachedLastKnownBreak; 623 if (lastResult >= start || lastResult <= BreakIterator.DONE) { 624 getPrevious(); 625 lastResult = handlePrevious(); 626 } else { 627 //it might be better to check if handlePrevious() give us closer 628 //safe value but handlePrevious() is slow too 629 //So, this has to be done carefully 630 text.setIndex(lastResult); 631 } 632 int result = lastResult; 633 634 // iterate forward from the known break position until we pass our 635 // starting point. The last break position before the starting 636 // point is our return value 637 while (result != BreakIterator.DONE && result < start) { 638 lastResult = result; 639 result = handleNext(); 640 } 641 642 // set the current iteration position to be the last break position 643 // before where we started, and then return that value 644 text.setIndex(lastResult); 645 cachedLastKnownBreak = lastResult; 646 return lastResult; 647 } 648 649 /** 650 * Returns previous character 651 */ 652 private int getPrevious() { 653 char c2 = text.previous(); 654 if (Character.isLowSurrogate(c2) && 655 text.getIndex() > text.getBeginIndex()) { 656 char c1 = text.previous(); 657 if (Character.isHighSurrogate(c1)) { 658 return Character.toCodePoint(c1, c2); 659 } else { 660 text.next(); 661 } 662 } 663 return (int)c2; 664 } 665 666 /** 667 * Returns current character 668 */ 669 int getCurrent() { 670 char c1 = text.current(); 671 if (Character.isHighSurrogate(c1) && 672 text.getIndex() < text.getEndIndex()) { 673 char c2 = text.next(); 674 text.previous(); 675 if (Character.isLowSurrogate(c2)) { 676 return Character.toCodePoint(c1, c2); 677 } 678 } 679 return (int)c1; 680 } 681 682 /** 683 * Returns the count of next character. 684 */ 685 private int getCurrentCodePointCount() { 686 char c1 = text.current(); 687 if (Character.isHighSurrogate(c1) && 688 text.getIndex() < text.getEndIndex()) { 689 char c2 = text.next(); 690 text.previous(); 691 if (Character.isLowSurrogate(c2)) { 692 return 2; 693 } 694 } 695 return 1; 696 } 697 698 /** 699 * Returns next character 700 */ 701 int getNext() { 702 int index = text.getIndex(); 703 int endIndex = text.getEndIndex(); 704 if (index == endIndex || 705 (index += getCurrentCodePointCount()) >= endIndex) { 706 return CharacterIterator.DONE; 707 } 708 text.setIndex(index); 709 return getCurrent(); 710 } 711 712 /** 713 * Returns the position of next character. 714 */ 715 private int getNextIndex() { 716 int index = text.getIndex() + getCurrentCodePointCount(); 717 int endIndex = text.getEndIndex(); 718 if (index > endIndex) { 719 return endIndex; 720 } else { 721 return index; 722 } 723 } 724 725 /** 726 * Throw IllegalArgumentException unless begin <= offset < end. 727 */ 728 protected static final void checkOffset(int offset, CharacterIterator text) { 729 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 730 throw new IllegalArgumentException("offset out of bounds"); 731 } 732 } 733 734 /** 735 * Sets the iterator to refer to the first boundary position following 736 * the specified position. 737 * @offset The position from which to begin searching for a break position. 738 * @return The position of the first break after the current position. 739 */ 740 @Override 741 public int following(int offset) { 742 743 CharacterIterator text = getText(); 744 checkOffset(offset, text); 745 746 // Set our internal iteration position (temporarily) 747 // to the position passed in. If this is the _beginning_ position, 748 // then we can just use next() to get our return value 749 text.setIndex(offset); 750 if (offset == text.getBeginIndex()) { 751 cachedLastKnownBreak = handleNext(); 752 return cachedLastKnownBreak; 753 } 754 755 // otherwise, we have to sync up first. Use handlePrevious() to back 756 // us up to a known break position before the specified position (if 757 // we can determine that the specified position is a break position, 758 // we don't back up at all). This may or may not be the last break 759 // position at or before our starting position. Advance forward 760 // from here until we've passed the starting position. The position 761 // we stop on will be the first break position after the specified one. 762 int result = cachedLastKnownBreak; 763 if (result >= offset || result <= BreakIterator.DONE) { 764 result = handlePrevious(); 765 } else { 766 //it might be better to check if handlePrevious() give us closer 767 //safe value but handlePrevious() is slow too 768 //So, this has to be done carefully 769 text.setIndex(result); 770 } 771 while (result != BreakIterator.DONE && result <= offset) { 772 result = handleNext(); 773 } 774 cachedLastKnownBreak = result; 775 return result; 776 } 777 778 /** 779 * Sets the iterator to refer to the last boundary position before the 780 * specified position. 781 * @offset The position to begin searching for a break from. 782 * @return The position of the last boundary before the starting position. 783 */ 784 @Override 785 public int preceding(int offset) { 786 // if we start by updating the current iteration position to the 787 // position specified by the caller, we can just use previous() 788 // to carry out this operation 789 CharacterIterator text = getText(); 790 checkOffset(offset, text); 791 text.setIndex(offset); 792 return previous(); 793 } 794 795 /** 796 * Returns true if the specified position is a boundary position. As a side 797 * effect, leaves the iterator pointing to the first boundary position at 798 * or after "offset". 799 * @param offset the offset to check. 800 * @return True if "offset" is a boundary position. 801 */ 802 @Override 803 public boolean isBoundary(int offset) { 804 CharacterIterator text = getText(); 805 checkOffset(offset, text); 806 if (offset == text.getBeginIndex()) { 807 return true; 808 } 809 810 // to check whether this is a boundary, we can use following() on the 811 // position before the specified one and return true if the position we 812 // get back is the one the user specified 813 else { 814 return following(offset - 1) == offset; 815 } 816 } 817 818 /** 819 * Returns the current iteration position. 820 * @return The current iteration position. 821 */ 822 @Override 823 public int current() { 824 return getText().getIndex(); 825 } 826 827 /** 828 * Return a CharacterIterator over the text being analyzed. This version 829 * of this method returns the actual CharacterIterator we're using internally. 830 * Changing the state of this iterator can have undefined consequences. If 831 * you need to change it, clone it first. 832 * @return An iterator over the text being analyzed. 833 */ 834 @Override 835 public CharacterIterator getText() { 836 // The iterator is initialized pointing to no text at all, so if this 837 // function is called while we're in that state, we have to fudge an 838 // iterator to return. 839 if (text == null) { 840 text = new StringCharacterIterator(""); 841 } 842 return text; 843 } 844 845 /** 846 * Set the iterator to analyze a new piece of text. This function resets 847 * the current iteration position to the beginning of the text. 848 * @param newText An iterator over the text to analyze. 849 */ 850 @Override 851 public void setText(CharacterIterator newText) { 852 // Test iterator to see if we need to wrap it in a SafeCharIterator. 853 // The correct behavior for CharacterIterators is to allow the 854 // position to be set to the endpoint of the iterator. Many 855 // CharacterIterators do not uphold this, so this is a workaround 856 // to permit them to use this class. 857 int end = newText.getEndIndex(); 858 boolean goodIterator; 859 try { 860 newText.setIndex(end); // some buggy iterators throw an exception here 861 goodIterator = newText.getIndex() == end; 862 } 863 catch(IllegalArgumentException e) { 864 goodIterator = false; 865 } 866 867 if (goodIterator) { 868 text = newText; 869 } 870 else { 871 text = new SafeCharIterator(newText); 872 } 873 text.first(); 874 875 cachedLastKnownBreak = BreakIterator.DONE; 876 } 877 878 879 //======================================================================= 880 // implementation 881 //======================================================================= 882 883 /** 884 * This method is the actual implementation of the next() method. All iteration 885 * vectors through here. This method initializes the state machine to state 1 886 * and advances through the text character by character until we reach the end 887 * of the text or the state machine transitions to state 0. We update our return 888 * value every time the state machine passes through a possible end state. 889 */ 890 protected int handleNext() { 891 // if we're already at the end of the text, return DONE. 892 CharacterIterator text = getText(); 893 if (text.getIndex() == text.getEndIndex()) { 894 return BreakIterator.DONE; 895 } 896 897 // no matter what, we always advance at least one character forward 898 int result = getNextIndex(); 899 int lookaheadResult = 0; 900 901 // begin in state 1 902 int state = START_STATE; 903 int category; 904 int c = getCurrent(); 905 906 // loop until we reach the end of the text or transition to state 0 907 while (c != CharacterIterator.DONE && state != STOP_STATE) { 908 909 // look up the current character's character category (which tells us 910 // which column in the state table to look at) 911 category = lookupCategory(c); 912 913 // if the character isn't an ignore character, look up a state 914 // transition in the state table 915 if (category != IGNORE) { 916 state = lookupState(state, category); 917 } 918 919 // if the state we've just transitioned to is a lookahead state, 920 // (but not also an end state), save its position. If it's 921 // both a lookahead state and an end state, update the break position 922 // to the last saved lookup-state position 923 if (lookaheadStates[state]) { 924 if (endStates[state]) { 925 result = lookaheadResult; 926 } 927 else { 928 lookaheadResult = getNextIndex(); 929 } 930 } 931 932 // otherwise, if the state we've just transitioned to is an accepting 933 // state, update the break position to be the current iteration position 934 else { 935 if (endStates[state]) { 936 result = getNextIndex(); 937 } 938 } 939 940 c = getNext(); 941 } 942 943 // if we've run off the end of the text, and the very last character took us into 944 // a lookahead state, advance the break position to the lookahead position 945 // (the theory here is that if there are no characters at all after the lookahead 946 // position, that always matches the lookahead criteria) 947 if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) { 948 result = lookaheadResult; 949 } 950 951 text.setIndex(result); 952 return result; 953 } 954 955 /** 956 * This method backs the iterator back up to a "safe position" in the text. 957 * This is a position that we know, without any context, must be a break position. 958 * The various calling methods then iterate forward from this safe position to 959 * the appropriate position to return. (For more information, see the description 960 * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) 961 */ 962 protected int handlePrevious() { 963 CharacterIterator text = getText(); 964 int state = START_STATE; 965 int category = 0; 966 int lastCategory = 0; 967 int c = getCurrent(); 968 969 // loop until we reach the beginning of the text or transition to state 0 970 while (c != CharacterIterator.DONE && state != STOP_STATE) { 971 972 // save the last character's category and look up the current 973 // character's category 974 lastCategory = category; 975 category = lookupCategory(c); 976 977 // if the current character isn't an ignore character, look up a 978 // state transition in the backwards state table 979 if (category != IGNORE) { 980 state = lookupBackwardState(state, category); 981 } 982 983 // then advance one character backwards 984 c = getPrevious(); 985 } 986 987 // if we didn't march off the beginning of the text, we're either one or two 988 // positions away from the real break position. (One because of the call to 989 // previous() at the end of the loop above, and another because the character 990 // that takes us into the stop state will always be the character BEFORE 991 // the break position.) 992 if (c != CharacterIterator.DONE) { 993 if (lastCategory != IGNORE) { 994 getNext(); 995 getNext(); 996 } 997 else { 998 getNext(); 999 } 1000 } 1001 return text.getIndex(); 1002 } 1003 1004 /** 1005 * Looks up a character's category (i.e., its category for breaking purposes, 1006 * not its Unicode category) 1007 */ 1008 protected int lookupCategory(int c) { 1009 if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1010 return charCategoryTable.elementAt((char)c); 1011 } else { 1012 return supplementaryCharCategoryTable.getValue(c); 1013 } 1014 } 1015 1016 /** 1017 * Given a current state and a character category, looks up the 1018 * next state to transition to in the state table. 1019 */ 1020 protected int lookupState(int state, int category) { 1021 return stateTable[state * numCategories + category]; 1022 } 1023 1024 /** 1025 * Given a current state and a character category, looks up the 1026 * next state to transition to in the backwards state table. 1027 */ 1028 protected int lookupBackwardState(int state, int category) { 1029 return backwardsStateTable[state * numCategories + category]; 1030 } 1031 1032 /* 1033 * This class exists to work around a bug in incorrect implementations 1034 * of CharacterIterator, which incorrectly handle setIndex(endIndex). 1035 * This iterator relies only on base.setIndex(n) where n is less than 1036 * endIndex. 1037 * 1038 * One caveat: if the base iterator's begin and end indices change 1039 * the change will not be reflected by this wrapper. Does that matter? 1040 */ 1041 // TODO: Review this class to see if it's still required. 1042 private static final class SafeCharIterator implements CharacterIterator, 1043 Cloneable { 1044 1045 private CharacterIterator base; 1046 private int rangeStart; 1047 private int rangeLimit; 1048 private int currentIndex; 1049 1050 SafeCharIterator(CharacterIterator base) { 1051 this.base = base; 1052 this.rangeStart = base.getBeginIndex(); 1053 this.rangeLimit = base.getEndIndex(); 1054 this.currentIndex = base.getIndex(); 1055 } 1056 1057 @Override 1058 public char first() { 1059 return setIndex(rangeStart); 1060 } 1061 1062 @Override 1063 public char last() { 1064 return setIndex(rangeLimit - 1); 1065 } 1066 1067 @Override 1068 public char current() { 1069 if (currentIndex < rangeStart || currentIndex >= rangeLimit) { 1070 return DONE; 1071 } 1072 else { 1073 return base.setIndex(currentIndex); 1074 } 1075 } 1076 1077 @Override 1078 public char next() { 1079 1080 currentIndex++; 1081 if (currentIndex >= rangeLimit) { 1082 currentIndex = rangeLimit; 1083 return DONE; 1084 } 1085 else { 1086 return base.setIndex(currentIndex); 1087 } 1088 } 1089 1090 @Override 1091 public char previous() { 1092 1093 currentIndex--; 1094 if (currentIndex < rangeStart) { 1095 currentIndex = rangeStart; 1096 return DONE; 1097 } 1098 else { 1099 return base.setIndex(currentIndex); 1100 } 1101 } 1102 1103 @Override 1104 public char setIndex(int i) { 1105 1106 if (i < rangeStart || i > rangeLimit) { 1107 throw new IllegalArgumentException("Invalid position"); 1108 } 1109 currentIndex = i; 1110 return current(); 1111 } 1112 1113 @Override 1114 public int getBeginIndex() { 1115 return rangeStart; 1116 } 1117 1118 @Override 1119 public int getEndIndex() { 1120 return rangeLimit; 1121 } 1122 1123 @Override 1124 public int getIndex() { 1125 return currentIndex; 1126 } 1127 1128 @Override 1129 public Object clone() { 1130 1131 SafeCharIterator copy = null; 1132 try { 1133 copy = (SafeCharIterator) super.clone(); 1134 } 1135 catch(CloneNotSupportedException e) { 1136 throw new Error("Clone not supported: " + e); 1137 } 1138 1139 CharacterIterator copyOfBase = (CharacterIterator) base.clone(); 1140 copy.base = copyOfBase; 1141 return copy; 1142 } 1143 } 1144 }