1 /* 2 * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 41 package sun.util.locale.provider; 42 43 import java.io.BufferedInputStream; 44 import java.io.IOException; 45 import java.security.AccessController; 46 import java.security.PrivilegedActionException; 47 import java.security.PrivilegedExceptionAction; 48 import java.text.BreakIterator; 49 import java.text.CharacterIterator; 50 import java.text.StringCharacterIterator; 51 import java.util.MissingResourceException; 52 import sun.text.CompactByteArray; 53 import sun.text.SupplementaryCharacterData; 54 55 /** 56 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p> 57 * 58 * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i> 59 * and <i>regular expressions.</i></p> 60 * 61 * <p>A substitution rule defines a name that can be used in place of an expression. It 62 * consists of a name, which is a string of characters contained in angle brackets, an equals 63 * sign, and an expression. (There can be no whitespace on either side of the equals sign.) 64 * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or 65 * square brackets. A substitution is visible after its definition, and is filled in using 66 * simple textual substitution. Substitution definitions can contain other substitutions, as 67 * long as those substitutions have been defined first. Substitutions are generally used to 68 * make the regular expressions (which can get quite complex) shorted and easier to read. 69 * They typically define either character categories or commonly-used subexpressions.</p> 70 * 71 * <p>There is one special substitution. If the description defines a substitution 72 * called "<ignore>", the expression must be a [] expression, and the 73 * expression defines a set of characters (the "<em>ignore characters</em>") that 74 * will be transparent to the BreakIterator. A sequence of characters will break the 75 * same way it would if any ignore characters it contains are taken out. Break 76 * positions never occur befoer ignore characters.</p> 77 * 78 * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and 79 * defines a sequence of characters to be kept together. With one significant exception, the 80 * iterator uses a longest-possible-match algorithm when matching text to regular 81 * expressions. The iterator also treats descriptions containing multiple regular expressions 82 * as if they were ORed together (i.e., as if they were separated by |).</p> 83 * 84 * <p>The special characters recognized by the regular-expression parser are as follows:</p> 85 * 86 * <blockquote> 87 * <table border="1" width="100%"> 88 * <tr> 89 * <td width="6%">*</td> 90 * <td width="94%">Specifies that the expression preceding the asterisk may occur any number 91 * of times (including not at all).</td> 92 * </tr> 93 * <tr> 94 * <td width="6%">{}</td> 95 * <td width="94%">Encloses a sequence of characters that is optional.</td> 96 * </tr> 97 * <tr> 98 * <td width="6%">()</td> 99 * <td width="94%">Encloses a sequence of characters. If followed by *, the sequence 100 * repeats. Otherwise, the parentheses are just a grouping device and a way to delimit 101 * the ends of expressions containing |.</td> 102 * </tr> 103 * <tr> 104 * <td width="6%">|</td> 105 * <td width="94%">Separates two alternative sequences of characters. Either one 106 * sequence or the other, but not both, matches this expression. The | character can 107 * only occur inside ().</td> 108 * </tr> 109 * <tr> 110 * <td width="6%">.</td> 111 * <td width="94%">Matches any character.</td> 112 * </tr> 113 * <tr> 114 * <td width="6%">*?</td> 115 * <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except 116 * when there is overlap between the last group of characters in the expression preceding the 117 * * and the first group of characters following the *. When there is this kind of 118 * overlap, * will match the longest sequence of characters that match the expression before 119 * the *, and *? will match the shortest sequence of characters matching the expression 120 * before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text, 121 * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy", 122 * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td> 123 * </tr> 124 * <tr> 125 * <td width="6%">[]</td> 126 * <td width="94%">Specifies a group of alternative characters. A [] expression will 127 * match any single character that is specified in the [] expression. For more on the 128 * syntax of [] expressions, see below.</td> 129 * </tr> 130 * <tr> 131 * <td width="6%">/</td> 132 * <td width="94%">Specifies where the break position should go if text matches this 133 * expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run 134 * of letters, followed by a run of whitespace, followed by a digit, but the break position 135 * will actually go before the whitespace). Expressions that don't contain / put the 136 * break position at the end of the matching text.</td> 137 * </tr> 138 * <tr> 139 * <td width="6%">\</td> 140 * <td width="94%">Escape character. The \ itself is ignored, but causes the next 141 * character to be treated as literal character. This has no effect for many 142 * characters, but for the characters listed above, this deprives them of their special 143 * meaning. (There are no special escape sequences for Unicode characters, or tabs and 144 * newlines; these are all handled by a higher-level protocol. In a Java string, 145 * "\n" will be converted to a literal newline character by the time the 146 * regular-expression parser sees it. Of course, this means that \ sequences that are 147 * visible to the regexp parser must be written as \\ when inside a Java string.) All 148 * characters in the ASCII range except for letters, digits, and control characters are 149 * reserved characters to the parser and must be preceded by \ even if they currently don't 150 * mean anything.</td> 151 * </tr> 152 * <tr> 153 * <td width="6%">!</td> 154 * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp 155 * parser that this expression specifies the backwards-iteration behavior of the iterator, 156 * and not its normal iteration behavior. This is generally only used in situations 157 * where the automatically-generated backwards-iteration brhavior doesn't produce 158 * satisfactory results and must be supplemented with extra client-specified rules.</td> 159 * </tr> 160 * <tr> 161 * <td width="6%"><em>(all others)</em></td> 162 * <td width="94%">All other characters are treated as literal characters, which must match 163 * the corresponding character(s) in the text exactly.</td> 164 * </tr> 165 * </table> 166 * </blockquote> 167 * 168 * <p>Within a [] expression, a number of other special characters can be used to specify 169 * groups of characters:</p> 170 * 171 * <blockquote> 172 * <table border="1" width="100%"> 173 * <tr> 174 * <td width="6%">-</td> 175 * <td width="94%">Specifies a range of matching characters. For example 176 * "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The - 177 * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a 178 * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does 179 * it include accented letters such as a-umlaut.</td> 180 * </tr> 181 * <tr> 182 * <td width="6%">::</td> 183 * <td width="94%">A pair of colons containing a one- or two-letter code matches all 184 * characters in the corresponding Unicode category. The two-letter codes are the same 185 * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]" 186 * matches all currency symbols and all math symbols). Specifying a one-letter code is 187 * the same as specifying all two-letter codes that begin with that letter (for example, 188 * "[:L:]" matches all letters, and is equivalent to 189 * "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode 190 * category code or a single letter that begins a Unicode category code is illegal within 191 * colons.</td> 192 * </tr> 193 * <tr> 194 * <td width="6%">[]</td> 195 * <td width="94%">[] expressions can nest. This has no effect, except when used in 196 * conjunction with the ^ token.</td> 197 * </tr> 198 * <tr> 199 * <td width="6%">^</td> 200 * <td width="94%">Excludes the character (or the characters in the [] expression) following 201 * it from the group of characters. For example, "[a-z^p]" matches all Latin 202 * lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters 203 * except the Han ideographs.</td> 204 * </tr> 205 * <tr> 206 * <td width="6%"><em>(all others)</em></td> 207 * <td width="94%">All other characters are treated as literal characters. (For 208 * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td> 209 * </tr> 210 * </table> 211 * </blockquote> 212 * 213 * <p>For a more complete explanation, see <a 214 * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>. 215 * For examples, see the resource data (which is annotated).</p> 216 * 217 * @author Richard Gillam 218 */ 219 class RuleBasedBreakIterator extends BreakIterator { 220 221 /** 222 * A token used as a character-category value to identify ignore characters 223 */ 224 protected static final byte IGNORE = -1; 225 226 /** 227 * The state number of the starting state 228 */ 229 private static final short START_STATE = 1; 230 231 /** 232 * The state-transition value indicating "stop" 233 */ 234 private static final short STOP_STATE = 0; 235 236 /** 237 * Magic number for the BreakIterator data file format. 238 */ 239 static final byte[] LABEL = { 240 (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a', 241 (byte)'\0' 242 }; 243 static final int LABEL_LENGTH = LABEL.length; 244 245 /** 246 * Version number of the dictionary that was read in. 247 */ 248 static final byte supportedVersion = 1; 249 250 /** 251 * Header size in byte count 252 */ 253 private static final int HEADER_LENGTH = 36; 254 255 /** 256 * An array length of indices for BMP characters 257 */ 258 private static final int BMP_INDICES_LENGTH = 512; 259 260 /** 261 * Tables that indexes from character values to character category numbers 262 */ 263 private CompactByteArray charCategoryTable = null; 264 private SupplementaryCharacterData supplementaryCharCategoryTable = null; 265 266 /** 267 * The table of state transitions used for forward iteration 268 */ 269 private short[] stateTable = null; 270 271 /** 272 * The table of state transitions used to sync up the iterator with the 273 * text in backwards and random-access iteration 274 */ 275 private short[] backwardsStateTable = null; 276 277 /** 278 * A list of flags indicating which states in the state table are accepting 279 * ("end") states 280 */ 281 private boolean[] endStates = null; 282 283 /** 284 * A list of flags indicating which states in the state table are 285 * lookahead states (states which turn lookahead on and off) 286 */ 287 private boolean[] lookaheadStates = null; 288 289 /** 290 * A table for additional data. May be used by a subclass of 291 * RuleBasedBreakIterator. 292 */ 293 private byte[] additionalData = null; 294 295 /** 296 * The number of character categories (and, thus, the number of columns in 297 * the state tables) 298 */ 299 private int numCategories; 300 301 /** 302 * The character iterator through which this BreakIterator accesses the text 303 */ 304 private CharacterIterator text = null; 305 306 /** 307 * A CRC32 value of all data in datafile 308 */ 309 private long checksum; 310 311 //======================================================================= 312 // constructors 313 //======================================================================= 314 315 /** 316 * Constructs a RuleBasedBreakIterator according to the datafile 317 * provided. 318 */ 319 RuleBasedBreakIterator(String datafile) 320 throws IOException, MissingResourceException { 321 readTables(datafile); 322 } 323 324 /** 325 * Read datafile. The datafile's format is as follows: 326 * <pre> 327 * BreakIteratorData { 328 * u1 magic[7]; 329 * u1 version; 330 * u4 totalDataSize; 331 * header_info header; 332 * body value; 333 * } 334 * </pre> 335 * <code>totalDataSize</code> is the summation of the size of 336 * <code>header_info</code> and <code>body</code> in byte count. 337 * <p> 338 * In <code>header</code>, each field except for checksum implies the 339 * length of each field. Since <code>BMPdataLength</code> is a fixed-length 340 * data(512 entries), its length isn't included in <code>header</code>. 341 * <code>checksum</code> is a CRC32 value of all in <code>body</code>. 342 * <pre> 343 * header_info { 344 * u4 stateTableLength; 345 * u4 backwardsStateTableLength; 346 * u4 endStatesLength; 347 * u4 lookaheadStatesLength; 348 * u4 BMPdataLength; 349 * u4 nonBMPdataLength; 350 * u4 additionalDataLength; 351 * u8 checksum; 352 * } 353 * </pre> 354 * <p> 355 * 356 * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to 357 * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to 358 * <code>supplementaryCharCategoryTable</code>. 359 * <pre> 360 * body { 361 * u2 stateTable[stateTableLength]; 362 * u2 backwardsStateTable[backwardsStateTableLength]; 363 * u1 endStates[endStatesLength]; 364 * u1 lookaheadStates[lookaheadStatesLength]; 365 * u2 BMPindices[512]; 366 * u1 BMPdata[BMPdataLength]; 367 * u4 nonBMPdata[numNonBMPdataLength]; 368 * u1 additionalData[additionalDataLength]; 369 * } 370 * </pre> 371 */ 372 protected final void readTables(String datafile) 373 throws IOException, MissingResourceException { 374 375 byte[] buffer = readFile(datafile); 376 377 /* Read header_info. */ 378 int stateTableLength = getInt(buffer, 0); 379 int backwardsStateTableLength = getInt(buffer, 4); 380 int endStatesLength = getInt(buffer, 8); 381 int lookaheadStatesLength = getInt(buffer, 12); 382 int BMPdataLength = getInt(buffer, 16); 383 int nonBMPdataLength = getInt(buffer, 20); 384 int additionalDataLength = getInt(buffer, 24); 385 checksum = getLong(buffer, 28); 386 387 /* Read stateTable[numCategories * numRows] */ 388 stateTable = new short[stateTableLength]; 389 int offset = HEADER_LENGTH; 390 for (int i = 0; i < stateTableLength; i++, offset+=2) { 391 stateTable[i] = getShort(buffer, offset); 392 } 393 394 /* Read backwardsStateTable[numCategories * numRows] */ 395 backwardsStateTable = new short[backwardsStateTableLength]; 396 for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) { 397 backwardsStateTable[i] = getShort(buffer, offset); 398 } 399 400 /* Read endStates[numRows] */ 401 endStates = new boolean[endStatesLength]; 402 for (int i = 0; i < endStatesLength; i++, offset++) { 403 endStates[i] = buffer[offset] == 1; 404 } 405 406 /* Read lookaheadStates[numRows] */ 407 lookaheadStates = new boolean[lookaheadStatesLength]; 408 for (int i = 0; i < lookaheadStatesLength; i++, offset++) { 409 lookaheadStates[i] = buffer[offset] == 1; 410 } 411 412 /* Read a category table and indices for BMP characters. */ 413 short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices 414 for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) { 415 temp1[i] = getShort(buffer, offset); 416 } 417 byte[] temp2 = new byte[BMPdataLength]; // BMPdata 418 System.arraycopy(buffer, offset, temp2, 0, BMPdataLength); 419 offset += BMPdataLength; 420 charCategoryTable = new CompactByteArray(temp1, temp2); 421 422 /* Read a category table for non-BMP characters. */ 423 int[] temp3 = new int[nonBMPdataLength]; 424 for (int i = 0; i < nonBMPdataLength; i++, offset+=4) { 425 temp3[i] = getInt(buffer, offset); 426 } 427 supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3); 428 429 /* Read additional data */ 430 if (additionalDataLength > 0) { 431 additionalData = new byte[additionalDataLength]; 432 System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength); 433 } 434 435 /* Set numCategories */ 436 numCategories = stateTable.length / endStates.length; 437 } 438 439 protected byte[] readFile(final String datafile) 440 throws IOException, MissingResourceException { 441 442 BufferedInputStream is; 443 try { 444 is = AccessController.doPrivileged( 445 new PrivilegedExceptionAction<BufferedInputStream>() { 446 @Override 447 public BufferedInputStream run() throws Exception { 448 return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + datafile)); 449 } 450 } 451 ); 452 } 453 catch (PrivilegedActionException e) { 454 throw new InternalError(e.toString(), e); 455 } 456 457 int offset = 0; 458 459 /* First, read magic, version, and header_info. */ 460 int len = LABEL_LENGTH + 5; 461 byte[] buf = new byte[len]; 462 if (is.read(buf) != len) { 463 throw new MissingResourceException("Wrong header length", 464 datafile, ""); 465 } 466 467 /* Validate the magic number. */ 468 for (int i = 0; i < LABEL_LENGTH; i++, offset++) { 469 if (buf[offset] != LABEL[offset]) { 470 throw new MissingResourceException("Wrong magic number", 471 datafile, ""); 472 } 473 } 474 475 /* Validate the version number. */ 476 if (buf[offset] != supportedVersion) { 477 throw new MissingResourceException("Unsupported version(" + buf[offset] + ")", 478 datafile, ""); 479 } 480 481 /* Read data: totalDataSize + 8(for checksum) */ 482 len = getInt(buf, ++offset); 483 buf = new byte[len]; 484 if (is.read(buf) != len) { 485 throw new MissingResourceException("Wrong data length", 486 datafile, ""); 487 } 488 489 is.close(); 490 491 return buf; 492 } 493 494 byte[] getAdditionalData() { 495 return additionalData; 496 } 497 498 void setAdditionalData(byte[] b) { 499 additionalData = b; 500 } 501 502 //======================================================================= 503 // boilerplate 504 //======================================================================= 505 /** 506 * Clones this iterator. 507 * @return A newly-constructed RuleBasedBreakIterator with the same 508 * behavior as this one. 509 */ 510 @Override 511 public Object clone() { 512 RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone(); 513 if (text != null) { 514 result.text = (CharacterIterator) text.clone(); 515 } 516 return result; 517 } 518 519 /** 520 * Returns true if both BreakIterators are of the same class, have the same 521 * rules, and iterate over the same text. 522 */ 523 @Override 524 public boolean equals(Object that) { 525 try { 526 if (that == null) { 527 return false; 528 } 529 530 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 531 if (checksum != other.checksum) { 532 return false; 533 } 534 if (text == null) { 535 return other.text == null; 536 } else { 537 return text.equals(other.text); 538 } 539 } 540 catch(ClassCastException e) { 541 return false; 542 } 543 } 544 545 /** 546 * Returns text 547 */ 548 @Override 549 public String toString() { 550 StringBuilder sb = new StringBuilder(); 551 sb.append('['); 552 sb.append("checksum=0x"); 553 sb.append(Long.toHexString(checksum)); 554 sb.append(']'); 555 return sb.toString(); 556 } 557 558 /** 559 * Compute a hashcode for this BreakIterator 560 * @return A hash code 561 */ 562 @Override 563 public int hashCode() { 564 return (int)checksum; 565 } 566 567 //======================================================================= 568 // BreakIterator overrides 569 //======================================================================= 570 571 /** 572 * Sets the current iteration position to the beginning of the text. 573 * (i.e., the CharacterIterator's starting offset). 574 * @return The offset of the beginning of the text. 575 */ 576 @Override 577 public int first() { 578 CharacterIterator t = getText(); 579 580 t.first(); 581 return t.getIndex(); 582 } 583 584 /** 585 * Sets the current iteration position to the end of the text. 586 * (i.e., the CharacterIterator's ending offset). 587 * @return The text's past-the-end offset. 588 */ 589 @Override 590 public int last() { 591 CharacterIterator t = getText(); 592 593 // I'm not sure why, but t.last() returns the offset of the last character, 594 // rather than the past-the-end offset 595 t.setIndex(t.getEndIndex()); 596 return t.getIndex(); 597 } 598 599 /** 600 * Advances the iterator either forward or backward the specified number of steps. 601 * Negative values move backward, and positive values move forward. This is 602 * equivalent to repeatedly calling next() or previous(). 603 * @param n The number of steps to move. The sign indicates the direction 604 * (negative is backwards, and positive is forwards). 605 * @return The character offset of the boundary position n boundaries away from 606 * the current one. 607 */ 608 @Override 609 public int next(int n) { 610 int result = current(); 611 while (n > 0) { 612 result = handleNext(); 613 --n; 614 } 615 while (n < 0) { 616 result = previous(); 617 ++n; 618 } 619 return result; 620 } 621 622 /** 623 * Advances the iterator to the next boundary position. 624 * @return The position of the first boundary after this one. 625 */ 626 @Override 627 public int next() { 628 return handleNext(); 629 } 630 631 private int cachedLastKnownBreak = BreakIterator.DONE; 632 633 /** 634 * Advances the iterator backwards, to the last boundary preceding this one. 635 * @return The position of the last boundary position preceding this one. 636 */ 637 @Override 638 public int previous() { 639 // if we're already sitting at the beginning of the text, return DONE 640 CharacterIterator text = getText(); 641 if (current() == text.getBeginIndex()) { 642 return BreakIterator.DONE; 643 } 644 645 // set things up. handlePrevious() will back us up to some valid 646 // break position before the current position (we back our internal 647 // iterator up one step to prevent handlePrevious() from returning 648 // the current position), but not necessarily the last one before 649 // where we started 650 int start = current(); 651 int lastResult = cachedLastKnownBreak; 652 if (lastResult >= start || lastResult <= BreakIterator.DONE) { 653 getPrevious(); 654 lastResult = handlePrevious(); 655 } else { 656 //it might be better to check if handlePrevious() give us closer 657 //safe value but handlePrevious() is slow too 658 //So, this has to be done carefully 659 text.setIndex(lastResult); 660 } 661 int result = lastResult; 662 663 // iterate forward from the known break position until we pass our 664 // starting point. The last break position before the starting 665 // point is our return value 666 while (result != BreakIterator.DONE && result < start) { 667 lastResult = result; 668 result = handleNext(); 669 } 670 671 // set the current iteration position to be the last break position 672 // before where we started, and then return that value 673 text.setIndex(lastResult); 674 cachedLastKnownBreak = lastResult; 675 return lastResult; 676 } 677 678 /** 679 * Returns previous character 680 */ 681 private int getPrevious() { 682 char c2 = text.previous(); 683 if (Character.isLowSurrogate(c2) && 684 text.getIndex() > text.getBeginIndex()) { 685 char c1 = text.previous(); 686 if (Character.isHighSurrogate(c1)) { 687 return Character.toCodePoint(c1, c2); 688 } else { 689 text.next(); 690 } 691 } 692 return (int)c2; 693 } 694 695 /** 696 * Returns current character 697 */ 698 int getCurrent() { 699 char c1 = text.current(); 700 if (Character.isHighSurrogate(c1) && 701 text.getIndex() < text.getEndIndex()) { 702 char c2 = text.next(); 703 text.previous(); 704 if (Character.isLowSurrogate(c2)) { 705 return Character.toCodePoint(c1, c2); 706 } 707 } 708 return (int)c1; 709 } 710 711 /** 712 * Returns the count of next character. 713 */ 714 private int getCurrentCodePointCount() { 715 char c1 = text.current(); 716 if (Character.isHighSurrogate(c1) && 717 text.getIndex() < text.getEndIndex()) { 718 char c2 = text.next(); 719 text.previous(); 720 if (Character.isLowSurrogate(c2)) { 721 return 2; 722 } 723 } 724 return 1; 725 } 726 727 /** 728 * Returns next character 729 */ 730 int getNext() { 731 int index = text.getIndex(); 732 int endIndex = text.getEndIndex(); 733 if (index == endIndex || 734 (index += getCurrentCodePointCount()) >= endIndex) { 735 return CharacterIterator.DONE; 736 } 737 text.setIndex(index); 738 return getCurrent(); 739 } 740 741 /** 742 * Returns the position of next character. 743 */ 744 private int getNextIndex() { 745 int index = text.getIndex() + getCurrentCodePointCount(); 746 int endIndex = text.getEndIndex(); 747 if (index > endIndex) { 748 return endIndex; 749 } else { 750 return index; 751 } 752 } 753 754 /** 755 * Throw IllegalArgumentException unless begin <= offset < end. 756 */ 757 protected static final void checkOffset(int offset, CharacterIterator text) { 758 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 759 throw new IllegalArgumentException("offset out of bounds"); 760 } 761 } 762 763 /** 764 * Sets the iterator to refer to the first boundary position following 765 * the specified position. 766 * @offset The position from which to begin searching for a break position. 767 * @return The position of the first break after the current position. 768 */ 769 @Override 770 public int following(int offset) { 771 772 CharacterIterator text = getText(); 773 checkOffset(offset, text); 774 775 // Set our internal iteration position (temporarily) 776 // to the position passed in. If this is the _beginning_ position, 777 // then we can just use next() to get our return value 778 text.setIndex(offset); 779 if (offset == text.getBeginIndex()) { 780 cachedLastKnownBreak = handleNext(); 781 return cachedLastKnownBreak; 782 } 783 784 // otherwise, we have to sync up first. Use handlePrevious() to back 785 // us up to a known break position before the specified position (if 786 // we can determine that the specified position is a break position, 787 // we don't back up at all). This may or may not be the last break 788 // position at or before our starting position. Advance forward 789 // from here until we've passed the starting position. The position 790 // we stop on will be the first break position after the specified one. 791 int result = cachedLastKnownBreak; 792 if (result >= offset || result <= BreakIterator.DONE) { 793 result = handlePrevious(); 794 } else { 795 //it might be better to check if handlePrevious() give us closer 796 //safe value but handlePrevious() is slow too 797 //So, this has to be done carefully 798 text.setIndex(result); 799 } 800 while (result != BreakIterator.DONE && result <= offset) { 801 result = handleNext(); 802 } 803 cachedLastKnownBreak = result; 804 return result; 805 } 806 807 /** 808 * Sets the iterator to refer to the last boundary position before the 809 * specified position. 810 * @offset The position to begin searching for a break from. 811 * @return The position of the last boundary before the starting position. 812 */ 813 @Override 814 public int preceding(int offset) { 815 // if we start by updating the current iteration position to the 816 // position specified by the caller, we can just use previous() 817 // to carry out this operation 818 CharacterIterator text = getText(); 819 checkOffset(offset, text); 820 text.setIndex(offset); 821 return previous(); 822 } 823 824 /** 825 * Returns true if the specfied position is a boundary position. As a side 826 * effect, leaves the iterator pointing to the first boundary position at 827 * or after "offset". 828 * @param offset the offset to check. 829 * @return True if "offset" is a boundary position. 830 */ 831 @Override 832 public boolean isBoundary(int offset) { 833 CharacterIterator text = getText(); 834 checkOffset(offset, text); 835 if (offset == text.getBeginIndex()) { 836 return true; 837 } 838 839 // to check whether this is a boundary, we can use following() on the 840 // position before the specified one and return true if the position we 841 // get back is the one the user specified 842 else { 843 return following(offset - 1) == offset; 844 } 845 } 846 847 /** 848 * Returns the current iteration position. 849 * @return The current iteration position. 850 */ 851 @Override 852 public int current() { 853 return getText().getIndex(); 854 } 855 856 /** 857 * Return a CharacterIterator over the text being analyzed. This version 858 * of this method returns the actual CharacterIterator we're using internally. 859 * Changing the state of this iterator can have undefined consequences. If 860 * you need to change it, clone it first. 861 * @return An iterator over the text being analyzed. 862 */ 863 @Override 864 public CharacterIterator getText() { 865 // The iterator is initialized pointing to no text at all, so if this 866 // function is called while we're in that state, we have to fudge an 867 // iterator to return. 868 if (text == null) { 869 text = new StringCharacterIterator(""); 870 } 871 return text; 872 } 873 874 /** 875 * Set the iterator to analyze a new piece of text. This function resets 876 * the current iteration position to the beginning of the text. 877 * @param newText An iterator over the text to analyze. 878 */ 879 @Override 880 public void setText(CharacterIterator newText) { 881 // Test iterator to see if we need to wrap it in a SafeCharIterator. 882 // The correct behavior for CharacterIterators is to allow the 883 // position to be set to the endpoint of the iterator. Many 884 // CharacterIterators do not uphold this, so this is a workaround 885 // to permit them to use this class. 886 int end = newText.getEndIndex(); 887 boolean goodIterator; 888 try { 889 newText.setIndex(end); // some buggy iterators throw an exception here 890 goodIterator = newText.getIndex() == end; 891 } 892 catch(IllegalArgumentException e) { 893 goodIterator = false; 894 } 895 896 if (goodIterator) { 897 text = newText; 898 } 899 else { 900 text = new SafeCharIterator(newText); 901 } 902 text.first(); 903 904 cachedLastKnownBreak = BreakIterator.DONE; 905 } 906 907 908 //======================================================================= 909 // implementation 910 //======================================================================= 911 912 /** 913 * This method is the actual implementation of the next() method. All iteration 914 * vectors through here. This method initializes the state machine to state 1 915 * and advances through the text character by character until we reach the end 916 * of the text or the state machine transitions to state 0. We update our return 917 * value every time the state machine passes through a possible end state. 918 */ 919 protected int handleNext() { 920 // if we're already at the end of the text, return DONE. 921 CharacterIterator text = getText(); 922 if (text.getIndex() == text.getEndIndex()) { 923 return BreakIterator.DONE; 924 } 925 926 // no matter what, we always advance at least one character forward 927 int result = getNextIndex(); 928 int lookaheadResult = 0; 929 930 // begin in state 1 931 int state = START_STATE; 932 int category; 933 int c = getCurrent(); 934 935 // loop until we reach the end of the text or transition to state 0 936 while (c != CharacterIterator.DONE && state != STOP_STATE) { 937 938 // look up the current character's character category (which tells us 939 // which column in the state table to look at) 940 category = lookupCategory(c); 941 942 // if the character isn't an ignore character, look up a state 943 // transition in the state table 944 if (category != IGNORE) { 945 state = lookupState(state, category); 946 } 947 948 // if the state we've just transitioned to is a lookahead state, 949 // (but not also an end state), save its position. If it's 950 // both a lookahead state and an end state, update the break position 951 // to the last saved lookup-state position 952 if (lookaheadStates[state]) { 953 if (endStates[state]) { 954 result = lookaheadResult; 955 } 956 else { 957 lookaheadResult = getNextIndex(); 958 } 959 } 960 961 // otherwise, if the state we've just transitioned to is an accepting 962 // state, update the break position to be the current iteration position 963 else { 964 if (endStates[state]) { 965 result = getNextIndex(); 966 } 967 } 968 969 c = getNext(); 970 } 971 972 // if we've run off the end of the text, and the very last character took us into 973 // a lookahead state, advance the break position to the lookahead position 974 // (the theory here is that if there are no characters at all after the lookahead 975 // position, that always matches the lookahead criteria) 976 if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) { 977 result = lookaheadResult; 978 } 979 980 text.setIndex(result); 981 return result; 982 } 983 984 /** 985 * This method backs the iterator back up to a "safe position" in the text. 986 * This is a position that we know, without any context, must be a break position. 987 * The various calling methods then iterate forward from this safe position to 988 * the appropriate position to return. (For more information, see the description 989 * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) 990 */ 991 protected int handlePrevious() { 992 CharacterIterator text = getText(); 993 int state = START_STATE; 994 int category = 0; 995 int lastCategory = 0; 996 int c = getCurrent(); 997 998 // loop until we reach the beginning of the text or transition to state 0 999 while (c != CharacterIterator.DONE && state != STOP_STATE) { 1000 1001 // save the last character's category and look up the current 1002 // character's category 1003 lastCategory = category; 1004 category = lookupCategory(c); 1005 1006 // if the current character isn't an ignore character, look up a 1007 // state transition in the backwards state table 1008 if (category != IGNORE) { 1009 state = lookupBackwardState(state, category); 1010 } 1011 1012 // then advance one character backwards 1013 c = getPrevious(); 1014 } 1015 1016 // if we didn't march off the beginning of the text, we're either one or two 1017 // positions away from the real break position. (One because of the call to 1018 // previous() at the end of the loop above, and another because the character 1019 // that takes us into the stop state will always be the character BEFORE 1020 // the break position.) 1021 if (c != CharacterIterator.DONE) { 1022 if (lastCategory != IGNORE) { 1023 getNext(); 1024 getNext(); 1025 } 1026 else { 1027 getNext(); 1028 } 1029 } 1030 return text.getIndex(); 1031 } 1032 1033 /** 1034 * Looks up a character's category (i.e., its category for breaking purposes, 1035 * not its Unicode category) 1036 */ 1037 protected int lookupCategory(int c) { 1038 if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1039 return charCategoryTable.elementAt((char)c); 1040 } else { 1041 return supplementaryCharCategoryTable.getValue(c); 1042 } 1043 } 1044 1045 /** 1046 * Given a current state and a character category, looks up the 1047 * next state to transition to in the state table. 1048 */ 1049 protected int lookupState(int state, int category) { 1050 return stateTable[state * numCategories + category]; 1051 } 1052 1053 /** 1054 * Given a current state and a character category, looks up the 1055 * next state to transition to in the backwards state table. 1056 */ 1057 protected int lookupBackwardState(int state, int category) { 1058 return backwardsStateTable[state * numCategories + category]; 1059 } 1060 1061 static long getLong(byte[] buf, int offset) { 1062 long num = buf[offset]&0xFF; 1063 for (int i = 1; i < 8; i++) { 1064 num = num<<8 | (buf[offset+i]&0xFF); 1065 } 1066 return num; 1067 } 1068 1069 static int getInt(byte[] buf, int offset) { 1070 int num = buf[offset]&0xFF; 1071 for (int i = 1; i < 4; i++) { 1072 num = num<<8 | (buf[offset+i]&0xFF); 1073 } 1074 return num; 1075 } 1076 1077 static short getShort(byte[] buf, int offset) { 1078 short num = (short)(buf[offset]&0xFF); 1079 num = (short)(num<<8 | (buf[offset+1]&0xFF)); 1080 return num; 1081 } 1082 1083 /* 1084 * This class exists to work around a bug in incorrect implementations 1085 * of CharacterIterator, which incorrectly handle setIndex(endIndex). 1086 * This iterator relies only on base.setIndex(n) where n is less than 1087 * endIndex. 1088 * 1089 * One caveat: if the base iterator's begin and end indices change 1090 * the change will not be reflected by this wrapper. Does that matter? 1091 */ 1092 // TODO: Review this class to see if it's still required. 1093 private static final class SafeCharIterator implements CharacterIterator, 1094 Cloneable { 1095 1096 private CharacterIterator base; 1097 private int rangeStart; 1098 private int rangeLimit; 1099 private int currentIndex; 1100 1101 SafeCharIterator(CharacterIterator base) { 1102 this.base = base; 1103 this.rangeStart = base.getBeginIndex(); 1104 this.rangeLimit = base.getEndIndex(); 1105 this.currentIndex = base.getIndex(); 1106 } 1107 1108 @Override 1109 public char first() { 1110 return setIndex(rangeStart); 1111 } 1112 1113 @Override 1114 public char last() { 1115 return setIndex(rangeLimit - 1); 1116 } 1117 1118 @Override 1119 public char current() { 1120 if (currentIndex < rangeStart || currentIndex >= rangeLimit) { 1121 return DONE; 1122 } 1123 else { 1124 return base.setIndex(currentIndex); 1125 } 1126 } 1127 1128 @Override 1129 public char next() { 1130 1131 currentIndex++; 1132 if (currentIndex >= rangeLimit) { 1133 currentIndex = rangeLimit; 1134 return DONE; 1135 } 1136 else { 1137 return base.setIndex(currentIndex); 1138 } 1139 } 1140 1141 @Override 1142 public char previous() { 1143 1144 currentIndex--; 1145 if (currentIndex < rangeStart) { 1146 currentIndex = rangeStart; 1147 return DONE; 1148 } 1149 else { 1150 return base.setIndex(currentIndex); 1151 } 1152 } 1153 1154 @Override 1155 public char setIndex(int i) { 1156 1157 if (i < rangeStart || i > rangeLimit) { 1158 throw new IllegalArgumentException("Invalid position"); 1159 } 1160 currentIndex = i; 1161 return current(); 1162 } 1163 1164 @Override 1165 public int getBeginIndex() { 1166 return rangeStart; 1167 } 1168 1169 @Override 1170 public int getEndIndex() { 1171 return rangeLimit; 1172 } 1173 1174 @Override 1175 public int getIndex() { 1176 return currentIndex; 1177 } 1178 1179 @Override 1180 public Object clone() { 1181 1182 SafeCharIterator copy = null; 1183 try { 1184 copy = (SafeCharIterator) super.clone(); 1185 } 1186 catch(CloneNotSupportedException e) { 1187 throw new Error("Clone not supported: " + e); 1188 } 1189 1190 CharacterIterator copyOfBase = (CharacterIterator) base.clone(); 1191 copy.base = copyOfBase; 1192 return copy; 1193 } 1194 } 1195 }