1 /*
   2  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *
  28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  29  * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  30  *
  31  * The original version of this source code and documentation
  32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  33  * subsidiary of IBM. These materials are provided under terms
  34  * of a License Agreement between Taligent and Sun. This technology
  35  * is protected by multiple US and International patents.
  36  *
  37  * This notice and attribution to Taligent may not be removed.
  38  * Taligent is a registered trademark of Taligent, Inc.
  39  */
  40 
  41 package java.text;
  42 
  43 import java.io.BufferedInputStream;
  44 import java.io.IOException;
  45 import java.security.AccessController;
  46 import java.security.PrivilegedActionException;
  47 import java.security.PrivilegedExceptionAction;
  48 import java.util.Vector;
  49 import java.util.Stack;
  50 import java.util.Hashtable;
  51 import java.util.Enumeration;
  52 import java.util.MissingResourceException;
  53 import java.text.CharacterIterator;
  54 import java.text.StringCharacterIterator;
  55 import sun.text.CompactByteArray;
  56 import sun.text.SupplementaryCharacterData;
  57 
  58 /**
  59  * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
  60  *
  61  * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
  62  * and <i>regular expressions.</i></p>
  63  *
  64  * <p>A substitution rule defines a name that can be used in place of an expression. It
  65  * consists of a name, which is a string of characters contained in angle brackets, an equals
  66  * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
  67  * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
  68  * square brackets. A substitution is visible after its definition, and is filled in using
  69  * simple textual substitution. Substitution definitions can contain other substitutions, as
  70  * long as those substitutions have been defined first. Substitutions are generally used to
  71  * make the regular expressions (which can get quite complex) shorted and easier to read.
  72  * They typically define either character categories or commonly-used subexpressions.</p>
  73  *
  74  * <p>There is one special substitution.&nbsp; If the description defines a substitution
  75  * called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
  76  * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
  77  * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
  78  * same way it would if any ignore characters it contains are taken out.&nbsp; Break
  79  * positions never occur befoer ignore characters.</p>
  80  *
  81  * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
  82  * defines a sequence of characters to be kept together. With one significant exception, the
  83  * iterator uses a longest-possible-match algorithm when matching text to regular
  84  * expressions. The iterator also treats descriptions containing multiple regular expressions
  85  * as if they were ORed together (i.e., as if they were separated by |).</p>
  86  *
  87  * <p>The special characters recognized by the regular-expression parser are as follows:</p>
  88  *
  89  * <blockquote>
  90  *   <table border="1" width="100%">
  91  *     <tr>
  92  *       <td width="6%">*</td>
  93  *       <td width="94%">Specifies that the expression preceding the asterisk may occur any number
  94  *       of times (including not at all).</td>
  95  *     </tr>
  96  *     <tr>
  97  *       <td width="6%">{}</td>
  98  *       <td width="94%">Encloses a sequence of characters that is optional.</td>
  99  *     </tr>
 100  *     <tr>
 101  *       <td width="6%">()</td>
 102  *       <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
 103  *       repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
 104  *       the ends of expressions containing |.</td>
 105  *     </tr>
 106  *     <tr>
 107  *       <td width="6%">|</td>
 108  *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
 109  *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
 110  *       only occur inside ().</td>
 111  *     </tr>
 112  *     <tr>
 113  *       <td width="6%">.</td>
 114  *       <td width="94%">Matches any character.</td>
 115  *     </tr>
 116  *     <tr>
 117  *       <td width="6%">*?</td>
 118  *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
 119  *       when there is overlap between the last group of characters in the expression preceding the
 120  *       * and the first group of characters following the *.&nbsp; When there is this kind of
 121  *       overlap, * will match the longest sequence of characters that match the expression before
 122  *       the *, and *? will match the shortest sequence of characters matching the expression
 123  *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
 124  *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
 125  *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
 126  *     </tr>
 127  *     <tr>
 128  *       <td width="6%">[]</td>
 129  *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
 130  *       match any single character that is specified in the [] expression.&nbsp; For more on the
 131  *       syntax of [] expressions, see below.</td>
 132  *     </tr>
 133  *     <tr>
 134  *       <td width="6%">/</td>
 135  *       <td width="94%">Specifies where the break position should go if text matches this
 136  *       expression.&nbsp; (e.g., &quot;[a-z]&#42;/[:Zs:]*[1-0]&quot; will match if the iterator sees a run
 137  *       of letters, followed by a run of whitespace, followed by a digit, but the break position
 138  *       will actually go before the whitespace).&nbsp; Expressions that don't contain / put the
 139  *       break position at the end of the matching text.</td>
 140  *     </tr>
 141  *     <tr>
 142  *       <td width="6%">\</td>
 143  *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
 144  *       character to be treated as literal character.&nbsp; This has no effect for many
 145  *       characters, but for the characters listed above, this deprives them of their special
 146  *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
 147  *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
 148  *       &quot;\n&quot; will be converted to a literal newline character by the time the
 149  *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
 150  *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
 151  *       characters in the ASCII range except for letters, digits, and control characters are
 152  *       reserved characters to the parser and must be preceded by \ even if they currently don't
 153  *       mean anything.</td>
 154  *     </tr>
 155  *     <tr>
 156  *       <td width="6%">!</td>
 157  *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
 158  *       parser that this expression specifies the backwards-iteration behavior of the iterator,
 159  *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
 160  *       where the automatically-generated backwards-iteration brhavior doesn't produce
 161  *       satisfactory results and must be supplemented with extra client-specified rules.</td>
 162  *     </tr>
 163  *     <tr>
 164  *       <td width="6%"><em>(all others)</em></td>
 165  *       <td width="94%">All other characters are treated as literal characters, which must match
 166  *       the corresponding character(s) in the text exactly.</td>
 167  *     </tr>
 168  *   </table>
 169  * </blockquote>
 170  *
 171  * <p>Within a [] expression, a number of other special characters can be used to specify
 172  * groups of characters:</p>
 173  *
 174  * <blockquote>
 175  *   <table border="1" width="100%">
 176  *     <tr>
 177  *       <td width="6%">-</td>
 178  *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
 179  *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
 180  *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
 181  *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
 182  *       it include accented letters such as a-umlaut.</td>
 183  *     </tr>
 184  *     <tr>
 185  *       <td width="6%">::</td>
 186  *       <td width="94%">A pair of colons containing a one- or two-letter code matches all
 187  *       characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
 188  *       as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
 189  *       matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
 190  *       the same as specifying all two-letter codes that begin with that letter (for example,
 191  *       &quot;[:L:]&quot; matches all letters, and is equivalent to
 192  *       &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
 193  *       category code or a single letter that begins a Unicode category code is illegal within
 194  *       colons.</td>
 195  *     </tr>
 196  *     <tr>
 197  *       <td width="6%">[]</td>
 198  *       <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
 199  *       conjunction with the ^ token.</td>
 200  *     </tr>
 201  *     <tr>
 202  *       <td width="6%">^</td>
 203  *       <td width="94%">Excludes the character (or the characters in the [] expression) following
 204  *       it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
 205  *       lowercase letters except p.&nbsp; &quot;[:L:^[&#92;u4e00-&#92;u9fff]]&quot; matches all letters
 206  *       except the Han ideographs.</td>
 207  *     </tr>
 208  *     <tr>
 209  *       <td width="6%"><em>(all others)</em></td>
 210  *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
 211  *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
 212  *     </tr>
 213  *   </table>
 214  * </blockquote>
 215  *
 216  * <p>For a more complete explanation, see <a
 217  * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
 218  * &nbsp; For examples, see the resource data (which is annotated).</p>
 219  *
 220  * @author Richard Gillam
 221  */
 222 class RuleBasedBreakIterator extends BreakIterator {
 223 
 224     /**
 225      * A token used as a character-category value to identify ignore characters
 226      */
 227     protected static final byte IGNORE = -1;
 228 
 229     /**
 230      * The state number of the starting state
 231      */
 232     private static final short START_STATE = 1;
 233 
 234     /**
 235      * The state-transition value indicating "stop"
 236      */
 237     private static final short STOP_STATE = 0;
 238 
 239     /**
 240      * Magic number for the BreakIterator data file format.
 241      */
 242     static final byte[] LABEL = {
 243         (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
 244         (byte)'\0'
 245     };
 246     static final int    LABEL_LENGTH = LABEL.length;
 247 
 248     /**
 249      * Version number of the dictionary that was read in.
 250      */
 251     static final byte supportedVersion = 1;
 252 
 253     /**
 254      * Header size in byte count
 255      */
 256     private static final int HEADER_LENGTH = 36;
 257 
 258     /**
 259      * An array length of indices for BMP characters
 260      */
 261     private static final int BMP_INDICES_LENGTH = 512;
 262 
 263     /**
 264      * Tables that indexes from character values to character category numbers
 265      */
 266     private CompactByteArray charCategoryTable = null;
 267     private SupplementaryCharacterData supplementaryCharCategoryTable = null;
 268 
 269     /**
 270      * The table of state transitions used for forward iteration
 271      */
 272     private short[] stateTable = null;
 273 
 274     /**
 275      * The table of state transitions used to sync up the iterator with the
 276      * text in backwards and random-access iteration
 277      */
 278     private short[] backwardsStateTable = null;
 279 
 280     /**
 281      * A list of flags indicating which states in the state table are accepting
 282      * ("end") states
 283      */
 284     private boolean[] endStates = null;
 285 
 286     /**
 287      * A list of flags indicating which states in the state table are
 288      * lookahead states (states which turn lookahead on and off)
 289      */
 290     private boolean[] lookaheadStates = null;
 291 
 292     /**
 293      * A table for additional data. May be used by a subclass of
 294      * RuleBasedBreakIterator.
 295      */
 296     private byte[] additionalData = null;
 297 
 298     /**
 299      * The number of character categories (and, thus, the number of columns in
 300      * the state tables)
 301      */
 302     private int numCategories;
 303 
 304     /**
 305      * The character iterator through which this BreakIterator accesses the text
 306      */
 307     private CharacterIterator text = null;
 308 
 309     /**
 310      * A CRC32 value of all data in datafile
 311      */
 312     private long checksum;
 313 
 314     //=======================================================================
 315     // constructors
 316     //=======================================================================
 317 
 318     /**
 319      * Constructs a RuleBasedBreakIterator according to the datafile
 320      * provided.
 321      */
 322     public RuleBasedBreakIterator(String datafile)
 323         throws IOException, MissingResourceException {
 324         readTables(datafile);
 325     }
 326 
 327     /**
 328      * Read datafile. The datafile's format is as follows:
 329      * <pre>
 330      *   BreakIteratorData {
 331      *       u1           magic[7];
 332      *       u1           version;
 333      *       u4           totalDataSize;
 334      *       header_info  header;
 335      *       body         value;
 336      *   }
 337      * </pre>
 338      * <code>totalDataSize</code> is the summation of the size of
 339      * <code>header_info</code> and <code>body</code> in byte count.
 340      * <p>
 341      * In <code>header</code>, each field except for checksum implies the
 342      * length of each field. Since <code>BMPdataLength</code> is a fixed-length
 343      *  data(512 entries), its length isn't included in <code>header</code>.
 344      * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
 345      * <pre>
 346      *   header_info {
 347      *       u4           stateTableLength;
 348      *       u4           backwardsStateTableLength;
 349      *       u4           endStatesLength;
 350      *       u4           lookaheadStatesLength;
 351      *       u4           BMPdataLength;
 352      *       u4           nonBMPdataLength;
 353      *       u4           additionalDataLength;
 354      *       u8           checksum;
 355      *   }
 356      * </pre>
 357      * <p>
 358      *
 359      * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
 360      * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
 361      * <code>supplementaryCharCategoryTable</code>.
 362      * <pre>
 363      *   body {
 364      *       u2           stateTable[stateTableLength];
 365      *       u2           backwardsStateTable[backwardsStateTableLength];
 366      *       u1           endStates[endStatesLength];
 367      *       u1           lookaheadStates[lookaheadStatesLength];
 368      *       u2           BMPindices[512];
 369      *       u1           BMPdata[BMPdataLength];
 370      *       u4           nonBMPdata[numNonBMPdataLength];
 371      *       u1           additionalData[additionalDataLength];
 372      *   }
 373      * </pre>
 374      */
 375     protected void readTables(String datafile)
 376         throws IOException, MissingResourceException {
 377 
 378         byte[] buffer = readFile(datafile);
 379 
 380         /* Read header_info. */
 381         int stateTableLength = BreakIterator.getInt(buffer, 0);
 382         int backwardsStateTableLength = BreakIterator.getInt(buffer, 4);
 383         int endStatesLength = BreakIterator.getInt(buffer, 8);
 384         int lookaheadStatesLength = BreakIterator.getInt(buffer, 12);
 385         int BMPdataLength = BreakIterator.getInt(buffer, 16);
 386         int nonBMPdataLength = BreakIterator.getInt(buffer, 20);
 387         int additionalDataLength = BreakIterator.getInt(buffer, 24);
 388         checksum = BreakIterator.getLong(buffer, 28);
 389 
 390         /* Read stateTable[numCategories * numRows] */
 391         stateTable = new short[stateTableLength];
 392         int offset = HEADER_LENGTH;
 393         for (int i = 0; i < stateTableLength; i++, offset+=2) {
 394            stateTable[i] = BreakIterator.getShort(buffer, offset);
 395         }
 396 
 397         /* Read backwardsStateTable[numCategories * numRows] */
 398         backwardsStateTable = new short[backwardsStateTableLength];
 399         for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) {
 400            backwardsStateTable[i] = BreakIterator.getShort(buffer, offset);
 401         }
 402 
 403         /* Read endStates[numRows] */
 404         endStates = new boolean[endStatesLength];
 405         for (int i = 0; i < endStatesLength; i++, offset++) {
 406            endStates[i] = buffer[offset] == 1;
 407         }
 408 
 409         /* Read lookaheadStates[numRows] */
 410         lookaheadStates = new boolean[lookaheadStatesLength];
 411         for (int i = 0; i < lookaheadStatesLength; i++, offset++) {
 412            lookaheadStates[i] = buffer[offset] == 1;
 413         }
 414 
 415         /* Read a category table and indices for BMP characters. */
 416         short[] temp1 = new short[BMP_INDICES_LENGTH];  // BMPindices
 417         for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) {
 418             temp1[i] = BreakIterator.getShort(buffer, offset);
 419         }
 420         byte[] temp2 = new byte[BMPdataLength];  // BMPdata
 421         System.arraycopy(buffer, offset, temp2, 0, BMPdataLength);
 422         offset += BMPdataLength;
 423         charCategoryTable = new CompactByteArray(temp1, temp2);
 424 
 425         /* Read a category table for non-BMP characters. */
 426         int[] temp3 = new int[nonBMPdataLength];
 427         for (int i = 0; i < nonBMPdataLength; i++, offset+=4) {
 428             temp3[i] = BreakIterator.getInt(buffer, offset);
 429         }
 430         supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
 431 
 432         /* Read additional data */
 433         if (additionalDataLength > 0) {
 434             additionalData = new byte[additionalDataLength];
 435             System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength);
 436         }
 437 
 438         /* Set numCategories */
 439         numCategories = stateTable.length / endStates.length;
 440     }
 441 
 442     protected byte[] readFile(final String datafile)
 443         throws IOException, MissingResourceException {
 444 
 445         BufferedInputStream is;
 446         try {
 447             is = (BufferedInputStream)AccessController.doPrivileged(
 448                 new PrivilegedExceptionAction() {
 449                     public Object run() throws Exception {
 450                         return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + datafile));
 451                     }
 452                 }
 453             );
 454         }
 455         catch (PrivilegedActionException e) {
 456             throw new InternalError(e.toString());
 457         }
 458 
 459         int offset = 0;
 460 
 461         /* First, read magic, version, and header_info. */
 462         int len = LABEL_LENGTH + 5;
 463         byte[] buf = new byte[len];
 464         if (is.read(buf) != len) {
 465             throw new MissingResourceException("Wrong header length",
 466                                                datafile, "");
 467         }
 468 
 469         /* Validate the magic number. */
 470         for (int i = 0; i < LABEL_LENGTH; i++, offset++) {
 471             if (buf[offset] != LABEL[offset]) {
 472                 throw new MissingResourceException("Wrong magic number",
 473                                                    datafile, "");
 474             }
 475         }
 476 
 477         /* Validate the version number. */
 478         if (buf[offset] != supportedVersion) {
 479             throw new MissingResourceException("Unsupported version(" + buf[offset] + ")",
 480                                                datafile, "");
 481         }
 482 
 483         /* Read data: totalDataSize + 8(for checksum) */
 484         len = BreakIterator.getInt(buf, ++offset);
 485         buf = new byte[len];
 486         if (is.read(buf) != len) {
 487             throw new MissingResourceException("Wrong data length",
 488                                                datafile, "");
 489         }
 490 
 491         is.close();
 492 
 493         return buf;
 494     }
 495 
 496     byte[] getAdditionalData() {
 497         return additionalData;
 498     }
 499 
 500     void setAdditionalData(byte[] b) {
 501         additionalData = b;
 502     }
 503 
 504     //=======================================================================
 505     // boilerplate
 506     //=======================================================================
 507     /**
 508      * Clones this iterator.
 509      * @return A newly-constructed RuleBasedBreakIterator with the same
 510      * behavior as this one.
 511      */
 512     public Object clone() {
 513         RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
 514         if (text != null) {
 515             result.text = (CharacterIterator) text.clone();
 516         }
 517         return result;
 518     }
 519 
 520     /**
 521      * Returns true if both BreakIterators are of the same class, have the same
 522      * rules, and iterate over the same text.
 523      */
 524     public boolean equals(Object that) {
 525         try {
 526             if (that == null) {
 527                 return false;
 528             }
 529 
 530             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
 531             if (checksum != other.checksum) {
 532                 return false;
 533             }
 534             if (text == null) {
 535                 return other.text == null;
 536             } else {
 537                 return text.equals(other.text);
 538             }
 539         }
 540         catch(ClassCastException e) {
 541             return false;
 542         }
 543     }
 544 
 545     /**
 546      * Returns text
 547      */
 548     public String toString() {
 549         StringBuffer sb = new StringBuffer();
 550         sb.append('[');
 551         sb.append("checksum=0x" + Long.toHexString(checksum));
 552         sb.append(']');
 553         return sb.toString();
 554     }
 555 
 556     /**
 557      * Compute a hashcode for this BreakIterator
 558      * @return A hash code
 559      */
 560     public int hashCode() {
 561         return (int)checksum;
 562     }
 563 
 564     //=======================================================================
 565     // BreakIterator overrides
 566     //=======================================================================
 567 
 568     /**
 569      * Sets the current iteration position to the beginning of the text.
 570      * (i.e., the CharacterIterator's starting offset).
 571      * @return The offset of the beginning of the text.
 572      */
 573     public int first() {
 574         CharacterIterator t = getText();
 575 
 576         t.first();
 577         return t.getIndex();
 578     }
 579 
 580     /**
 581      * Sets the current iteration position to the end of the text.
 582      * (i.e., the CharacterIterator's ending offset).
 583      * @return The text's past-the-end offset.
 584      */
 585     public int last() {
 586         CharacterIterator t = getText();
 587 
 588         // I'm not sure why, but t.last() returns the offset of the last character,
 589         // rather than the past-the-end offset
 590         t.setIndex(t.getEndIndex());
 591         return t.getIndex();
 592     }
 593 
 594     /**
 595      * Advances the iterator either forward or backward the specified number of steps.
 596      * Negative values move backward, and positive values move forward.  This is
 597      * equivalent to repeatedly calling next() or previous().
 598      * @param n The number of steps to move.  The sign indicates the direction
 599      * (negative is backwards, and positive is forwards).
 600      * @return The character offset of the boundary position n boundaries away from
 601      * the current one.
 602      */
 603     public int next(int n) {
 604         int result = current();
 605         while (n > 0) {
 606             result = handleNext();
 607             --n;
 608         }
 609         while (n < 0) {
 610             result = previous();
 611             ++n;
 612         }
 613         return result;
 614     }
 615 
 616     /**
 617      * Advances the iterator to the next boundary position.
 618      * @return The position of the first boundary after this one.
 619      */
 620     public int next() {
 621         return handleNext();
 622     }
 623 
 624     private int cachedLastKnownBreak = BreakIterator.DONE;
 625 
 626     /**
 627      * Advances the iterator backwards, to the last boundary preceding this one.
 628      * @return The position of the last boundary position preceding this one.
 629      */
 630     public int previous() {
 631         // if we're already sitting at the beginning of the text, return DONE
 632         CharacterIterator text = getText();
 633         if (current() == text.getBeginIndex()) {
 634             return BreakIterator.DONE;
 635         }
 636 
 637         // set things up.  handlePrevious() will back us up to some valid
 638         // break position before the current position (we back our internal
 639         // iterator up one step to prevent handlePrevious() from returning
 640         // the current position), but not necessarily the last one before
 641         // where we started
 642         int start = current();
 643         int lastResult = cachedLastKnownBreak;
 644         if (lastResult >= start || lastResult <= BreakIterator.DONE) {
 645             getPrevious();
 646             lastResult = handlePrevious();
 647         } else {
 648             //it might be better to check if handlePrevious() give us closer
 649             //safe value but handlePrevious() is slow too
 650             //So, this has to be done carefully
 651             text.setIndex(lastResult);
 652         }
 653         int result = lastResult;
 654 
 655         // iterate forward from the known break position until we pass our
 656         // starting point.  The last break position before the starting
 657         // point is our return value
 658         while (result != BreakIterator.DONE && result < start) {
 659             lastResult = result;
 660             result = handleNext();
 661         }
 662 
 663         // set the current iteration position to be the last break position
 664         // before where we started, and then return that value
 665         text.setIndex(lastResult);
 666         cachedLastKnownBreak = lastResult;
 667         return lastResult;
 668     }
 669 
 670     /**
 671      * Returns previous character
 672      */
 673     private int getPrevious() {
 674         char c2 = text.previous();
 675         if (Character.isLowSurrogate(c2) &&
 676             text.getIndex() > text.getBeginIndex()) {
 677             char c1 = text.previous();
 678             if (Character.isHighSurrogate(c1)) {
 679                 return Character.toCodePoint(c1, c2);
 680             } else {
 681                 text.next();
 682             }
 683         }
 684         return (int)c2;
 685     }
 686 
 687     /**
 688      * Returns current character
 689      */
 690     int getCurrent() {
 691         char c1 = text.current();
 692         if (Character.isHighSurrogate(c1) &&
 693             text.getIndex() < text.getEndIndex()) {
 694             char c2 = text.next();
 695             text.previous();
 696             if (Character.isLowSurrogate(c2)) {
 697                 return Character.toCodePoint(c1, c2);
 698             }
 699         }
 700         return (int)c1;
 701     }
 702 
 703     /**
 704      * Returns the count of next character.
 705      */
 706     private int getCurrentCodePointCount() {
 707         char c1 = text.current();
 708         if (Character.isHighSurrogate(c1) &&
 709             text.getIndex() < text.getEndIndex()) {
 710             char c2 = text.next();
 711             text.previous();
 712             if (Character.isLowSurrogate(c2)) {
 713                 return 2;
 714             }
 715         }
 716         return 1;
 717     }
 718 
 719     /**
 720      * Returns next character
 721      */
 722     int getNext() {
 723         int index = text.getIndex();
 724         int endIndex = text.getEndIndex();
 725         if (index == endIndex ||
 726             (index = index + getCurrentCodePointCount()) >= endIndex) {
 727             return CharacterIterator.DONE;
 728         }
 729         text.setIndex(index);
 730         return getCurrent();
 731     }
 732 
 733     /**
 734      * Returns the position of next character.
 735      */
 736     private int getNextIndex() {
 737         int index = text.getIndex() + getCurrentCodePointCount();
 738         int endIndex = text.getEndIndex();
 739         if (index > endIndex) {
 740             return endIndex;
 741         } else {
 742             return index;
 743         }
 744     }
 745 
 746     /**
 747      * Throw IllegalArgumentException unless begin <= offset < end.
 748      */
 749     protected static final void checkOffset(int offset, CharacterIterator text) {
 750         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
 751             throw new IllegalArgumentException("offset out of bounds");
 752         }
 753     }
 754 
 755     /**
 756      * Sets the iterator to refer to the first boundary position following
 757      * the specified position.
 758      * @offset The position from which to begin searching for a break position.
 759      * @return The position of the first break after the current position.
 760      */
 761     public int following(int offset) {
 762 
 763         CharacterIterator text = getText();
 764         checkOffset(offset, text);
 765 
 766         // Set our internal iteration position (temporarily)
 767         // to the position passed in.  If this is the _beginning_ position,
 768         // then we can just use next() to get our return value
 769         text.setIndex(offset);
 770         if (offset == text.getBeginIndex()) {
 771             cachedLastKnownBreak = handleNext();
 772             return cachedLastKnownBreak;
 773         }
 774 
 775         // otherwise, we have to sync up first.  Use handlePrevious() to back
 776         // us up to a known break position before the specified position (if
 777         // we can determine that the specified position is a break position,
 778         // we don't back up at all).  This may or may not be the last break
 779         // position at or before our starting position.  Advance forward
 780         // from here until we've passed the starting position.  The position
 781         // we stop on will be the first break position after the specified one.
 782         int result = cachedLastKnownBreak;
 783         if (result >= offset || result <= BreakIterator.DONE) {
 784             result = handlePrevious();
 785         } else {
 786             //it might be better to check if handlePrevious() give us closer
 787             //safe value but handlePrevious() is slow too
 788             //So, this has to be done carefully
 789             text.setIndex(result);
 790         }
 791         while (result != BreakIterator.DONE && result <= offset) {
 792             result = handleNext();
 793         }
 794         cachedLastKnownBreak = result;
 795         return result;
 796     }
 797 
 798     /**
 799      * Sets the iterator to refer to the last boundary position before the
 800      * specified position.
 801      * @offset The position to begin searching for a break from.
 802      * @return The position of the last boundary before the starting position.
 803      */
 804     public int preceding(int offset) {
 805         // if we start by updating the current iteration position to the
 806         // position specified by the caller, we can just use previous()
 807         // to carry out this operation
 808         CharacterIterator text = getText();
 809         checkOffset(offset, text);
 810         text.setIndex(offset);
 811         return previous();
 812     }
 813 
 814     /**
 815      * Returns true if the specfied position is a boundary position.  As a side
 816      * effect, leaves the iterator pointing to the first boundary position at
 817      * or after "offset".
 818      * @param offset the offset to check.
 819      * @return True if "offset" is a boundary position.
 820      */
 821     public boolean isBoundary(int offset) {
 822         CharacterIterator text = getText();
 823         checkOffset(offset, text);
 824         if (offset == text.getBeginIndex()) {
 825             return true;
 826         }
 827 
 828         // to check whether this is a boundary, we can use following() on the
 829         // position before the specified one and return true if the position we
 830         // get back is the one the user specified
 831         else {
 832             return following(offset - 1) == offset;
 833         }
 834     }
 835 
 836     /**
 837      * Returns the current iteration position.
 838      * @return The current iteration position.
 839      */
 840     public int current() {
 841         return getText().getIndex();
 842     }
 843 
 844     /**
 845      * Return a CharacterIterator over the text being analyzed.  This version
 846      * of this method returns the actual CharacterIterator we're using internally.
 847      * Changing the state of this iterator can have undefined consequences.  If
 848      * you need to change it, clone it first.
 849      * @return An iterator over the text being analyzed.
 850      */
 851     public CharacterIterator getText() {
 852         // The iterator is initialized pointing to no text at all, so if this
 853         // function is called while we're in that state, we have to fudge an
 854         // iterator to return.
 855         if (text == null) {
 856             text = new StringCharacterIterator("");
 857         }
 858         return text;
 859     }
 860 
 861     /**
 862      * Set the iterator to analyze a new piece of text.  This function resets
 863      * the current iteration position to the beginning of the text.
 864      * @param newText An iterator over the text to analyze.
 865      */
 866     public void setText(CharacterIterator newText) {
 867         // Test iterator to see if we need to wrap it in a SafeCharIterator.
 868         // The correct behavior for CharacterIterators is to allow the
 869         // position to be set to the endpoint of the iterator.  Many
 870         // CharacterIterators do not uphold this, so this is a workaround
 871         // to permit them to use this class.
 872         int end = newText.getEndIndex();
 873         boolean goodIterator;
 874         try {
 875             newText.setIndex(end);  // some buggy iterators throw an exception here
 876             goodIterator = newText.getIndex() == end;
 877         }
 878         catch(IllegalArgumentException e) {
 879             goodIterator = false;
 880         }
 881 
 882         if (goodIterator) {
 883             text = newText;
 884         }
 885         else {
 886             text = new SafeCharIterator(newText);
 887         }
 888         text.first();
 889 
 890         cachedLastKnownBreak = BreakIterator.DONE;
 891     }
 892 
 893 
 894     //=======================================================================
 895     // implementation
 896     //=======================================================================
 897 
 898     /**
 899      * This method is the actual implementation of the next() method.  All iteration
 900      * vectors through here.  This method initializes the state machine to state 1
 901      * and advances through the text character by character until we reach the end
 902      * of the text or the state machine transitions to state 0.  We update our return
 903      * value every time the state machine passes through a possible end state.
 904      */
 905     protected int handleNext() {
 906         // if we're already at the end of the text, return DONE.
 907         CharacterIterator text = getText();
 908         if (text.getIndex() == text.getEndIndex()) {
 909             return BreakIterator.DONE;
 910         }
 911 
 912         // no matter what, we always advance at least one character forward
 913         int result = getNextIndex();
 914         int lookaheadResult = 0;
 915 
 916         // begin in state 1
 917         int state = START_STATE;
 918         int category;
 919         int c = getCurrent();
 920 
 921         // loop until we reach the end of the text or transition to state 0
 922         while (c != CharacterIterator.DONE && state != STOP_STATE) {
 923 
 924             // look up the current character's character category (which tells us
 925             // which column in the state table to look at)
 926             category = lookupCategory(c);
 927 
 928             // if the character isn't an ignore character, look up a state
 929             // transition in the state table
 930             if (category != IGNORE) {
 931                 state = lookupState(state, category);
 932             }
 933 
 934             // if the state we've just transitioned to is a lookahead state,
 935             // (but not also an end state), save its position.  If it's
 936             // both a lookahead state and an end state, update the break position
 937             // to the last saved lookup-state position
 938             if (lookaheadStates[state]) {
 939                 if (endStates[state]) {
 940                     result = lookaheadResult;
 941                 }
 942                 else {
 943                     lookaheadResult = getNextIndex();
 944                 }
 945             }
 946 
 947             // otherwise, if the state we've just transitioned to is an accepting
 948             // state, update the break position to be the current iteration position
 949             else {
 950                 if (endStates[state]) {
 951                     result = getNextIndex();
 952                 }
 953             }
 954 
 955             c = getNext();
 956         }
 957 
 958         // if we've run off the end of the text, and the very last character took us into
 959         // a lookahead state, advance the break position to the lookahead position
 960         // (the theory here is that if there are no characters at all after the lookahead
 961         // position, that always matches the lookahead criteria)
 962         if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
 963             result = lookaheadResult;
 964         }
 965 
 966         text.setIndex(result);
 967         return result;
 968     }
 969 
 970     /**
 971      * This method backs the iterator back up to a "safe position" in the text.
 972      * This is a position that we know, without any context, must be a break position.
 973      * The various calling methods then iterate forward from this safe position to
 974      * the appropriate position to return.  (For more information, see the description
 975      * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
 976      */
 977     protected int handlePrevious() {
 978         CharacterIterator text = getText();
 979         int state = START_STATE;
 980         int category = 0;
 981         int lastCategory = 0;
 982         int c = getCurrent();
 983 
 984         // loop until we reach the beginning of the text or transition to state 0
 985         while (c != CharacterIterator.DONE && state != STOP_STATE) {
 986 
 987             // save the last character's category and look up the current
 988             // character's category
 989             lastCategory = category;
 990             category = lookupCategory(c);
 991 
 992             // if the current character isn't an ignore character, look up a
 993             // state transition in the backwards state table
 994             if (category != IGNORE) {
 995                 state = lookupBackwardState(state, category);
 996             }
 997 
 998             // then advance one character backwards
 999             c = getPrevious();
1000         }
1001 
1002         // if we didn't march off the beginning of the text, we're either one or two
1003         // positions away from the real break position.  (One because of the call to
1004         // previous() at the end of the loop above, and another because the character
1005         // that takes us into the stop state will always be the character BEFORE
1006         // the break position.)
1007         if (c != CharacterIterator.DONE) {
1008             if (lastCategory != IGNORE) {
1009                 getNext();
1010                 getNext();
1011             }
1012             else {
1013                 getNext();
1014             }
1015         }
1016         return text.getIndex();
1017     }
1018 
1019     /**
1020      * Looks up a character's category (i.e., its category for breaking purposes,
1021      * not its Unicode category)
1022      */
1023     protected int lookupCategory(int c) {
1024         if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1025             return charCategoryTable.elementAt((char)c);
1026         } else {
1027             return supplementaryCharCategoryTable.getValue(c);
1028         }
1029     }
1030 
1031     /**
1032      * Given a current state and a character category, looks up the
1033      * next state to transition to in the state table.
1034      */
1035     protected int lookupState(int state, int category) {
1036         return stateTable[state * numCategories + category];
1037     }
1038 
1039     /**
1040      * Given a current state and a character category, looks up the
1041      * next state to transition to in the backwards state table.
1042      */
1043     protected int lookupBackwardState(int state, int category) {
1044         return backwardsStateTable[state * numCategories + category];
1045     }
1046 
1047     /*
1048      * This class exists to work around a bug in incorrect implementations
1049      * of CharacterIterator, which incorrectly handle setIndex(endIndex).
1050      * This iterator relies only on base.setIndex(n) where n is less than
1051      * endIndex.
1052      *
1053      * One caveat:  if the base iterator's begin and end indices change
1054      * the change will not be reflected by this wrapper.  Does that matter?
1055      */
1056     private static final class SafeCharIterator implements CharacterIterator,
1057                                                            Cloneable {
1058 
1059         private CharacterIterator base;
1060         private int rangeStart;
1061         private int rangeLimit;
1062         private int currentIndex;
1063 
1064         SafeCharIterator(CharacterIterator base) {
1065             this.base = base;
1066             this.rangeStart = base.getBeginIndex();
1067             this.rangeLimit = base.getEndIndex();
1068             this.currentIndex = base.getIndex();
1069         }
1070 
1071         public char first() {
1072             return setIndex(rangeStart);
1073         }
1074 
1075         public char last() {
1076             return setIndex(rangeLimit - 1);
1077         }
1078 
1079         public char current() {
1080             if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
1081                 return DONE;
1082             }
1083             else {
1084                 return base.setIndex(currentIndex);
1085             }
1086         }
1087 
1088         public char next() {
1089 
1090             currentIndex++;
1091             if (currentIndex >= rangeLimit) {
1092                 currentIndex = rangeLimit;
1093                 return DONE;
1094             }
1095             else {
1096                 return base.setIndex(currentIndex);
1097             }
1098         }
1099 
1100         public char previous() {
1101 
1102             currentIndex--;
1103             if (currentIndex < rangeStart) {
1104                 currentIndex = rangeStart;
1105                 return DONE;
1106             }
1107             else {
1108                 return base.setIndex(currentIndex);
1109             }
1110         }
1111 
1112         public char setIndex(int i) {
1113 
1114             if (i < rangeStart || i > rangeLimit) {
1115                 throw new IllegalArgumentException("Invalid position");
1116             }
1117             currentIndex = i;
1118             return current();
1119         }
1120 
1121         public int getBeginIndex() {
1122             return rangeStart;
1123         }
1124 
1125         public int getEndIndex() {
1126             return rangeLimit;
1127         }
1128 
1129         public int getIndex() {
1130             return currentIndex;
1131         }
1132 
1133         public Object clone() {
1134 
1135             SafeCharIterator copy = null;
1136             try {
1137                 copy = (SafeCharIterator) super.clone();
1138             }
1139             catch(CloneNotSupportedException e) {
1140                 throw new Error("Clone not supported: " + e);
1141             }
1142 
1143             CharacterIterator copyOfBase = (CharacterIterator) base.clone();
1144             copy.base = copyOfBase;
1145             return copy;
1146         }
1147     }
1148 }