1 /*
   2  * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
  29  *                                                                             *
  30  * The original version of this source code and documentation is copyrighted   *
  31  * and owned by IBM, These materials are provided under terms of a License     *
  32  * Agreement between IBM and Sun. This technology is protected by multiple     *
  33  * US and International patents. This notice and attribution to IBM may not    *
  34  * to removed.                                                                 *
  35  *******************************************************************************
  36  */
  37 
  38 /*
  39  **********************************************************************
  40  * Author: Alan Liu
  41  * Created: September 23 2003
  42  * Since: ICU 2.8
  43  **********************************************************************
  44  */
  45 
  46 package sun.text.normalizer;
  47 
  48 import java.text.ParsePosition;
  49 
  50 /**
  51  * An iterator that returns 32-bit code points.  This class is deliberately
  52  * <em>not</em> related to any of the JDK or ICU4J character iterator classes
  53  * in order to minimize complexity.
  54  * @author Alan Liu
  55  * @since ICU 2.8
  56  */
  57 @SuppressWarnings("deprecation")
  58 public class RuleCharacterIterator {
  59 
  60     // TODO: Ideas for later.  (Do not implement if not needed, lest the
  61     // code coverage numbers go down due to unused methods.)
  62     // 1. Add a copy constructor, equals() method, clone() method.
  63     // 2. Rather than return DONE, throw an exception if the end
  64     // is reached -- this is an alternate usage model, probably not useful.
  65     // 3. Return isEscaped from next().  If this happens,
  66     // don't keep an isEscaped member variable.
  67 
  68     /**
  69      * Text being iterated.
  70      */
  71     private String text;
  72 
  73     /**
  74      * Position of iterator.
  75      */
  76     private ParsePosition pos;
  77 
  78     /**
  79      * Symbol table used to parse and dereference variables.  May be null.
  80      */
  81     private SymbolTable sym;
  82 
  83     /**
  84      * Current variable expansion, or null if none.
  85      */
  86     private char[] buf;
  87 
  88     /**
  89      * Position within buf[].  Meaningless if buf == null.
  90      */
  91     private int bufPos;
  92 
  93     /**
  94      * Flag indicating whether the last character was parsed from an escape.
  95      */
  96     private boolean isEscaped;
  97 
  98     /**
  99      * Value returned when there are no more characters to iterate.
 100      */
 101     public static final int DONE = -1;
 102 
 103     /**
 104      * Bitmask option to enable parsing of variable names.  If (options &
 105      * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
 106      * its value.  Variables are parsed using the SymbolTable API.
 107      */
 108     public static final int PARSE_VARIABLES = 1;
 109 
 110     /**
 111      * Bitmask option to enable parsing of escape sequences.  If (options &
 112      * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
 113      * to its value.  Escapes are parsed using Utility.unescapeAt().
 114      */
 115     public static final int PARSE_ESCAPES   = 2;
 116 
 117     /**
 118      * Bitmask option to enable skipping of whitespace.  If (options &
 119      * SKIP_WHITESPACE) != 0, then whitespace characters will be silently
 120      * skipped, as if they were not present in the input.  Whitespace
 121      * characters are defined by UCharacterProperty.isRuleWhiteSpace().
 122      */
 123     public static final int SKIP_WHITESPACE = 4;
 124 
 125     /**
 126      * Constructs an iterator over the given text, starting at the given
 127      * position.
 128      * @param text the text to be iterated
 129      * @param sym the symbol table, or null if there is none.  If sym is null,
 130      * then variables will not be deferenced, even if the PARSE_VARIABLES
 131      * option is set.
 132      * @param pos upon input, the index of the next character to return.  If a
 133      * variable has been dereferenced, then pos will <em>not</em> increment as
 134      * characters of the variable value are iterated.
 135      */
 136     public RuleCharacterIterator(String text, SymbolTable sym,
 137                                  ParsePosition pos) {
 138         if (text == null || pos.getIndex() > text.length()) {
 139             throw new IllegalArgumentException();
 140         }
 141         this.text = text;
 142         this.sym = sym;
 143         this.pos = pos;
 144         buf = null;
 145     }
 146 
 147     /**
 148      * Returns true if this iterator has no more characters to return.
 149      */
 150     public boolean atEnd() {
 151         return buf == null && pos.getIndex() == text.length();
 152     }
 153 
 154     /**
 155      * Returns the next character using the given options, or DONE if there
 156      * are no more characters, and advance the position to the next
 157      * character.
 158      * @param options one or more of the following options, bitwise-OR-ed
 159      * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
 160      * @return the current 32-bit code point, or DONE
 161      */
 162     public int next(int options) {
 163         int c = DONE;
 164         isEscaped = false;
 165 
 166         for (;;) {
 167             c = _current();
 168             _advance(UTF16.getCharCount(c));
 169 
 170             if (c == SymbolTable.SYMBOL_REF && buf == null &&
 171                 (options & PARSE_VARIABLES) != 0 && sym != null) {
 172                 String name = sym.parseReference(text, pos, text.length());
 173                 // If name == null there was an isolated SYMBOL_REF;
 174                 // return it.  Caller must be prepared for this.
 175                 if (name == null) {
 176                     break;
 177                 }
 178                 bufPos = 0;
 179                 buf = sym.lookup(name);
 180                 if (buf == null) {
 181                     throw new IllegalArgumentException(
 182                                 "Undefined variable: " + name);
 183                 }
 184                 // Handle empty variable value
 185                 if (buf.length == 0) {
 186                     buf = null;
 187                 }
 188                 continue;
 189             }
 190 
 191             if ((options & SKIP_WHITESPACE) != 0 &&
 192                 UCharacterProperty.isRuleWhiteSpace(c)) {
 193                 continue;
 194             }
 195 
 196             if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
 197                 int offset[] = new int[] { 0 };
 198                 c = Utility.unescapeAt(lookahead(), offset);
 199                 jumpahead(offset[0]);
 200                 isEscaped = true;
 201                 if (c < 0) {
 202                     throw new IllegalArgumentException("Invalid escape");
 203                 }
 204             }
 205 
 206             break;
 207         }
 208 
 209         return c;
 210     }
 211 
 212     /**
 213      * Returns true if the last character returned by next() was
 214      * escaped.  This will only be the case if the option passed in to
 215      * next() included PARSE_ESCAPED and the next character was an
 216      * escape sequence.
 217      */
 218     public boolean isEscaped() {
 219         return isEscaped;
 220     }
 221 
 222     /**
 223      * Returns true if this iterator is currently within a variable expansion.
 224      */
 225     public boolean inVariable() {
 226         return buf != null;
 227     }
 228 
 229     /**
 230      * Returns an object which, when later passed to setPos(), will
 231      * restore this iterator's position.  Usage idiom:
 232      *
 233      * RuleCharacterIterator iterator = ...;
 234      * Object pos = iterator.getPos(null); // allocate position object
 235      * for (;;) {
 236      *   pos = iterator.getPos(pos); // reuse position object
 237      *   int c = iterator.next(...);
 238      *   ...
 239      * }
 240      * iterator.setPos(pos);
 241      *
 242      * @param p a position object previously returned by getPos(),
 243      * or null.  If not null, it will be updated and returned.  If
 244      * null, a new position object will be allocated and returned.
 245      * @return a position object which may be passed to setPos(),
 246      * either `p,' or if `p' == null, a newly-allocated object
 247      */
 248     public Object getPos(Object p) {
 249         if (p == null) {
 250             return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
 251         }
 252         Object[] a = (Object[]) p;
 253         a[0] = buf;
 254         int[] v = (int[]) a[1];
 255         v[0] = pos.getIndex();
 256         v[1] = bufPos;
 257         return p;
 258     }
 259 
 260     /**
 261      * Restores this iterator to the position it had when getPos()
 262      * returned the given object.
 263      * @param p a position object previously returned by getPos()
 264      */
 265     public void setPos(Object p) {
 266         Object[] a = (Object[]) p;
 267         buf = (char[]) a[0];
 268         int[] v = (int[]) a[1];
 269         pos.setIndex(v[0]);
 270         bufPos = v[1];
 271     }
 272 
 273     /**
 274      * Skips ahead past any ignored characters, as indicated by the given
 275      * options.  This is useful in conjunction with the lookahead() method.
 276      *
 277      * Currently, this only has an effect for SKIP_WHITESPACE.
 278      * @param options one or more of the following options, bitwise-OR-ed
 279      * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
 280      */
 281     public void skipIgnored(int options) {
 282         if ((options & SKIP_WHITESPACE) != 0) {
 283             for (;;) {
 284                 int a = _current();
 285                 if (!UCharacterProperty.isRuleWhiteSpace(a)) break;
 286                 _advance(UTF16.getCharCount(a));
 287             }
 288         }
 289     }
 290 
 291     /**
 292      * Returns a string containing the remainder of the characters to be
 293      * returned by this iterator, without any option processing.  If the
 294      * iterator is currently within a variable expansion, this will only
 295      * extend to the end of the variable expansion.  This method is provided
 296      * so that iterators may interoperate with string-based APIs.  The typical
 297      * sequence of calls is to call skipIgnored(), then call lookahead(), then
 298      * parse the string returned by lookahead(), then call jumpahead() to
 299      * resynchronize the iterator.
 300      * @return a string containing the characters to be returned by future
 301      * calls to next()
 302      */
 303     public String lookahead() {
 304         if (buf != null) {
 305             return new String(buf, bufPos, buf.length - bufPos);
 306         } else {
 307             return text.substring(pos.getIndex());
 308         }
 309     }
 310 
 311     /**
 312      * Advances the position by the given number of 16-bit code units.
 313      * This is useful in conjunction with the lookahead() method.
 314      * @param count the number of 16-bit code units to jump over
 315      */
 316     public void jumpahead(int count) {
 317         if (count < 0) {
 318             throw new IllegalArgumentException();
 319         }
 320         if (buf != null) {
 321             bufPos += count;
 322             if (bufPos > buf.length) {
 323                 throw new IllegalArgumentException();
 324             }
 325             if (bufPos == buf.length) {
 326                 buf = null;
 327             }
 328         } else {
 329             int i = pos.getIndex() + count;
 330             pos.setIndex(i);
 331             if (i > text.length()) {
 332                 throw new IllegalArgumentException();
 333             }
 334         }
 335     }
 336 
 337     /**
 338      * Returns the current 32-bit code point without parsing escapes, parsing
 339      * variables, or skipping whitespace.
 340      * @return the current 32-bit code point
 341      */
 342     private int _current() {
 343         if (buf != null) {
 344             return UTF16.charAt(buf, 0, buf.length, bufPos);
 345         } else {
 346             int i = pos.getIndex();
 347             return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
 348         }
 349     }
 350 
 351     /**
 352      * Advances the position by the given amount.
 353      * @param count the number of 16-bit code units to advance past
 354      */
 355     private void _advance(int count) {
 356         if (buf != null) {
 357             bufPos += count;
 358             if (bufPos == buf.length) {
 359                 buf = null;
 360             }
 361         } else {
 362             pos.setIndex(pos.getIndex() + count);
 363             if (pos.getIndex() > text.length()) {
 364                 pos.setIndex(text.length());
 365             }
 366         }
 367     }
 368 }