1 /* 2 * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * 29 * * 30 * The original version of this source code and documentation is copyrighted * 31 * and owned by IBM, These materials are provided under terms of a License * 32 * Agreement between IBM and Sun. This technology is protected by multiple * 33 * US and International patents. This notice and attribution to IBM may not * 34 * to removed. * 35 ******************************************************************************* 36 */ 37 38 /* 39 ********************************************************************** 40 * Author: Alan Liu 41 * Created: September 23 2003 42 * Since: ICU 2.8 43 ********************************************************************** 44 */ 45 46 package sun.text.normalizer; 47 48 import java.text.ParsePosition; 49 50 /** 51 * An iterator that returns 32-bit code points. This class is deliberately 52 * <em>not</em> related to any of the JDK or ICU4J character iterator classes 53 * in order to minimize complexity. 54 * @author Alan Liu 55 * @since ICU 2.8 56 */ 57 @SuppressWarnings("deprecation") 58 public class RuleCharacterIterator { 59 60 // TODO: Ideas for later. (Do not implement if not needed, lest the 61 // code coverage numbers go down due to unused methods.) 62 // 1. Add a copy constructor, equals() method, clone() method. 63 // 2. Rather than return DONE, throw an exception if the end 64 // is reached -- this is an alternate usage model, probably not useful. 65 // 3. Return isEscaped from next(). If this happens, 66 // don't keep an isEscaped member variable. 67 68 /** 69 * Text being iterated. 70 */ 71 private String text; 72 73 /** 74 * Position of iterator. 75 */ 76 private ParsePosition pos; 77 78 /** 79 * Symbol table used to parse and dereference variables. May be null. 80 */ 81 private SymbolTable sym; 82 83 /** 84 * Current variable expansion, or null if none. 85 */ 86 private char[] buf; 87 88 /** 89 * Position within buf[]. Meaningless if buf == null. 90 */ 91 private int bufPos; 92 93 /** 94 * Flag indicating whether the last character was parsed from an escape. 95 */ 96 private boolean isEscaped; 97 98 /** 99 * Value returned when there are no more characters to iterate. 100 */ 101 public static final int DONE = -1; 102 103 /** 104 * Bitmask option to enable parsing of variable names. If (options & 105 * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to 106 * its value. Variables are parsed using the SymbolTable API. 107 */ 108 public static final int PARSE_VARIABLES = 1; 109 110 /** 111 * Bitmask option to enable parsing of escape sequences. If (options & 112 * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded 113 * to its value. Escapes are parsed using Utility.unescapeAt(). 114 */ 115 public static final int PARSE_ESCAPES = 2; 116 117 /** 118 * Bitmask option to enable skipping of whitespace. If (options & 119 * SKIP_WHITESPACE) != 0, then whitespace characters will be silently 120 * skipped, as if they were not present in the input. Whitespace 121 * characters are defined by UCharacterProperty.isRuleWhiteSpace(). 122 */ 123 public static final int SKIP_WHITESPACE = 4; 124 125 /** 126 * Constructs an iterator over the given text, starting at the given 127 * position. 128 * @param text the text to be iterated 129 * @param sym the symbol table, or null if there is none. If sym is null, 130 * then variables will not be deferenced, even if the PARSE_VARIABLES 131 * option is set. 132 * @param pos upon input, the index of the next character to return. If a 133 * variable has been dereferenced, then pos will <em>not</em> increment as 134 * characters of the variable value are iterated. 135 */ 136 public RuleCharacterIterator(String text, SymbolTable sym, 137 ParsePosition pos) { 138 if (text == null || pos.getIndex() > text.length()) { 139 throw new IllegalArgumentException(); 140 } 141 this.text = text; 142 this.sym = sym; 143 this.pos = pos; 144 buf = null; 145 } 146 147 /** 148 * Returns true if this iterator has no more characters to return. 149 */ 150 public boolean atEnd() { 151 return buf == null && pos.getIndex() == text.length(); 152 } 153 154 /** 155 * Returns the next character using the given options, or DONE if there 156 * are no more characters, and advance the position to the next 157 * character. 158 * @param options one or more of the following options, bitwise-OR-ed 159 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 160 * @return the current 32-bit code point, or DONE 161 */ 162 public int next(int options) { 163 int c = DONE; 164 isEscaped = false; 165 166 for (;;) { 167 c = _current(); 168 _advance(UTF16.getCharCount(c)); 169 170 if (c == SymbolTable.SYMBOL_REF && buf == null && 171 (options & PARSE_VARIABLES) != 0 && sym != null) { 172 String name = sym.parseReference(text, pos, text.length()); 173 // If name == null there was an isolated SYMBOL_REF; 174 // return it. Caller must be prepared for this. 175 if (name == null) { 176 break; 177 } 178 bufPos = 0; 179 buf = sym.lookup(name); 180 if (buf == null) { 181 throw new IllegalArgumentException( 182 "Undefined variable: " + name); 183 } 184 // Handle empty variable value 185 if (buf.length == 0) { 186 buf = null; 187 } 188 continue; 189 } 190 191 if ((options & SKIP_WHITESPACE) != 0 && 192 UCharacterProperty.isRuleWhiteSpace(c)) { 193 continue; 194 } 195 196 if (c == '\\' && (options & PARSE_ESCAPES) != 0) { 197 int offset[] = new int[] { 0 }; 198 c = Utility.unescapeAt(lookahead(), offset); 199 jumpahead(offset[0]); 200 isEscaped = true; 201 if (c < 0) { 202 throw new IllegalArgumentException("Invalid escape"); 203 } 204 } 205 206 break; 207 } 208 209 return c; 210 } 211 212 /** 213 * Returns true if the last character returned by next() was 214 * escaped. This will only be the case if the option passed in to 215 * next() included PARSE_ESCAPED and the next character was an 216 * escape sequence. 217 */ 218 public boolean isEscaped() { 219 return isEscaped; 220 } 221 222 /** 223 * Returns true if this iterator is currently within a variable expansion. 224 */ 225 public boolean inVariable() { 226 return buf != null; 227 } 228 229 /** 230 * Returns an object which, when later passed to setPos(), will 231 * restore this iterator's position. Usage idiom: 232 * 233 * RuleCharacterIterator iterator = ...; 234 * Object pos = iterator.getPos(null); // allocate position object 235 * for (;;) { 236 * pos = iterator.getPos(pos); // reuse position object 237 * int c = iterator.next(...); 238 * ... 239 * } 240 * iterator.setPos(pos); 241 * 242 * @param p a position object previously returned by getPos(), 243 * or null. If not null, it will be updated and returned. If 244 * null, a new position object will be allocated and returned. 245 * @return a position object which may be passed to setPos(), 246 * either `p,' or if `p' == null, a newly-allocated object 247 */ 248 public Object getPos(Object p) { 249 if (p == null) { 250 return new Object[] {buf, new int[] {pos.getIndex(), bufPos}}; 251 } 252 Object[] a = (Object[]) p; 253 a[0] = buf; 254 int[] v = (int[]) a[1]; 255 v[0] = pos.getIndex(); 256 v[1] = bufPos; 257 return p; 258 } 259 260 /** 261 * Restores this iterator to the position it had when getPos() 262 * returned the given object. 263 * @param p a position object previously returned by getPos() 264 */ 265 public void setPos(Object p) { 266 Object[] a = (Object[]) p; 267 buf = (char[]) a[0]; 268 int[] v = (int[]) a[1]; 269 pos.setIndex(v[0]); 270 bufPos = v[1]; 271 } 272 273 /** 274 * Skips ahead past any ignored characters, as indicated by the given 275 * options. This is useful in conjunction with the lookahead() method. 276 * 277 * Currently, this only has an effect for SKIP_WHITESPACE. 278 * @param options one or more of the following options, bitwise-OR-ed 279 * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. 280 */ 281 public void skipIgnored(int options) { 282 if ((options & SKIP_WHITESPACE) != 0) { 283 for (;;) { 284 int a = _current(); 285 if (!UCharacterProperty.isRuleWhiteSpace(a)) break; 286 _advance(UTF16.getCharCount(a)); 287 } 288 } 289 } 290 291 /** 292 * Returns a string containing the remainder of the characters to be 293 * returned by this iterator, without any option processing. If the 294 * iterator is currently within a variable expansion, this will only 295 * extend to the end of the variable expansion. This method is provided 296 * so that iterators may interoperate with string-based APIs. The typical 297 * sequence of calls is to call skipIgnored(), then call lookahead(), then 298 * parse the string returned by lookahead(), then call jumpahead() to 299 * resynchronize the iterator. 300 * @return a string containing the characters to be returned by future 301 * calls to next() 302 */ 303 public String lookahead() { 304 if (buf != null) { 305 return new String(buf, bufPos, buf.length - bufPos); 306 } else { 307 return text.substring(pos.getIndex()); 308 } 309 } 310 311 /** 312 * Advances the position by the given number of 16-bit code units. 313 * This is useful in conjunction with the lookahead() method. 314 * @param count the number of 16-bit code units to jump over 315 */ 316 public void jumpahead(int count) { 317 if (count < 0) { 318 throw new IllegalArgumentException(); 319 } 320 if (buf != null) { 321 bufPos += count; 322 if (bufPos > buf.length) { 323 throw new IllegalArgumentException(); 324 } 325 if (bufPos == buf.length) { 326 buf = null; 327 } 328 } else { 329 int i = pos.getIndex() + count; 330 pos.setIndex(i); 331 if (i > text.length()) { 332 throw new IllegalArgumentException(); 333 } 334 } 335 } 336 337 /** 338 * Returns the current 32-bit code point without parsing escapes, parsing 339 * variables, or skipping whitespace. 340 * @return the current 32-bit code point 341 */ 342 private int _current() { 343 if (buf != null) { 344 return UTF16.charAt(buf, 0, buf.length, bufPos); 345 } else { 346 int i = pos.getIndex(); 347 return (i < text.length()) ? UTF16.charAt(text, i) : DONE; 348 } 349 } 350 351 /** 352 * Advances the position by the given amount. 353 * @param count the number of 16-bit code units to advance past 354 */ 355 private void _advance(int count) { 356 if (buf != null) { 357 bufPos += count; 358 if (bufPos == buf.length) { 359 buf = null; 360 } 361 } else { 362 pos.setIndex(pos.getIndex() + count); 363 if (pos.getIndex() > text.length()) { 364 pos.setIndex(text.length()); 365 } 366 } 367 } 368 }