1 /*
   2  * Copyright (c) 1994, 2004, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util;
  27 
  28 import java.lang.*;
  29 
  30 /**
  31  * The string tokenizer class allows an application to break a
  32  * string into tokens. The tokenization method is much simpler than
  33  * the one used by the {@code StreamTokenizer} class. The
  34  * {@code StringTokenizer} methods do not distinguish among
  35  * identifiers, numbers, and quoted strings, nor do they recognize
  36  * and skip comments.
  37  * <p>
  38  * The set of delimiters (the characters that separate tokens) may
  39  * be specified either at creation time or on a per-token basis.
  40  * <p>
  41  * An instance of {@code StringTokenizer} behaves in one of two
  42  * ways, depending on whether it was created with the
  43  * {@code returnDelims} flag having the value {@code true}
  44  * or {@code false}:
  45  * <ul>
  46  * <li>If the flag is {@code false}, delimiter characters serve to
  47  *     separate tokens. A token is a maximal sequence of consecutive
  48  *     characters that are not delimiters.
  49  * <li>If the flag is {@code true}, delimiter characters are themselves
  50  *     considered to be tokens. A token is thus either one delimiter
  51  *     character, or a maximal sequence of consecutive characters that are
  52  *     not delimiters.
  53  * </ul><p>
  54  * A {@code StringTokenizer} object internally maintains a current
  55  * position within the string to be tokenized. Some operations advance this
  56  * current position past the characters processed.<p>
  57  * A token is returned by taking a substring of the string that was used to
  58  * create the {@code StringTokenizer} object.
  59  * <p>
  60  * The following is one example of the use of the tokenizer. The code:
  61  * <blockquote><pre>
  62  *     StringTokenizer st = new StringTokenizer("this is a test");
  63  *     while (st.hasMoreTokens()) {
  64  *         System.out.println(st.nextToken());
  65  *     }
  66  * </pre></blockquote>
  67  * <p>
  68  * prints the following output:
  69  * <blockquote><pre>
  70  *     this
  71  *     is
  72  *     a
  73  *     test
  74  * </pre></blockquote>
  75  *
  76  * <p>
  77  * {@code StringTokenizer} is a legacy class that is retained for
  78  * compatibility reasons although its use is discouraged in new code. It is
  79  * recommended that anyone seeking this functionality use the {@code split}
  80  * method of {@code String} or the java.util.regex package instead.
  81  * <p>
  82  * The following example illustrates how the {@code String.split}
  83  * method can be used to break up a string into its basic tokens:
  84  * <blockquote><pre>
  85  *     String[] result = "this is a test".split("\\s");
  86  *     for (int x=0; x&lt;result.length; x++)
  87  *         System.out.println(result[x]);
  88  * </pre></blockquote>
  89  * <p>
  90  * prints the following output:
  91  * <blockquote><pre>
  92  *     this
  93  *     is
  94  *     a
  95  *     test
  96  * </pre></blockquote>
  97  *
  98  * @author  unascribed
  99  * @see     java.io.StreamTokenizer
 100  * @since   1.0
 101  */
 102 public
 103 class StringTokenizer implements Enumeration<Object> {
 104     private int currentPosition;
 105     private int newPosition;
 106     private int maxPosition;
 107     private String str;
 108     private String delimiters;
 109     private boolean retDelims;
 110     private boolean delimsChanged;
 111 
 112     /**
 113      * maxDelimCodePoint stores the value of the delimiter character with the
 114      * highest value. It is used to optimize the detection of delimiter
 115      * characters.
 116      *
 117      * It is unlikely to provide any optimization benefit in the
 118      * hasSurrogates case because most string characters will be
 119      * smaller than the limit, but we keep it so that the two code
 120      * paths remain similar.
 121      */
 122     private int maxDelimCodePoint;
 123 
 124     /**
 125      * If delimiters include any surrogates (including surrogate
 126      * pairs), hasSurrogates is true and the tokenizer uses the
 127      * different code path. This is because String.indexOf(int)
 128      * doesn't handle unpaired surrogates as a single character.
 129      */
 130     private boolean hasSurrogates = false;
 131 
 132     /**
 133      * When hasSurrogates is true, delimiters are converted to code
 134      * points and isDelimiter(int) is used to determine if the given
 135      * codepoint is a delimiter.
 136      */
 137     private int[] delimiterCodePoints;
 138 
 139     /**
 140      * Set maxDelimCodePoint to the highest char in the delimiter set.
 141      */
 142     private void setMaxDelimCodePoint() {
 143         if (delimiters == null) {
 144             maxDelimCodePoint = 0;
 145             return;
 146         }
 147 
 148         int m = 0;
 149         int c;
 150         int count = 0;
 151         for (int i = 0; i < delimiters.length(); i += Character.charCount(c)) {
 152             c = delimiters.charAt(i);
 153             if (c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE) {
 154                 c = delimiters.codePointAt(i);
 155                 hasSurrogates = true;
 156             }
 157             if (m < c)
 158                 m = c;
 159             count++;
 160         }
 161         maxDelimCodePoint = m;
 162 
 163         if (hasSurrogates) {
 164             delimiterCodePoints = new int[count];
 165             for (int i = 0, j = 0; i < count; i++, j += Character.charCount(c)) {
 166                 c = delimiters.codePointAt(j);
 167                 delimiterCodePoints[i] = c;
 168             }
 169         }
 170     }
 171 
 172     /**
 173      * Constructs a string tokenizer for the specified string. All
 174      * characters in the {@code delim} argument are the delimiters
 175      * for separating tokens.
 176      * <p>
 177      * If the {@code returnDelims} flag is {@code true}, then
 178      * the delimiter characters are also returned as tokens. Each
 179      * delimiter is returned as a string of length one. If the flag is
 180      * {@code false}, the delimiter characters are skipped and only
 181      * serve as separators between tokens.
 182      * <p>
 183      * Note that if {@code delim} is {@code null}, this constructor does
 184      * not throw an exception. However, trying to invoke other methods on the
 185      * resulting {@code StringTokenizer} may result in a
 186      * {@code NullPointerException}.
 187      *
 188      * @param   str            a string to be parsed.
 189      * @param   delim          the delimiters.
 190      * @param   returnDelims   flag indicating whether to return the delimiters
 191      *                         as tokens.
 192      * @exception NullPointerException if str is {@code null}
 193      */
 194     public StringTokenizer(String str, String delim, boolean returnDelims) {
 195         currentPosition = 0;
 196         newPosition = -1;
 197         delimsChanged = false;
 198         this.str = str;
 199         maxPosition = str.length();
 200         delimiters = delim;
 201         retDelims = returnDelims;
 202         setMaxDelimCodePoint();
 203     }
 204 
 205     /**
 206      * Constructs a string tokenizer for the specified string. The
 207      * characters in the {@code delim} argument are the delimiters
 208      * for separating tokens. Delimiter characters themselves will not
 209      * be treated as tokens.
 210      * <p>
 211      * Note that if {@code delim} is {@code null}, this constructor does
 212      * not throw an exception. However, trying to invoke other methods on the
 213      * resulting {@code StringTokenizer} may result in a
 214      * {@code NullPointerException}.
 215      *
 216      * @param   str     a string to be parsed.
 217      * @param   delim   the delimiters.
 218      * @exception NullPointerException if str is {@code null}
 219      */
 220     public StringTokenizer(String str, String delim) {
 221         this(str, delim, false);
 222     }
 223 
 224     /**
 225      * Constructs a string tokenizer for the specified string. The
 226      * tokenizer uses the default delimiter set, which is
 227      * <code>"&nbsp;\t\n\r\f"</code>: the space character,
 228      * the tab character, the newline character, the carriage-return character,
 229      * and the form-feed character. Delimiter characters themselves will
 230      * not be treated as tokens.
 231      *
 232      * @param   str   a string to be parsed.
 233      * @exception NullPointerException if str is {@code null}
 234      */
 235     public StringTokenizer(String str) {
 236         this(str, " \t\n\r\f", false);
 237     }
 238 
 239     /**
 240      * Skips delimiters starting from the specified position. If retDelims
 241      * is false, returns the index of the first non-delimiter character at or
 242      * after startPos. If retDelims is true, startPos is returned.
 243      */
 244     private int skipDelimiters(int startPos) {
 245         if (delimiters == null)
 246             throw new NullPointerException();
 247 
 248         int position = startPos;
 249         while (!retDelims && position < maxPosition) {
 250             if (!hasSurrogates) {
 251                 char c = str.charAt(position);
 252                 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0))
 253                     break;
 254                 position++;
 255             } else {
 256                 int c = str.codePointAt(position);
 257                 if ((c > maxDelimCodePoint) || !isDelimiter(c)) {
 258                     break;
 259                 }
 260                 position += Character.charCount(c);
 261             }
 262         }
 263         return position;
 264     }
 265 
 266     /**
 267      * Skips ahead from startPos and returns the index of the next delimiter
 268      * character encountered, or maxPosition if no such delimiter is found.
 269      */
 270     private int scanToken(int startPos) {
 271         int position = startPos;
 272         while (position < maxPosition) {
 273             if (!hasSurrogates) {
 274                 char c = str.charAt(position);
 275                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
 276                     break;
 277                 position++;
 278             } else {
 279                 int c = str.codePointAt(position);
 280                 if ((c <= maxDelimCodePoint) && isDelimiter(c))
 281                     break;
 282                 position += Character.charCount(c);
 283             }
 284         }
 285         if (retDelims && (startPos == position)) {
 286             if (!hasSurrogates) {
 287                 char c = str.charAt(position);
 288                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
 289                     position++;
 290             } else {
 291                 int c = str.codePointAt(position);
 292                 if ((c <= maxDelimCodePoint) && isDelimiter(c))
 293                     position += Character.charCount(c);
 294             }
 295         }
 296         return position;
 297     }
 298 
 299     private boolean isDelimiter(int codePoint) {
 300         for (int delimiterCodePoint : delimiterCodePoints) {
 301             if (delimiterCodePoint == codePoint) {
 302                 return true;
 303             }
 304         }
 305         return false;
 306     }
 307 
 308     /**
 309      * Tests if there are more tokens available from this tokenizer's string.
 310      * If this method returns {@code true}, then a subsequent call to
 311      * {@code nextToken} with no argument will successfully return a token.
 312      *
 313      * @return  {@code true} if and only if there is at least one token
 314      *          in the string after the current position; {@code false}
 315      *          otherwise.
 316      */
 317     public boolean hasMoreTokens() {
 318         /*
 319          * Temporarily store this position and use it in the following
 320          * nextToken() method only if the delimiters haven't been changed in
 321          * that nextToken() invocation.
 322          */
 323         newPosition = skipDelimiters(currentPosition);
 324         return (newPosition < maxPosition);
 325     }
 326 
 327     /**
 328      * Returns the next token from this string tokenizer.
 329      *
 330      * @return     the next token from this string tokenizer.
 331      * @exception  NoSuchElementException  if there are no more tokens in this
 332      *               tokenizer's string.
 333      */
 334     public String nextToken() {
 335         /*
 336          * If next position already computed in hasMoreElements() and
 337          * delimiters have changed between the computation and this invocation,
 338          * then use the computed value.
 339          */
 340 
 341         currentPosition = (newPosition >= 0 && !delimsChanged) ?
 342             newPosition : skipDelimiters(currentPosition);
 343 
 344         /* Reset these anyway */
 345         delimsChanged = false;
 346         newPosition = -1;
 347 
 348         if (currentPosition >= maxPosition)
 349             throw new NoSuchElementException();
 350         int start = currentPosition;
 351         currentPosition = scanToken(currentPosition);
 352         return str.substring(start, currentPosition);
 353     }
 354 
 355     /**
 356      * Returns the next token in this string tokenizer's string. First,
 357      * the set of characters considered to be delimiters by this
 358      * {@code StringTokenizer} object is changed to be the characters in
 359      * the string {@code delim}. Then the next token in the string
 360      * after the current position is returned. The current position is
 361      * advanced beyond the recognized token.  The new delimiter set
 362      * remains the default after this call.
 363      *
 364      * @param      delim   the new delimiters.
 365      * @return     the next token, after switching to the new delimiter set.
 366      * @exception  NoSuchElementException  if there are no more tokens in this
 367      *               tokenizer's string.
 368      * @exception NullPointerException if delim is {@code null}
 369      */
 370     public String nextToken(String delim) {
 371         delimiters = delim;
 372 
 373         /* delimiter string specified, so set the appropriate flag. */
 374         delimsChanged = true;
 375 
 376         setMaxDelimCodePoint();
 377         return nextToken();
 378     }
 379 
 380     /**
 381      * Returns the same value as the {@code hasMoreTokens}
 382      * method. It exists so that this class can implement the
 383      * {@code Enumeration} interface.
 384      *
 385      * @return  {@code true} if there are more tokens;
 386      *          {@code false} otherwise.
 387      * @see     java.util.Enumeration
 388      * @see     java.util.StringTokenizer#hasMoreTokens()
 389      */
 390     public boolean hasMoreElements() {
 391         return hasMoreTokens();
 392     }
 393 
 394     /**
 395      * Returns the same value as the {@code nextToken} method,
 396      * except that its declared return value is {@code Object} rather than
 397      * {@code String}. It exists so that this class can implement the
 398      * {@code Enumeration} interface.
 399      *
 400      * @return     the next token in the string.
 401      * @exception  NoSuchElementException  if there are no more tokens in this
 402      *               tokenizer's string.
 403      * @see        java.util.Enumeration
 404      * @see        java.util.StringTokenizer#nextToken()
 405      */
 406     public Object nextElement() {
 407         return nextToken();
 408     }
 409 
 410     /**
 411      * Calculates the number of times that this tokenizer's
 412      * {@code nextToken} method can be called before it generates an
 413      * exception. The current position is not advanced.
 414      *
 415      * @return  the number of tokens remaining in the string using the current
 416      *          delimiter set.
 417      * @see     java.util.StringTokenizer#nextToken()
 418      */
 419     public int countTokens() {
 420         int count = 0;
 421         int currpos = currentPosition;
 422         while (currpos < maxPosition) {
 423             currpos = skipDelimiters(currpos);
 424             if (currpos >= maxPosition)
 425                 break;
 426             currpos = scanToken(currpos);
 427             count++;
 428         }
 429         return count;
 430     }
 431 }