1 /*
   2  * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * @(#)HeaderTokenizer.java   1.9 02/03/27
  28  */
  29 
  30 
  31 
  32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
  33 
  34 
  35 /**
  36  * This class tokenizes RFC822 and MIME headers into the basic
  37  * symbols specified by RFC822 and MIME. <p>
  38  *
  39  * This class handles folded headers (ie headers with embedded
  40  * CRLF SPACE sequences). The folds are removed in the returned
  41  * tokens.
  42  *
  43  * @version 1.9, 02/03/27
  44  * @author  John Mani
  45  */
  46 
  47 public class HeaderTokenizer {
  48 
  49     /**
  50      * The Token class represents tokens returned by the
  51      * HeaderTokenizer.
  52      */
  53     public static class Token {
  54 
  55         private int type;
  56         private String value;
  57 
  58         /**
  59          * Token type indicating an ATOM.
  60          */
  61         public static final int ATOM            = -1;
  62 
  63         /**
  64          * Token type indicating a quoted string. The value
  65          * field contains the string without the quotes.
  66          */
  67         public static final int QUOTEDSTRING    = -2;
  68 
  69         /**
  70          * Token type indicating a comment. The value field
  71          * contains the comment string without the comment
  72          * start and end symbols.
  73          */
  74         public static final int COMMENT         = -3;
  75 
  76         /**
  77          * Token type indicating end of input.
  78          */
  79         public static final int  EOF            = -4;
  80 
  81         /**
  82          * Constructor.
  83          * @param       type    Token type
  84          * @param       value   Token value
  85          */
  86         public Token(int type, String value) {
  87              this.type = type;
  88              this.value = value;
  89         }
  90 
  91         /**
  92          * Return the type of the token. If the token represents a
  93          * delimiter or a control character, the type is that character
  94          * itself, converted to an integer. Otherwise, it's value is
  95          * one of the following:
  96          * <ul>
  97          * <li><code>ATOM</code> A sequence of ASCII characters
  98          *      delimited by either SPACE, CTL, "(", <"> or the
  99          *      specified SPECIALS
 100          * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
 101          *      within quotes
 102          * <li><code>COMMENT</code> A sequence of ASCII characters
 103          *      within "(" and ")".
 104          * <li><code>EOF</code> End of header
 105          * </ul>
 106          */
 107         public int getType() {
 108             return type;
 109         }
 110 
 111         /**
 112          * Returns the value of the token just read. When the current
 113          * token is a quoted string, this field contains the body of the
 114          * string, without the quotes. When the current token is a comment,
 115          * this field contains the body of the comment.
 116          *
 117          * @return      token value
 118          */
 119         public String getValue() {
 120             return value;
 121         }
 122     }
 123 
 124     private String string; // the string to be tokenized
 125     private boolean skipComments; // should comments be skipped ?
 126     private String delimiters; // delimiter string
 127     private int currentPos; // current parse position
 128     private int maxPos; // string length
 129     private int nextPos; // track start of next Token for next()
 130     private int peekPos; // track start of next Token for peek()
 131 
 132     /**
 133      * RFC822 specials
 134      */
 135     public final static String RFC822 = "()<>@,;:\\\"\t .[]";
 136 
 137     /**
 138      * MIME specials
 139      */
 140     public final static String MIME = "()<>@,;:\\\"\t []/?=";
 141 
 142     // The EOF Token
 143     private final static Token EOFToken = new Token(Token.EOF, null);
 144 
 145     /**
 146      * Constructor that takes a rfc822 style header.
 147      *
 148      * @param   header  The rfc822 header to be tokenized
 149      * @param   delimiters      Set of delimiter characters
 150      *                          to be used to delimit ATOMS. These
 151      *                          are usually <code>RFC822</code> or
 152      *                          <code>MIME</code>
 153      * @param   skipComments  If true, comments are skipped and
 154      *                          not returned as tokens
 155      */
 156     public HeaderTokenizer(String header, String delimiters,
 157                            boolean skipComments) {
 158         string = (header == null) ? "" : header; // paranoia ?!
 159         this.skipComments = skipComments;
 160         this.delimiters = delimiters;
 161         currentPos = nextPos = peekPos = 0;
 162         maxPos = string.length();
 163     }
 164 
 165     /**
 166      * Constructor. Comments are ignored and not returned as tokens
 167      *
 168      * @param   header  The header that is tokenized
 169      * @param   delimiters  The delimiters to be used
 170      */
 171     public HeaderTokenizer(String header, String delimiters) {
 172         this(header, delimiters, true);
 173     }
 174 
 175     /**
 176      * Constructor. The RFC822 defined delimiters - RFC822 - are
 177      * used to delimit ATOMS. Also comments are skipped and not
 178      * returned as tokens
 179      */
 180     public HeaderTokenizer(String header)  {
 181         this(header, RFC822);
 182     }
 183 
 184     /**
 185      * Parses the next token from this String. <p>
 186      *
 187      * Clients sit in a loop calling next() to parse successive
 188      * tokens until an EOF Token is returned.
 189      *
 190      * @return          the next Token
 191      * @exception       ParseException if the parse fails
 192      */
 193     public Token next() throws ParseException {
 194         Token tk;
 195 
 196         currentPos = nextPos; // setup currentPos
 197         tk = getNext();
 198         nextPos = peekPos = currentPos; // update currentPos and peekPos
 199         return tk;
 200     }
 201 
 202     /**
 203      * Peek at the next token, without actually removing the token
 204      * from the parse stream. Invoking this method multiple times
 205      * will return successive tokens, until <code>next()</code> is
 206      * called. <p>
 207      *
 208      * @return          the next Token
 209      * @exception       ParseException if the parse fails
 210      */
 211     public Token peek() throws ParseException {
 212         Token tk;
 213 
 214         currentPos = peekPos; // setup currentPos
 215         tk = getNext();
 216         peekPos = currentPos; // update peekPos
 217         return tk;
 218     }
 219 
 220     /**
 221      * Return the rest of the Header.
 222      *
 223      * @return String   rest of header. null is returned if we are
 224      *                  already at end of header
 225      */
 226     public String getRemainder() {
 227         return string.substring(nextPos);
 228     }
 229 
 230     /*
 231      * Return the next token starting from 'currentPos'. After the
 232      * parse, 'currentPos' is updated to point to the start of the
 233      * next token.
 234      */
 235     private Token getNext() throws ParseException {
 236         // If we're already at end of string, return EOF
 237         if (currentPos >= maxPos)
 238             return EOFToken;
 239 
 240         // Skip white-space, position currentPos beyond the space
 241         if (skipWhiteSpace() == Token.EOF)
 242             return EOFToken;
 243 
 244         char c;
 245         int start;
 246         boolean filter = false;
 247 
 248         c = string.charAt(currentPos);
 249 
 250         // Check or Skip comments and position currentPos
 251         // beyond the comment
 252         while (c == '(') {
 253             // Parsing comment ..
 254             int nesting;
 255             for (start = ++currentPos, nesting = 1;
 256                  nesting > 0 && currentPos < maxPos;
 257                  currentPos++) {
 258                 c = string.charAt(currentPos);
 259                 if (c == '\\') {  // Escape sequence
 260                     currentPos++; // skip the escaped character
 261                     filter = true;
 262                 } else if (c == '\r')
 263                     filter = true;
 264                 else if (c == '(')
 265                     nesting++;
 266                 else if (c == ')')
 267                     nesting--;
 268             }
 269             if (nesting != 0)
 270                 throw new ParseException("Unbalanced comments");
 271 
 272             if (!skipComments) {
 273                 // Return the comment, if we are asked to.
 274                 // Note that the comment start & end markers are ignored.
 275                 String s;
 276                 if (filter) // need to go thru the token again.
 277                     s = filterToken(string, start, currentPos-1);
 278                 else
 279                     s = string.substring(start,currentPos-1);
 280 
 281                 return new Token(Token.COMMENT, s);
 282             }
 283 
 284             // Skip any whitespace after the comment.
 285             if (skipWhiteSpace() == Token.EOF)
 286                 return EOFToken;
 287             c = string.charAt(currentPos);
 288         }
 289 
 290         // Check for quoted-string and position currentPos
 291         //  beyond the terminating quote
 292         if (c == '"') {
 293             for (start = ++currentPos; currentPos < maxPos; currentPos++) {
 294                 c = string.charAt(currentPos);
 295                 if (c == '\\') { // Escape sequence
 296                     currentPos++;
 297                     filter = true;
 298                 } else if (c == '\r')
 299                     filter = true;
 300                 else if (c == '"') {
 301                     currentPos++;
 302                     String s;
 303 
 304                     if (filter)
 305                         s = filterToken(string, start, currentPos-1);
 306                     else
 307                         s = string.substring(start,currentPos-1);
 308 
 309                     return new Token(Token.QUOTEDSTRING, s);
 310                 }
 311             }
 312             throw new ParseException("Unbalanced quoted string");
 313         }
 314 
 315         // Check for SPECIAL or CTL
 316         if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
 317             currentPos++; // re-position currentPos
 318             char ch[] = new char[1];
 319             ch[0] = c;
 320             return new Token((int)c, new String(ch));
 321         }
 322 
 323         // Check for ATOM
 324         for (start = currentPos; currentPos < maxPos; currentPos++) {
 325             c = string.charAt(currentPos);
 326             // ATOM is delimited by either SPACE, CTL, "(", <">
 327             // or the specified SPECIALS
 328             if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
 329                 c == '"' || delimiters.indexOf(c) >= 0)
 330                 break;
 331         }
 332         return new Token(Token.ATOM, string.substring(start, currentPos));
 333     }
 334 
 335     // Skip SPACE, HT, CR and NL
 336     private int skipWhiteSpace() {
 337         char c;
 338         for (; currentPos < maxPos; currentPos++)
 339             if (((c = string.charAt(currentPos)) != ' ') &&
 340                 (c != '\t') && (c != '\r') && (c != '\n'))
 341                 return currentPos;
 342         return Token.EOF;
 343     }
 344 
 345     /* Process escape sequences and embedded LWSPs from a comment or
 346      * quoted string.
 347      */
 348     private static String filterToken(String s, int start, int end) {
 349         StringBuffer sb = new StringBuffer();
 350         char c;
 351         boolean gotEscape = false;
 352         boolean gotCR = false;
 353 
 354         for (int i = start; i < end; i++) {
 355             c = s.charAt(i);
 356             if (c == '\n' && gotCR) {
 357                 // This LF is part of an unescaped
 358                 // CRLF sequence (i.e, LWSP). Skip it.
 359                 gotCR = false;
 360                 continue;
 361             }
 362 
 363             gotCR = false;
 364             if (!gotEscape) {
 365                 // Previous character was NOT '\'
 366                 if (c == '\\') // skip this character
 367                     gotEscape = true;
 368                 else if (c == '\r') // skip this character
 369                     gotCR = true;
 370                 else // append this character
 371                     sb.append(c);
 372             } else {
 373                 // Previous character was '\'. So no need to
 374                 // bother with any special processing, just
 375                 // append this character
 376                 sb.append(c);
 377                 gotEscape = false;
 378             }
 379         }
 380         return sb.toString();
 381     }
 382 }