1 /* 2 * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * @(#)HeaderTokenizer.java 1.9 02/03/27 28 */ 29 30 31 32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet; 33 34 35 /** 36 * This class tokenizes RFC822 and MIME headers into the basic 37 * symbols specified by RFC822 and MIME. <p> 38 * 39 * This class handles folded headers (ie headers with embedded 40 * CRLF SPACE sequences). The folds are removed in the returned 41 * tokens. 42 * 43 * @version 1.9, 02/03/27 44 * @author John Mani 45 */ 46 47 public class HeaderTokenizer { 48 49 /** 50 * The Token class represents tokens returned by the 51 * HeaderTokenizer. 52 */ 53 public static class Token { 54 55 private int type; 56 private String value; 57 58 /** 59 * Token type indicating an ATOM. 60 */ 61 public static final int ATOM = -1; 62 63 /** 64 * Token type indicating a quoted string. The value 65 * field contains the string without the quotes. 66 */ 67 public static final int QUOTEDSTRING = -2; 68 69 /** 70 * Token type indicating a comment. The value field 71 * contains the comment string without the comment 72 * start and end symbols. 73 */ 74 public static final int COMMENT = -3; 75 76 /** 77 * Token type indicating end of input. 78 */ 79 public static final int EOF = -4; 80 81 /** 82 * Constructor. 83 * @param type Token type 84 * @param value Token value 85 */ 86 public Token(int type, String value) { 87 this.type = type; 88 this.value = value; 89 } 90 91 /** 92 * Return the type of the token. If the token represents a 93 * delimiter or a control character, the type is that character 94 * itself, converted to an integer. Otherwise, it's value is 95 * one of the following: 96 * <ul> 97 * <li><code>ATOM</code> A sequence of ASCII characters 98 * delimited by either SPACE, CTL, "(", <"> or the 99 * specified SPECIALS 100 * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters 101 * within quotes 102 * <li><code>COMMENT</code> A sequence of ASCII characters 103 * within "(" and ")". 104 * <li><code>EOF</code> End of header 105 * </ul> 106 */ 107 public int getType() { 108 return type; 109 } 110 111 /** 112 * Returns the value of the token just read. When the current 113 * token is a quoted string, this field contains the body of the 114 * string, without the quotes. When the current token is a comment, 115 * this field contains the body of the comment. 116 * 117 * @return token value 118 */ 119 public String getValue() { 120 return value; 121 } 122 } 123 124 private String string; // the string to be tokenized 125 private boolean skipComments; // should comments be skipped ? 126 private String delimiters; // delimiter string 127 private int currentPos; // current parse position 128 private int maxPos; // string length 129 private int nextPos; // track start of next Token for next() 130 private int peekPos; // track start of next Token for peek() 131 132 /** 133 * RFC822 specials 134 */ 135 public final static String RFC822 = "()<>@,;:\\\"\t .[]"; 136 137 /** 138 * MIME specials 139 */ 140 public final static String MIME = "()<>@,;:\\\"\t []/?="; 141 142 // The EOF Token 143 private final static Token EOFToken = new Token(Token.EOF, null); 144 145 /** 146 * Constructor that takes a rfc822 style header. 147 * 148 * @param header The rfc822 header to be tokenized 149 * @param delimiters Set of delimiter characters 150 * to be used to delimit ATOMS. These 151 * are usually <code>RFC822</code> or 152 * <code>MIME</code> 153 * @param skipComments If true, comments are skipped and 154 * not returned as tokens 155 */ 156 public HeaderTokenizer(String header, String delimiters, 157 boolean skipComments) { 158 string = (header == null) ? "" : header; // paranoia ?! 159 this.skipComments = skipComments; 160 this.delimiters = delimiters; 161 currentPos = nextPos = peekPos = 0; 162 maxPos = string.length(); 163 } 164 165 /** 166 * Constructor. Comments are ignored and not returned as tokens 167 * 168 * @param header The header that is tokenized 169 * @param delimiters The delimiters to be used 170 */ 171 public HeaderTokenizer(String header, String delimiters) { 172 this(header, delimiters, true); 173 } 174 175 /** 176 * Constructor. The RFC822 defined delimiters - RFC822 - are 177 * used to delimit ATOMS. Also comments are skipped and not 178 * returned as tokens 179 */ 180 public HeaderTokenizer(String header) { 181 this(header, RFC822); 182 } 183 184 /** 185 * Parses the next token from this String. <p> 186 * 187 * Clients sit in a loop calling next() to parse successive 188 * tokens until an EOF Token is returned. 189 * 190 * @return the next Token 191 * @exception ParseException if the parse fails 192 */ 193 public Token next() throws ParseException { 194 Token tk; 195 196 currentPos = nextPos; // setup currentPos 197 tk = getNext(); 198 nextPos = peekPos = currentPos; // update currentPos and peekPos 199 return tk; 200 } 201 202 /** 203 * Peek at the next token, without actually removing the token 204 * from the parse stream. Invoking this method multiple times 205 * will return successive tokens, until <code>next()</code> is 206 * called. <p> 207 * 208 * @return the next Token 209 * @exception ParseException if the parse fails 210 */ 211 public Token peek() throws ParseException { 212 Token tk; 213 214 currentPos = peekPos; // setup currentPos 215 tk = getNext(); 216 peekPos = currentPos; // update peekPos 217 return tk; 218 } 219 220 /** 221 * Return the rest of the Header. 222 * 223 * @return String rest of header. null is returned if we are 224 * already at end of header 225 */ 226 public String getRemainder() { 227 return string.substring(nextPos); 228 } 229 230 /* 231 * Return the next token starting from 'currentPos'. After the 232 * parse, 'currentPos' is updated to point to the start of the 233 * next token. 234 */ 235 private Token getNext() throws ParseException { 236 // If we're already at end of string, return EOF 237 if (currentPos >= maxPos) 238 return EOFToken; 239 240 // Skip white-space, position currentPos beyond the space 241 if (skipWhiteSpace() == Token.EOF) 242 return EOFToken; 243 244 char c; 245 int start; 246 boolean filter = false; 247 248 c = string.charAt(currentPos); 249 250 // Check or Skip comments and position currentPos 251 // beyond the comment 252 while (c == '(') { 253 // Parsing comment .. 254 int nesting; 255 for (start = ++currentPos, nesting = 1; 256 nesting > 0 && currentPos < maxPos; 257 currentPos++) { 258 c = string.charAt(currentPos); 259 if (c == '\\') { // Escape sequence 260 currentPos++; // skip the escaped character 261 filter = true; 262 } else if (c == '\r') 263 filter = true; 264 else if (c == '(') 265 nesting++; 266 else if (c == ')') 267 nesting--; 268 } 269 if (nesting != 0) 270 throw new ParseException("Unbalanced comments"); 271 272 if (!skipComments) { 273 // Return the comment, if we are asked to. 274 // Note that the comment start & end markers are ignored. 275 String s; 276 if (filter) // need to go thru the token again. 277 s = filterToken(string, start, currentPos-1); 278 else 279 s = string.substring(start,currentPos-1); 280 281 return new Token(Token.COMMENT, s); 282 } 283 284 // Skip any whitespace after the comment. 285 if (skipWhiteSpace() == Token.EOF) 286 return EOFToken; 287 c = string.charAt(currentPos); 288 } 289 290 // Check for quoted-string and position currentPos 291 // beyond the terminating quote 292 if (c == '"') { 293 for (start = ++currentPos; currentPos < maxPos; currentPos++) { 294 c = string.charAt(currentPos); 295 if (c == '\\') { // Escape sequence 296 currentPos++; 297 filter = true; 298 } else if (c == '\r') 299 filter = true; 300 else if (c == '"') { 301 currentPos++; 302 String s; 303 304 if (filter) 305 s = filterToken(string, start, currentPos-1); 306 else 307 s = string.substring(start,currentPos-1); 308 309 return new Token(Token.QUOTEDSTRING, s); 310 } 311 } 312 throw new ParseException("Unbalanced quoted string"); 313 } 314 315 // Check for SPECIAL or CTL 316 if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) { 317 currentPos++; // re-position currentPos 318 char ch[] = new char[1]; 319 ch[0] = c; 320 return new Token((int)c, new String(ch)); 321 } 322 323 // Check for ATOM 324 for (start = currentPos; currentPos < maxPos; currentPos++) { 325 c = string.charAt(currentPos); 326 // ATOM is delimited by either SPACE, CTL, "(", <"> 327 // or the specified SPECIALS 328 if (c < 040 || c >= 0177 || c == '(' || c == ' ' || 329 c == '"' || delimiters.indexOf(c) >= 0) 330 break; 331 } 332 return new Token(Token.ATOM, string.substring(start, currentPos)); 333 } 334 335 // Skip SPACE, HT, CR and NL 336 private int skipWhiteSpace() { 337 char c; 338 for (; currentPos < maxPos; currentPos++) 339 if (((c = string.charAt(currentPos)) != ' ') && 340 (c != '\t') && (c != '\r') && (c != '\n')) 341 return currentPos; 342 return Token.EOF; 343 } 344 345 /* Process escape sequences and embedded LWSPs from a comment or 346 * quoted string. 347 */ 348 private static String filterToken(String s, int start, int end) { 349 StringBuffer sb = new StringBuffer(); 350 char c; 351 boolean gotEscape = false; 352 boolean gotCR = false; 353 354 for (int i = start; i < end; i++) { 355 c = s.charAt(i); 356 if (c == '\n' && gotCR) { 357 // This LF is part of an unescaped 358 // CRLF sequence (i.e, LWSP). Skip it. 359 gotCR = false; 360 continue; 361 } 362 363 gotCR = false; 364 if (!gotEscape) { 365 // Previous character was NOT '\' 366 if (c == '\\') // skip this character 367 gotEscape = true; 368 else if (c == '\r') // skip this character 369 gotCR = true; 370 else // append this character 371 sb.append(c); 372 } else { 373 // Previous character was '\'. So no need to 374 // bother with any special processing, just 375 // append this character 376 sb.append(c); 377 gotEscape = false; 378 } 379 } 380 return sb.toString(); 381 } 382 }