1 /* 2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.xml.internal.org.jvnet.mimepull; 27 28 import java.io.InputStream; 29 import java.io.IOException; 30 import java.util.*; 31 import java.util.logging.Logger; 32 import java.nio.ByteBuffer; 33 import java.util.logging.Level; 34 35 /** 36 * Pull parser for the MIME messages. Applications can use pull API to continue 37 * the parsing MIME messages lazily. 38 * 39 * <pre> 40 * for e.g.: 41 * <p> 42 * 43 * MIMEParser parser = ... 44 * Iterator<MIMEEvent> it = parser.iterator(); 45 * while(it.hasNext()) { 46 * MIMEEvent event = it.next(); 47 * ... 48 * } 49 * </pre> 50 * 51 * @author Jitendra Kotamraju 52 */ 53 class MIMEParser implements Iterable<MIMEEvent> { 54 55 private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName()); 56 57 private static final String HEADER_ENCODING = "ISO8859-1"; 58 59 // Actually, the grammar doesn't support whitespace characters 60 // after boundary. But the mail implementation checks for it. 61 // We will only check for these many whitespace characters after boundary 62 private static final int NO_LWSP = 1000; 63 private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE} 64 private STATE state = STATE.START_MESSAGE; 65 66 private final InputStream in; 67 private final byte[] bndbytes; 68 private final int bl; 69 private final MIMEConfig config; 70 private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table 71 private final int[] gss; // BnM algo : Good Suffix Shift table 72 73 /** 74 * Have we parsed the data from our InputStream yet? 75 */ 76 private boolean parsed; 77 78 /* 79 * Read and process body partsList until we see the 80 * terminating boundary line (or EOF). 81 */ 82 private boolean done = false; 83 84 private boolean eof; 85 private final int capacity; 86 private byte[] buf; 87 private int len; 88 private boolean bol; // beginning of the line 89 90 /* 91 * Parses the MIME content. At the EOF, it also closes input stream 92 */ 93 MIMEParser(InputStream in, String boundary, MIMEConfig config) { 94 this.in = in; 95 this.bndbytes = getBytes("--"+boundary); 96 bl = bndbytes.length; 97 this.config = config; 98 gss = new int[bl]; 99 compileBoundaryPattern(); 100 101 // \r\n + boundary + "--\r\n" + lots of LWSP 102 capacity = config.chunkSize+2+bl+4+NO_LWSP; 103 createBuf(capacity); 104 } 105 106 /** 107 * Returns iterator for the parsing events. Use the iterator to advance 108 * the parsing. 109 * 110 * @return iterator for parsing events 111 */ 112 @Override 113 public Iterator<MIMEEvent> iterator() { 114 return new MIMEEventIterator(); 115 } 116 117 class MIMEEventIterator implements Iterator<MIMEEvent> { 118 119 @Override 120 public boolean hasNext() { 121 return !parsed; 122 } 123 124 @Override 125 public MIMEEvent next() { 126 127 if (parsed) { 128 throw new NoSuchElementException(); 129 } 130 131 switch(state) { 132 case START_MESSAGE : 133 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);} 134 state = STATE.SKIP_PREAMBLE; 135 return MIMEEvent.START_MESSAGE; 136 137 case SKIP_PREAMBLE : 138 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);} 139 skipPreamble(); 140 // fall through 141 case START_PART : 142 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);} 143 state = STATE.HEADERS; 144 return MIMEEvent.START_PART; 145 146 case HEADERS : 147 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);} 148 InternetHeaders ih = readHeaders(); 149 state = STATE.BODY; 150 bol = true; 151 return new MIMEEvent.Headers(ih); 152 153 case BODY : 154 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);} 155 ByteBuffer buf = readBody(); 156 bol = false; 157 return new MIMEEvent.Content(buf); 158 159 case END_PART : 160 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);} 161 if (done) { 162 state = STATE.END_MESSAGE; 163 } else { 164 state = STATE.START_PART; 165 } 166 return MIMEEvent.END_PART; 167 168 case END_MESSAGE : 169 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);} 170 parsed = true; 171 return MIMEEvent.END_MESSAGE; 172 173 default : 174 throw new MIMEParsingException("Unknown Parser state = "+state); 175 } 176 } 177 178 @Override 179 public void remove() { 180 throw new UnsupportedOperationException(); 181 } 182 } 183 184 /** 185 * Collects the headers for the current part by parsing mesage stream. 186 * 187 * @return headers for the current part 188 */ 189 private InternetHeaders readHeaders() { 190 if (!eof) { 191 fillBuf(); 192 } 193 return new InternetHeaders(new LineInputStream()); 194 } 195 196 /** 197 * Reads and saves the part of the current attachment part's content. 198 * At the end of this method, buf should have the remaining data 199 * at index 0. 200 * 201 * @return a chunk of the part's content 202 * 203 */ 204 private ByteBuffer readBody() { 205 if (!eof) { 206 fillBuf(); 207 } 208 int start = match(buf, 0, len); // matches boundary 209 if (start == -1) { 210 // No boundary is found 211 assert eof || len >= config.chunkSize; 212 int chunkSize = eof ? len : config.chunkSize; 213 if (eof) { 214 done = true; 215 throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary."); 216 } 217 return adjustBuf(chunkSize, len-chunkSize); 218 } 219 // Found boundary. 220 // Is it at the start of a line ? 221 int chunkLen = start; 222 if (bol && start == 0) { 223 // nothing to do 224 } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) { 225 --chunkLen; 226 if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') { 227 --chunkLen; 228 } 229 } else { 230 return adjustBuf(start+1, len-start-1); // boundary is not at beginning of a line 231 } 232 233 if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') { 234 state = STATE.END_PART; 235 done = true; 236 return adjustBuf(chunkLen, 0); 237 } 238 239 // Consider all the whitespace in boundary+whitespace+"\r\n" 240 int lwsp = 0; 241 for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) { 242 ++lwsp; 243 } 244 245 // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n" 246 if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') { 247 state = STATE.END_PART; 248 return adjustBuf(chunkLen, len-start-bl-lwsp-1); 249 } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') { 250 state = STATE.END_PART; 251 return adjustBuf(chunkLen, len-start-bl-lwsp-2); 252 } else if (start+bl+lwsp+1 < len) { 253 return adjustBuf(chunkLen+1, len-chunkLen-1); // boundary string in a part data 254 } else if (eof) { 255 done = true; 256 throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary."); 257 } 258 259 // Some more data needed to determine if it is indeed a proper boundary 260 return adjustBuf(chunkLen, len-chunkLen); 261 } 262 263 /** 264 * Returns a chunk from the original buffer. A new buffer is 265 * created with the remaining bytes. 266 * 267 * @param chunkSize create a chunk with these many bytes 268 * @param remaining bytes from the end of the buffer that need to be copied to 269 * the beginning of the new buffer 270 * @return chunk 271 */ 272 private ByteBuffer adjustBuf(int chunkSize, int remaining) { 273 assert buf != null; 274 assert chunkSize >= 0; 275 assert remaining >= 0; 276 277 byte[] temp = buf; 278 // create a new buf and adjust it without this chunk 279 createBuf(remaining); 280 System.arraycopy(temp, len-remaining, buf, 0, remaining); 281 len = remaining; 282 283 return ByteBuffer.wrap(temp, 0, chunkSize); 284 } 285 286 private void createBuf(int min) { 287 buf = new byte[min < capacity ? capacity : min]; 288 } 289 290 /** 291 * Skips the preamble to find the first attachment part 292 */ 293 private void skipPreamble() { 294 295 while(true) { 296 if (!eof) { 297 fillBuf(); 298 } 299 int start = match(buf, 0, len); // matches boundary 300 if (start == -1) { 301 // No boundary is found 302 if (eof) { 303 throw new MIMEParsingException("Missing start boundary"); 304 } else { 305 adjustBuf(len-bl+1, bl-1); 306 continue; 307 } 308 } 309 310 if (start > config.chunkSize) { 311 adjustBuf(start, len-start); 312 continue; 313 } 314 // Consider all the whitespace boundary+whitespace+"\r\n" 315 int lwsp = 0; 316 for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) { 317 ++lwsp; 318 } 319 // Check for \n or \r\n 320 if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) { 321 if (buf[start+bl+lwsp] == '\n') { 322 adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1); 323 break; 324 } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') { 325 adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2); 326 break; 327 } 328 } 329 adjustBuf(start+1, len-start-1); 330 } 331 if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);} 332 } 333 334 private static byte[] getBytes(String s) { 335 char [] chars= s.toCharArray(); 336 int size = chars.length; 337 byte[] bytes = new byte[size]; 338 339 for (int i = 0; i < size;) { 340 bytes[i] = (byte) chars[i++]; 341 } 342 return bytes; 343 } 344 345 /** 346 * Boyer-Moore search method. Copied from java.util.regex.Pattern.java 347 * 348 * Pre calculates arrays needed to generate the bad character 349 * shift and the good suffix shift. Only the last seven bits 350 * are used to see if chars match; This keeps the tables small 351 * and covers the heavily used ASCII range, but occasionally 352 * results in an aliased match for the bad character shift. 353 */ 354 private void compileBoundaryPattern() { 355 int i, j; 356 357 // Precalculate part of the bad character shift 358 // It is a table for where in the pattern each 359 // lower 7-bit value occurs 360 for (i = 0; i < bndbytes.length; i++) { 361 bcs[bndbytes[i]&0x7F] = i + 1; 362 } 363 364 // Precalculate the good suffix shift 365 // i is the shift amount being considered 366 NEXT: for (i = bndbytes.length; i > 0; i--) { 367 // j is the beginning index of suffix being considered 368 for (j = bndbytes.length - 1; j >= i; j--) { 369 // Testing for good suffix 370 if (bndbytes[j] == bndbytes[j-i]) { 371 // src[j..len] is a good suffix 372 gss[j-1] = i; 373 } else { 374 // No match. The array has already been 375 // filled up with correct values before. 376 continue NEXT; 377 } 378 } 379 // This fills up the remaining of optoSft 380 // any suffix can not have larger shift amount 381 // then its sub-suffix. Why??? 382 while (j > 0) { 383 gss[--j] = i; 384 } 385 } 386 // Set the guard value because of unicode compression 387 gss[bndbytes.length -1] = 1; 388 } 389 390 /** 391 * Finds the boundary in the given buffer using Boyer-Moore algo. 392 * Copied from java.util.regex.Pattern.java 393 * 394 * @param mybuf boundary to be searched in this mybuf 395 * @param off start index in mybuf 396 * @param len number of bytes in mybuf 397 * 398 * @return -1 if there is no match or index where the match starts 399 */ 400 private int match(byte[] mybuf, int off, int len) { 401 int last = len - bndbytes.length; 402 403 // Loop over all possible match positions in text 404 NEXT: while (off <= last) { 405 // Loop over pattern from right to left 406 for (int j = bndbytes.length - 1; j >= 0; j--) { 407 byte ch = mybuf[off+j]; 408 if (ch != bndbytes[j]) { 409 // Shift search to the right by the maximum of the 410 // bad character shift and the good suffix shift 411 off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]); 412 continue NEXT; 413 } 414 } 415 // Entire pattern matched starting at off 416 return off; 417 } 418 return -1; 419 } 420 421 /** 422 * Fills the remaining buf to the full capacity 423 */ 424 private void fillBuf() { 425 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);} 426 assert !eof; 427 while(len < buf.length) { 428 int read; 429 try { 430 read = in.read(buf, len, buf.length-len); 431 } catch(IOException ioe) { 432 throw new MIMEParsingException(ioe); 433 } 434 if (read == -1) { 435 eof = true; 436 try { 437 if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");} 438 in.close(); 439 } catch(IOException ioe) { 440 throw new MIMEParsingException(ioe); 441 } 442 break; 443 } else { 444 len += read; 445 } 446 } 447 if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);} 448 } 449 450 private void doubleBuf() { 451 byte[] temp = new byte[2*len]; 452 System.arraycopy(buf, 0, temp, 0, len); 453 buf = temp; 454 if (!eof) { 455 fillBuf(); 456 } 457 } 458 459 class LineInputStream { 460 private int offset; 461 462 /* 463 * Read a line containing only ASCII characters from the input 464 * stream. A line is terminated by a CR or NL or CR-NL sequence. 465 * A common error is a CR-CR-NL sequence, which will also terminate 466 * a line. 467 * The line terminator is not returned as part of the returned 468 * String. Returns null if no data is available. <p> 469 * 470 * This class is similar to the deprecated 471 * <code>DataInputStream.readLine()</code> 472 */ 473 public String readLine() throws IOException { 474 475 int hdrLen = 0; 476 int lwsp = 0; 477 while(offset+hdrLen < len) { 478 if (buf[offset+hdrLen] == '\n') { 479 lwsp = 1; 480 break; 481 } 482 if (offset+hdrLen+1 == len) { 483 doubleBuf(); 484 } 485 if (offset+hdrLen+1 >= len) { // No more data in the stream 486 assert eof; 487 return null; 488 } 489 if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') { 490 lwsp = 2; 491 break; 492 } 493 ++hdrLen; 494 } 495 if (hdrLen == 0) { 496 adjustBuf(offset+lwsp, len-offset-lwsp); 497 return null; 498 } 499 500 String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING); 501 offset += hdrLen+lwsp; 502 return hdr; 503 } 504 505 } 506 507 }