Old src/java.xml.bind/share/classes/com/sun/xml/internal/org/jvnet/mimepull/MIMEParser.java

   1 /*
   2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.org.jvnet.mimepull;
  27 
  28 import java.io.InputStream;
  29 import java.io.IOException;
  30 import java.util.*;
  31 import java.util.logging.Logger;
  32 import java.nio.ByteBuffer;
  33 import java.util.logging.Level;
  34 
  35 /**
  36  * Pull parser for the MIME messages. Applications can use pull API to continue
  37  * the parsing MIME messages lazily.
  38  *
  39  * <pre>
  40  * for e.g.:
  41  * <p>
  42  *
  43  * MIMEParser parser = ...
  44  * Iterator<MIMEEvent> it = parser.iterator();
  45  * while(it.hasNext()) {
  46  *   MIMEEvent event = it.next();
  47  *   ...
  48  * }
  49  * </pre>
  50  *
  51  * @author Jitendra Kotamraju
  52  */
  53 class MIMEParser implements Iterable<MIMEEvent> {
  54 
  55     private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName());
  56 
  57     private static final String HEADER_ENCODING = "ISO8859-1";
  58 
  59     // Actually, the grammar doesn't support whitespace characters
  60     // after boundary. But the mail implementation checks for it.
  61     // We will only check for these many whitespace characters after boundary
  62     private static final int NO_LWSP = 1000;
  63     private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE}
  64     private STATE state = STATE.START_MESSAGE;
  65 
  66     private final InputStream in;
  67     private final byte[] bndbytes;
  68     private final int bl;
  69     private final MIMEConfig config;
  70     private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table
  71     private final int[] gss;                // BnM algo : Good Suffix Shift table
  72 
  73     /**
  74      * Have we parsed the data from our InputStream yet?
  75      */
  76     private boolean parsed;
  77 
  78     /*
  79      * Read and process body partsList until we see the
  80      * terminating boundary line (or EOF).
  81      */
  82     private boolean done = false;
  83 
  84     private boolean eof;
  85     private final int capacity;
  86     private byte[] buf;
  87     private int len;
  88     private boolean bol;        // beginning of the line
  89 
  90     /*
  91      * Parses the MIME content. At the EOF, it also closes input stream
  92      */
  93     MIMEParser(InputStream in, String boundary, MIMEConfig config) {
  94         this.in = in;
  95         this.bndbytes = getBytes("--"+boundary);
  96         bl = bndbytes.length;
  97         this.config = config;
  98         gss = new int[bl];
  99         compileBoundaryPattern();
 100 
 101         // \r\n + boundary + "--\r\n" + lots of LWSP
 102         capacity = config.chunkSize+2+bl+4+NO_LWSP;
 103         createBuf(capacity);
 104     }
 105 
 106     /**
 107      * Returns iterator for the parsing events. Use the iterator to advance
 108      * the parsing.
 109      *
 110      * @return iterator for parsing events
 111      */
 112     @Override
 113     public Iterator<MIMEEvent> iterator() {
 114         return new MIMEEventIterator();
 115     }
 116 
 117     class MIMEEventIterator implements Iterator<MIMEEvent> {
 118 
 119         @Override
 120         public boolean hasNext() {
 121             return !parsed;
 122         }
 123 
 124         @Override
 125         public MIMEEvent next() {
 126             switch(state) {
 127                 case START_MESSAGE :
 128                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);}
 129                     state = STATE.SKIP_PREAMBLE;
 130                     return MIMEEvent.START_MESSAGE;
 131 
 132                 case SKIP_PREAMBLE :
 133                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);}
 134                     skipPreamble();
 135                     // fall through
 136                 case START_PART :
 137                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);}
 138                     state = STATE.HEADERS;
 139                     return MIMEEvent.START_PART;
 140 
 141                 case HEADERS :
 142                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);}
 143                     InternetHeaders ih = readHeaders();
 144                     state = STATE.BODY;
 145                     bol = true;
 146                     return new MIMEEvent.Headers(ih);
 147 
 148                 case BODY :
 149                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);}
 150                     ByteBuffer buf = readBody();
 151                     bol = false;
 152                     return new MIMEEvent.Content(buf);
 153 
 154                 case END_PART :
 155                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);}
 156                     if (done) {
 157                         state = STATE.END_MESSAGE;
 158                     } else {
 159                         state = STATE.START_PART;
 160                     }
 161                     return MIMEEvent.END_PART;
 162 
 163                 case END_MESSAGE :
 164                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);}
 165                     parsed = true;
 166                     return MIMEEvent.END_MESSAGE;
 167 
 168                 default :
 169                     throw new MIMEParsingException("Unknown Parser state = "+state);
 170             }
 171         }
 172 
 173         @Override
 174         public void remove() {
 175             throw new UnsupportedOperationException();
 176         }
 177     }
 178 
 179     /**
 180      * Collects the headers for the current part by parsing mesage stream.
 181      *
 182      * @return headers for the current part
 183      */
 184     private InternetHeaders readHeaders() {
 185         if (!eof) {
 186             fillBuf();
 187         }
 188         return new InternetHeaders(new LineInputStream());
 189     }
 190 
 191     /**
 192      * Reads and saves the part of the current attachment part's content.
 193      * At the end of this method, buf should have the remaining data
 194      * at index 0.
 195      *
 196      * @return a chunk of the part's content
 197      *
 198      */
 199     private ByteBuffer readBody() {
 200         if (!eof) {
 201             fillBuf();
 202         }
 203         int start = match(buf, 0, len);     // matches boundary
 204         if (start == -1) {
 205             // No boundary is found
 206             assert eof || len >= config.chunkSize;
 207             int chunkSize = eof ? len : config.chunkSize;
 208             if (eof) {
 209                 done = true;
 210                 throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
 211             }
 212             return adjustBuf(chunkSize, len-chunkSize);
 213         }
 214         // Found boundary.
 215         // Is it at the start of a line ?
 216         int chunkLen = start;
 217         if (bol && start == 0) {
 218             // nothing to do
 219         } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) {
 220             --chunkLen;
 221             if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') {
 222                 --chunkLen;
 223             }
 224         } else {
 225            return adjustBuf(start+1, len-start-1);  // boundary is not at beginning of a line
 226         }
 227 
 228         if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') {
 229             state = STATE.END_PART;
 230             done = true;
 231             return adjustBuf(chunkLen, 0);
 232         }
 233 
 234         // Consider all the whitespace in boundary+whitespace+"\r\n"
 235         int lwsp = 0;
 236         for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
 237             ++lwsp;
 238         }
 239 
 240         // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n"
 241         if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') {
 242             state = STATE.END_PART;
 243             return adjustBuf(chunkLen, len-start-bl-lwsp-1);
 244         } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') {
 245             state = STATE.END_PART;
 246             return adjustBuf(chunkLen, len-start-bl-lwsp-2);
 247         } else if (start+bl+lwsp+1 < len) {
 248             return adjustBuf(chunkLen+1, len-chunkLen-1);       // boundary string in a part data
 249         } else if (eof) {
 250             done = true;
 251             throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
 252         }
 253 
 254         // Some more data needed to determine if it is indeed a proper boundary
 255         return adjustBuf(chunkLen, len-chunkLen);
 256     }
 257 
 258     /**
 259      * Returns a chunk from the original buffer. A new buffer is
 260      * created with the remaining bytes.
 261      *
 262      * @param chunkSize create a chunk with these many bytes
 263      * @param remaining bytes from the end of the buffer that need to be copied to
 264      *        the beginning of the new buffer
 265      * @return chunk
 266      */
 267     private ByteBuffer adjustBuf(int chunkSize, int remaining) {
 268         assert buf != null;
 269         assert chunkSize >= 0;
 270         assert remaining >= 0;
 271 
 272         byte[] temp = buf;
 273         // create a new buf and adjust it without this chunk
 274         createBuf(remaining);
 275         System.arraycopy(temp, len-remaining, buf, 0, remaining);
 276         len = remaining;
 277 
 278         return ByteBuffer.wrap(temp, 0, chunkSize);
 279     }
 280 
 281     private void createBuf(int min) {
 282         buf = new byte[min < capacity ? capacity : min];
 283     }
 284 
 285     /**
 286      * Skips the preamble to find the first attachment part
 287      */
 288     private void skipPreamble() {
 289 
 290         while(true) {
 291             if (!eof) {
 292                 fillBuf();
 293             }
 294             int start = match(buf, 0, len);     // matches boundary
 295             if (start == -1) {
 296                 // No boundary is found
 297                 if (eof) {
 298                     throw new MIMEParsingException("Missing start boundary");
 299                 } else {
 300                     adjustBuf(len-bl+1, bl-1);
 301                     continue;
 302                 }
 303             }
 304 
 305             if (start > config.chunkSize) {
 306                 adjustBuf(start, len-start);
 307                 continue;
 308             }
 309             // Consider all the whitespace boundary+whitespace+"\r\n"
 310             int lwsp = 0;
 311             for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
 312                 ++lwsp;
 313             }
 314             // Check for \n or \r\n
 315             if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) {
 316                 if (buf[start+bl+lwsp] == '\n') {
 317                     adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1);
 318                     break;
 319                 } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') {
 320                     adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2);
 321                     break;
 322                 }
 323             }
 324             adjustBuf(start+1, len-start-1);
 325         }
 326         if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);}
 327     }
 328 
 329     private static byte[] getBytes(String s) {
 330         char [] chars= s.toCharArray();
 331         int size = chars.length;
 332         byte[] bytes = new byte[size];
 333 
 334         for (int i = 0; i < size;) {
 335             bytes[i] = (byte) chars[i++];
 336         }
 337         return bytes;
 338     }
 339 
 340         /**
 341      * Boyer-Moore search method. Copied from java.util.regex.Pattern.java
 342      *
 343      * Pre calculates arrays needed to generate the bad character
 344      * shift and the good suffix shift. Only the last seven bits
 345      * are used to see if chars match; This keeps the tables small
 346      * and covers the heavily used ASCII range, but occasionally
 347      * results in an aliased match for the bad character shift.
 348      */
 349     private void compileBoundaryPattern() {
 350         int i, j;
 351 
 352         // Precalculate part of the bad character shift
 353         // It is a table for where in the pattern each
 354         // lower 7-bit value occurs
 355         for (i = 0; i < bndbytes.length; i++) {
 356             bcs[bndbytes[i]&0x7F] = i + 1;
 357         }
 358 
 359         // Precalculate the good suffix shift
 360         // i is the shift amount being considered
 361 NEXT:   for (i = bndbytes.length; i > 0; i--) {
 362             // j is the beginning index of suffix being considered
 363             for (j = bndbytes.length - 1; j >= i; j--) {
 364                 // Testing for good suffix
 365                 if (bndbytes[j] == bndbytes[j-i]) {
 366                     // src[j..len] is a good suffix
 367                     gss[j-1] = i;
 368                 } else {
 369                     // No match. The array has already been
 370                     // filled up with correct values before.
 371                     continue NEXT;
 372                 }
 373             }
 374             // This fills up the remaining of optoSft
 375             // any suffix can not have larger shift amount
 376             // then its sub-suffix. Why???
 377             while (j > 0) {
 378                 gss[--j] = i;
 379             }
 380         }
 381         // Set the guard value because of unicode compression
 382         gss[bndbytes.length -1] = 1;
 383     }
 384 
 385     /**
 386      * Finds the boundary in the given buffer using Boyer-Moore algo.
 387      * Copied from java.util.regex.Pattern.java
 388      *
 389      * @param mybuf boundary to be searched in this mybuf
 390      * @param off start index in mybuf
 391      * @param len number of bytes in mybuf
 392      *
 393      * @return -1 if there is no match or index where the match starts
 394      */
 395     private int match(byte[] mybuf, int off, int len) {
 396         int last = len - bndbytes.length;
 397 
 398         // Loop over all possible match positions in text
 399 NEXT:   while (off <= last) {
 400             // Loop over pattern from right to left
 401             for (int j = bndbytes.length - 1; j >= 0; j--) {
 402                 byte ch = mybuf[off+j];
 403                 if (ch != bndbytes[j]) {
 404                     // Shift search to the right by the maximum of the
 405                     // bad character shift and the good suffix shift
 406                     off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]);
 407                     continue NEXT;
 408                 }
 409             }
 410             // Entire pattern matched starting at off
 411             return off;
 412         }
 413         return -1;
 414     }
 415 
 416     /**
 417      * Fills the remaining buf to the full capacity
 418      */
 419     private void fillBuf() {
 420         if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);}
 421         assert !eof;
 422         while(len < buf.length) {
 423             int read;
 424             try {
 425                 read = in.read(buf, len, buf.length-len);
 426             } catch(IOException ioe) {
 427                 throw new MIMEParsingException(ioe);
 428             }
 429             if (read == -1) {
 430                 eof = true;
 431                 try {
 432                     if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");}
 433                     in.close();
 434                 } catch(IOException ioe) {
 435                     throw new MIMEParsingException(ioe);
 436                 }
 437                 break;
 438             } else {
 439                 len += read;
 440             }
 441         }
 442         if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);}
 443     }
 444 
 445     private void doubleBuf() {
 446         byte[] temp = new byte[2*len];
 447         System.arraycopy(buf, 0, temp, 0, len);
 448         buf = temp;
 449         if (!eof) {
 450             fillBuf();
 451         }
 452     }
 453 
 454     class LineInputStream {
 455         private int offset;
 456 
 457         /*
 458          * Read a line containing only ASCII characters from the input
 459          * stream. A line is terminated by a CR or NL or CR-NL sequence.
 460          * A common error is a CR-CR-NL sequence, which will also terminate
 461          * a line.
 462          * The line terminator is not returned as part of the returned
 463          * String. Returns null if no data is available. <p>
 464          *
 465          * This class is similar to the deprecated
 466          * <code>DataInputStream.readLine()</code>
 467          */
 468         public String readLine() throws IOException {
 469 
 470             int hdrLen = 0;
 471             int lwsp = 0;
 472             while(offset+hdrLen < len) {
 473                 if (buf[offset+hdrLen] == '\n') {
 474                     lwsp = 1;
 475                     break;
 476                 }
 477                 if (offset+hdrLen+1 == len) {
 478                     doubleBuf();
 479                 }
 480                 if (offset+hdrLen+1 >= len) {   // No more data in the stream
 481                     assert eof;
 482                     return null;
 483                 }
 484                 if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') {
 485                     lwsp = 2;
 486                     break;
 487                 }
 488                 ++hdrLen;
 489             }
 490             if (hdrLen == 0) {
 491                 adjustBuf(offset+lwsp, len-offset-lwsp);
 492                 return null;
 493             }
 494 
 495             String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING);
 496             offset += hdrLen+lwsp;
 497             return hdr;
 498         }
 499 
 500     }
 501 
 502 }