New src/java.xml.bind/share/classes/com/sun/xml/internal/org/jvnet/mimepull/MIMEParser.java

   1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.org.jvnet.mimepull;
  27 
  28 import java.io.InputStream;
  29 import java.io.IOException;
  30 import java.util.*;
  31 import java.util.logging.Logger;
  32 import java.nio.ByteBuffer;
  33 import java.util.logging.Level;
  34 
  35 /**
  36  * Pull parser for the MIME messages. Applications can use pull API to continue
  37  * the parsing MIME messages lazily.
  38  *
  39  * <pre>
  40  * for e.g.:
  41  * <p>
  42  *
  43  * MIMEParser parser = ...
  44  * Iterator<MIMEEvent> it = parser.iterator();
  45  * while(it.hasNext()) {
  46  *   MIMEEvent event = it.next();
  47  *   ...
  48  * }
  49  * </pre>
  50  *
  51  * @author Jitendra Kotamraju
  52  */
  53 class MIMEParser implements Iterable<MIMEEvent> {
  54 
  55     private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName());
  56 
  57     private static final String HEADER_ENCODING = "ISO8859-1";
  58 
  59     // Actually, the grammar doesn't support whitespace characters
  60     // after boundary. But the mail implementation checks for it.
  61     // We will only check for these many whitespace characters after boundary
  62     private static final int NO_LWSP = 1000;
  63     private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE}
  64     private STATE state = STATE.START_MESSAGE;
  65 
  66     private final InputStream in;
  67     private final byte[] bndbytes;
  68     private final int bl;
  69     private final MIMEConfig config;
  70     private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table
  71     private final int[] gss;                // BnM algo : Good Suffix Shift table
  72 
  73     /**
  74      * Have we parsed the data from our InputStream yet?
  75      */
  76     private boolean parsed;
  77 
  78     /*
  79      * Read and process body partsList until we see the
  80      * terminating boundary line (or EOF).
  81      */
  82     private boolean done = false;
  83 
  84     private boolean eof;
  85     private final int capacity;
  86     private byte[] buf;
  87     private int len;
  88     private boolean bol;        // beginning of the line
  89 
  90     /*
  91      * Parses the MIME content. At the EOF, it also closes input stream
  92      */
  93     MIMEParser(InputStream in, String boundary, MIMEConfig config) {
  94         this.in = in;
  95         this.bndbytes = getBytes("--"+boundary);
  96         bl = bndbytes.length;
  97         this.config = config;
  98         gss = new int[bl];
  99         compileBoundaryPattern();
 100 
 101         // \r\n + boundary + "--\r\n" + lots of LWSP
 102         capacity = config.chunkSize+2+bl+4+NO_LWSP;
 103         createBuf(capacity);
 104     }
 105 
 106     /**
 107      * Returns iterator for the parsing events. Use the iterator to advance
 108      * the parsing.
 109      *
 110      * @return iterator for parsing events
 111      */
 112     @Override
 113     public Iterator<MIMEEvent> iterator() {
 114         return new MIMEEventIterator();
 115     }
 116 
 117     class MIMEEventIterator implements Iterator<MIMEEvent> {
 118 
 119         @Override
 120         public boolean hasNext() {
 121             return !parsed;
 122         }
 123 
 124         @Override
 125         public MIMEEvent next() {
 126 
 127             if (parsed) {
 128                 throw new NoSuchElementException();
 129             }
 130 
 131             switch(state) {
 132                 case START_MESSAGE :
 133                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);}
 134                     state = STATE.SKIP_PREAMBLE;
 135                     return MIMEEvent.START_MESSAGE;
 136 
 137                 case SKIP_PREAMBLE :
 138                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);}
 139                     skipPreamble();
 140                     // fall through
 141                 case START_PART :
 142                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);}
 143                     state = STATE.HEADERS;
 144                     return MIMEEvent.START_PART;
 145 
 146                 case HEADERS :
 147                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);}
 148                     InternetHeaders ih = readHeaders();
 149                     state = STATE.BODY;
 150                     bol = true;
 151                     return new MIMEEvent.Headers(ih);
 152 
 153                 case BODY :
 154                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);}
 155                     ByteBuffer buf = readBody();
 156                     bol = false;
 157                     return new MIMEEvent.Content(buf);
 158 
 159                 case END_PART :
 160                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);}
 161                     if (done) {
 162                         state = STATE.END_MESSAGE;
 163                     } else {
 164                         state = STATE.START_PART;
 165                     }
 166                     return MIMEEvent.END_PART;
 167 
 168                 case END_MESSAGE :
 169                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);}
 170                     parsed = true;
 171                     return MIMEEvent.END_MESSAGE;
 172 
 173                 default :
 174                     throw new MIMEParsingException("Unknown Parser state = "+state);
 175             }
 176         }
 177 
 178         @Override
 179         public void remove() {
 180             throw new UnsupportedOperationException();
 181         }
 182     }
 183 
 184     /**
 185      * Collects the headers for the current part by parsing mesage stream.
 186      *
 187      * @return headers for the current part
 188      */
 189     private InternetHeaders readHeaders() {
 190         if (!eof) {
 191             fillBuf();
 192         }
 193         return new InternetHeaders(new LineInputStream());
 194     }
 195 
 196     /**
 197      * Reads and saves the part of the current attachment part's content.
 198      * At the end of this method, buf should have the remaining data
 199      * at index 0.
 200      *
 201      * @return a chunk of the part's content
 202      *
 203      */
 204     private ByteBuffer readBody() {
 205         if (!eof) {
 206             fillBuf();
 207         }
 208         int start = match(buf, 0, len);     // matches boundary
 209         if (start == -1) {
 210             // No boundary is found
 211             assert eof || len >= config.chunkSize;
 212             int chunkSize = eof ? len : config.chunkSize;
 213             if (eof) {
 214                 done = true;
 215                 throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
 216             }
 217             return adjustBuf(chunkSize, len-chunkSize);
 218         }
 219         // Found boundary.
 220         // Is it at the start of a line ?
 221         int chunkLen = start;
 222         if (bol && start == 0) {
 223             // nothing to do
 224         } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) {
 225             --chunkLen;
 226             if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') {
 227                 --chunkLen;
 228             }
 229         } else {
 230            return adjustBuf(start+1, len-start-1);  // boundary is not at beginning of a line
 231         }
 232 
 233         if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') {
 234             state = STATE.END_PART;
 235             done = true;
 236             return adjustBuf(chunkLen, 0);
 237         }
 238 
 239         // Consider all the whitespace in boundary+whitespace+"\r\n"
 240         int lwsp = 0;
 241         for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
 242             ++lwsp;
 243         }
 244 
 245         // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n"
 246         if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') {
 247             state = STATE.END_PART;
 248             return adjustBuf(chunkLen, len-start-bl-lwsp-1);
 249         } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') {
 250             state = STATE.END_PART;
 251             return adjustBuf(chunkLen, len-start-bl-lwsp-2);
 252         } else if (start+bl+lwsp+1 < len) {
 253             return adjustBuf(chunkLen+1, len-chunkLen-1);       // boundary string in a part data
 254         } else if (eof) {
 255             done = true;
 256             throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
 257         }
 258 
 259         // Some more data needed to determine if it is indeed a proper boundary
 260         return adjustBuf(chunkLen, len-chunkLen);
 261     }
 262 
 263     /**
 264      * Returns a chunk from the original buffer. A new buffer is
 265      * created with the remaining bytes.
 266      *
 267      * @param chunkSize create a chunk with these many bytes
 268      * @param remaining bytes from the end of the buffer that need to be copied to
 269      *        the beginning of the new buffer
 270      * @return chunk
 271      */
 272     private ByteBuffer adjustBuf(int chunkSize, int remaining) {
 273         assert buf != null;
 274         assert chunkSize >= 0;
 275         assert remaining >= 0;
 276 
 277         byte[] temp = buf;
 278         // create a new buf and adjust it without this chunk
 279         createBuf(remaining);
 280         System.arraycopy(temp, len-remaining, buf, 0, remaining);
 281         len = remaining;
 282 
 283         return ByteBuffer.wrap(temp, 0, chunkSize);
 284     }
 285 
 286     private void createBuf(int min) {
 287         buf = new byte[min < capacity ? capacity : min];
 288     }
 289 
 290     /**
 291      * Skips the preamble to find the first attachment part
 292      */
 293     private void skipPreamble() {
 294 
 295         while(true) {
 296             if (!eof) {
 297                 fillBuf();
 298             }
 299             int start = match(buf, 0, len);     // matches boundary
 300             if (start == -1) {
 301                 // No boundary is found
 302                 if (eof) {
 303                     throw new MIMEParsingException("Missing start boundary");
 304                 } else {
 305                     adjustBuf(len-bl+1, bl-1);
 306                     continue;
 307                 }
 308             }
 309 
 310             if (start > config.chunkSize) {
 311                 adjustBuf(start, len-start);
 312                 continue;
 313             }
 314             // Consider all the whitespace boundary+whitespace+"\r\n"
 315             int lwsp = 0;
 316             for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
 317                 ++lwsp;
 318             }
 319             // Check for \n or \r\n
 320             if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) {
 321                 if (buf[start+bl+lwsp] == '\n') {
 322                     adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1);
 323                     break;
 324                 } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') {
 325                     adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2);
 326                     break;
 327                 }
 328             }
 329             adjustBuf(start+1, len-start-1);
 330         }
 331         if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);}
 332     }
 333 
 334     private static byte[] getBytes(String s) {
 335         char [] chars= s.toCharArray();
 336         int size = chars.length;
 337         byte[] bytes = new byte[size];
 338 
 339         for (int i = 0; i < size;) {
 340             bytes[i] = (byte) chars[i++];
 341         }
 342         return bytes;
 343     }
 344 
 345         /**
 346      * Boyer-Moore search method. Copied from java.util.regex.Pattern.java
 347      *
 348      * Pre calculates arrays needed to generate the bad character
 349      * shift and the good suffix shift. Only the last seven bits
 350      * are used to see if chars match; This keeps the tables small
 351      * and covers the heavily used ASCII range, but occasionally
 352      * results in an aliased match for the bad character shift.
 353      */
 354     private void compileBoundaryPattern() {
 355         int i, j;
 356 
 357         // Precalculate part of the bad character shift
 358         // It is a table for where in the pattern each
 359         // lower 7-bit value occurs
 360         for (i = 0; i < bndbytes.length; i++) {
 361             bcs[bndbytes[i]&0x7F] = i + 1;
 362         }
 363 
 364         // Precalculate the good suffix shift
 365         // i is the shift amount being considered
 366 NEXT:   for (i = bndbytes.length; i > 0; i--) {
 367             // j is the beginning index of suffix being considered
 368             for (j = bndbytes.length - 1; j >= i; j--) {
 369                 // Testing for good suffix
 370                 if (bndbytes[j] == bndbytes[j-i]) {
 371                     // src[j..len] is a good suffix
 372                     gss[j-1] = i;
 373                 } else {
 374                     // No match. The array has already been
 375                     // filled up with correct values before.
 376                     continue NEXT;
 377                 }
 378             }
 379             // This fills up the remaining of optoSft
 380             // any suffix can not have larger shift amount
 381             // then its sub-suffix. Why???
 382             while (j > 0) {
 383                 gss[--j] = i;
 384             }
 385         }
 386         // Set the guard value because of unicode compression
 387         gss[bndbytes.length -1] = 1;
 388     }
 389 
 390     /**
 391      * Finds the boundary in the given buffer using Boyer-Moore algo.
 392      * Copied from java.util.regex.Pattern.java
 393      *
 394      * @param mybuf boundary to be searched in this mybuf
 395      * @param off start index in mybuf
 396      * @param len number of bytes in mybuf
 397      *
 398      * @return -1 if there is no match or index where the match starts
 399      */
 400     private int match(byte[] mybuf, int off, int len) {
 401         int last = len - bndbytes.length;
 402 
 403         // Loop over all possible match positions in text
 404 NEXT:   while (off <= last) {
 405             // Loop over pattern from right to left
 406             for (int j = bndbytes.length - 1; j >= 0; j--) {
 407                 byte ch = mybuf[off+j];
 408                 if (ch != bndbytes[j]) {
 409                     // Shift search to the right by the maximum of the
 410                     // bad character shift and the good suffix shift
 411                     off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]);
 412                     continue NEXT;
 413                 }
 414             }
 415             // Entire pattern matched starting at off
 416             return off;
 417         }
 418         return -1;
 419     }
 420 
 421     /**
 422      * Fills the remaining buf to the full capacity
 423      */
 424     private void fillBuf() {
 425         if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);}
 426         assert !eof;
 427         while(len < buf.length) {
 428             int read;
 429             try {
 430                 read = in.read(buf, len, buf.length-len);
 431             } catch(IOException ioe) {
 432                 throw new MIMEParsingException(ioe);
 433             }
 434             if (read == -1) {
 435                 eof = true;
 436                 try {
 437                     if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");}
 438                     in.close();
 439                 } catch(IOException ioe) {
 440                     throw new MIMEParsingException(ioe);
 441                 }
 442                 break;
 443             } else {
 444                 len += read;
 445             }
 446         }
 447         if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);}
 448     }
 449 
 450     private void doubleBuf() {
 451         byte[] temp = new byte[2*len];
 452         System.arraycopy(buf, 0, temp, 0, len);
 453         buf = temp;
 454         if (!eof) {
 455             fillBuf();
 456         }
 457     }
 458 
 459     class LineInputStream {
 460         private int offset;
 461 
 462         /*
 463          * Read a line containing only ASCII characters from the input
 464          * stream. A line is terminated by a CR or NL or CR-NL sequence.
 465          * A common error is a CR-CR-NL sequence, which will also terminate
 466          * a line.
 467          * The line terminator is not returned as part of the returned
 468          * String. Returns null if no data is available. <p>
 469          *
 470          * This class is similar to the deprecated
 471          * <code>DataInputStream.readLine()</code>
 472          */
 473         public String readLine() throws IOException {
 474 
 475             int hdrLen = 0;
 476             int lwsp = 0;
 477             while(offset+hdrLen < len) {
 478                 if (buf[offset+hdrLen] == '\n') {
 479                     lwsp = 1;
 480                     break;
 481                 }
 482                 if (offset+hdrLen+1 == len) {
 483                     doubleBuf();
 484                 }
 485                 if (offset+hdrLen+1 >= len) {   // No more data in the stream
 486                     assert eof;
 487                     return null;
 488                 }
 489                 if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') {
 490                     lwsp = 2;
 491                     break;
 492                 }
 493                 ++hdrLen;
 494             }
 495             if (hdrLen == 0) {
 496                 adjustBuf(offset+lwsp, len-offset-lwsp);
 497                 return null;
 498             }
 499 
 500             String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING);
 501             offset += hdrLen+lwsp;
 502             return hdr;
 503         }
 504 
 505     }
 506 
 507 }