1 /* 2 * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * @(#)MimeUtility.java 1.45 03/03/10 28 */ 29 30 31 32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet; 33 34 import java.io.*; 35 import java.util.*; 36 37 import javax.activation.DataHandler; 38 import javax.activation.DataSource; 39 40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException; 41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*; 42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil; 43 44 /** 45 * This is a utility class that provides various MIME related 46 * functionality. <p> 47 * 48 * There are a set of methods to encode and decode MIME headers as 49 * per RFC 2047. A brief description on handling such headers is 50 * given below: <p> 51 * 52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII 53 * characters. Headers that contain non US-ASCII characters must be 54 * encoded so that they contain only US-ASCII characters. Basically, 55 * this process involves using either BASE64 or QP to encode certain 56 * characters. RFC 2047 describes this in detail. <p> 57 * 58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a 59 * subset of Unicode (and occupies the range 0 - 127). A String 60 * that contains only ASCII characters is already mail-safe. If the 61 * String contains non US-ASCII characters, it must be encoded. An 62 * additional complexity in this step is that since Unicode is not 63 * yet a widely used charset, one might want to first charset-encode 64 * the String into another charset and then do the transfer-encoding. 65 * <p> 66 * Note that to get the actual bytes of a mail-safe String (say, 67 * for sending over SMTP), one must do 68 * <p><blockquote><pre> 69 * 70 * byte[] bytes = string.getBytes("iso-8859-1"); 71 * 72 * </pre></blockquote><p> 73 * 74 * The <code>setHeader</code> and <code>addHeader</code> methods 75 * on MimeMessage and MimeBodyPart assume that the given header values 76 * are Unicode strings that contain only US-ASCII characters. Hence 77 * the callers of those methods must insure that the values they pass 78 * do not contain non US-ASCII characters. The methods in this class 79 * help do this. <p> 80 * 81 * The <code>getHeader</code> family of methods on MimeMessage and 82 * MimeBodyPart return the raw header value. These might be encoded 83 * as per RFC 2047, and if so, must be decoded into Unicode Strings. 84 * The methods in this class help to do this. <p> 85 * 86 * Several System properties control strict conformance to the MIME 87 * spec. Note that these are not session properties but must be set 88 * globally as System properties. <p> 89 * 90 * The <code>mail.mime.decodetext.strict</code> property controls 91 * decoding of MIME encoded words. The MIME spec requires that encoded 92 * words start at the beginning of a whitespace separated word. Some 93 * mailers incorrectly include encoded words in the middle of a word. 94 * If the <code>mail.mime.decodetext.strict</code> System property is 95 * set to <code>"false"</code>, an attempt will be made to decode these 96 * illegal encoded words. The default is true. <p> 97 * 98 * The <code>mail.mime.encodeeol.strict</code> property controls the 99 * choice of Content-Transfer-Encoding for MIME parts that are not of 100 * type "text". Often such parts will contain textual data for which 101 * an encoding that allows normal end of line conventions is appropriate. 102 * In rare cases, such a part will appear to contain entirely textual 103 * data, but will require an encoding that preserves CR and LF characters 104 * without change. If the <code>mail.mime.decodetext.strict</code> 105 * System property is set to <code>"true"</code>, such an encoding will 106 * be used when necessary. The default is false. <p> 107 * 108 * In addition, the <code>mail.mime.charset</code> System property can 109 * be used to specify the default MIME charset to use for encoded words 110 * and text parts that don't otherwise specify a charset. Normally, the 111 * default MIME charset is derived from the default Java charset, as 112 * specified in the <code>file.encoding</code> System property. Most 113 * applications will have no need to explicitly set the default MIME 114 * charset. In cases where the default MIME charset to be used for 115 * mail messages is different than the charset used for files stored on 116 * the system, this property should be set. 117 * 118 * @version 1.45, 03/03/10 119 * @author John Mani 120 * @author Bill Shannon 121 */ 122 123 public class MimeUtility { 124 125 // This class cannot be instantiated 126 private MimeUtility() { } 127 128 public static final int ALL = -1; 129 130 private static final int BUFFER_SIZE = 1024; 131 private static boolean decodeStrict = true; 132 private static boolean encodeEolStrict = false; 133 private static boolean foldEncodedWords = false; 134 private static boolean foldText = true; 135 136 static { 137 try { 138 String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict"); 139 // default to true 140 decodeStrict = s == null || !s.equalsIgnoreCase("false"); 141 s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict"); 142 // default to false 143 encodeEolStrict = s != null && s.equalsIgnoreCase("true"); 144 s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords"); 145 // default to false 146 foldEncodedWords = s != null && s.equalsIgnoreCase("true"); 147 s = SAAJUtil.getSystemProperty("mail.mime.foldtext"); 148 // default to true 149 foldText = s == null || !s.equalsIgnoreCase("false"); 150 } catch (SecurityException sex) { 151 // ignore it 152 } 153 } 154 155 156 /** 157 * Get the content-transfer-encoding that should be applied 158 * to the input stream of this datasource, to make it mailsafe. <p> 159 * 160 * The algorithm used here is: <br> 161 * <ul> 162 * <li> 163 * If the primary type of this datasource is "text" and if all 164 * the bytes in its input stream are US-ASCII, then the encoding 165 * is "7bit". If more than half of the bytes are non-US-ASCII, then 166 * the encoding is "base64". If less than half of the bytes are 167 * non-US-ASCII, then the encoding is "quoted-printable". 168 * <li> 169 * If the primary type of this datasource is not "text", then if 170 * all the bytes of its input stream are US-ASCII, the encoding 171 * is "7bit". If there is even one non-US-ASCII character, the 172 * encoding is "base64". 173 * </ul> 174 * 175 * @param ds DataSource 176 * @return the encoding. This is either "7bit", 177 * "quoted-printable" or "base64" 178 */ 179 public static String getEncoding(DataSource ds) { 180 ContentType cType = null; 181 InputStream is = null; 182 String encoding = null; 183 184 try { 185 cType = new ContentType(ds.getContentType()); 186 is = ds.getInputStream(); 187 } catch (Exception ex) { 188 return "base64"; // what else ?! 189 } 190 191 boolean isText = cType.match("text/*"); 192 // if not text, stop processing when we see non-ASCII 193 int i = checkAscii(is, ALL, !isText); 194 switch (i) { 195 case ALL_ASCII: 196 encoding = "7bit"; // all ascii 197 break; 198 case MOSTLY_ASCII: 199 encoding = "quoted-printable"; // mostly ascii 200 break; 201 default: 202 encoding = "base64"; // mostly binary 203 break; 204 } 205 206 // Close the input stream 207 try { 208 is.close(); 209 } catch (IOException ioex) { } 210 211 return encoding; 212 } 213 214 /** 215 * Same as <code>getEncoding(DataSource)</code> except that instead 216 * of reading the data from an <code>InputStream</code> it uses the 217 * <code>writeTo</code> method to examine the data. This is more 218 * efficient in the common case of a <code>DataHandler</code> 219 * created with an object and a MIME type (for example, a 220 * "text/plain" String) because all the I/O is done in this 221 * thread. In the case requiring an <code>InputStream</code> the 222 * <code>DataHandler</code> uses a thread, a pair of pipe streams, 223 * and the <code>writeTo</code> method to produce the data. <p> 224 * 225 * @since JavaMail 1.2 226 */ 227 public static String getEncoding(DataHandler dh) { 228 ContentType cType = null; 229 String encoding = null; 230 231 /* 232 * Try to pick the most efficient means of determining the 233 * encoding. If this DataHandler was created using a DataSource, 234 * the getEncoding(DataSource) method is typically faster. If 235 * the DataHandler was created with an object, this method is 236 * much faster. To distinguish the two cases, we use a heuristic. 237 * A DataHandler created with an object will always have a null name. 238 * A DataHandler created with a DataSource will usually have a 239 * non-null name. 240 * 241 * XXX - This is actually quite a disgusting hack, but it makes 242 * a common case run over twice as fast. 243 */ 244 if (dh.getName() != null) 245 return getEncoding(dh.getDataSource()); 246 247 try { 248 cType = new ContentType(dh.getContentType()); 249 } catch (Exception ex) { 250 return "base64"; // what else ?! 251 } 252 253 if (cType.match("text/*")) { 254 // Check all of the available bytes 255 AsciiOutputStream aos = new AsciiOutputStream(false, false); 256 try { 257 dh.writeTo(aos); 258 } catch (IOException ex) { } // ignore it 259 switch (aos.getAscii()) { 260 case ALL_ASCII: 261 encoding = "7bit"; // all ascii 262 break; 263 case MOSTLY_ASCII: 264 encoding = "quoted-printable"; // mostly ascii 265 break; 266 default: 267 encoding = "base64"; // mostly binary 268 break; 269 } 270 } else { // not "text" 271 // Check all of available bytes, break out if we find 272 // at least one non-US-ASCII character 273 AsciiOutputStream aos = 274 new AsciiOutputStream(true, encodeEolStrict); 275 try { 276 dh.writeTo(aos); 277 } catch (IOException ex) { } // ignore it 278 if (aos.getAscii() == ALL_ASCII) // all ascii 279 encoding = "7bit"; 280 else // found atleast one non-ascii character, use b64 281 encoding = "base64"; 282 } 283 284 return encoding; 285 } 286 287 /** 288 * Decode the given input stream. The Input stream returned is 289 * the decoded input stream. All the encodings defined in RFC 2045 290 * are supported here. They include "base64", "quoted-printable", 291 * "7bit", "8bit", and "binary". In addition, "uuencode" is also 292 * supported. 293 * 294 * @param is input stream 295 * @param encoding the encoding of the stream. 296 * @return decoded input stream. 297 */ 298 public static InputStream decode(InputStream is, String encoding) 299 throws MessagingException { 300 if (encoding.equalsIgnoreCase("base64")) 301 return new BASE64DecoderStream(is); 302 else if (encoding.equalsIgnoreCase("quoted-printable")) 303 return new QPDecoderStream(is); 304 else if (encoding.equalsIgnoreCase("uuencode") || 305 encoding.equalsIgnoreCase("x-uuencode") || 306 encoding.equalsIgnoreCase("x-uue")) 307 return new UUDecoderStream(is); 308 else if (encoding.equalsIgnoreCase("binary") || 309 encoding.equalsIgnoreCase("7bit") || 310 encoding.equalsIgnoreCase("8bit")) 311 return is; 312 else 313 throw new MessagingException("Unknown encoding: " + encoding); 314 } 315 316 /** 317 * Wrap an encoder around the given output stream. 318 * All the encodings defined in RFC 2045 are supported here. 319 * They include "base64", "quoted-printable", "7bit", "8bit" and 320 * "binary". In addition, "uuencode" is also supported. 321 * 322 * @param os output stream 323 * @param encoding the encoding of the stream. 324 * @return output stream that applies the 325 * specified encoding. 326 */ 327 public static OutputStream encode(OutputStream os, String encoding) 328 throws MessagingException { 329 if (encoding == null) 330 return os; 331 else if (encoding.equalsIgnoreCase("base64")) 332 return new BASE64EncoderStream(os); 333 else if (encoding.equalsIgnoreCase("quoted-printable")) 334 return new QPEncoderStream(os); 335 else if (encoding.equalsIgnoreCase("uuencode") || 336 encoding.equalsIgnoreCase("x-uuencode") || 337 encoding.equalsIgnoreCase("x-uue")) 338 return new UUEncoderStream(os); 339 else if (encoding.equalsIgnoreCase("binary") || 340 encoding.equalsIgnoreCase("7bit") || 341 encoding.equalsIgnoreCase("8bit")) 342 return os; 343 else 344 throw new MessagingException("Unknown encoding: " +encoding); 345 } 346 347 /** 348 * Wrap an encoder around the given output stream. 349 * All the encodings defined in RFC 2045 are supported here. 350 * They include "base64", "quoted-printable", "7bit", "8bit" and 351 * "binary". In addition, "uuencode" is also supported. 352 * The <code>filename</code> parameter is used with the "uuencode" 353 * encoding and is included in the encoded output. 354 * 355 * @param os output stream 356 * @param encoding the encoding of the stream. 357 * @param filename name for the file being encoded (only used 358 * with uuencode) 359 * @return output stream that applies the 360 * specified encoding. 361 * @since JavaMail 1.2 362 */ 363 public static OutputStream encode(OutputStream os, String encoding, 364 String filename) 365 throws MessagingException { 366 if (encoding == null) 367 return os; 368 else if (encoding.equalsIgnoreCase("base64")) 369 return new BASE64EncoderStream(os); 370 else if (encoding.equalsIgnoreCase("quoted-printable")) 371 return new QPEncoderStream(os); 372 else if (encoding.equalsIgnoreCase("uuencode") || 373 encoding.equalsIgnoreCase("x-uuencode") || 374 encoding.equalsIgnoreCase("x-uue")) 375 return new UUEncoderStream(os, filename); 376 else if (encoding.equalsIgnoreCase("binary") || 377 encoding.equalsIgnoreCase("7bit") || 378 encoding.equalsIgnoreCase("8bit")) 379 return os; 380 else 381 throw new MessagingException("Unknown encoding: " +encoding); 382 } 383 384 /** 385 * Encode a RFC 822 "text" token into mail-safe form as per 386 * RFC 2047. <p> 387 * 388 * The given Unicode string is examined for non US-ASCII 389 * characters. If the string contains only US-ASCII characters, 390 * it is returned as-is. If the string contains non US-ASCII 391 * characters, it is first character-encoded using the platform's 392 * default charset, then transfer-encoded using either the B or 393 * Q encoding. The resulting bytes are then returned as a Unicode 394 * string containing only ASCII characters. <p> 395 * 396 * Note that this method should be used to encode only 397 * "unstructured" RFC 822 headers. <p> 398 * 399 * Example of usage: 400 * <p><blockquote><pre> 401 * 402 * MimeBodyPart part = ... 403 * String rawvalue = "FooBar Mailer, Japanese version 1.1" 404 * try { 405 * // If we know for sure that rawvalue contains only US-ASCII 406 * // characters, we can skip the encoding part 407 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue)); 408 * } catch (UnsupportedEncodingException e) { 409 * // encoding failure 410 * } catch (MessagingException me) { 411 * // setHeader() failure 412 * } 413 * 414 * </pre></blockquote><p> 415 * 416 * @param text unicode string 417 * @return Unicode string containing only US-ASCII characters 418 * @exception UnsupportedEncodingException if the encoding fails 419 */ 420 public static String encodeText(String text) 421 throws UnsupportedEncodingException { 422 return encodeText(text, null, null); 423 } 424 425 /** 426 * Encode a RFC 822 "text" token into mail-safe form as per 427 * RFC 2047. <p> 428 * 429 * The given Unicode string is examined for non US-ASCII 430 * characters. If the string contains only US-ASCII characters, 431 * it is returned as-is. If the string contains non US-ASCII 432 * characters, it is first character-encoded using the specified 433 * charset, then transfer-encoded using either the B or Q encoding. 434 * The resulting bytes are then returned as a Unicode string 435 * containing only ASCII characters. <p> 436 * 437 * Note that this method should be used to encode only 438 * "unstructured" RFC 822 headers. 439 * 440 * @param text the header value 441 * @param charset the charset. If this parameter is null, the 442 * platform's default chatset is used. 443 * @param encoding the encoding to be used. Currently supported 444 * values are "B" and "Q". If this parameter is null, then 445 * the "Q" encoding is used if most of characters to be 446 * encoded are in the ASCII charset, otherwise "B" encoding 447 * is used. 448 * @return Unicode string containing only US-ASCII characters 449 */ 450 public static String encodeText(String text, String charset, 451 String encoding) 452 throws UnsupportedEncodingException { 453 return encodeWord(text, charset, encoding, false); 454 } 455 456 /** 457 * Decode "unstructured" headers, that is, headers that are defined 458 * as '*text' as per RFC 822. <p> 459 * 460 * The string is decoded using the algorithm specified in 461 * RFC 2047, Section 6.1.1. If the charset-conversion fails 462 * for any sequence, an UnsupportedEncodingException is thrown. 463 * If the String is not an RFC 2047 style encoded header, it is 464 * returned as-is <p> 465 * 466 * Example of usage: 467 * <p><blockquote><pre> 468 * 469 * MimeBodyPart part = ... 470 * String rawvalue = null; 471 * String value = null; 472 * try { 473 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null) 474 * value = MimeUtility.decodeText(rawvalue); 475 * } catch (UnsupportedEncodingException e) { 476 * // Don't care 477 * value = rawvalue; 478 * } catch (MessagingException me) { } 479 * 480 * return value; 481 * 482 * </pre></blockquote><p> 483 * 484 * @param etext the possibly encoded value 485 * @exception UnsupportedEncodingException if the charset 486 * conversion failed. 487 */ 488 public static String decodeText(String etext) 489 throws UnsupportedEncodingException { 490 /* 491 * We look for sequences separated by "linear-white-space". 492 * (as per RFC 2047, Section 6.1.1) 493 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL. 494 */ 495 String lwsp = " \t\n\r"; 496 StringTokenizer st; 497 498 /* 499 * First, lets do a quick run thru the string and check 500 * whether the sequence "=?" exists at all. If none exists, 501 * we know there are no encoded-words in here and we can just 502 * return the string as-is, without suffering thru the later 503 * decoding logic. 504 * This handles the most common case of unencoded headers 505 * efficiently. 506 */ 507 if (etext.indexOf("=?") == -1) 508 return etext; 509 510 // Encoded words found. Start decoding ... 511 512 st = new StringTokenizer(etext, lwsp, true); 513 StringBuffer sb = new StringBuffer(); // decode buffer 514 StringBuffer wsb = new StringBuffer(); // white space buffer 515 boolean prevWasEncoded = false; 516 517 while (st.hasMoreTokens()) { 518 char c; 519 String s = st.nextToken(); 520 // If whitespace, append it to the whitespace buffer 521 if (((c = s.charAt(0)) == ' ') || (c == '\t') || 522 (c == '\r') || (c == '\n')) 523 wsb.append(c); 524 else { 525 // Check if token is an 'encoded-word' .. 526 String word; 527 try { 528 word = decodeWord(s); 529 // Yes, this IS an 'encoded-word'. 530 if (!prevWasEncoded && wsb.length() > 0) { 531 // if the previous word was also encoded, we 532 // should ignore the collected whitespace. Else 533 // we include the whitespace as well. 534 sb.append(wsb); 535 } 536 prevWasEncoded = true; 537 } catch (ParseException pex) { 538 // This is NOT an 'encoded-word'. 539 word = s; 540 // possibly decode inner encoded words 541 if (!decodeStrict) 542 word = decodeInnerWords(word); 543 // include colleced whitespace .. 544 if (wsb.length() > 0) 545 sb.append(wsb); 546 prevWasEncoded = false; 547 } 548 sb.append(word); // append the actual word 549 wsb.setLength(0); // reset wsb for reuse 550 } 551 } 552 return sb.toString(); 553 } 554 555 /** 556 * Encode a RFC 822 "word" token into mail-safe form as per 557 * RFC 2047. <p> 558 * 559 * The given Unicode string is examined for non US-ASCII 560 * characters. If the string contains only US-ASCII characters, 561 * it is returned as-is. If the string contains non US-ASCII 562 * characters, it is first character-encoded using the platform's 563 * default charset, then transfer-encoded using either the B or 564 * Q encoding. The resulting bytes are then returned as a Unicode 565 * string containing only ASCII characters. <p> 566 * 567 * This method is meant to be used when creating RFC 822 "phrases". 568 * The InternetAddress class, for example, uses this to encode 569 * it's 'phrase' component. 570 * 571 * @param text unicode string 572 * @return Array of Unicode strings containing only US-ASCII 573 * characters. 574 * @exception UnsupportedEncodingException if the encoding fails 575 */ 576 public static String encodeWord(String word) 577 throws UnsupportedEncodingException { 578 return encodeWord(word, null, null); 579 } 580 581 /** 582 * Encode a RFC 822 "word" token into mail-safe form as per 583 * RFC 2047. <p> 584 * 585 * The given Unicode string is examined for non US-ASCII 586 * characters. If the string contains only US-ASCII characters, 587 * it is returned as-is. If the string contains non US-ASCII 588 * characters, it is first character-encoded using the specified 589 * charset, then transfer-encoded using either the B or Q encoding. 590 * The resulting bytes are then returned as a Unicode string 591 * containing only ASCII characters. <p> 592 * 593 * @param text unicode string 594 * @param charset the MIME charset 595 * @param encoding the encoding to be used. Currently supported 596 * values are "B" and "Q". If this parameter is null, then 597 * the "Q" encoding is used if most of characters to be 598 * encoded are in the ASCII charset, otherwise "B" encoding 599 * is used. 600 * @return Unicode string containing only US-ASCII characters 601 * @exception UnsupportedEncodingException if the encoding fails 602 */ 603 public static String encodeWord(String word, String charset, 604 String encoding) 605 throws UnsupportedEncodingException { 606 return encodeWord(word, charset, encoding, true); 607 } 608 609 /* 610 * Encode the given string. The parameter 'encodingWord' should 611 * be true if a RFC 822 "word" token is being encoded and false if a 612 * RFC 822 "text" token is being encoded. This is because the 613 * "Q" encoding defined in RFC 2047 has more restrictions when 614 * encoding "word" tokens. (Sigh) 615 */ 616 private static String encodeWord(String string, String charset, 617 String encoding, boolean encodingWord) 618 throws UnsupportedEncodingException { 619 620 // If 'string' contains only US-ASCII characters, just 621 // return it. 622 int ascii = checkAscii(string); 623 if (ascii == ALL_ASCII) 624 return string; 625 626 // Else, apply the specified charset conversion. 627 String jcharset; 628 if (charset == null) { // use default charset 629 jcharset = getDefaultJavaCharset(); // the java charset 630 charset = getDefaultMIMECharset(); // the MIME equivalent 631 } else // MIME charset -> java charset 632 jcharset = javaCharset(charset); 633 634 // If no transfer-encoding is specified, figure one out. 635 if (encoding == null) { 636 if (ascii != MOSTLY_NONASCII) 637 encoding = "Q"; 638 else 639 encoding = "B"; 640 } 641 642 boolean b64; 643 if (encoding.equalsIgnoreCase("B")) 644 b64 = true; 645 else if (encoding.equalsIgnoreCase("Q")) 646 b64 = false; 647 else 648 throw new UnsupportedEncodingException( 649 "Unknown transfer encoding: " + encoding); 650 651 StringBuffer outb = new StringBuffer(); // the output buffer 652 doEncode(string, b64, jcharset, 653 // As per RFC 2047, size of an encoded string should not 654 // exceed 75 bytes. 655 // 7 = size of "=?", '?', 'B'/'Q', '?', "?=" 656 75 - 7 - charset.length(), // the available space 657 "=?" + charset + "?" + encoding + "?", // prefix 658 true, encodingWord, outb); 659 660 return outb.toString(); 661 } 662 663 private static void doEncode(String string, boolean b64, 664 String jcharset, int avail, String prefix, 665 boolean first, boolean encodingWord, StringBuffer buf) 666 throws UnsupportedEncodingException { 667 668 // First find out what the length of the encoded version of 669 // 'string' would be. 670 byte[] bytes = string.getBytes(jcharset); 671 int len; 672 if (b64) // "B" encoding 673 len = BEncoderStream.encodedLength(bytes); 674 else // "Q" 675 len = QEncoderStream.encodedLength(bytes, encodingWord); 676 677 int size; 678 if ((len > avail) && ((size = string.length()) > 1)) { 679 // If the length is greater than 'avail', split 'string' 680 // into two and recurse. 681 doEncode(string.substring(0, size/2), b64, jcharset, 682 avail, prefix, first, encodingWord, buf); 683 doEncode(string.substring(size/2, size), b64, jcharset, 684 avail, prefix, false, encodingWord, buf); 685 } else { 686 // length <= than 'avail'. Encode the given string 687 ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); 688 OutputStream eos; // the encoder 689 if (b64) // "B" encoding 690 eos = new BEncoderStream(os); 691 else // "Q" encoding 692 eos = new QEncoderStream(os, encodingWord); 693 694 try { // do the encoding 695 eos.write(bytes); 696 eos.close(); 697 } catch (IOException ioex) { } 698 699 byte[] encodedBytes = os.toByteArray(); // the encoded stuff 700 // Now write out the encoded (all ASCII) bytes into our 701 // StringBuffer 702 if (!first) // not the first line of this sequence 703 if (foldEncodedWords) 704 buf.append("\r\n "); // start a continuation line 705 else 706 buf.append(" "); // line will be folded later 707 708 buf.append(prefix); 709 for (int i = 0; i < encodedBytes.length; i++) 710 buf.append((char)encodedBytes[i]); 711 buf.append("?="); // terminate the current sequence 712 } 713 } 714 715 /** 716 * The string is parsed using the rules in RFC 2047 for parsing 717 * an "encoded-word". If the parse fails, a ParseException is 718 * thrown. Otherwise, it is transfer-decoded, and then 719 * charset-converted into Unicode. If the charset-conversion 720 * fails, an UnsupportedEncodingException is thrown.<p> 721 * 722 * @param eword the possibly encoded value 723 * @exception ParseException if the string is not an 724 * encoded-word as per RFC 2047. 725 * @exception UnsupportedEncodingException if the charset 726 * conversion failed. 727 */ 728 public static String decodeWord(String eword) 729 throws ParseException, UnsupportedEncodingException { 730 731 if (!eword.startsWith("=?")) // not an encoded word 732 throw new ParseException(); 733 734 // get charset 735 int start = 2; int pos; 736 if ((pos = eword.indexOf('?', start)) == -1) 737 throw new ParseException(); 738 String charset = javaCharset(eword.substring(start, pos)); 739 740 // get encoding 741 start = pos+1; 742 if ((pos = eword.indexOf('?', start)) == -1) 743 throw new ParseException(); 744 String encoding = eword.substring(start, pos); 745 746 // get encoded-sequence 747 start = pos+1; 748 if ((pos = eword.indexOf("?=", start)) == -1) 749 throw new ParseException(); 750 String word = eword.substring(start, pos); 751 752 try { 753 // Extract the bytes from word 754 ByteArrayInputStream bis = 755 new ByteArrayInputStream(ASCIIUtility.getBytes(word)); 756 757 // Get the appropriate decoder 758 InputStream is; 759 if (encoding.equalsIgnoreCase("B")) 760 is = new BASE64DecoderStream(bis); 761 else if (encoding.equalsIgnoreCase("Q")) 762 is = new QDecoderStream(bis); 763 else 764 throw new UnsupportedEncodingException( 765 "unknown encoding: " + encoding); 766 767 // For b64 & q, size of decoded word <= size of word. So 768 // the decoded bytes must fit into the 'bytes' array. This 769 // is certainly more efficient than writing bytes into a 770 // ByteArrayOutputStream and then pulling out the byte[] 771 // from it. 772 int count = bis.available(); 773 byte[] bytes = new byte[count]; 774 // count is set to the actual number of decoded bytes 775 count = is.read(bytes, 0, count); 776 777 // Finally, convert the decoded bytes into a String using 778 // the specified charset 779 String s = new String(bytes, 0, count, charset); 780 if (pos + 2 < eword.length()) { 781 // there's still more text in the string 782 String rest = eword.substring(pos + 2); 783 if (!decodeStrict) 784 rest = decodeInnerWords(rest); 785 s += rest; 786 } 787 return s; 788 } catch (UnsupportedEncodingException uex) { 789 // explicitly catch and rethrow this exception, otherwise 790 // the below IOException catch will swallow this up! 791 throw uex; 792 } catch (IOException ioex) { 793 // Shouldn't happen. 794 throw new ParseException(); 795 } catch (IllegalArgumentException iex) { 796 /* An unknown charset of the form ISO-XXX-XXX, will cause 797 * the JDK to throw an IllegalArgumentException ... Since the 798 * JDK will attempt to create a classname using this string, 799 * but valid classnames must not contain the character '-', 800 * and this results in an IllegalArgumentException, rather than 801 * the expected UnsupportedEncodingException. Yikes 802 */ 803 throw new UnsupportedEncodingException(); 804 } 805 } 806 807 /** 808 * Look for encoded words within a word. The MIME spec doesn't 809 * allow this, but many broken mailers, especially Japanese mailers, 810 * produce such incorrect encodings. 811 */ 812 private static String decodeInnerWords(String word) 813 throws UnsupportedEncodingException { 814 int start = 0, i; 815 StringBuffer buf = new StringBuffer(); 816 while ((i = word.indexOf("=?", start)) >= 0) { 817 buf.append(word.substring(start, i)); 818 int end = word.indexOf("?=", i); 819 if (end < 0) 820 break; 821 String s = word.substring(i, end + 2); 822 try { 823 s = decodeWord(s); 824 } catch (ParseException pex) { 825 // ignore it, just use the original string 826 } 827 buf.append(s); 828 start = end + 2; 829 } 830 if (start == 0) 831 return word; 832 if (start < word.length()) 833 buf.append(word.substring(start)); 834 return buf.toString(); 835 } 836 837 /** 838 * A utility method to quote a word, if the word contains any 839 * characters from the specified 'specials' list.<p> 840 * 841 * The <code>HeaderTokenizer</code> class defines two special 842 * sets of delimiters - MIME and RFC 822. <p> 843 * 844 * This method is typically used during the generation of 845 * RFC 822 and MIME header fields. 846 * 847 * @param word word to be quoted 848 * @param specials the set of special characters 849 * @return the possibly quoted word 850 * @see javax.mail.internet.HeaderTokenizer#MIME 851 * @see javax.mail.internet.HeaderTokenizer#RFC822 852 */ 853 public static String quote(String word, String specials) { 854 int len = word.length(); 855 856 /* 857 * Look for any "bad" characters, Escape and 858 * quote the entire string if necessary. 859 */ 860 boolean needQuoting = false; 861 for (int i = 0; i < len; i++) { 862 char c = word.charAt(i); 863 if (c == '"' || c == '\\' || c == '\r' || c == '\n') { 864 // need to escape them and then quote the whole string 865 StringBuffer sb = new StringBuffer(len + 3); 866 sb.append('"'); 867 sb.append(word.substring(0, i)); 868 int lastc = 0; 869 for (int j = i; j < len; j++) { 870 char cc = word.charAt(j); 871 if ((cc == '"') || (cc == '\\') || 872 (cc == '\r') || (cc == '\n')) 873 if (cc == '\n' && lastc == '\r') 874 ; // do nothing, CR was already escaped 875 else 876 sb.append('\\'); // Escape the character 877 sb.append(cc); 878 lastc = cc; 879 } 880 sb.append('"'); 881 return sb.toString(); 882 } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0) 883 // These characters cause the string to be quoted 884 needQuoting = true; 885 } 886 887 if (needQuoting) { 888 StringBuffer sb = new StringBuffer(len + 2); 889 sb.append('"').append(word).append('"'); 890 return sb.toString(); 891 } else 892 return word; 893 } 894 895 /** 896 * Fold a string at linear whitespace so that each line is no longer 897 * than 76 characters, if possible. If there are more than 76 898 * non-whitespace characters consecutively, the string is folded at 899 * the first whitespace after that sequence. The parameter 900 * <code>used</code> indicates how many characters have been used in 901 * the current line; it is usually the length of the header name. <p> 902 * 903 * Note that line breaks in the string aren't escaped; they probably 904 * should be. 905 * 906 * @param used characters used in line so far 907 * @param s the string to fold 908 * @return the folded string 909 */ 910 /*public*/ static String fold(int used, String s) { 911 if (!foldText) 912 return s; 913 914 int end; 915 char c; 916 // Strip trailing spaces 917 for (end = s.length() - 1; end >= 0; end--) { 918 c = s.charAt(end); 919 if (c != ' ' && c != '\t') 920 break; 921 } 922 if (end != s.length() - 1) 923 s = s.substring(0, end + 1); 924 925 // if the string fits now, just return it 926 if (used + s.length() <= 76) 927 return s; 928 929 // have to actually fold the string 930 StringBuffer sb = new StringBuffer(s.length() + 4); 931 char lastc = 0; 932 while (used + s.length() > 76) { 933 int lastspace = -1; 934 for (int i = 0; i < s.length(); i++) { 935 if (lastspace != -1 && used + i > 76) 936 break; 937 c = s.charAt(i); 938 if (c == ' ' || c == '\t') 939 if (!(lastc == ' ' || lastc == '\t')) 940 lastspace = i; 941 lastc = c; 942 } 943 if (lastspace == -1) { 944 // no space, use the whole thing 945 sb.append(s); 946 s = ""; 947 used = 0; 948 break; 949 } 950 sb.append(s.substring(0, lastspace)); 951 sb.append("\r\n"); 952 lastc = s.charAt(lastspace); 953 sb.append(lastc); 954 s = s.substring(lastspace + 1); 955 used = 1; 956 } 957 sb.append(s); 958 return sb.toString(); 959 } 960 961 /** 962 * Unfold a folded header. Any line breaks that aren't escaped and 963 * are followed by whitespace are removed. 964 * 965 * @param s the string to unfold 966 * @return the unfolded string 967 */ 968 /*public*/ static String unfold(String s) { 969 if (!foldText) 970 return s; 971 972 StringBuffer sb = null; 973 int i; 974 while ((i = indexOfAny(s, "\r\n")) >= 0) { 975 int start = i; 976 int l = s.length(); 977 i++; // skip CR or NL 978 if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n') 979 i++; // skip LF 980 if (start == 0 || s.charAt(start - 1) != '\\') { 981 char c; 982 // if next line starts with whitespace, skip all of it 983 // XXX - always has to be true? 984 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) { 985 i++; // skip whitespace 986 while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) 987 i++; 988 if (sb == null) 989 sb = new StringBuffer(s.length()); 990 if (start != 0) { 991 sb.append(s.substring(0, start)); 992 sb.append(' '); 993 } 994 s = s.substring(i); 995 continue; 996 } 997 // it's not a continuation line, just leave it in 998 if (sb == null) 999 sb = new StringBuffer(s.length()); 1000 sb.append(s.substring(0, i)); 1001 s = s.substring(i); 1002 } else { 1003 // there's a backslash at "start - 1" 1004 // strip it out, but leave in the line break 1005 if (sb == null) 1006 sb = new StringBuffer(s.length()); 1007 sb.append(s.substring(0, start - 1)); 1008 sb.append(s.substring(start, i)); 1009 s = s.substring(i); 1010 } 1011 } 1012 if (sb != null) { 1013 sb.append(s); 1014 return sb.toString(); 1015 } else 1016 return s; 1017 } 1018 1019 /** 1020 * Return the first index of any of the characters in "any" in "s", 1021 * or -1 if none are found. 1022 * 1023 * This should be a method on String. 1024 */ 1025 private static int indexOfAny(String s, String any) { 1026 return indexOfAny(s, any, 0); 1027 } 1028 1029 private static int indexOfAny(String s, String any, int start) { 1030 try { 1031 int len = s.length(); 1032 for (int i = start; i < len; i++) { 1033 if (any.indexOf(s.charAt(i)) >= 0) 1034 return i; 1035 } 1036 return -1; 1037 } catch (StringIndexOutOfBoundsException e) { 1038 return -1; 1039 } 1040 } 1041 1042 /** 1043 * Convert a MIME charset name into a valid Java charset name. <p> 1044 * 1045 * @param charset the MIME charset name 1046 * @return the Java charset equivalent. If a suitable mapping is 1047 * not available, the passed in charset is itself returned. 1048 */ 1049 public static String javaCharset(String charset) { 1050 if (mime2java == null || charset == null) 1051 // no mapping table, or charset parameter is null 1052 return charset; 1053 1054 String alias = (String)mime2java.get(charset.toLowerCase()); 1055 return alias == null ? charset : alias; 1056 } 1057 1058 /** 1059 * Convert a java charset into its MIME charset name. <p> 1060 * 1061 * Note that a future version of JDK (post 1.2) might provide 1062 * this functionality, in which case, we may deprecate this 1063 * method then. 1064 * 1065 * @param charset the JDK charset 1066 * @return the MIME/IANA equivalent. If a mapping 1067 * is not possible, the passed in charset itself 1068 * is returned. 1069 * @since JavaMail 1.1 1070 */ 1071 public static String mimeCharset(String charset) { 1072 if (java2mime == null || charset == null) 1073 // no mapping table or charset param is null 1074 return charset; 1075 1076 String alias = (String)java2mime.get(charset.toLowerCase()); 1077 return alias == null ? charset : alias; 1078 } 1079 1080 private static String defaultJavaCharset; 1081 private static String defaultMIMECharset; 1082 1083 /** 1084 * Get the default charset corresponding to the system's current 1085 * default locale. If the System property <code>mail.mime.charset</code> 1086 * is set, a system charset corresponding to this MIME charset will be 1087 * returned. <p> 1088 * 1089 * @return the default charset of the system's default locale, 1090 * as a Java charset. (NOT a MIME charset) 1091 * @since JavaMail 1.1 1092 */ 1093 public static String getDefaultJavaCharset() { 1094 if (defaultJavaCharset == null) { 1095 /* 1096 * If mail.mime.charset is set, it controls the default 1097 * Java charset as well. 1098 */ 1099 String mimecs = null; 1100 1101 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset"); 1102 1103 if (mimecs != null && mimecs.length() > 0) { 1104 defaultJavaCharset = javaCharset(mimecs); 1105 return defaultJavaCharset; 1106 } 1107 1108 try { 1109 defaultJavaCharset = System.getProperty("file.encoding", 1110 "8859_1"); 1111 } catch (SecurityException sex) { 1112 1113 class NullInputStream extends InputStream { 1114 public int read() { 1115 return 0; 1116 } 1117 } 1118 InputStreamReader reader = 1119 new InputStreamReader(new NullInputStream()); 1120 defaultJavaCharset = reader.getEncoding(); 1121 if (defaultJavaCharset == null) 1122 defaultJavaCharset = "8859_1"; 1123 } 1124 } 1125 1126 return defaultJavaCharset; 1127 } 1128 1129 /* 1130 * Get the default MIME charset for this locale. 1131 */ 1132 static String getDefaultMIMECharset() { 1133 if (defaultMIMECharset == null) { 1134 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset"); 1135 } 1136 if (defaultMIMECharset == null) 1137 defaultMIMECharset = mimeCharset(getDefaultJavaCharset()); 1138 return defaultMIMECharset; 1139 } 1140 1141 // Tables to map MIME charset names to Java names and vice versa. 1142 // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset 1143 private static Hashtable mime2java; 1144 private static Hashtable java2mime; 1145 1146 static { 1147 java2mime = new Hashtable(40); 1148 mime2java = new Hashtable(10); 1149 1150 try { 1151 // Use this class's classloader to load the mapping file 1152 // XXX - we should use SecuritySupport, but it's in another package 1153 InputStream is = 1154 com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream( 1155 "/META-INF/javamail.charset.map"); 1156 1157 if (is != null) { 1158 is = new LineInputStream(is); 1159 1160 // Load the JDK-to-MIME charset mapping table 1161 loadMappings((LineInputStream)is, java2mime); 1162 1163 // Load the MIME-to-JDK charset mapping table 1164 loadMappings((LineInputStream)is, mime2java); 1165 } 1166 } catch (Exception ex) { } 1167 1168 // If we didn't load the tables, e.g., because we didn't have 1169 // permission, load them manually. The entries here should be 1170 // the same as the default javamail.charset.map. 1171 if (java2mime.isEmpty()) { 1172 java2mime.put("8859_1", "ISO-8859-1"); 1173 java2mime.put("iso8859_1", "ISO-8859-1"); 1174 java2mime.put("ISO8859-1", "ISO-8859-1"); 1175 1176 java2mime.put("8859_2", "ISO-8859-2"); 1177 java2mime.put("iso8859_2", "ISO-8859-2"); 1178 java2mime.put("ISO8859-2", "ISO-8859-2"); 1179 1180 java2mime.put("8859_3", "ISO-8859-3"); 1181 java2mime.put("iso8859_3", "ISO-8859-3"); 1182 java2mime.put("ISO8859-3", "ISO-8859-3"); 1183 1184 java2mime.put("8859_4", "ISO-8859-4"); 1185 java2mime.put("iso8859_4", "ISO-8859-4"); 1186 java2mime.put("ISO8859-4", "ISO-8859-4"); 1187 1188 java2mime.put("8859_5", "ISO-8859-5"); 1189 java2mime.put("iso8859_5", "ISO-8859-5"); 1190 java2mime.put("ISO8859-5", "ISO-8859-5"); 1191 1192 java2mime.put("8859_6", "ISO-8859-6"); 1193 java2mime.put("iso8859_6", "ISO-8859-6"); 1194 java2mime.put("ISO8859-6", "ISO-8859-6"); 1195 1196 java2mime.put("8859_7", "ISO-8859-7"); 1197 java2mime.put("iso8859_7", "ISO-8859-7"); 1198 java2mime.put("ISO8859-7", "ISO-8859-7"); 1199 1200 java2mime.put("8859_8", "ISO-8859-8"); 1201 java2mime.put("iso8859_8", "ISO-8859-8"); 1202 java2mime.put("ISO8859-8", "ISO-8859-8"); 1203 1204 java2mime.put("8859_9", "ISO-8859-9"); 1205 java2mime.put("iso8859_9", "ISO-8859-9"); 1206 java2mime.put("ISO8859-9", "ISO-8859-9"); 1207 1208 java2mime.put("SJIS", "Shift_JIS"); 1209 java2mime.put("MS932", "Shift_JIS"); 1210 java2mime.put("JIS", "ISO-2022-JP"); 1211 java2mime.put("ISO2022JP", "ISO-2022-JP"); 1212 java2mime.put("EUC_JP", "euc-jp"); 1213 java2mime.put("KOI8_R", "koi8-r"); 1214 java2mime.put("EUC_CN", "euc-cn"); 1215 java2mime.put("EUC_TW", "euc-tw"); 1216 java2mime.put("EUC_KR", "euc-kr"); 1217 } 1218 if (mime2java.isEmpty()) { 1219 mime2java.put("iso-2022-cn", "ISO2022CN"); 1220 mime2java.put("iso-2022-kr", "ISO2022KR"); 1221 mime2java.put("utf-8", "UTF8"); 1222 mime2java.put("utf8", "UTF8"); 1223 mime2java.put("ja_jp.iso2022-7", "ISO2022JP"); 1224 mime2java.put("ja_jp.eucjp", "EUCJIS"); 1225 mime2java.put("euc-kr", "KSC5601"); 1226 mime2java.put("euckr", "KSC5601"); 1227 mime2java.put("us-ascii", "ISO-8859-1"); 1228 mime2java.put("x-us-ascii", "ISO-8859-1"); 1229 } 1230 } 1231 1232 private static void loadMappings(LineInputStream is, Hashtable table) { 1233 String currLine; 1234 1235 while (true) { 1236 try { 1237 currLine = is.readLine(); 1238 } catch (IOException ioex) { 1239 break; // error in reading, stop 1240 } 1241 1242 if (currLine == null) // end of file, stop 1243 break; 1244 if (currLine.startsWith("--") && currLine.endsWith("--")) 1245 // end of this table 1246 break; 1247 1248 // ignore empty lines and comments 1249 if (currLine.trim().length() == 0 || currLine.startsWith("#")) 1250 continue; 1251 1252 // A valid entry is of the form <key><separator><value> 1253 // where, <separator> := SPACE | HT. Parse this 1254 StringTokenizer tk = new StringTokenizer(currLine, " \t"); 1255 try { 1256 String key = tk.nextToken(); 1257 String value = tk.nextToken(); 1258 table.put(key.toLowerCase(), value); 1259 } catch (NoSuchElementException nex) { } 1260 } 1261 } 1262 1263 static final int ALL_ASCII = 1; 1264 static final int MOSTLY_ASCII = 2; 1265 static final int MOSTLY_NONASCII = 3; 1266 1267 /** 1268 * Check if the given string contains non US-ASCII characters. 1269 * @param s string 1270 * @return ALL_ASCII if all characters in the string 1271 * belong to the US-ASCII charset. MOSTLY_ASCII 1272 * if more than half of the available characters 1273 * are US-ASCII characters. Else MOSTLY_NONASCII. 1274 */ 1275 static int checkAscii(String s) { 1276 int ascii = 0, non_ascii = 0; 1277 int l = s.length(); 1278 1279 for (int i = 0; i < l; i++) { 1280 if (nonascii((int)s.charAt(i))) // non-ascii 1281 non_ascii++; 1282 else 1283 ascii++; 1284 } 1285 1286 if (non_ascii == 0) 1287 return ALL_ASCII; 1288 if (ascii > non_ascii) 1289 return MOSTLY_ASCII; 1290 1291 return MOSTLY_NONASCII; 1292 } 1293 1294 /** 1295 * Check if the given byte array contains non US-ASCII characters. 1296 * @param b byte array 1297 * @return ALL_ASCII if all characters in the string 1298 * belong to the US-ASCII charset. MOSTLY_ASCII 1299 * if more than half of the available characters 1300 * are US-ASCII characters. Else MOSTLY_NONASCII. 1301 * 1302 * XXX - this method is no longer used 1303 */ 1304 static int checkAscii(byte[] b) { 1305 int ascii = 0, non_ascii = 0; 1306 1307 for (int i=0; i < b.length; i++) { 1308 // The '&' operator automatically causes b[i] to be promoted 1309 // to an int, and we mask out the higher bytes in the int 1310 // so that the resulting value is not a negative integer. 1311 if (nonascii(b[i] & 0xff)) // non-ascii 1312 non_ascii++; 1313 else 1314 ascii++; 1315 } 1316 1317 if (non_ascii == 0) 1318 return ALL_ASCII; 1319 if (ascii > non_ascii) 1320 return MOSTLY_ASCII; 1321 1322 return MOSTLY_NONASCII; 1323 } 1324 1325 /** 1326 * Check if the given input stream contains non US-ASCII characters. 1327 * Upto <code>max</code> bytes are checked. If <code>max</code> is 1328 * set to <code>ALL</code>, then all the bytes available in this 1329 * input stream are checked. If <code>breakOnNonAscii</code> is true 1330 * the check terminates when the first non-US-ASCII character is 1331 * found and MOSTLY_NONASCII is returned. Else, the check continues 1332 * till <code>max</code> bytes or till the end of stream. 1333 * 1334 * @param is the input stream 1335 * @param max maximum bytes to check for. The special value 1336 * ALL indicates that all the bytes in this input 1337 * stream must be checked. 1338 * @param breakOnNonAscii if <code>true</code>, then terminate the 1339 * the check when the first non-US-ASCII character 1340 * is found. 1341 * @return ALL_ASCII if all characters in the string 1342 * belong to the US-ASCII charset. MOSTLY_ASCII 1343 * if more than half of the available characters 1344 * are US-ASCII characters. Else MOSTLY_NONASCII. 1345 */ 1346 static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) { 1347 int ascii = 0, non_ascii = 0; 1348 int len; 1349 int block = 4096; 1350 int linelen = 0; 1351 boolean longLine = false, badEOL = false; 1352 boolean checkEOL = encodeEolStrict && breakOnNonAscii; 1353 byte buf[] = null; 1354 if (max != 0) { 1355 block = (max == ALL) ? 4096 : Math.min(max, 4096); 1356 buf = new byte[block]; 1357 } 1358 while (max != 0) { 1359 try { 1360 if ((len = is.read(buf, 0, block)) == -1) 1361 break; 1362 int lastb = 0; 1363 for (int i = 0; i < len; i++) { 1364 // The '&' operator automatically causes b[i] to 1365 // be promoted to an int, and we mask out the higher 1366 // bytes in the int so that the resulting value is 1367 // not a negative integer. 1368 int b = buf[i] & 0xff; 1369 if (checkEOL && 1370 ((lastb == '\r' && b != '\n') || 1371 (lastb != '\r' && b == '\n'))) 1372 badEOL = true; 1373 if (b == '\r' || b == '\n') 1374 linelen = 0; 1375 else { 1376 linelen++; 1377 if (linelen > 998) // 1000 - CRLF 1378 longLine = true; 1379 } 1380 if (nonascii(b)) { // non-ascii 1381 if (breakOnNonAscii) // we are done 1382 return MOSTLY_NONASCII; 1383 else 1384 non_ascii++; 1385 } else 1386 ascii++; 1387 lastb = b; 1388 } 1389 } catch (IOException ioex) { 1390 break; 1391 } 1392 if (max != ALL) 1393 max -= len; 1394 } 1395 1396 if (max == 0 && breakOnNonAscii) 1397 // We have been told to break on the first non-ascii character. 1398 // We haven't got any non-ascii character yet, but then we 1399 // have not checked all of the available bytes either. So we 1400 // cannot say for sure that this input stream is ALL_ASCII, 1401 // and hence we must play safe and return MOSTLY_NONASCII 1402 1403 return MOSTLY_NONASCII; 1404 1405 if (non_ascii == 0) { // no non-us-ascii characters so far 1406 // If we're looking at non-text data, and we saw CR without LF 1407 // or vice versa, consider this mostly non-ASCII so that it 1408 // will be base64 encoded (since the quoted-printable encoder 1409 // doesn't encode this case properly). 1410 if (badEOL) 1411 return MOSTLY_NONASCII; 1412 // if we've seen a long line, we degrade to mostly ascii 1413 else if (longLine) 1414 return MOSTLY_ASCII; 1415 else 1416 return ALL_ASCII; 1417 } 1418 if (ascii > non_ascii) // mostly ascii 1419 return MOSTLY_ASCII; 1420 return MOSTLY_NONASCII; 1421 } 1422 1423 static final boolean nonascii(int b) { 1424 return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t'); 1425 } 1426 } 1427 1428 /** 1429 * An OutputStream that determines whether the data written to 1430 * it is all ASCII, mostly ASCII, or mostly non-ASCII. 1431 */ 1432 class AsciiOutputStream extends OutputStream { 1433 private boolean breakOnNonAscii; 1434 private int ascii = 0, non_ascii = 0; 1435 private int linelen = 0; 1436 private boolean longLine = false; 1437 private boolean badEOL = false; 1438 private boolean checkEOL = false; 1439 private int lastb = 0; 1440 private int ret = 0; 1441 1442 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) { 1443 this.breakOnNonAscii = breakOnNonAscii; 1444 checkEOL = encodeEolStrict && breakOnNonAscii; 1445 } 1446 1447 public void write(int b) throws IOException { 1448 check(b); 1449 } 1450 1451 public void write(byte b[]) throws IOException { 1452 write(b, 0, b.length); 1453 } 1454 1455 public void write(byte b[], int off, int len) throws IOException { 1456 len += off; 1457 for (int i = off; i < len ; i++) 1458 check(b[i]); 1459 } 1460 1461 private final void check(int b) throws IOException { 1462 b &= 0xff; 1463 if (checkEOL && 1464 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n'))) 1465 badEOL = true; 1466 if (b == '\r' || b == '\n') 1467 linelen = 0; 1468 else { 1469 linelen++; 1470 if (linelen > 998) // 1000 - CRLF 1471 longLine = true; 1472 } 1473 if (MimeUtility.nonascii(b)) { // non-ascii 1474 non_ascii++; 1475 if (breakOnNonAscii) { // we are done 1476 ret = MimeUtility.MOSTLY_NONASCII; 1477 throw new EOFException(); 1478 } 1479 } else 1480 ascii++; 1481 lastb = b; 1482 } 1483 1484 /** 1485 * Return ASCII-ness of data stream. 1486 */ 1487 public int getAscii() { 1488 if (ret != 0) 1489 return ret; 1490 // If we're looking at non-text data, and we saw CR without LF 1491 // or vice versa, consider this mostly non-ASCII so that it 1492 // will be base64 encoded (since the quoted-printable encoder 1493 // doesn't encode this case properly). 1494 if (badEOL) 1495 return MimeUtility.MOSTLY_NONASCII; 1496 else if (non_ascii == 0) { // no non-us-ascii characters so far 1497 // if we've seen a long line, we degrade to mostly ascii 1498 if (longLine) 1499 return MimeUtility.MOSTLY_ASCII; 1500 else 1501 return MimeUtility.ALL_ASCII; 1502 } 1503 if (ascii > non_ascii) // mostly ascii 1504 return MimeUtility.MOSTLY_ASCII; 1505 return MimeUtility.MOSTLY_NONASCII; 1506 } 1507 }