1 /* 2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * @(#)MimeUtility.java 1.45 03/03/10 28 */ 29 30 31 32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet; 33 34 import java.io.*; 35 import java.util.*; 36 37 import javax.activation.DataHandler; 38 import javax.activation.DataSource; 39 40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException; 41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*; 42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil; 43 44 /** 45 * This is a utility class that provides various MIME related 46 * functionality. <p> 47 * 48 * There are a set of methods to encode and decode MIME headers as 49 * per RFC 2047. A brief description on handling such headers is 50 * given below: <p> 51 * 52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII 53 * characters. Headers that contain non US-ASCII characters must be 54 * encoded so that they contain only US-ASCII characters. Basically, 55 * this process involves using either BASE64 or QP to encode certain 56 * characters. RFC 2047 describes this in detail. <p> 57 * 58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a 59 * subset of Unicode (and occupies the range 0 - 127). A String 60 * that contains only ASCII characters is already mail-safe. If the 61 * String contains non US-ASCII characters, it must be encoded. An 62 * additional complexity in this step is that since Unicode is not 63 * yet a widely used charset, one might want to first charset-encode 64 * the String into another charset and then do the transfer-encoding. 65 * <p> 66 * Note that to get the actual bytes of a mail-safe String (say, 67 * for sending over SMTP), one must do 68 * <blockquote><pre> 69 * 70 * byte[] bytes = string.getBytes("iso-8859-1"); 71 * 72 * </pre></blockquote> 73 * 74 * The <code>setHeader</code> and <code>addHeader</code> methods 75 * on MimeMessage and MimeBodyPart assume that the given header values 76 * are Unicode strings that contain only US-ASCII characters. Hence 77 * the callers of those methods must insure that the values they pass 78 * do not contain non US-ASCII characters. The methods in this class 79 * help do this. <p> 80 * 81 * The <code>getHeader</code> family of methods on MimeMessage and 82 * MimeBodyPart return the raw header value. These might be encoded 83 * as per RFC 2047, and if so, must be decoded into Unicode Strings. 84 * The methods in this class help to do this. <p> 85 * 86 * Several System properties control strict conformance to the MIME 87 * spec. Note that these are not session properties but must be set 88 * globally as System properties. <p> 89 * 90 * The <code>mail.mime.decodetext.strict</code> property controls 91 * decoding of MIME encoded words. The MIME spec requires that encoded 92 * words start at the beginning of a whitespace separated word. Some 93 * mailers incorrectly include encoded words in the middle of a word. 94 * If the <code>mail.mime.decodetext.strict</code> System property is 95 * set to <code>"false"</code>, an attempt will be made to decode these 96 * illegal encoded words. The default is true. <p> 97 * 98 * The <code>mail.mime.encodeeol.strict</code> property controls the 99 * choice of Content-Transfer-Encoding for MIME parts that are not of 100 * type "text". Often such parts will contain textual data for which 101 * an encoding that allows normal end of line conventions is appropriate. 102 * In rare cases, such a part will appear to contain entirely textual 103 * data, but will require an encoding that preserves CR and LF characters 104 * without change. If the <code>mail.mime.decodetext.strict</code> 105 * System property is set to <code>"true"</code>, such an encoding will 106 * be used when necessary. The default is false. <p> 107 * 108 * In addition, the <code>mail.mime.charset</code> System property can 109 * be used to specify the default MIME charset to use for encoded words 110 * and text parts that don't otherwise specify a charset. Normally, the 111 * default MIME charset is derived from the default Java charset, as 112 * specified in the <code>file.encoding</code> System property. Most 113 * applications will have no need to explicitly set the default MIME 114 * charset. In cases where the default MIME charset to be used for 115 * mail messages is different than the charset used for files stored on 116 * the system, this property should be set. 117 * 118 * @version 1.45, 03/03/10 119 * @author John Mani 120 * @author Bill Shannon 121 */ 122 123 public class MimeUtility { 124 125 // This class cannot be instantiated 126 private MimeUtility() { } 127 128 public static final int ALL = -1; 129 130 private static final int BUFFER_SIZE = 1024; 131 private static boolean decodeStrict = true; 132 private static boolean encodeEolStrict = false; 133 private static boolean foldEncodedWords = false; 134 private static boolean foldText = true; 135 136 static { 137 try { 138 String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict"); 139 // default to true 140 decodeStrict = s == null || !s.equalsIgnoreCase("false"); 141 s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict"); 142 // default to false 143 encodeEolStrict = s != null && s.equalsIgnoreCase("true"); 144 s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords"); 145 // default to false 146 foldEncodedWords = s != null && s.equalsIgnoreCase("true"); 147 s = SAAJUtil.getSystemProperty("mail.mime.foldtext"); 148 // default to true 149 foldText = s == null || !s.equalsIgnoreCase("false"); 150 } catch (SecurityException sex) { 151 // ignore it 152 } 153 } 154 155 156 /** 157 * Get the content-transfer-encoding that should be applied 158 * to the input stream of this datasource, to make it mailsafe. <p> 159 * 160 * The algorithm used here is: <br> 161 * <ul> 162 * <li> 163 * If the primary type of this datasource is "text" and if all 164 * the bytes in its input stream are US-ASCII, then the encoding 165 * is "7bit". If more than half of the bytes are non-US-ASCII, then 166 * the encoding is "base64". If less than half of the bytes are 167 * non-US-ASCII, then the encoding is "quoted-printable". 168 * <li> 169 * If the primary type of this datasource is not "text", then if 170 * all the bytes of its input stream are US-ASCII, the encoding 171 * is "7bit". If there is even one non-US-ASCII character, the 172 * encoding is "base64". 173 * </ul> 174 * 175 * @param ds DataSource 176 * @return the encoding. This is either "7bit", 177 * "quoted-printable" or "base64" 178 */ 179 public static String getEncoding(DataSource ds) { 180 ContentType cType = null; 181 InputStream is = null; 182 String encoding = null; 183 184 try { 185 cType = new ContentType(ds.getContentType()); 186 is = ds.getInputStream(); 187 } catch (Exception ex) { 188 return "base64"; // what else ?! 189 } 190 191 boolean isText = cType.match("text/*"); 192 // if not text, stop processing when we see non-ASCII 193 int i = checkAscii(is, ALL, !isText); 194 switch (i) { 195 case ALL_ASCII: 196 encoding = "7bit"; // all ascii 197 break; 198 case MOSTLY_ASCII: 199 encoding = "quoted-printable"; // mostly ascii 200 break; 201 default: 202 encoding = "base64"; // mostly binary 203 break; 204 } 205 206 // Close the input stream 207 try { 208 is.close(); 209 } catch (IOException ioex) { } 210 211 return encoding; 212 } 213 214 /** 215 * Same as <code>getEncoding(DataSource)</code> except that instead 216 * of reading the data from an <code>InputStream</code> it uses the 217 * <code>writeTo</code> method to examine the data. This is more 218 * efficient in the common case of a <code>DataHandler</code> 219 * created with an object and a MIME type (for example, a 220 * "text/plain" String) because all the I/O is done in this 221 * thread. In the case requiring an <code>InputStream</code> the 222 * <code>DataHandler</code> uses a thread, a pair of pipe streams, 223 * and the <code>writeTo</code> method to produce the data. <p> 224 * 225 * @param dh data handler 226 * 227 * @return encoding 228 * 229 * @since JavaMail 1.2 230 */ 231 public static String getEncoding(DataHandler dh) { 232 ContentType cType = null; 233 String encoding = null; 234 235 /* 236 * Try to pick the most efficient means of determining the 237 * encoding. If this DataHandler was created using a DataSource, 238 * the getEncoding(DataSource) method is typically faster. If 239 * the DataHandler was created with an object, this method is 240 * much faster. To distinguish the two cases, we use a heuristic. 241 * A DataHandler created with an object will always have a null name. 242 * A DataHandler created with a DataSource will usually have a 243 * non-null name. 244 * 245 * XXX - This is actually quite a disgusting hack, but it makes 246 * a common case run over twice as fast. 247 */ 248 if (dh.getName() != null) 249 return getEncoding(dh.getDataSource()); 250 251 try { 252 cType = new ContentType(dh.getContentType()); 253 } catch (Exception ex) { 254 return "base64"; // what else ?! 255 } 256 257 if (cType.match("text/*")) { 258 // Check all of the available bytes 259 AsciiOutputStream aos = new AsciiOutputStream(false, false); 260 try { 261 dh.writeTo(aos); 262 } catch (IOException ex) { } // ignore it 263 switch (aos.getAscii()) { 264 case ALL_ASCII: 265 encoding = "7bit"; // all ascii 266 break; 267 case MOSTLY_ASCII: 268 encoding = "quoted-printable"; // mostly ascii 269 break; 270 default: 271 encoding = "base64"; // mostly binary 272 break; 273 } 274 } else { // not "text" 275 // Check all of available bytes, break out if we find 276 // at least one non-US-ASCII character 277 AsciiOutputStream aos = 278 new AsciiOutputStream(true, encodeEolStrict); 279 try { 280 dh.writeTo(aos); 281 } catch (IOException ex) { } // ignore it 282 if (aos.getAscii() == ALL_ASCII) // all ascii 283 encoding = "7bit"; 284 else // found atleast one non-ascii character, use b64 285 encoding = "base64"; 286 } 287 288 return encoding; 289 } 290 291 /** 292 * Decode the given input stream. The Input stream returned is 293 * the decoded input stream. All the encodings defined in RFC 2045 294 * are supported here. They include "base64", "quoted-printable", 295 * "7bit", "8bit", and "binary". In addition, "uuencode" is also 296 * supported. 297 * 298 * @param is input stream 299 * @param encoding the encoding of the stream. 300 * @return decoded input stream. 301 * @exception MessagingException in case of error 302 */ 303 public static InputStream decode(InputStream is, String encoding) 304 throws MessagingException { 305 if (encoding.equalsIgnoreCase("base64")) 306 return new BASE64DecoderStream(is); 307 else if (encoding.equalsIgnoreCase("quoted-printable")) 308 return new QPDecoderStream(is); 309 else if (encoding.equalsIgnoreCase("uuencode") || 310 encoding.equalsIgnoreCase("x-uuencode") || 311 encoding.equalsIgnoreCase("x-uue")) 312 return new UUDecoderStream(is); 313 else if (encoding.equalsIgnoreCase("binary") || 314 encoding.equalsIgnoreCase("7bit") || 315 encoding.equalsIgnoreCase("8bit")) 316 return is; 317 else 318 throw new MessagingException("Unknown encoding: " + encoding); 319 } 320 321 /** 322 * Wrap an encoder around the given output stream. 323 * All the encodings defined in RFC 2045 are supported here. 324 * They include "base64", "quoted-printable", "7bit", "8bit" and 325 * "binary". In addition, "uuencode" is also supported. 326 * 327 * @param os output stream 328 * @param encoding the encoding of the stream. 329 * @return output stream that applies the 330 * specified encoding. 331 * @exception MessagingException in case of error 332 */ 333 public static OutputStream encode(OutputStream os, String encoding) 334 throws MessagingException { 335 if (encoding == null) 336 return os; 337 else if (encoding.equalsIgnoreCase("base64")) 338 return new BASE64EncoderStream(os); 339 else if (encoding.equalsIgnoreCase("quoted-printable")) 340 return new QPEncoderStream(os); 341 else if (encoding.equalsIgnoreCase("uuencode") || 342 encoding.equalsIgnoreCase("x-uuencode") || 343 encoding.equalsIgnoreCase("x-uue")) 344 return new UUEncoderStream(os); 345 else if (encoding.equalsIgnoreCase("binary") || 346 encoding.equalsIgnoreCase("7bit") || 347 encoding.equalsIgnoreCase("8bit")) 348 return os; 349 else 350 throw new MessagingException("Unknown encoding: " +encoding); 351 } 352 353 /** 354 * Wrap an encoder around the given output stream. 355 * All the encodings defined in RFC 2045 are supported here. 356 * They include "base64", "quoted-printable", "7bit", "8bit" and 357 * "binary". In addition, "uuencode" is also supported. 358 * The <code>filename</code> parameter is used with the "uuencode" 359 * encoding and is included in the encoded output. 360 * 361 * @param os output stream 362 * @param encoding the encoding of the stream. 363 * @param filename name for the file being encoded (only used 364 * with uuencode) 365 * @return output stream that applies the 366 * specified encoding. 367 * @exception MessagingException in case of error 368 * @since JavaMail 1.2 369 */ 370 public static OutputStream encode(OutputStream os, String encoding, 371 String filename) 372 throws MessagingException { 373 if (encoding == null) 374 return os; 375 else if (encoding.equalsIgnoreCase("base64")) 376 return new BASE64EncoderStream(os); 377 else if (encoding.equalsIgnoreCase("quoted-printable")) 378 return new QPEncoderStream(os); 379 else if (encoding.equalsIgnoreCase("uuencode") || 380 encoding.equalsIgnoreCase("x-uuencode") || 381 encoding.equalsIgnoreCase("x-uue")) 382 return new UUEncoderStream(os, filename); 383 else if (encoding.equalsIgnoreCase("binary") || 384 encoding.equalsIgnoreCase("7bit") || 385 encoding.equalsIgnoreCase("8bit")) 386 return os; 387 else 388 throw new MessagingException("Unknown encoding: " +encoding); 389 } 390 391 /** 392 * Encode a RFC 822 "text" token into mail-safe form as per 393 * RFC 2047. <p> 394 * 395 * The given Unicode string is examined for non US-ASCII 396 * characters. If the string contains only US-ASCII characters, 397 * it is returned as-is. If the string contains non US-ASCII 398 * characters, it is first character-encoded using the platform's 399 * default charset, then transfer-encoded using either the B or 400 * Q encoding. The resulting bytes are then returned as a Unicode 401 * string containing only ASCII characters. <p> 402 * 403 * Note that this method should be used to encode only 404 * "unstructured" RFC 822 headers. <p> 405 * 406 * Example of usage: 407 * <blockquote><pre> 408 * 409 * MimeBodyPart part = ... 410 * String rawvalue = "FooBar Mailer, Japanese version 1.1" 411 * try { 412 * // If we know for sure that rawvalue contains only US-ASCII 413 * // characters, we can skip the encoding part 414 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue)); 415 * } catch (UnsupportedEncodingException e) { 416 * // encoding failure 417 * } catch (MessagingException me) { 418 * // setHeader() failure 419 * } 420 * 421 * </pre></blockquote> 422 * 423 * @param text unicode string 424 * @return Unicode string containing only US-ASCII characters 425 * @exception UnsupportedEncodingException if the encoding fails 426 */ 427 public static String encodeText(String text) 428 throws UnsupportedEncodingException { 429 return encodeText(text, null, null); 430 } 431 432 /** 433 * Encode a RFC 822 "text" token into mail-safe form as per 434 * RFC 2047. <p> 435 * 436 * The given Unicode string is examined for non US-ASCII 437 * characters. If the string contains only US-ASCII characters, 438 * it is returned as-is. If the string contains non US-ASCII 439 * characters, it is first character-encoded using the specified 440 * charset, then transfer-encoded using either the B or Q encoding. 441 * The resulting bytes are then returned as a Unicode string 442 * containing only ASCII characters. <p> 443 * 444 * Note that this method should be used to encode only 445 * "unstructured" RFC 822 headers. 446 * 447 * @param text the header value 448 * @param charset the charset. If this parameter is null, the 449 * platform's default chatset is used. 450 * @param encoding the encoding to be used. Currently supported 451 * values are "B" and "Q". If this parameter is null, then 452 * the "Q" encoding is used if most of characters to be 453 * encoded are in the ASCII charset, otherwise "B" encoding 454 * is used. 455 * @return Unicode string containing only US-ASCII characters 456 * @exception UnsupportedEncodingException in case of unsupported encoding 457 */ 458 public static String encodeText(String text, String charset, 459 String encoding) 460 throws UnsupportedEncodingException { 461 return encodeWord(text, charset, encoding, false); 462 } 463 464 /** 465 * Decode "unstructured" headers, that is, headers that are defined 466 * as '*text' as per RFC 822. <p> 467 * 468 * The string is decoded using the algorithm specified in 469 * RFC 2047, Section 6.1.1. If the charset-conversion fails 470 * for any sequence, an UnsupportedEncodingException is thrown. 471 * If the String is not an RFC 2047 style encoded header, it is 472 * returned as-is <p> 473 * 474 * Example of usage: 475 * <blockquote><pre> 476 * 477 * MimeBodyPart part = ... 478 * String rawvalue = null; 479 * String value = null; 480 * try { 481 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null) 482 * value = MimeUtility.decodeText(rawvalue); 483 * } catch (UnsupportedEncodingException e) { 484 * // Don't care 485 * value = rawvalue; 486 * } catch (MessagingException me) { } 487 * 488 * return value; 489 * 490 * </pre></blockquote> 491 * 492 * @param etext the possibly encoded value 493 * @return decoded text 494 * @exception UnsupportedEncodingException if the charset 495 * conversion failed. 496 */ 497 public static String decodeText(String etext) 498 throws UnsupportedEncodingException { 499 /* 500 * We look for sequences separated by "linear-white-space". 501 * (as per RFC 2047, Section 6.1.1) 502 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL. 503 */ 504 String lwsp = " \t\n\r"; 505 StringTokenizer st; 506 507 /* 508 * First, lets do a quick run thru the string and check 509 * whether the sequence "=?" exists at all. If none exists, 510 * we know there are no encoded-words in here and we can just 511 * return the string as-is, without suffering thru the later 512 * decoding logic. 513 * This handles the most common case of unencoded headers 514 * efficiently. 515 */ 516 if (etext.indexOf("=?") == -1) 517 return etext; 518 519 // Encoded words found. Start decoding ... 520 521 st = new StringTokenizer(etext, lwsp, true); 522 StringBuilder sb = new StringBuilder(); // decode buffer 523 StringBuilder wsb = new StringBuilder(); // white space buffer 524 boolean prevWasEncoded = false; 525 526 while (st.hasMoreTokens()) { 527 char c; 528 String s = st.nextToken(); 529 // If whitespace, append it to the whitespace buffer 530 if (((c = s.charAt(0)) == ' ') || (c == '\t') || 531 (c == '\r') || (c == '\n')) 532 wsb.append(c); 533 else { 534 // Check if token is an 'encoded-word' .. 535 String word; 536 try { 537 word = decodeWord(s); 538 // Yes, this IS an 'encoded-word'. 539 if (!prevWasEncoded && wsb.length() > 0) { 540 // if the previous word was also encoded, we 541 // should ignore the collected whitespace. Else 542 // we include the whitespace as well. 543 sb.append(wsb); 544 } 545 prevWasEncoded = true; 546 } catch (ParseException pex) { 547 // This is NOT an 'encoded-word'. 548 word = s; 549 // possibly decode inner encoded words 550 if (!decodeStrict) 551 word = decodeInnerWords(word); 552 // include colleced whitespace .. 553 if (wsb.length() > 0) 554 sb.append(wsb); 555 prevWasEncoded = false; 556 } 557 sb.append(word); // append the actual word 558 wsb.setLength(0); // reset wsb for reuse 559 } 560 } 561 return sb.toString(); 562 } 563 564 /** 565 * Encode a RFC 822 "word" token into mail-safe form as per 566 * RFC 2047. <p> 567 * 568 * The given Unicode string is examined for non US-ASCII 569 * characters. If the string contains only US-ASCII characters, 570 * it is returned as-is. If the string contains non US-ASCII 571 * characters, it is first character-encoded using the platform's 572 * default charset, then transfer-encoded using either the B or 573 * Q encoding. The resulting bytes are then returned as a Unicode 574 * string containing only ASCII characters. <p> 575 * 576 * This method is meant to be used when creating RFC 822 "phrases". 577 * The InternetAddress class, for example, uses this to encode 578 * it's 'phrase' component. 579 * 580 * @param word unicode string 581 * @return Array of Unicode strings containing only US-ASCII 582 * characters. 583 * @exception UnsupportedEncodingException if the encoding fails 584 */ 585 public static String encodeWord(String word) 586 throws UnsupportedEncodingException { 587 return encodeWord(word, null, null); 588 } 589 590 /** 591 * Encode a RFC 822 "word" token into mail-safe form as per 592 * RFC 2047. <p> 593 * 594 * The given Unicode string is examined for non US-ASCII 595 * characters. If the string contains only US-ASCII characters, 596 * it is returned as-is. If the string contains non US-ASCII 597 * characters, it is first character-encoded using the specified 598 * charset, then transfer-encoded using either the B or Q encoding. 599 * The resulting bytes are then returned as a Unicode string 600 * containing only ASCII characters. <p> 601 * 602 * @param word unicode string 603 * @param charset the MIME charset 604 * @param encoding the encoding to be used. Currently supported 605 * values are "B" and "Q". If this parameter is null, then 606 * the "Q" encoding is used if most of characters to be 607 * encoded are in the ASCII charset, otherwise "B" encoding 608 * is used. 609 * @return Unicode string containing only US-ASCII characters 610 * @exception UnsupportedEncodingException if the encoding fails 611 */ 612 public static String encodeWord(String word, String charset, 613 String encoding) 614 throws UnsupportedEncodingException { 615 return encodeWord(word, charset, encoding, true); 616 } 617 618 /* 619 * Encode the given string. The parameter 'encodingWord' should 620 * be true if a RFC 822 "word" token is being encoded and false if a 621 * RFC 822 "text" token is being encoded. This is because the 622 * "Q" encoding defined in RFC 2047 has more restrictions when 623 * encoding "word" tokens. (Sigh) 624 */ 625 private static String encodeWord(String string, String charset, 626 String encoding, boolean encodingWord) 627 throws UnsupportedEncodingException { 628 629 // If 'string' contains only US-ASCII characters, just 630 // return it. 631 int ascii = checkAscii(string); 632 if (ascii == ALL_ASCII) 633 return string; 634 635 // Else, apply the specified charset conversion. 636 String jcharset; 637 if (charset == null) { // use default charset 638 jcharset = getDefaultJavaCharset(); // the java charset 639 charset = getDefaultMIMECharset(); // the MIME equivalent 640 } else // MIME charset -> java charset 641 jcharset = javaCharset(charset); 642 643 // If no transfer-encoding is specified, figure one out. 644 if (encoding == null) { 645 if (ascii != MOSTLY_NONASCII) 646 encoding = "Q"; 647 else 648 encoding = "B"; 649 } 650 651 boolean b64; 652 if (encoding.equalsIgnoreCase("B")) 653 b64 = true; 654 else if (encoding.equalsIgnoreCase("Q")) 655 b64 = false; 656 else 657 throw new UnsupportedEncodingException( 658 "Unknown transfer encoding: " + encoding); 659 660 StringBuilder outb = new StringBuilder(); // the output buffer 661 doEncode(string, b64, jcharset, 662 // As per RFC 2047, size of an encoded string should not 663 // exceed 75 bytes. 664 // 7 = size of "=?", '?', 'B'/'Q', '?', "?=" 665 75 - 7 - charset.length(), // the available space 666 "=?" + charset + "?" + encoding + "?", // prefix 667 true, encodingWord, outb); 668 669 return outb.toString(); 670 } 671 672 private static void doEncode(String string, boolean b64, 673 String jcharset, int avail, String prefix, 674 boolean first, boolean encodingWord, StringBuilder buf) 675 throws UnsupportedEncodingException { 676 677 // First find out what the length of the encoded version of 678 // 'string' would be. 679 byte[] bytes = string.getBytes(jcharset); 680 int len; 681 if (b64) // "B" encoding 682 len = BEncoderStream.encodedLength(bytes); 683 else // "Q" 684 len = QEncoderStream.encodedLength(bytes, encodingWord); 685 686 int size; 687 if ((len > avail) && ((size = string.length()) > 1)) { 688 // If the length is greater than 'avail', split 'string' 689 // into two and recurse. 690 doEncode(string.substring(0, size/2), b64, jcharset, 691 avail, prefix, first, encodingWord, buf); 692 doEncode(string.substring(size/2, size), b64, jcharset, 693 avail, prefix, false, encodingWord, buf); 694 } else { 695 // length <= than 'avail'. Encode the given string 696 ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); 697 OutputStream eos; // the encoder 698 if (b64) // "B" encoding 699 eos = new BEncoderStream(os); 700 else // "Q" encoding 701 eos = new QEncoderStream(os, encodingWord); 702 703 try { // do the encoding 704 eos.write(bytes); 705 eos.close(); 706 } catch (IOException ioex) { } 707 708 byte[] encodedBytes = os.toByteArray(); // the encoded stuff 709 // Now write out the encoded (all ASCII) bytes into our 710 // StringBuffer 711 if (!first) // not the first line of this sequence 712 if (foldEncodedWords) 713 buf.append("\r\n "); // start a continuation line 714 else 715 buf.append(" "); // line will be folded later 716 717 buf.append(prefix); 718 for (int i = 0; i < encodedBytes.length; i++) 719 buf.append((char)encodedBytes[i]); 720 buf.append("?="); // terminate the current sequence 721 } 722 } 723 724 /** 725 * The string is parsed using the rules in RFC 2047 for parsing 726 * an "encoded-word". If the parse fails, a ParseException is 727 * thrown. Otherwise, it is transfer-decoded, and then 728 * charset-converted into Unicode. If the charset-conversion 729 * fails, an UnsupportedEncodingException is thrown.<p> 730 * 731 * @param eword the possibly encoded value 732 * @return deocoded word 733 * @exception ParseException if the string is not an 734 * encoded-word as per RFC 2047. 735 * @exception UnsupportedEncodingException if the charset 736 * conversion failed. 737 */ 738 public static String decodeWord(String eword) 739 throws ParseException, UnsupportedEncodingException { 740 741 if (!eword.startsWith("=?")) // not an encoded word 742 throw new ParseException(); 743 744 // get charset 745 int start = 2; int pos; 746 if ((pos = eword.indexOf('?', start)) == -1) 747 throw new ParseException(); 748 String charset = javaCharset(eword.substring(start, pos)); 749 750 // get encoding 751 start = pos+1; 752 if ((pos = eword.indexOf('?', start)) == -1) 753 throw new ParseException(); 754 String encoding = eword.substring(start, pos); 755 756 // get encoded-sequence 757 start = pos+1; 758 if ((pos = eword.indexOf("?=", start)) == -1) 759 throw new ParseException(); 760 String word = eword.substring(start, pos); 761 762 try { 763 // Extract the bytes from word 764 ByteArrayInputStream bis = 765 new ByteArrayInputStream(ASCIIUtility.getBytes(word)); 766 767 // Get the appropriate decoder 768 InputStream is; 769 if (encoding.equalsIgnoreCase("B")) 770 is = new BASE64DecoderStream(bis); 771 else if (encoding.equalsIgnoreCase("Q")) 772 is = new QDecoderStream(bis); 773 else 774 throw new UnsupportedEncodingException( 775 "unknown encoding: " + encoding); 776 777 // For b64 & q, size of decoded word <= size of word. So 778 // the decoded bytes must fit into the 'bytes' array. This 779 // is certainly more efficient than writing bytes into a 780 // ByteArrayOutputStream and then pulling out the byte[] 781 // from it. 782 int count = bis.available(); 783 byte[] bytes = new byte[count]; 784 // count is set to the actual number of decoded bytes 785 count = is.read(bytes, 0, count); 786 787 // Finally, convert the decoded bytes into a String using 788 // the specified charset 789 String s = new String(bytes, 0, count, charset); 790 if (pos + 2 < eword.length()) { 791 // there's still more text in the string 792 String rest = eword.substring(pos + 2); 793 if (!decodeStrict) 794 rest = decodeInnerWords(rest); 795 s += rest; 796 } 797 return s; 798 } catch (UnsupportedEncodingException uex) { 799 // explicitly catch and rethrow this exception, otherwise 800 // the below IOException catch will swallow this up! 801 throw uex; 802 } catch (IOException ioex) { 803 // Shouldn't happen. 804 throw new ParseException(); 805 } catch (IllegalArgumentException iex) { 806 /* An unknown charset of the form ISO-XXX-XXX, will cause 807 * the JDK to throw an IllegalArgumentException ... Since the 808 * JDK will attempt to create a classname using this string, 809 * but valid classnames must not contain the character '-', 810 * and this results in an IllegalArgumentException, rather than 811 * the expected UnsupportedEncodingException. Yikes 812 */ 813 throw new UnsupportedEncodingException(); 814 } 815 } 816 817 /** 818 * Look for encoded words within a word. The MIME spec doesn't 819 * allow this, but many broken mailers, especially Japanese mailers, 820 * produce such incorrect encodings. 821 */ 822 private static String decodeInnerWords(String word) 823 throws UnsupportedEncodingException { 824 int start = 0, i; 825 StringBuilder buf = new StringBuilder(); 826 while ((i = word.indexOf("=?", start)) >= 0) { 827 buf.append(word.substring(start, i)); 828 int end = word.indexOf("?=", i); 829 if (end < 0) 830 break; 831 String s = word.substring(i, end + 2); 832 try { 833 s = decodeWord(s); 834 } catch (ParseException pex) { 835 // ignore it, just use the original string 836 } 837 buf.append(s); 838 start = end + 2; 839 } 840 if (start == 0) 841 return word; 842 if (start < word.length()) 843 buf.append(word.substring(start)); 844 return buf.toString(); 845 } 846 847 /** 848 * A utility method to quote a word, if the word contains any 849 * characters from the specified 'specials' list.<p> 850 * 851 * The <code>HeaderTokenizer</code> class defines two special 852 * sets of delimiters - MIME and RFC 822. <p> 853 * 854 * This method is typically used during the generation of 855 * RFC 822 and MIME header fields. 856 * 857 * @param word word to be quoted 858 * @param specials the set of special characters 859 * @return the possibly quoted word 860 * @see com.sun.xml.internal.messaging.saaj.packaging.mime.internet.HeaderTokenizer#MIME 861 * @see com.sun.xml.internal.messaging.saaj.packaging.mime.internet.HeaderTokenizer#RFC822 862 */ 863 public static String quote(String word, String specials) { 864 int len = word.length(); 865 866 /* 867 * Look for any "bad" characters, Escape and 868 * quote the entire string if necessary. 869 */ 870 boolean needQuoting = false; 871 for (int i = 0; i < len; i++) { 872 char c = word.charAt(i); 873 if (c == '"' || c == '\\' || c == '\r' || c == '\n') { 874 // need to escape them and then quote the whole string 875 StringBuilder sb = new StringBuilder(len + 3); 876 sb.append('"'); 877 sb.append(word.substring(0, i)); 878 int lastc = 0; 879 for (int j = i; j < len; j++) { 880 char cc = word.charAt(j); 881 if ((cc == '"') || (cc == '\\') || 882 (cc == '\r') || (cc == '\n')) 883 if (cc == '\n' && lastc == '\r') 884 ; // do nothing, CR was already escaped 885 else 886 sb.append('\\'); // Escape the character 887 sb.append(cc); 888 lastc = cc; 889 } 890 sb.append('"'); 891 return sb.toString(); 892 } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0) 893 // These characters cause the string to be quoted 894 needQuoting = true; 895 } 896 897 if (needQuoting) { 898 StringBuilder sb = new StringBuilder(len + 2); 899 sb.append('"').append(word).append('"'); 900 return sb.toString(); 901 } else 902 return word; 903 } 904 905 /** 906 * Fold a string at linear whitespace so that each line is no longer 907 * than 76 characters, if possible. If there are more than 76 908 * non-whitespace characters consecutively, the string is folded at 909 * the first whitespace after that sequence. The parameter 910 * <code>used</code> indicates how many characters have been used in 911 * the current line; it is usually the length of the header name. <p> 912 * 913 * Note that line breaks in the string aren't escaped; they probably 914 * should be. 915 * 916 * @param used characters used in line so far 917 * @param s the string to fold 918 * @return the folded string 919 */ 920 /*public*/ static String fold(int used, String s) { 921 if (!foldText) 922 return s; 923 924 int end; 925 char c; 926 // Strip trailing spaces 927 for (end = s.length() - 1; end >= 0; end--) { 928 c = s.charAt(end); 929 if (c != ' ' && c != '\t') 930 break; 931 } 932 if (end != s.length() - 1) 933 s = s.substring(0, end + 1); 934 935 // if the string fits now, just return it 936 if (used + s.length() <= 76) 937 return s; 938 939 // have to actually fold the string 940 StringBuilder sb = new StringBuilder(s.length() + 4); 941 char lastc = 0; 942 while (used + s.length() > 76) { 943 int lastspace = -1; 944 for (int i = 0; i < s.length(); i++) { 945 if (lastspace != -1 && used + i > 76) 946 break; 947 c = s.charAt(i); 948 if (c == ' ' || c == '\t') 949 if (!(lastc == ' ' || lastc == '\t')) 950 lastspace = i; 951 lastc = c; 952 } 953 if (lastspace == -1) { 954 // no space, use the whole thing 955 sb.append(s); 956 s = ""; 957 used = 0; 958 break; 959 } 960 sb.append(s.substring(0, lastspace)); 961 sb.append("\r\n"); 962 lastc = s.charAt(lastspace); 963 sb.append(lastc); 964 s = s.substring(lastspace + 1); 965 used = 1; 966 } 967 sb.append(s); 968 return sb.toString(); 969 } 970 971 /** 972 * Unfold a folded header. Any line breaks that aren't escaped and 973 * are followed by whitespace are removed. 974 * 975 * @param s the string to unfold 976 * @return the unfolded string 977 */ 978 /*public*/ static String unfold(String s) { 979 if (!foldText) 980 return s; 981 982 StringBuilder sb = null; 983 int i; 984 while ((i = indexOfAny(s, "\r\n")) >= 0) { 985 int start = i; 986 int l = s.length(); 987 i++; // skip CR or NL 988 if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n') 989 i++; // skip LF 990 if (start == 0 || s.charAt(start - 1) != '\\') { 991 char c; 992 // if next line starts with whitespace, skip all of it 993 // XXX - always has to be true? 994 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) { 995 i++; // skip whitespace 996 while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) 997 i++; 998 if (sb == null) 999 sb = new StringBuilder(s.length()); 1000 if (start != 0) { 1001 sb.append(s.substring(0, start)); 1002 sb.append(' '); 1003 } 1004 s = s.substring(i); 1005 continue; 1006 } 1007 // it's not a continuation line, just leave it in 1008 if (sb == null) 1009 sb = new StringBuilder(s.length()); 1010 sb.append(s.substring(0, i)); 1011 s = s.substring(i); 1012 } else { 1013 // there's a backslash at "start - 1" 1014 // strip it out, but leave in the line break 1015 if (sb == null) 1016 sb = new StringBuilder(s.length()); 1017 sb.append(s.substring(0, start - 1)); 1018 sb.append(s.substring(start, i)); 1019 s = s.substring(i); 1020 } 1021 } 1022 if (sb != null) { 1023 sb.append(s); 1024 return sb.toString(); 1025 } else 1026 return s; 1027 } 1028 1029 /** 1030 * Return the first index of any of the characters in "any" in "s", 1031 * or -1 if none are found. 1032 * 1033 * This should be a method on String. 1034 */ 1035 private static int indexOfAny(String s, String any) { 1036 return indexOfAny(s, any, 0); 1037 } 1038 1039 private static int indexOfAny(String s, String any, int start) { 1040 try { 1041 int len = s.length(); 1042 for (int i = start; i < len; i++) { 1043 if (any.indexOf(s.charAt(i)) >= 0) 1044 return i; 1045 } 1046 return -1; 1047 } catch (StringIndexOutOfBoundsException e) { 1048 return -1; 1049 } 1050 } 1051 1052 /** 1053 * Convert a MIME charset name into a valid Java charset name. <p> 1054 * 1055 * @param charset the MIME charset name 1056 * @return the Java charset equivalent. If a suitable mapping is 1057 * not available, the passed in charset is itself returned. 1058 */ 1059 public static String javaCharset(String charset) { 1060 if (mime2java == null || charset == null) 1061 // no mapping table, or charset parameter is null 1062 return charset; 1063 1064 String alias = mime2java.get(charset.toLowerCase()); 1065 return alias == null ? charset : alias; 1066 } 1067 1068 /** 1069 * Convert a java charset into its MIME charset name. <p> 1070 * 1071 * Note that a future version of JDK (post 1.2) might provide 1072 * this functionality, in which case, we may deprecate this 1073 * method then. 1074 * 1075 * @param charset the JDK charset 1076 * @return the MIME/IANA equivalent. If a mapping 1077 * is not possible, the passed in charset itself 1078 * is returned. 1079 * @since JavaMail 1.1 1080 */ 1081 public static String mimeCharset(String charset) { 1082 if (java2mime == null || charset == null) 1083 // no mapping table or charset param is null 1084 return charset; 1085 1086 String alias = java2mime.get(charset.toLowerCase()); 1087 return alias == null ? charset : alias; 1088 } 1089 1090 private static String defaultJavaCharset; 1091 private static String defaultMIMECharset; 1092 1093 /** 1094 * Get the default charset corresponding to the system's current 1095 * default locale. If the System property <code>mail.mime.charset</code> 1096 * is set, a system charset corresponding to this MIME charset will be 1097 * returned. <p> 1098 * 1099 * @return the default charset of the system's default locale, 1100 * as a Java charset. (NOT a MIME charset) 1101 * @since JavaMail 1.1 1102 */ 1103 public static String getDefaultJavaCharset() { 1104 if (defaultJavaCharset == null) { 1105 /* 1106 * If mail.mime.charset is set, it controls the default 1107 * Java charset as well. 1108 */ 1109 String mimecs = null; 1110 1111 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset"); 1112 1113 if (mimecs != null && mimecs.length() > 0) { 1114 defaultJavaCharset = javaCharset(mimecs); 1115 return defaultJavaCharset; 1116 } 1117 1118 try { 1119 defaultJavaCharset = System.getProperty("file.encoding", 1120 "8859_1"); 1121 } catch (SecurityException sex) { 1122 1123 class NullInputStream extends InputStream { 1124 @Override 1125 public int read() { 1126 return 0; 1127 } 1128 } 1129 InputStreamReader reader = 1130 new InputStreamReader(new NullInputStream()); 1131 defaultJavaCharset = reader.getEncoding(); 1132 if (defaultJavaCharset == null) 1133 defaultJavaCharset = "8859_1"; 1134 } 1135 } 1136 1137 return defaultJavaCharset; 1138 } 1139 1140 /* 1141 * Get the default MIME charset for this locale. 1142 */ 1143 static String getDefaultMIMECharset() { 1144 if (defaultMIMECharset == null) { 1145 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset"); 1146 } 1147 if (defaultMIMECharset == null) 1148 defaultMIMECharset = mimeCharset(getDefaultJavaCharset()); 1149 return defaultMIMECharset; 1150 } 1151 1152 // Tables to map MIME charset names to Java names and vice versa. 1153 // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset 1154 private static Hashtable<String, String> mime2java; 1155 private static Hashtable<String, String> java2mime; 1156 1157 static { 1158 java2mime = new Hashtable<String, String>(40); 1159 mime2java = new Hashtable<String, String>(10); 1160 1161 try { 1162 // Use this class's classloader to load the mapping file 1163 // XXX - we should use SecuritySupport, but it's in another package 1164 InputStream is = 1165 com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream( 1166 "/META-INF/javamail.charset.map"); 1167 1168 if (is != null) { 1169 is = new LineInputStream(is); 1170 1171 // Load the JDK-to-MIME charset mapping table 1172 loadMappings((LineInputStream)is, java2mime); 1173 1174 // Load the MIME-to-JDK charset mapping table 1175 loadMappings((LineInputStream)is, mime2java); 1176 } 1177 } catch (Exception ex) { } 1178 1179 // If we didn't load the tables, e.g., because we didn't have 1180 // permission, load them manually. The entries here should be 1181 // the same as the default javamail.charset.map. 1182 if (java2mime.isEmpty()) { 1183 java2mime.put("8859_1", "ISO-8859-1"); 1184 java2mime.put("iso8859_1", "ISO-8859-1"); 1185 java2mime.put("ISO8859-1", "ISO-8859-1"); 1186 1187 java2mime.put("8859_2", "ISO-8859-2"); 1188 java2mime.put("iso8859_2", "ISO-8859-2"); 1189 java2mime.put("ISO8859-2", "ISO-8859-2"); 1190 1191 java2mime.put("8859_3", "ISO-8859-3"); 1192 java2mime.put("iso8859_3", "ISO-8859-3"); 1193 java2mime.put("ISO8859-3", "ISO-8859-3"); 1194 1195 java2mime.put("8859_4", "ISO-8859-4"); 1196 java2mime.put("iso8859_4", "ISO-8859-4"); 1197 java2mime.put("ISO8859-4", "ISO-8859-4"); 1198 1199 java2mime.put("8859_5", "ISO-8859-5"); 1200 java2mime.put("iso8859_5", "ISO-8859-5"); 1201 java2mime.put("ISO8859-5", "ISO-8859-5"); 1202 1203 java2mime.put("8859_6", "ISO-8859-6"); 1204 java2mime.put("iso8859_6", "ISO-8859-6"); 1205 java2mime.put("ISO8859-6", "ISO-8859-6"); 1206 1207 java2mime.put("8859_7", "ISO-8859-7"); 1208 java2mime.put("iso8859_7", "ISO-8859-7"); 1209 java2mime.put("ISO8859-7", "ISO-8859-7"); 1210 1211 java2mime.put("8859_8", "ISO-8859-8"); 1212 java2mime.put("iso8859_8", "ISO-8859-8"); 1213 java2mime.put("ISO8859-8", "ISO-8859-8"); 1214 1215 java2mime.put("8859_9", "ISO-8859-9"); 1216 java2mime.put("iso8859_9", "ISO-8859-9"); 1217 java2mime.put("ISO8859-9", "ISO-8859-9"); 1218 1219 java2mime.put("SJIS", "Shift_JIS"); 1220 java2mime.put("MS932", "Shift_JIS"); 1221 java2mime.put("JIS", "ISO-2022-JP"); 1222 java2mime.put("ISO2022JP", "ISO-2022-JP"); 1223 java2mime.put("EUC_JP", "euc-jp"); 1224 java2mime.put("KOI8_R", "koi8-r"); 1225 java2mime.put("EUC_CN", "euc-cn"); 1226 java2mime.put("EUC_TW", "euc-tw"); 1227 java2mime.put("EUC_KR", "euc-kr"); 1228 } 1229 if (mime2java.isEmpty()) { 1230 mime2java.put("iso-2022-cn", "ISO2022CN"); 1231 mime2java.put("iso-2022-kr", "ISO2022KR"); 1232 mime2java.put("utf-8", "UTF8"); 1233 mime2java.put("utf8", "UTF8"); 1234 mime2java.put("ja_jp.iso2022-7", "ISO2022JP"); 1235 mime2java.put("ja_jp.eucjp", "EUCJIS"); 1236 mime2java.put("euc-kr", "KSC5601"); 1237 mime2java.put("euckr", "KSC5601"); 1238 mime2java.put("us-ascii", "ISO-8859-1"); 1239 mime2java.put("x-us-ascii", "ISO-8859-1"); 1240 } 1241 } 1242 1243 private static void loadMappings(LineInputStream is, Hashtable<String, String> table) { 1244 String currLine; 1245 1246 while (true) { 1247 try { 1248 currLine = is.readLine(); 1249 } catch (IOException ioex) { 1250 break; // error in reading, stop 1251 } 1252 1253 if (currLine == null) // end of file, stop 1254 break; 1255 if (currLine.startsWith("--") && currLine.endsWith("--")) 1256 // end of this table 1257 break; 1258 1259 // ignore empty lines and comments 1260 if (currLine.trim().length() == 0 || currLine.startsWith("#")) 1261 continue; 1262 1263 // A valid entry is of the form <key><separator><value> 1264 // where, <separator> := SPACE | HT. Parse this 1265 StringTokenizer tk = new StringTokenizer(currLine, " \t"); 1266 try { 1267 String key = tk.nextToken(); 1268 String value = tk.nextToken(); 1269 table.put(key.toLowerCase(), value); 1270 } catch (NoSuchElementException nex) { } 1271 } 1272 } 1273 1274 static final int ALL_ASCII = 1; 1275 static final int MOSTLY_ASCII = 2; 1276 static final int MOSTLY_NONASCII = 3; 1277 1278 /** 1279 * Check if the given string contains non US-ASCII characters. 1280 * @param s string 1281 * @return ALL_ASCII if all characters in the string 1282 * belong to the US-ASCII charset. MOSTLY_ASCII 1283 * if more than half of the available characters 1284 * are US-ASCII characters. Else MOSTLY_NONASCII. 1285 */ 1286 static int checkAscii(String s) { 1287 int ascii = 0, non_ascii = 0; 1288 int l = s.length(); 1289 1290 for (int i = 0; i < l; i++) { 1291 if (nonascii(s.charAt(i))) // non-ascii 1292 non_ascii++; 1293 else 1294 ascii++; 1295 } 1296 1297 if (non_ascii == 0) 1298 return ALL_ASCII; 1299 if (ascii > non_ascii) 1300 return MOSTLY_ASCII; 1301 1302 return MOSTLY_NONASCII; 1303 } 1304 1305 /** 1306 * Check if the given byte array contains non US-ASCII characters. 1307 * @param b byte array 1308 * @return ALL_ASCII if all characters in the string 1309 * belong to the US-ASCII charset. MOSTLY_ASCII 1310 * if more than half of the available characters 1311 * are US-ASCII characters. Else MOSTLY_NONASCII. 1312 * 1313 * XXX - this method is no longer used 1314 */ 1315 static int checkAscii(byte[] b) { 1316 int ascii = 0, non_ascii = 0; 1317 1318 for (int i=0; i < b.length; i++) { 1319 // The '&' operator automatically causes b[i] to be promoted 1320 // to an int, and we mask out the higher bytes in the int 1321 // so that the resulting value is not a negative integer. 1322 if (nonascii(b[i] & 0xff)) // non-ascii 1323 non_ascii++; 1324 else 1325 ascii++; 1326 } 1327 1328 if (non_ascii == 0) 1329 return ALL_ASCII; 1330 if (ascii > non_ascii) 1331 return MOSTLY_ASCII; 1332 1333 return MOSTLY_NONASCII; 1334 } 1335 1336 /** 1337 * Check if the given input stream contains non US-ASCII characters. 1338 * Upto <code>max</code> bytes are checked. If <code>max</code> is 1339 * set to <code>ALL</code>, then all the bytes available in this 1340 * input stream are checked. If <code>breakOnNonAscii</code> is true 1341 * the check terminates when the first non-US-ASCII character is 1342 * found and MOSTLY_NONASCII is returned. Else, the check continues 1343 * till <code>max</code> bytes or till the end of stream. 1344 * 1345 * @param is the input stream 1346 * @param max maximum bytes to check for. The special value 1347 * ALL indicates that all the bytes in this input 1348 * stream must be checked. 1349 * @param breakOnNonAscii if <code>true</code>, then terminate the 1350 * the check when the first non-US-ASCII character 1351 * is found. 1352 * @return ALL_ASCII if all characters in the string 1353 * belong to the US-ASCII charset. MOSTLY_ASCII 1354 * if more than half of the available characters 1355 * are US-ASCII characters. Else MOSTLY_NONASCII. 1356 */ 1357 static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) { 1358 int ascii = 0, non_ascii = 0; 1359 int len; 1360 int block = 4096; 1361 int linelen = 0; 1362 boolean longLine = false, badEOL = false; 1363 boolean checkEOL = encodeEolStrict && breakOnNonAscii; 1364 byte buf[] = null; 1365 if (max != 0) { 1366 block = (max == ALL) ? 4096 : Math.min(max, 4096); 1367 buf = new byte[block]; 1368 } 1369 while (max != 0) { 1370 try { 1371 if ((len = is.read(buf, 0, block)) == -1) 1372 break; 1373 int lastb = 0; 1374 for (int i = 0; i < len; i++) { 1375 // The '&' operator automatically causes b[i] to 1376 // be promoted to an int, and we mask out the higher 1377 // bytes in the int so that the resulting value is 1378 // not a negative integer. 1379 int b = buf[i] & 0xff; 1380 if (checkEOL && 1381 ((lastb == '\r' && b != '\n') || 1382 (lastb != '\r' && b == '\n'))) 1383 badEOL = true; 1384 if (b == '\r' || b == '\n') 1385 linelen = 0; 1386 else { 1387 linelen++; 1388 if (linelen > 998) // 1000 - CRLF 1389 longLine = true; 1390 } 1391 if (nonascii(b)) { // non-ascii 1392 if (breakOnNonAscii) // we are done 1393 return MOSTLY_NONASCII; 1394 else 1395 non_ascii++; 1396 } else 1397 ascii++; 1398 lastb = b; 1399 } 1400 } catch (IOException ioex) { 1401 break; 1402 } 1403 if (max != ALL) 1404 max -= len; 1405 } 1406 1407 if (max == 0 && breakOnNonAscii) 1408 // We have been told to break on the first non-ascii character. 1409 // We haven't got any non-ascii character yet, but then we 1410 // have not checked all of the available bytes either. So we 1411 // cannot say for sure that this input stream is ALL_ASCII, 1412 // and hence we must play safe and return MOSTLY_NONASCII 1413 1414 return MOSTLY_NONASCII; 1415 1416 if (non_ascii == 0) { // no non-us-ascii characters so far 1417 // If we're looking at non-text data, and we saw CR without LF 1418 // or vice versa, consider this mostly non-ASCII so that it 1419 // will be base64 encoded (since the quoted-printable encoder 1420 // doesn't encode this case properly). 1421 if (badEOL) 1422 return MOSTLY_NONASCII; 1423 // if we've seen a long line, we degrade to mostly ascii 1424 else if (longLine) 1425 return MOSTLY_ASCII; 1426 else 1427 return ALL_ASCII; 1428 } 1429 if (ascii > non_ascii) // mostly ascii 1430 return MOSTLY_ASCII; 1431 return MOSTLY_NONASCII; 1432 } 1433 1434 static final boolean nonascii(int b) { 1435 return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t'); 1436 } 1437 } 1438 1439 /** 1440 * An OutputStream that determines whether the data written to 1441 * it is all ASCII, mostly ASCII, or mostly non-ASCII. 1442 */ 1443 class AsciiOutputStream extends OutputStream { 1444 private boolean breakOnNonAscii; 1445 private int ascii = 0, non_ascii = 0; 1446 private int linelen = 0; 1447 private boolean longLine = false; 1448 private boolean badEOL = false; 1449 private boolean checkEOL = false; 1450 private int lastb = 0; 1451 private int ret = 0; 1452 1453 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) { 1454 this.breakOnNonAscii = breakOnNonAscii; 1455 checkEOL = encodeEolStrict && breakOnNonAscii; 1456 } 1457 1458 @Override 1459 public void write(int b) throws IOException { 1460 check(b); 1461 } 1462 1463 @Override 1464 public void write(byte b[]) throws IOException { 1465 write(b, 0, b.length); 1466 } 1467 1468 @Override 1469 public void write(byte b[], int off, int len) throws IOException { 1470 len += off; 1471 for (int i = off; i < len ; i++) 1472 check(b[i]); 1473 } 1474 1475 private final void check(int b) throws IOException { 1476 b &= 0xff; 1477 if (checkEOL && 1478 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n'))) 1479 badEOL = true; 1480 if (b == '\r' || b == '\n') 1481 linelen = 0; 1482 else { 1483 linelen++; 1484 if (linelen > 998) // 1000 - CRLF 1485 longLine = true; 1486 } 1487 if (MimeUtility.nonascii(b)) { // non-ascii 1488 non_ascii++; 1489 if (breakOnNonAscii) { // we are done 1490 ret = MimeUtility.MOSTLY_NONASCII; 1491 throw new EOFException(); 1492 } 1493 } else 1494 ascii++; 1495 lastb = b; 1496 } 1497 1498 /** 1499 * Return ASCII-ness of data stream. 1500 */ 1501 public int getAscii() { 1502 if (ret != 0) 1503 return ret; 1504 // If we're looking at non-text data, and we saw CR without LF 1505 // or vice versa, consider this mostly non-ASCII so that it 1506 // will be base64 encoded (since the quoted-printable encoder 1507 // doesn't encode this case properly). 1508 if (badEOL) 1509 return MimeUtility.MOSTLY_NONASCII; 1510 else if (non_ascii == 0) { // no non-us-ascii characters so far 1511 // if we've seen a long line, we degrade to mostly ascii 1512 if (longLine) 1513 return MimeUtility.MOSTLY_ASCII; 1514 else 1515 return MimeUtility.ALL_ASCII; 1516 } 1517 if (ascii > non_ascii) // mostly ascii 1518 return MimeUtility.MOSTLY_ASCII; 1519 return MimeUtility.MOSTLY_NONASCII; 1520 } 1521 }