1 /*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
48 * There are a set of methods to encode and decode MIME headers as
49 * per RFC 2047. A brief description on handling such headers is
50 * given below: <p>
51 *
52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
53 * characters. Headers that contain non US-ASCII characters must be
54 * encoded so that they contain only US-ASCII characters. Basically,
55 * this process involves using either BASE64 or QP to encode certain
56 * characters. RFC 2047 describes this in detail. <p>
57 *
58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
59 * subset of Unicode (and occupies the range 0 - 127). A String
60 * that contains only ASCII characters is already mail-safe. If the
61 * String contains non US-ASCII characters, it must be encoded. An
62 * additional complexity in this step is that since Unicode is not
63 * yet a widely used charset, one might want to first charset-encode
64 * the String into another charset and then do the transfer-encoding.
65 * <p>
66 * Note that to get the actual bytes of a mail-safe String (say,
67 * for sending over SMTP), one must do
68 * <p><blockquote><pre>
69 *
70 * byte[] bytes = string.getBytes("iso-8859-1");
71 *
72 * </pre></blockquote><p>
73 *
74 * The <code>setHeader</code> and <code>addHeader</code> methods
75 * on MimeMessage and MimeBodyPart assume that the given header values
76 * are Unicode strings that contain only US-ASCII characters. Hence
77 * the callers of those methods must insure that the values they pass
78 * do not contain non US-ASCII characters. The methods in this class
79 * help do this. <p>
80 *
81 * The <code>getHeader</code> family of methods on MimeMessage and
82 * MimeBodyPart return the raw header value. These might be encoded
83 * as per RFC 2047, and if so, must be decoded into Unicode Strings.
84 * The methods in this class help to do this. <p>
85 *
86 * Several System properties control strict conformance to the MIME
87 * spec. Note that these are not session properties but must be set
88 * globally as System properties. <p>
89 *
90 * The <code>mail.mime.decodetext.strict</code> property controls
91 * decoding of MIME encoded words. The MIME spec requires that encoded
92 * words start at the beginning of a whitespace separated word. Some
205
206 // Close the input stream
207 try {
208 is.close();
209 } catch (IOException ioex) { }
210
211 return encoding;
212 }
213
214 /**
215 * Same as <code>getEncoding(DataSource)</code> except that instead
216 * of reading the data from an <code>InputStream</code> it uses the
217 * <code>writeTo</code> method to examine the data. This is more
218 * efficient in the common case of a <code>DataHandler</code>
219 * created with an object and a MIME type (for example, a
220 * "text/plain" String) because all the I/O is done in this
221 * thread. In the case requiring an <code>InputStream</code> the
222 * <code>DataHandler</code> uses a thread, a pair of pipe streams,
223 * and the <code>writeTo</code> method to produce the data. <p>
224 *
225 * @since JavaMail 1.2
226 */
227 public static String getEncoding(DataHandler dh) {
228 ContentType cType = null;
229 String encoding = null;
230
231 /*
232 * Try to pick the most efficient means of determining the
233 * encoding. If this DataHandler was created using a DataSource,
234 * the getEncoding(DataSource) method is typically faster. If
235 * the DataHandler was created with an object, this method is
236 * much faster. To distinguish the two cases, we use a heuristic.
237 * A DataHandler created with an object will always have a null name.
238 * A DataHandler created with a DataSource will usually have a
239 * non-null name.
240 *
241 * XXX - This is actually quite a disgusting hack, but it makes
242 * a common case run over twice as fast.
243 */
244 if (dh.getName() != null)
277 } catch (IOException ex) { } // ignore it
278 if (aos.getAscii() == ALL_ASCII) // all ascii
279 encoding = "7bit";
280 else // found atleast one non-ascii character, use b64
281 encoding = "base64";
282 }
283
284 return encoding;
285 }
286
287 /**
288 * Decode the given input stream. The Input stream returned is
289 * the decoded input stream. All the encodings defined in RFC 2045
290 * are supported here. They include "base64", "quoted-printable",
291 * "7bit", "8bit", and "binary". In addition, "uuencode" is also
292 * supported.
293 *
294 * @param is input stream
295 * @param encoding the encoding of the stream.
296 * @return decoded input stream.
297 */
298 public static InputStream decode(InputStream is, String encoding)
299 throws MessagingException {
300 if (encoding.equalsIgnoreCase("base64"))
301 return new BASE64DecoderStream(is);
302 else if (encoding.equalsIgnoreCase("quoted-printable"))
303 return new QPDecoderStream(is);
304 else if (encoding.equalsIgnoreCase("uuencode") ||
305 encoding.equalsIgnoreCase("x-uuencode") ||
306 encoding.equalsIgnoreCase("x-uue"))
307 return new UUDecoderStream(is);
308 else if (encoding.equalsIgnoreCase("binary") ||
309 encoding.equalsIgnoreCase("7bit") ||
310 encoding.equalsIgnoreCase("8bit"))
311 return is;
312 else
313 throw new MessagingException("Unknown encoding: " + encoding);
314 }
315
316 /**
317 * Wrap an encoder around the given output stream.
318 * All the encodings defined in RFC 2045 are supported here.
319 * They include "base64", "quoted-printable", "7bit", "8bit" and
320 * "binary". In addition, "uuencode" is also supported.
321 *
322 * @param os output stream
323 * @param encoding the encoding of the stream.
324 * @return output stream that applies the
325 * specified encoding.
326 */
327 public static OutputStream encode(OutputStream os, String encoding)
328 throws MessagingException {
329 if (encoding == null)
330 return os;
331 else if (encoding.equalsIgnoreCase("base64"))
332 return new BASE64EncoderStream(os);
333 else if (encoding.equalsIgnoreCase("quoted-printable"))
334 return new QPEncoderStream(os);
335 else if (encoding.equalsIgnoreCase("uuencode") ||
336 encoding.equalsIgnoreCase("x-uuencode") ||
337 encoding.equalsIgnoreCase("x-uue"))
338 return new UUEncoderStream(os);
339 else if (encoding.equalsIgnoreCase("binary") ||
340 encoding.equalsIgnoreCase("7bit") ||
341 encoding.equalsIgnoreCase("8bit"))
342 return os;
343 else
344 throw new MessagingException("Unknown encoding: " +encoding);
345 }
346
347 /**
348 * Wrap an encoder around the given output stream.
349 * All the encodings defined in RFC 2045 are supported here.
350 * They include "base64", "quoted-printable", "7bit", "8bit" and
351 * "binary". In addition, "uuencode" is also supported.
352 * The <code>filename</code> parameter is used with the "uuencode"
353 * encoding and is included in the encoded output.
354 *
355 * @param os output stream
356 * @param encoding the encoding of the stream.
357 * @param filename name for the file being encoded (only used
358 * with uuencode)
359 * @return output stream that applies the
360 * specified encoding.
361 * @since JavaMail 1.2
362 */
363 public static OutputStream encode(OutputStream os, String encoding,
364 String filename)
365 throws MessagingException {
366 if (encoding == null)
367 return os;
368 else if (encoding.equalsIgnoreCase("base64"))
369 return new BASE64EncoderStream(os);
370 else if (encoding.equalsIgnoreCase("quoted-printable"))
371 return new QPEncoderStream(os);
372 else if (encoding.equalsIgnoreCase("uuencode") ||
373 encoding.equalsIgnoreCase("x-uuencode") ||
374 encoding.equalsIgnoreCase("x-uue"))
375 return new UUEncoderStream(os, filename);
376 else if (encoding.equalsIgnoreCase("binary") ||
377 encoding.equalsIgnoreCase("7bit") ||
378 encoding.equalsIgnoreCase("8bit"))
379 return os;
380 else
381 throw new MessagingException("Unknown encoding: " +encoding);
382 }
383
384 /**
385 * Encode a RFC 822 "text" token into mail-safe form as per
386 * RFC 2047. <p>
387 *
388 * The given Unicode string is examined for non US-ASCII
389 * characters. If the string contains only US-ASCII characters,
390 * it is returned as-is. If the string contains non US-ASCII
391 * characters, it is first character-encoded using the platform's
392 * default charset, then transfer-encoded using either the B or
393 * Q encoding. The resulting bytes are then returned as a Unicode
394 * string containing only ASCII characters. <p>
395 *
396 * Note that this method should be used to encode only
397 * "unstructured" RFC 822 headers. <p>
398 *
399 * Example of usage:
400 * <p><blockquote><pre>
401 *
402 * MimeBodyPart part = ...
403 * String rawvalue = "FooBar Mailer, Japanese version 1.1"
404 * try {
405 * // If we know for sure that rawvalue contains only US-ASCII
406 * // characters, we can skip the encoding part
407 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
408 * } catch (UnsupportedEncodingException e) {
409 * // encoding failure
410 * } catch (MessagingException me) {
411 * // setHeader() failure
412 * }
413 *
414 * </pre></blockquote><p>
415 *
416 * @param text unicode string
417 * @return Unicode string containing only US-ASCII characters
418 * @exception UnsupportedEncodingException if the encoding fails
419 */
420 public static String encodeText(String text)
421 throws UnsupportedEncodingException {
422 return encodeText(text, null, null);
423 }
424
425 /**
426 * Encode a RFC 822 "text" token into mail-safe form as per
427 * RFC 2047. <p>
428 *
429 * The given Unicode string is examined for non US-ASCII
430 * characters. If the string contains only US-ASCII characters,
431 * it is returned as-is. If the string contains non US-ASCII
432 * characters, it is first character-encoded using the specified
433 * charset, then transfer-encoded using either the B or Q encoding.
434 * The resulting bytes are then returned as a Unicode string
435 * containing only ASCII characters. <p>
436 *
437 * Note that this method should be used to encode only
438 * "unstructured" RFC 822 headers.
439 *
440 * @param text the header value
441 * @param charset the charset. If this parameter is null, the
442 * platform's default chatset is used.
443 * @param encoding the encoding to be used. Currently supported
444 * values are "B" and "Q". If this parameter is null, then
445 * the "Q" encoding is used if most of characters to be
446 * encoded are in the ASCII charset, otherwise "B" encoding
447 * is used.
448 * @return Unicode string containing only US-ASCII characters
449 */
450 public static String encodeText(String text, String charset,
451 String encoding)
452 throws UnsupportedEncodingException {
453 return encodeWord(text, charset, encoding, false);
454 }
455
456 /**
457 * Decode "unstructured" headers, that is, headers that are defined
458 * as '*text' as per RFC 822. <p>
459 *
460 * The string is decoded using the algorithm specified in
461 * RFC 2047, Section 6.1.1. If the charset-conversion fails
462 * for any sequence, an UnsupportedEncodingException is thrown.
463 * If the String is not an RFC 2047 style encoded header, it is
464 * returned as-is <p>
465 *
466 * Example of usage:
467 * <p><blockquote><pre>
468 *
469 * MimeBodyPart part = ...
470 * String rawvalue = null;
471 * String value = null;
472 * try {
473 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
474 * value = MimeUtility.decodeText(rawvalue);
475 * } catch (UnsupportedEncodingException e) {
476 * // Don't care
477 * value = rawvalue;
478 * } catch (MessagingException me) { }
479 *
480 * return value;
481 *
482 * </pre></blockquote><p>
483 *
484 * @param etext the possibly encoded value
485 * @exception UnsupportedEncodingException if the charset
486 * conversion failed.
487 */
488 public static String decodeText(String etext)
489 throws UnsupportedEncodingException {
490 /*
491 * We look for sequences separated by "linear-white-space".
492 * (as per RFC 2047, Section 6.1.1)
493 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
494 */
495 String lwsp = " \t\n\r";
496 StringTokenizer st;
497
498 /*
499 * First, lets do a quick run thru the string and check
500 * whether the sequence "=?" exists at all. If none exists,
501 * we know there are no encoded-words in here and we can just
502 * return the string as-is, without suffering thru the later
503 * decoding logic.
504 * This handles the most common case of unencoded headers
551 }
552 return sb.toString();
553 }
554
555 /**
556 * Encode a RFC 822 "word" token into mail-safe form as per
557 * RFC 2047. <p>
558 *
559 * The given Unicode string is examined for non US-ASCII
560 * characters. If the string contains only US-ASCII characters,
561 * it is returned as-is. If the string contains non US-ASCII
562 * characters, it is first character-encoded using the platform's
563 * default charset, then transfer-encoded using either the B or
564 * Q encoding. The resulting bytes are then returned as a Unicode
565 * string containing only ASCII characters. <p>
566 *
567 * This method is meant to be used when creating RFC 822 "phrases".
568 * The InternetAddress class, for example, uses this to encode
569 * it's 'phrase' component.
570 *
571 * @param text unicode string
572 * @return Array of Unicode strings containing only US-ASCII
573 * characters.
574 * @exception UnsupportedEncodingException if the encoding fails
575 */
576 public static String encodeWord(String word)
577 throws UnsupportedEncodingException {
578 return encodeWord(word, null, null);
579 }
580
581 /**
582 * Encode a RFC 822 "word" token into mail-safe form as per
583 * RFC 2047. <p>
584 *
585 * The given Unicode string is examined for non US-ASCII
586 * characters. If the string contains only US-ASCII characters,
587 * it is returned as-is. If the string contains non US-ASCII
588 * characters, it is first character-encoded using the specified
589 * charset, then transfer-encoded using either the B or Q encoding.
590 * The resulting bytes are then returned as a Unicode string
591 * containing only ASCII characters. <p>
592 *
593 * @param text unicode string
594 * @param charset the MIME charset
595 * @param encoding the encoding to be used. Currently supported
596 * values are "B" and "Q". If this parameter is null, then
597 * the "Q" encoding is used if most of characters to be
598 * encoded are in the ASCII charset, otherwise "B" encoding
599 * is used.
600 * @return Unicode string containing only US-ASCII characters
601 * @exception UnsupportedEncodingException if the encoding fails
602 */
603 public static String encodeWord(String word, String charset,
604 String encoding)
605 throws UnsupportedEncodingException {
606 return encodeWord(word, charset, encoding, true);
607 }
608
609 /*
610 * Encode the given string. The parameter 'encodingWord' should
611 * be true if a RFC 822 "word" token is being encoded and false if a
612 * RFC 822 "text" token is being encoded. This is because the
613 * "Q" encoding defined in RFC 2047 has more restrictions when
703 if (foldEncodedWords)
704 buf.append("\r\n "); // start a continuation line
705 else
706 buf.append(" "); // line will be folded later
707
708 buf.append(prefix);
709 for (int i = 0; i < encodedBytes.length; i++)
710 buf.append((char)encodedBytes[i]);
711 buf.append("?="); // terminate the current sequence
712 }
713 }
714
715 /**
716 * The string is parsed using the rules in RFC 2047 for parsing
717 * an "encoded-word". If the parse fails, a ParseException is
718 * thrown. Otherwise, it is transfer-decoded, and then
719 * charset-converted into Unicode. If the charset-conversion
720 * fails, an UnsupportedEncodingException is thrown.<p>
721 *
722 * @param eword the possibly encoded value
723 * @exception ParseException if the string is not an
724 * encoded-word as per RFC 2047.
725 * @exception UnsupportedEncodingException if the charset
726 * conversion failed.
727 */
728 public static String decodeWord(String eword)
729 throws ParseException, UnsupportedEncodingException {
730
731 if (!eword.startsWith("=?")) // not an encoded word
732 throw new ParseException();
733
734 // get charset
735 int start = 2; int pos;
736 if ((pos = eword.indexOf('?', start)) == -1)
737 throw new ParseException();
738 String charset = javaCharset(eword.substring(start, pos));
739
740 // get encoding
741 start = pos+1;
742 if ((pos = eword.indexOf('?', start)) == -1)
830 if (start == 0)
831 return word;
832 if (start < word.length())
833 buf.append(word.substring(start));
834 return buf.toString();
835 }
836
837 /**
838 * A utility method to quote a word, if the word contains any
839 * characters from the specified 'specials' list.<p>
840 *
841 * The <code>HeaderTokenizer</code> class defines two special
842 * sets of delimiters - MIME and RFC 822. <p>
843 *
844 * This method is typically used during the generation of
845 * RFC 822 and MIME header fields.
846 *
847 * @param word word to be quoted
848 * @param specials the set of special characters
849 * @return the possibly quoted word
850 * @see javax.mail.internet.HeaderTokenizer#MIME
851 * @see javax.mail.internet.HeaderTokenizer#RFC822
852 */
853 public static String quote(String word, String specials) {
854 int len = word.length();
855
856 /*
857 * Look for any "bad" characters, Escape and
858 * quote the entire string if necessary.
859 */
860 boolean needQuoting = false;
861 for (int i = 0; i < len; i++) {
862 char c = word.charAt(i);
863 if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
864 // need to escape them and then quote the whole string
865 StringBuilder sb = new StringBuilder(len + 3);
866 sb.append('"');
867 sb.append(word.substring(0, i));
868 int lastc = 0;
869 for (int j = i; j < len; j++) {
870 char cc = word.charAt(j);
871 if ((cc == '"') || (cc == '\\') ||
1094 if (defaultJavaCharset == null) {
1095 /*
1096 * If mail.mime.charset is set, it controls the default
1097 * Java charset as well.
1098 */
1099 String mimecs = null;
1100
1101 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
1102
1103 if (mimecs != null && mimecs.length() > 0) {
1104 defaultJavaCharset = javaCharset(mimecs);
1105 return defaultJavaCharset;
1106 }
1107
1108 try {
1109 defaultJavaCharset = System.getProperty("file.encoding",
1110 "8859_1");
1111 } catch (SecurityException sex) {
1112
1113 class NullInputStream extends InputStream {
1114 public int read() {
1115 return 0;
1116 }
1117 }
1118 InputStreamReader reader =
1119 new InputStreamReader(new NullInputStream());
1120 defaultJavaCharset = reader.getEncoding();
1121 if (defaultJavaCharset == null)
1122 defaultJavaCharset = "8859_1";
1123 }
1124 }
1125
1126 return defaultJavaCharset;
1127 }
1128
1129 /*
1130 * Get the default MIME charset for this locale.
1131 */
1132 static String getDefaultMIMECharset() {
1133 if (defaultMIMECharset == null) {
1260 }
1261 }
1262
1263 static final int ALL_ASCII = 1;
1264 static final int MOSTLY_ASCII = 2;
1265 static final int MOSTLY_NONASCII = 3;
1266
1267 /**
1268 * Check if the given string contains non US-ASCII characters.
1269 * @param s string
1270 * @return ALL_ASCII if all characters in the string
1271 * belong to the US-ASCII charset. MOSTLY_ASCII
1272 * if more than half of the available characters
1273 * are US-ASCII characters. Else MOSTLY_NONASCII.
1274 */
1275 static int checkAscii(String s) {
1276 int ascii = 0, non_ascii = 0;
1277 int l = s.length();
1278
1279 for (int i = 0; i < l; i++) {
1280 if (nonascii((int)s.charAt(i))) // non-ascii
1281 non_ascii++;
1282 else
1283 ascii++;
1284 }
1285
1286 if (non_ascii == 0)
1287 return ALL_ASCII;
1288 if (ascii > non_ascii)
1289 return MOSTLY_ASCII;
1290
1291 return MOSTLY_NONASCII;
1292 }
1293
1294 /**
1295 * Check if the given byte array contains non US-ASCII characters.
1296 * @param b byte array
1297 * @return ALL_ASCII if all characters in the string
1298 * belong to the US-ASCII charset. MOSTLY_ASCII
1299 * if more than half of the available characters
1300 * are US-ASCII characters. Else MOSTLY_NONASCII.
1427
1428 /**
1429 * An OutputStream that determines whether the data written to
1430 * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1431 */
1432 class AsciiOutputStream extends OutputStream {
1433 private boolean breakOnNonAscii;
1434 private int ascii = 0, non_ascii = 0;
1435 private int linelen = 0;
1436 private boolean longLine = false;
1437 private boolean badEOL = false;
1438 private boolean checkEOL = false;
1439 private int lastb = 0;
1440 private int ret = 0;
1441
1442 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
1443 this.breakOnNonAscii = breakOnNonAscii;
1444 checkEOL = encodeEolStrict && breakOnNonAscii;
1445 }
1446
1447 public void write(int b) throws IOException {
1448 check(b);
1449 }
1450
1451 public void write(byte b[]) throws IOException {
1452 write(b, 0, b.length);
1453 }
1454
1455 public void write(byte b[], int off, int len) throws IOException {
1456 len += off;
1457 for (int i = off; i < len ; i++)
1458 check(b[i]);
1459 }
1460
1461 private final void check(int b) throws IOException {
1462 b &= 0xff;
1463 if (checkEOL &&
1464 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1465 badEOL = true;
1466 if (b == '\r' || b == '\n')
1467 linelen = 0;
1468 else {
1469 linelen++;
1470 if (linelen > 998) // 1000 - CRLF
1471 longLine = true;
1472 }
1473 if (MimeUtility.nonascii(b)) { // non-ascii
1474 non_ascii++;
|
1 /*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
48 * There are a set of methods to encode and decode MIME headers as
49 * per RFC 2047. A brief description on handling such headers is
50 * given below: <p>
51 *
52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
53 * characters. Headers that contain non US-ASCII characters must be
54 * encoded so that they contain only US-ASCII characters. Basically,
55 * this process involves using either BASE64 or QP to encode certain
56 * characters. RFC 2047 describes this in detail. <p>
57 *
58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
59 * subset of Unicode (and occupies the range 0 - 127). A String
60 * that contains only ASCII characters is already mail-safe. If the
61 * String contains non US-ASCII characters, it must be encoded. An
62 * additional complexity in this step is that since Unicode is not
63 * yet a widely used charset, one might want to first charset-encode
64 * the String into another charset and then do the transfer-encoding.
65 * <p>
66 * Note that to get the actual bytes of a mail-safe String (say,
67 * for sending over SMTP), one must do
68 * <blockquote><pre>
69 *
70 * byte[] bytes = string.getBytes("iso-8859-1");
71 *
72 * </pre></blockquote>
73 *
74 * The <code>setHeader</code> and <code>addHeader</code> methods
75 * on MimeMessage and MimeBodyPart assume that the given header values
76 * are Unicode strings that contain only US-ASCII characters. Hence
77 * the callers of those methods must insure that the values they pass
78 * do not contain non US-ASCII characters. The methods in this class
79 * help do this. <p>
80 *
81 * The <code>getHeader</code> family of methods on MimeMessage and
82 * MimeBodyPart return the raw header value. These might be encoded
83 * as per RFC 2047, and if so, must be decoded into Unicode Strings.
84 * The methods in this class help to do this. <p>
85 *
86 * Several System properties control strict conformance to the MIME
87 * spec. Note that these are not session properties but must be set
88 * globally as System properties. <p>
89 *
90 * The <code>mail.mime.decodetext.strict</code> property controls
91 * decoding of MIME encoded words. The MIME spec requires that encoded
92 * words start at the beginning of a whitespace separated word. Some
205
206 // Close the input stream
207 try {
208 is.close();
209 } catch (IOException ioex) { }
210
211 return encoding;
212 }
213
214 /**
215 * Same as <code>getEncoding(DataSource)</code> except that instead
216 * of reading the data from an <code>InputStream</code> it uses the
217 * <code>writeTo</code> method to examine the data. This is more
218 * efficient in the common case of a <code>DataHandler</code>
219 * created with an object and a MIME type (for example, a
220 * "text/plain" String) because all the I/O is done in this
221 * thread. In the case requiring an <code>InputStream</code> the
222 * <code>DataHandler</code> uses a thread, a pair of pipe streams,
223 * and the <code>writeTo</code> method to produce the data. <p>
224 *
225 * @param dh data handler
226 *
227 * @return encoding
228 *
229 * @since JavaMail 1.2
230 */
231 public static String getEncoding(DataHandler dh) {
232 ContentType cType = null;
233 String encoding = null;
234
235 /*
236 * Try to pick the most efficient means of determining the
237 * encoding. If this DataHandler was created using a DataSource,
238 * the getEncoding(DataSource) method is typically faster. If
239 * the DataHandler was created with an object, this method is
240 * much faster. To distinguish the two cases, we use a heuristic.
241 * A DataHandler created with an object will always have a null name.
242 * A DataHandler created with a DataSource will usually have a
243 * non-null name.
244 *
245 * XXX - This is actually quite a disgusting hack, but it makes
246 * a common case run over twice as fast.
247 */
248 if (dh.getName() != null)
281 } catch (IOException ex) { } // ignore it
282 if (aos.getAscii() == ALL_ASCII) // all ascii
283 encoding = "7bit";
284 else // found atleast one non-ascii character, use b64
285 encoding = "base64";
286 }
287
288 return encoding;
289 }
290
291 /**
292 * Decode the given input stream. The Input stream returned is
293 * the decoded input stream. All the encodings defined in RFC 2045
294 * are supported here. They include "base64", "quoted-printable",
295 * "7bit", "8bit", and "binary". In addition, "uuencode" is also
296 * supported.
297 *
298 * @param is input stream
299 * @param encoding the encoding of the stream.
300 * @return decoded input stream.
301 * @exception MessagingException in case of error
302 */
303 public static InputStream decode(InputStream is, String encoding)
304 throws MessagingException {
305 if (encoding.equalsIgnoreCase("base64"))
306 return new BASE64DecoderStream(is);
307 else if (encoding.equalsIgnoreCase("quoted-printable"))
308 return new QPDecoderStream(is);
309 else if (encoding.equalsIgnoreCase("uuencode") ||
310 encoding.equalsIgnoreCase("x-uuencode") ||
311 encoding.equalsIgnoreCase("x-uue"))
312 return new UUDecoderStream(is);
313 else if (encoding.equalsIgnoreCase("binary") ||
314 encoding.equalsIgnoreCase("7bit") ||
315 encoding.equalsIgnoreCase("8bit"))
316 return is;
317 else
318 throw new MessagingException("Unknown encoding: " + encoding);
319 }
320
321 /**
322 * Wrap an encoder around the given output stream.
323 * All the encodings defined in RFC 2045 are supported here.
324 * They include "base64", "quoted-printable", "7bit", "8bit" and
325 * "binary". In addition, "uuencode" is also supported.
326 *
327 * @param os output stream
328 * @param encoding the encoding of the stream.
329 * @return output stream that applies the
330 * specified encoding.
331 * @exception MessagingException in case of error
332 */
333 public static OutputStream encode(OutputStream os, String encoding)
334 throws MessagingException {
335 if (encoding == null)
336 return os;
337 else if (encoding.equalsIgnoreCase("base64"))
338 return new BASE64EncoderStream(os);
339 else if (encoding.equalsIgnoreCase("quoted-printable"))
340 return new QPEncoderStream(os);
341 else if (encoding.equalsIgnoreCase("uuencode") ||
342 encoding.equalsIgnoreCase("x-uuencode") ||
343 encoding.equalsIgnoreCase("x-uue"))
344 return new UUEncoderStream(os);
345 else if (encoding.equalsIgnoreCase("binary") ||
346 encoding.equalsIgnoreCase("7bit") ||
347 encoding.equalsIgnoreCase("8bit"))
348 return os;
349 else
350 throw new MessagingException("Unknown encoding: " +encoding);
351 }
352
353 /**
354 * Wrap an encoder around the given output stream.
355 * All the encodings defined in RFC 2045 are supported here.
356 * They include "base64", "quoted-printable", "7bit", "8bit" and
357 * "binary". In addition, "uuencode" is also supported.
358 * The <code>filename</code> parameter is used with the "uuencode"
359 * encoding and is included in the encoded output.
360 *
361 * @param os output stream
362 * @param encoding the encoding of the stream.
363 * @param filename name for the file being encoded (only used
364 * with uuencode)
365 * @return output stream that applies the
366 * specified encoding.
367 * @exception MessagingException in case of error
368 * @since JavaMail 1.2
369 */
370 public static OutputStream encode(OutputStream os, String encoding,
371 String filename)
372 throws MessagingException {
373 if (encoding == null)
374 return os;
375 else if (encoding.equalsIgnoreCase("base64"))
376 return new BASE64EncoderStream(os);
377 else if (encoding.equalsIgnoreCase("quoted-printable"))
378 return new QPEncoderStream(os);
379 else if (encoding.equalsIgnoreCase("uuencode") ||
380 encoding.equalsIgnoreCase("x-uuencode") ||
381 encoding.equalsIgnoreCase("x-uue"))
382 return new UUEncoderStream(os, filename);
383 else if (encoding.equalsIgnoreCase("binary") ||
384 encoding.equalsIgnoreCase("7bit") ||
385 encoding.equalsIgnoreCase("8bit"))
386 return os;
387 else
388 throw new MessagingException("Unknown encoding: " +encoding);
389 }
390
391 /**
392 * Encode a RFC 822 "text" token into mail-safe form as per
393 * RFC 2047. <p>
394 *
395 * The given Unicode string is examined for non US-ASCII
396 * characters. If the string contains only US-ASCII characters,
397 * it is returned as-is. If the string contains non US-ASCII
398 * characters, it is first character-encoded using the platform's
399 * default charset, then transfer-encoded using either the B or
400 * Q encoding. The resulting bytes are then returned as a Unicode
401 * string containing only ASCII characters. <p>
402 *
403 * Note that this method should be used to encode only
404 * "unstructured" RFC 822 headers. <p>
405 *
406 * Example of usage:
407 * <blockquote><pre>
408 *
409 * MimeBodyPart part = ...
410 * String rawvalue = "FooBar Mailer, Japanese version 1.1"
411 * try {
412 * // If we know for sure that rawvalue contains only US-ASCII
413 * // characters, we can skip the encoding part
414 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
415 * } catch (UnsupportedEncodingException e) {
416 * // encoding failure
417 * } catch (MessagingException me) {
418 * // setHeader() failure
419 * }
420 *
421 * </pre></blockquote>
422 *
423 * @param text unicode string
424 * @return Unicode string containing only US-ASCII characters
425 * @exception UnsupportedEncodingException if the encoding fails
426 */
427 public static String encodeText(String text)
428 throws UnsupportedEncodingException {
429 return encodeText(text, null, null);
430 }
431
432 /**
433 * Encode a RFC 822 "text" token into mail-safe form as per
434 * RFC 2047. <p>
435 *
436 * The given Unicode string is examined for non US-ASCII
437 * characters. If the string contains only US-ASCII characters,
438 * it is returned as-is. If the string contains non US-ASCII
439 * characters, it is first character-encoded using the specified
440 * charset, then transfer-encoded using either the B or Q encoding.
441 * The resulting bytes are then returned as a Unicode string
442 * containing only ASCII characters. <p>
443 *
444 * Note that this method should be used to encode only
445 * "unstructured" RFC 822 headers.
446 *
447 * @param text the header value
448 * @param charset the charset. If this parameter is null, the
449 * platform's default chatset is used.
450 * @param encoding the encoding to be used. Currently supported
451 * values are "B" and "Q". If this parameter is null, then
452 * the "Q" encoding is used if most of characters to be
453 * encoded are in the ASCII charset, otherwise "B" encoding
454 * is used.
455 * @return Unicode string containing only US-ASCII characters
456 * @exception UnsupportedEncodingException in case of unsupported encoding
457 */
458 public static String encodeText(String text, String charset,
459 String encoding)
460 throws UnsupportedEncodingException {
461 return encodeWord(text, charset, encoding, false);
462 }
463
464 /**
465 * Decode "unstructured" headers, that is, headers that are defined
466 * as '*text' as per RFC 822. <p>
467 *
468 * The string is decoded using the algorithm specified in
469 * RFC 2047, Section 6.1.1. If the charset-conversion fails
470 * for any sequence, an UnsupportedEncodingException is thrown.
471 * If the String is not an RFC 2047 style encoded header, it is
472 * returned as-is <p>
473 *
474 * Example of usage:
475 * <blockquote><pre>
476 *
477 * MimeBodyPart part = ...
478 * String rawvalue = null;
479 * String value = null;
480 * try {
481 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
482 * value = MimeUtility.decodeText(rawvalue);
483 * } catch (UnsupportedEncodingException e) {
484 * // Don't care
485 * value = rawvalue;
486 * } catch (MessagingException me) { }
487 *
488 * return value;
489 *
490 * </pre></blockquote>
491 *
492 * @param etext the possibly encoded value
493 * @return decoded text
494 * @exception UnsupportedEncodingException if the charset
495 * conversion failed.
496 */
497 public static String decodeText(String etext)
498 throws UnsupportedEncodingException {
499 /*
500 * We look for sequences separated by "linear-white-space".
501 * (as per RFC 2047, Section 6.1.1)
502 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
503 */
504 String lwsp = " \t\n\r";
505 StringTokenizer st;
506
507 /*
508 * First, lets do a quick run thru the string and check
509 * whether the sequence "=?" exists at all. If none exists,
510 * we know there are no encoded-words in here and we can just
511 * return the string as-is, without suffering thru the later
512 * decoding logic.
513 * This handles the most common case of unencoded headers
560 }
561 return sb.toString();
562 }
563
564 /**
565 * Encode a RFC 822 "word" token into mail-safe form as per
566 * RFC 2047. <p>
567 *
568 * The given Unicode string is examined for non US-ASCII
569 * characters. If the string contains only US-ASCII characters,
570 * it is returned as-is. If the string contains non US-ASCII
571 * characters, it is first character-encoded using the platform's
572 * default charset, then transfer-encoded using either the B or
573 * Q encoding. The resulting bytes are then returned as a Unicode
574 * string containing only ASCII characters. <p>
575 *
576 * This method is meant to be used when creating RFC 822 "phrases".
577 * The InternetAddress class, for example, uses this to encode
578 * it's 'phrase' component.
579 *
580 * @param word unicode string
581 * @return Array of Unicode strings containing only US-ASCII
582 * characters.
583 * @exception UnsupportedEncodingException if the encoding fails
584 */
585 public static String encodeWord(String word)
586 throws UnsupportedEncodingException {
587 return encodeWord(word, null, null);
588 }
589
590 /**
591 * Encode a RFC 822 "word" token into mail-safe form as per
592 * RFC 2047. <p>
593 *
594 * The given Unicode string is examined for non US-ASCII
595 * characters. If the string contains only US-ASCII characters,
596 * it is returned as-is. If the string contains non US-ASCII
597 * characters, it is first character-encoded using the specified
598 * charset, then transfer-encoded using either the B or Q encoding.
599 * The resulting bytes are then returned as a Unicode string
600 * containing only ASCII characters. <p>
601 *
602 * @param word unicode string
603 * @param charset the MIME charset
604 * @param encoding the encoding to be used. Currently supported
605 * values are "B" and "Q". If this parameter is null, then
606 * the "Q" encoding is used if most of characters to be
607 * encoded are in the ASCII charset, otherwise "B" encoding
608 * is used.
609 * @return Unicode string containing only US-ASCII characters
610 * @exception UnsupportedEncodingException if the encoding fails
611 */
612 public static String encodeWord(String word, String charset,
613 String encoding)
614 throws UnsupportedEncodingException {
615 return encodeWord(word, charset, encoding, true);
616 }
617
618 /*
619 * Encode the given string. The parameter 'encodingWord' should
620 * be true if a RFC 822 "word" token is being encoded and false if a
621 * RFC 822 "text" token is being encoded. This is because the
622 * "Q" encoding defined in RFC 2047 has more restrictions when
712 if (foldEncodedWords)
713 buf.append("\r\n "); // start a continuation line
714 else
715 buf.append(" "); // line will be folded later
716
717 buf.append(prefix);
718 for (int i = 0; i < encodedBytes.length; i++)
719 buf.append((char)encodedBytes[i]);
720 buf.append("?="); // terminate the current sequence
721 }
722 }
723
724 /**
725 * The string is parsed using the rules in RFC 2047 for parsing
726 * an "encoded-word". If the parse fails, a ParseException is
727 * thrown. Otherwise, it is transfer-decoded, and then
728 * charset-converted into Unicode. If the charset-conversion
729 * fails, an UnsupportedEncodingException is thrown.<p>
730 *
731 * @param eword the possibly encoded value
732 * @return deocoded word
733 * @exception ParseException if the string is not an
734 * encoded-word as per RFC 2047.
735 * @exception UnsupportedEncodingException if the charset
736 * conversion failed.
737 */
738 public static String decodeWord(String eword)
739 throws ParseException, UnsupportedEncodingException {
740
741 if (!eword.startsWith("=?")) // not an encoded word
742 throw new ParseException();
743
744 // get charset
745 int start = 2; int pos;
746 if ((pos = eword.indexOf('?', start)) == -1)
747 throw new ParseException();
748 String charset = javaCharset(eword.substring(start, pos));
749
750 // get encoding
751 start = pos+1;
752 if ((pos = eword.indexOf('?', start)) == -1)
840 if (start == 0)
841 return word;
842 if (start < word.length())
843 buf.append(word.substring(start));
844 return buf.toString();
845 }
846
847 /**
848 * A utility method to quote a word, if the word contains any
849 * characters from the specified 'specials' list.<p>
850 *
851 * The <code>HeaderTokenizer</code> class defines two special
852 * sets of delimiters - MIME and RFC 822. <p>
853 *
854 * This method is typically used during the generation of
855 * RFC 822 and MIME header fields.
856 *
857 * @param word word to be quoted
858 * @param specials the set of special characters
859 * @return the possibly quoted word
860 * @see com.sun.xml.internal.messaging.saaj.packaging.mime.internet.HeaderTokenizer#MIME
861 * @see com.sun.xml.internal.messaging.saaj.packaging.mime.internet.HeaderTokenizer#RFC822
862 */
863 public static String quote(String word, String specials) {
864 int len = word.length();
865
866 /*
867 * Look for any "bad" characters, Escape and
868 * quote the entire string if necessary.
869 */
870 boolean needQuoting = false;
871 for (int i = 0; i < len; i++) {
872 char c = word.charAt(i);
873 if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
874 // need to escape them and then quote the whole string
875 StringBuilder sb = new StringBuilder(len + 3);
876 sb.append('"');
877 sb.append(word.substring(0, i));
878 int lastc = 0;
879 for (int j = i; j < len; j++) {
880 char cc = word.charAt(j);
881 if ((cc == '"') || (cc == '\\') ||
1104 if (defaultJavaCharset == null) {
1105 /*
1106 * If mail.mime.charset is set, it controls the default
1107 * Java charset as well.
1108 */
1109 String mimecs = null;
1110
1111 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
1112
1113 if (mimecs != null && mimecs.length() > 0) {
1114 defaultJavaCharset = javaCharset(mimecs);
1115 return defaultJavaCharset;
1116 }
1117
1118 try {
1119 defaultJavaCharset = System.getProperty("file.encoding",
1120 "8859_1");
1121 } catch (SecurityException sex) {
1122
1123 class NullInputStream extends InputStream {
1124 @Override
1125 public int read() {
1126 return 0;
1127 }
1128 }
1129 InputStreamReader reader =
1130 new InputStreamReader(new NullInputStream());
1131 defaultJavaCharset = reader.getEncoding();
1132 if (defaultJavaCharset == null)
1133 defaultJavaCharset = "8859_1";
1134 }
1135 }
1136
1137 return defaultJavaCharset;
1138 }
1139
1140 /*
1141 * Get the default MIME charset for this locale.
1142 */
1143 static String getDefaultMIMECharset() {
1144 if (defaultMIMECharset == null) {
1271 }
1272 }
1273
1274 static final int ALL_ASCII = 1;
1275 static final int MOSTLY_ASCII = 2;
1276 static final int MOSTLY_NONASCII = 3;
1277
1278 /**
1279 * Check if the given string contains non US-ASCII characters.
1280 * @param s string
1281 * @return ALL_ASCII if all characters in the string
1282 * belong to the US-ASCII charset. MOSTLY_ASCII
1283 * if more than half of the available characters
1284 * are US-ASCII characters. Else MOSTLY_NONASCII.
1285 */
1286 static int checkAscii(String s) {
1287 int ascii = 0, non_ascii = 0;
1288 int l = s.length();
1289
1290 for (int i = 0; i < l; i++) {
1291 if (nonascii(s.charAt(i))) // non-ascii
1292 non_ascii++;
1293 else
1294 ascii++;
1295 }
1296
1297 if (non_ascii == 0)
1298 return ALL_ASCII;
1299 if (ascii > non_ascii)
1300 return MOSTLY_ASCII;
1301
1302 return MOSTLY_NONASCII;
1303 }
1304
1305 /**
1306 * Check if the given byte array contains non US-ASCII characters.
1307 * @param b byte array
1308 * @return ALL_ASCII if all characters in the string
1309 * belong to the US-ASCII charset. MOSTLY_ASCII
1310 * if more than half of the available characters
1311 * are US-ASCII characters. Else MOSTLY_NONASCII.
1438
1439 /**
1440 * An OutputStream that determines whether the data written to
1441 * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1442 */
1443 class AsciiOutputStream extends OutputStream {
1444 private boolean breakOnNonAscii;
1445 private int ascii = 0, non_ascii = 0;
1446 private int linelen = 0;
1447 private boolean longLine = false;
1448 private boolean badEOL = false;
1449 private boolean checkEOL = false;
1450 private int lastb = 0;
1451 private int ret = 0;
1452
1453 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
1454 this.breakOnNonAscii = breakOnNonAscii;
1455 checkEOL = encodeEolStrict && breakOnNonAscii;
1456 }
1457
1458 @Override
1459 public void write(int b) throws IOException {
1460 check(b);
1461 }
1462
1463 @Override
1464 public void write(byte b[]) throws IOException {
1465 write(b, 0, b.length);
1466 }
1467
1468 @Override
1469 public void write(byte b[], int off, int len) throws IOException {
1470 len += off;
1471 for (int i = off; i < len ; i++)
1472 check(b[i]);
1473 }
1474
1475 private final void check(int b) throws IOException {
1476 b &= 0xff;
1477 if (checkEOL &&
1478 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1479 badEOL = true;
1480 if (b == '\r' || b == '\n')
1481 linelen = 0;
1482 else {
1483 linelen++;
1484 if (linelen > 998) // 1000 - CRLF
1485 longLine = true;
1486 }
1487 if (MimeUtility.nonascii(b)) { // non-ascii
1488 non_ascii++;
|