New jaxws/src/java.xml.ws/share/classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/MimeUtility.java

   1 /*
   2  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * @(#)MimeUtility.java       1.45 03/03/10
  28  */
  29 
  30 
  31 
  32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
  33 
  34 import java.io.*;
  35 import java.util.*;
  36 
  37 import javax.activation.DataHandler;
  38 import javax.activation.DataSource;
  39 
  40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
  41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
  42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil;
  43 
  44 /**
  45  * This is a utility class that provides various MIME related
  46  * functionality. <p>
  47  *
  48  * There are a set of methods to encode and decode MIME headers as
  49  * per RFC 2047. A brief description on handling such headers is
  50  * given below: <p>
  51  *
  52  * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
  53  * characters. Headers that contain non US-ASCII characters must be
  54  * encoded so that they contain only US-ASCII characters. Basically,
  55  * this process involves using either BASE64 or QP to encode certain
  56  * characters. RFC 2047 describes this in detail. <p>
  57  *
  58  * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
  59  * subset of Unicode (and occupies the range 0 - 127). A String
  60  * that contains only ASCII characters is already mail-safe. If the
  61  * String contains non US-ASCII characters, it must be encoded. An
  62  * additional complexity in this step is that since Unicode is not
  63  * yet a widely used charset, one might want to first charset-encode
  64  * the String into another charset and then do the transfer-encoding.
  65  * <p>
  66  * Note that to get the actual bytes of a mail-safe String (say,
  67  * for sending over SMTP), one must do
  68  * <blockquote><pre>
  69  *
  70  *      byte[] bytes = string.getBytes("iso-8859-1");
  71  *
  72  * </pre></blockquote>
  73  *
  74  * The <code>setHeader</code> and <code>addHeader</code> methods
  75  * on MimeMessage and MimeBodyPart assume that the given header values
  76  * are Unicode strings that contain only US-ASCII characters. Hence
  77  * the callers of those methods must insure that the values they pass
  78  * do not contain non US-ASCII characters. The methods in this class
  79  * help do this. <p>
  80  *
  81  * The <code>getHeader</code> family of methods on MimeMessage and
  82  * MimeBodyPart return the raw header value. These might be encoded
  83  * as per RFC 2047, and if so, must be decoded into Unicode Strings.
  84  * The methods in this class help to do this. <p>
  85  *
  86  * Several System properties control strict conformance to the MIME
  87  * spec.  Note that these are not session properties but must be set
  88  * globally as System properties. <p>
  89  *
  90  * The <code>mail.mime.decodetext.strict</code> property controls
  91  * decoding of MIME encoded words.  The MIME spec requires that encoded
  92  * words start at the beginning of a whitespace separated word.  Some
  93  * mailers incorrectly include encoded words in the middle of a word.
  94  * If the <code>mail.mime.decodetext.strict</code> System property is
  95  * set to <code>"false"</code>, an attempt will be made to decode these
  96  * illegal encoded words. The default is true. <p>
  97  *
  98  * The <code>mail.mime.encodeeol.strict</code> property controls the
  99  * choice of Content-Transfer-Encoding for MIME parts that are not of
 100  * type "text".  Often such parts will contain textual data for which
 101  * an encoding that allows normal end of line conventions is appropriate.
 102  * In rare cases, such a part will appear to contain entirely textual
 103  * data, but will require an encoding that preserves CR and LF characters
 104  * without change.  If the <code>mail.mime.decodetext.strict</code>
 105  * System property is set to <code>"true"</code>, such an encoding will
 106  * be used when necessary.  The default is false. <p>
 107  *
 108  * In addition, the <code>mail.mime.charset</code> System property can
 109  * be used to specify the default MIME charset to use for encoded words
 110  * and text parts that don't otherwise specify a charset.  Normally, the
 111  * default MIME charset is derived from the default Java charset, as
 112  * specified in the <code>file.encoding</code> System property.  Most
 113  * applications will have no need to explicitly set the default MIME
 114  * charset.  In cases where the default MIME charset to be used for
 115  * mail messages is different than the charset used for files stored on
 116  * the system, this property should be set.
 117  *
 118  * @version 1.45, 03/03/10
 119  * @author  John Mani
 120  * @author  Bill Shannon
 121  */
 122 
 123 public class MimeUtility {
 124 
 125     // This class cannot be instantiated
 126     private MimeUtility() { }
 127 
 128     public static final int ALL = -1;
 129 
 130     private static final int BUFFER_SIZE = 1024;
 131     private static boolean decodeStrict = true;
 132     private static boolean encodeEolStrict = false;
 133     private static boolean foldEncodedWords = false;
 134     private static boolean foldText = true;
 135 
 136     static {
 137         try {
 138             String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict");
 139             // default to true
 140             decodeStrict = s == null || !s.equalsIgnoreCase("false");
 141             s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict");
 142             // default to false
 143             encodeEolStrict = s != null && s.equalsIgnoreCase("true");
 144             s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords");
 145             // default to false
 146             foldEncodedWords = s != null && s.equalsIgnoreCase("true");
 147             s = SAAJUtil.getSystemProperty("mail.mime.foldtext");
 148             // default to true
 149             foldText = s == null || !s.equalsIgnoreCase("false");
 150         } catch (SecurityException sex) {
 151             // ignore it
 152         }
 153     }
 154 
 155 
 156     /**
 157      * Get the content-transfer-encoding that should be applied
 158      * to the input stream of this datasource, to make it mailsafe. <p>
 159      *
 160      * The algorithm used here is: <br>
 161      * <ul>
 162      * <li>
 163      * If the primary type of this datasource is "text" and if all
 164      * the bytes in its input stream are US-ASCII, then the encoding
 165      * is "7bit". If more than half of the bytes are non-US-ASCII, then
 166      * the encoding is "base64". If less than half of the bytes are
 167      * non-US-ASCII, then the encoding is "quoted-printable".
 168      * <li>
 169      * If the primary type of this datasource is not "text", then if
 170      * all the bytes of its input stream are US-ASCII, the encoding
 171      * is "7bit". If there is even one non-US-ASCII character, the
 172      * encoding is "base64".
 173      * </ul>
 174      *
 175      * @param   ds      DataSource
 176      * @return          the encoding. This is either "7bit",
 177      *                  "quoted-printable" or "base64"
 178      */
 179     public static String getEncoding(DataSource ds) {
 180         ContentType cType = null;
 181         InputStream is = null;
 182         String encoding = null;
 183 
 184         try {
 185             cType = new ContentType(ds.getContentType());
 186             is = ds.getInputStream();
 187         } catch (Exception ex) {
 188             return "base64"; // what else ?!
 189         }
 190 
 191         boolean isText = cType.match("text/*");
 192         // if not text, stop processing when we see non-ASCII
 193         int i = checkAscii(is, ALL, !isText);
 194         switch (i) {
 195         case ALL_ASCII:
 196             encoding = "7bit"; // all ascii
 197             break;
 198         case MOSTLY_ASCII:
 199             encoding = "quoted-printable"; // mostly ascii
 200             break;
 201         default:
 202             encoding = "base64"; // mostly binary
 203             break;
 204         }
 205 
 206         // Close the input stream
 207         try {
 208             is.close();
 209         } catch (IOException ioex) { }
 210 
 211         return encoding;
 212     }
 213 
 214     /**
 215      * Same as <code>getEncoding(DataSource)</code> except that instead
 216      * of reading the data from an <code>InputStream</code> it uses the
 217      * <code>writeTo</code> method to examine the data.  This is more
 218      * efficient in the common case of a <code>DataHandler</code>
 219      * created with an object and a MIME type (for example, a
 220      * "text/plain" String) because all the I/O is done in this
 221      * thread.  In the case requiring an <code>InputStream</code> the
 222      * <code>DataHandler</code> uses a thread, a pair of pipe streams,
 223      * and the <code>writeTo</code> method to produce the data. <p>
 224      *
 225      * @param dh data handler
 226      *
 227      * @return encoding
 228      *
 229      * @since   JavaMail 1.2
 230      */
 231     public static String getEncoding(DataHandler dh) {
 232         ContentType cType = null;
 233         String encoding = null;
 234 
 235         /*
 236          * Try to pick the most efficient means of determining the
 237          * encoding.  If this DataHandler was created using a DataSource,
 238          * the getEncoding(DataSource) method is typically faster.  If
 239          * the DataHandler was created with an object, this method is
 240          * much faster.  To distinguish the two cases, we use a heuristic.
 241          * A DataHandler created with an object will always have a null name.
 242          * A DataHandler created with a DataSource will usually have a
 243          * non-null name.
 244          *
 245          * XXX - This is actually quite a disgusting hack, but it makes
 246          *       a common case run over twice as fast.
 247          */
 248         if (dh.getName() != null)
 249             return getEncoding(dh.getDataSource());
 250 
 251         try {
 252             cType = new ContentType(dh.getContentType());
 253         } catch (Exception ex) {
 254             return "base64"; // what else ?!
 255         }
 256 
 257         if (cType.match("text/*")) {
 258             // Check all of the available bytes
 259             AsciiOutputStream aos = new AsciiOutputStream(false, false);
 260             try {
 261                 dh.writeTo(aos);
 262             } catch (IOException ex) { }        // ignore it
 263             switch (aos.getAscii()) {
 264             case ALL_ASCII:
 265                 encoding = "7bit"; // all ascii
 266                 break;
 267             case MOSTLY_ASCII:
 268                 encoding = "quoted-printable"; // mostly ascii
 269                 break;
 270             default:
 271                 encoding = "base64"; // mostly binary
 272                 break;
 273             }
 274         } else { // not "text"
 275             // Check all of available bytes, break out if we find
 276             // at least one non-US-ASCII character
 277             AsciiOutputStream aos =
 278                         new AsciiOutputStream(true, encodeEolStrict);
 279             try {
 280                 dh.writeTo(aos);
 281             } catch (IOException ex) { }        // ignore it
 282             if (aos.getAscii() == ALL_ASCII) // all ascii
 283                 encoding = "7bit";
 284             else // found atleast one non-ascii character, use b64
 285                 encoding = "base64";
 286         }
 287 
 288         return encoding;
 289     }
 290 
 291     /**
 292      * Decode the given input stream. The Input stream returned is
 293      * the decoded input stream. All the encodings defined in RFC 2045
 294      * are supported here. They include "base64", "quoted-printable",
 295      * "7bit", "8bit", and "binary". In addition, "uuencode" is also
 296      * supported.
 297      *
 298      * @param   is              input stream
 299      * @param   encoding        the encoding of the stream.
 300      * @return                  decoded input stream.
 301      * @exception MessagingException in case of error
 302      */
 303     public static InputStream decode(InputStream is, String encoding)
 304                 throws MessagingException {
 305         if (encoding.equalsIgnoreCase("base64"))
 306             return new BASE64DecoderStream(is);
 307         else if (encoding.equalsIgnoreCase("quoted-printable"))
 308             return new QPDecoderStream(is);
 309         else if (encoding.equalsIgnoreCase("uuencode") ||
 310                  encoding.equalsIgnoreCase("x-uuencode") ||
 311                  encoding.equalsIgnoreCase("x-uue"))
 312             return new UUDecoderStream(is);
 313         else if (encoding.equalsIgnoreCase("binary") ||
 314                  encoding.equalsIgnoreCase("7bit") ||
 315                  encoding.equalsIgnoreCase("8bit"))
 316             return is;
 317         else
 318             throw new MessagingException("Unknown encoding: " + encoding);
 319     }
 320 
 321     /**
 322      * Wrap an encoder around the given output stream.
 323      * All the encodings defined in RFC 2045 are supported here.
 324      * They include "base64", "quoted-printable", "7bit", "8bit" and
 325      * "binary". In addition, "uuencode" is also supported.
 326      *
 327      * @param   os              output stream
 328      * @param   encoding        the encoding of the stream.
 329      * @return                  output stream that applies the
 330      *                          specified encoding.
 331      * @exception MessagingException in case of error
 332      */
 333     public static OutputStream encode(OutputStream os, String encoding)
 334                 throws MessagingException {
 335         if (encoding == null)
 336             return os;
 337         else if (encoding.equalsIgnoreCase("base64"))
 338             return new BASE64EncoderStream(os);
 339         else if (encoding.equalsIgnoreCase("quoted-printable"))
 340             return new QPEncoderStream(os);
 341         else if (encoding.equalsIgnoreCase("uuencode") ||
 342                  encoding.equalsIgnoreCase("x-uuencode") ||
 343                  encoding.equalsIgnoreCase("x-uue"))
 344             return new UUEncoderStream(os);
 345         else if (encoding.equalsIgnoreCase("binary") ||
 346                  encoding.equalsIgnoreCase("7bit") ||
 347                  encoding.equalsIgnoreCase("8bit"))
 348             return os;
 349         else
 350             throw new MessagingException("Unknown encoding: " +encoding);
 351     }
 352 
 353     /**
 354      * Wrap an encoder around the given output stream.
 355      * All the encodings defined in RFC 2045 are supported here.
 356      * They include "base64", "quoted-printable", "7bit", "8bit" and
 357      * "binary". In addition, "uuencode" is also supported.
 358      * The <code>filename</code> parameter is used with the "uuencode"
 359      * encoding and is included in the encoded output.
 360      *
 361      * @param   os              output stream
 362      * @param   encoding        the encoding of the stream.
 363      * @param   filename        name for the file being encoded (only used
 364      *                          with uuencode)
 365      * @return                  output stream that applies the
 366      *                          specified encoding.
 367      * @exception MessagingException in case of error
 368      * @since                   JavaMail 1.2
 369      */
 370     public static OutputStream encode(OutputStream os, String encoding,
 371                                       String filename)
 372                 throws MessagingException {
 373         if (encoding == null)
 374             return os;
 375         else if (encoding.equalsIgnoreCase("base64"))
 376             return new BASE64EncoderStream(os);
 377         else if (encoding.equalsIgnoreCase("quoted-printable"))
 378             return new QPEncoderStream(os);
 379         else if (encoding.equalsIgnoreCase("uuencode") ||
 380                  encoding.equalsIgnoreCase("x-uuencode") ||
 381                  encoding.equalsIgnoreCase("x-uue"))
 382             return new UUEncoderStream(os, filename);
 383         else if (encoding.equalsIgnoreCase("binary") ||
 384                  encoding.equalsIgnoreCase("7bit") ||
 385                  encoding.equalsIgnoreCase("8bit"))
 386             return os;
 387         else
 388             throw new MessagingException("Unknown encoding: " +encoding);
 389     }
 390 
 391     /**
 392      * Encode a RFC 822 "text" token into mail-safe form as per
 393      * RFC 2047. <p>
 394      *
 395      * The given Unicode string is examined for non US-ASCII
 396      * characters. If the string contains only US-ASCII characters,
 397      * it is returned as-is.  If the string contains non US-ASCII
 398      * characters, it is first character-encoded using the platform's
 399      * default charset, then transfer-encoded using either the B or
 400      * Q encoding. The resulting bytes are then returned as a Unicode
 401      * string containing only ASCII  characters. <p>
 402      *
 403      * Note that this method should be used to encode only
 404      * "unstructured" RFC 822 headers. <p>
 405      *
 406      * Example of usage:
 407      * <blockquote><pre>
 408      *
 409      *  MimeBodyPart part = ...
 410      *  String rawvalue = "FooBar Mailer, Japanese version 1.1"
 411      *  try {
 412      *    // If we know for sure that rawvalue contains only US-ASCII
 413      *    // characters, we can skip the encoding part
 414      *    part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
 415      *  } catch (UnsupportedEncodingException e) {
 416      *    // encoding failure
 417      *  } catch (MessagingException me) {
 418      *   // setHeader() failure
 419      *  }
 420      *
 421      * </pre></blockquote>
 422      *
 423      * @param   text    unicode string
 424      * @return  Unicode string containing only US-ASCII characters
 425      * @exception UnsupportedEncodingException if the encoding fails
 426      */
 427     public static String encodeText(String text)
 428                         throws UnsupportedEncodingException {
 429         return encodeText(text, null, null);
 430     }
 431 
 432     /**
 433      * Encode a RFC 822 "text" token into mail-safe form as per
 434      * RFC 2047. <p>
 435      *
 436      * The given Unicode string is examined for non US-ASCII
 437      * characters. If the string contains only US-ASCII characters,
 438      * it is returned as-is.  If the string contains non US-ASCII
 439      * characters, it is first character-encoded using the specified
 440      * charset, then transfer-encoded using either the B or Q encoding.
 441      * The resulting bytes are then returned as a Unicode string
 442      * containing only ASCII characters. <p>
 443      *
 444      * Note that this method should be used to encode only
 445      * "unstructured" RFC 822 headers.
 446      *
 447      * @param   text    the header value
 448      * @param   charset the charset. If this parameter is null, the
 449      *          platform's default chatset is used.
 450      * @param   encoding the encoding to be used. Currently supported
 451      *          values are "B" and "Q". If this parameter is null, then
 452      *          the "Q" encoding is used if most of characters to be
 453      *          encoded are in the ASCII charset, otherwise "B" encoding
 454      *          is used.
 455      * @return  Unicode string containing only US-ASCII characters
 456      * @exception UnsupportedEncodingException in case of unsupported encoding
 457      */
 458     public static String encodeText(String text, String charset,
 459                                     String encoding)
 460                         throws UnsupportedEncodingException {
 461         return encodeWord(text, charset, encoding, false);
 462     }
 463 
 464     /**
 465      * Decode "unstructured" headers, that is, headers that are defined
 466      * as '*text' as per RFC 822. <p>
 467      *
 468      * The string is decoded using the algorithm specified in
 469      * RFC 2047, Section 6.1.1. If the charset-conversion fails
 470      * for any sequence, an UnsupportedEncodingException is thrown.
 471      * If the String is not an RFC 2047 style encoded header, it is
 472      * returned as-is <p>
 473      *
 474      * Example of usage:
 475      * <blockquote><pre>
 476      *
 477      *  MimeBodyPart part = ...
 478      *  String rawvalue = null;
 479      *  String  value = null;
 480      *  try {
 481      *    if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
 482      *      value = MimeUtility.decodeText(rawvalue);
 483      *  } catch (UnsupportedEncodingException e) {
 484      *      // Don't care
 485      *      value = rawvalue;
 486      *  } catch (MessagingException me) { }
 487      *
 488      *  return value;
 489      *
 490      * </pre></blockquote>
 491      *
 492      * @param   etext   the possibly encoded value
 493      * @return decoded text
 494      * @exception       UnsupportedEncodingException if the charset
 495      *                  conversion failed.
 496      */
 497     public static String decodeText(String etext)
 498                 throws UnsupportedEncodingException {
 499         /*
 500          * We look for sequences separated by "linear-white-space".
 501          * (as per RFC 2047, Section 6.1.1)
 502          * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
 503          */
 504         String lwsp = " \t\n\r";
 505         StringTokenizer st;
 506 
 507         /*
 508          * First, lets do a quick run thru the string and check
 509          * whether the sequence "=?"  exists at all. If none exists,
 510          * we know there are no encoded-words in here and we can just
 511          * return the string as-is, without suffering thru the later
 512          * decoding logic.
 513          * This handles the most common case of unencoded headers
 514          * efficiently.
 515          */
 516         if (etext.indexOf("=?") == -1)
 517             return etext;
 518 
 519         // Encoded words found. Start decoding ...
 520 
 521         st = new StringTokenizer(etext, lwsp, true);
 522         StringBuilder sb = new StringBuilder();  // decode buffer
 523         StringBuilder wsb = new StringBuilder(); // white space buffer
 524         boolean prevWasEncoded = false;
 525 
 526         while (st.hasMoreTokens()) {
 527             char c;
 528             String s = st.nextToken();
 529             // If whitespace, append it to the whitespace buffer
 530             if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
 531                 (c == '\r') || (c == '\n'))
 532                 wsb.append(c);
 533             else {
 534                 // Check if token is an 'encoded-word' ..
 535                 String word;
 536                 try {
 537                     word = decodeWord(s);
 538                     // Yes, this IS an 'encoded-word'.
 539                     if (!prevWasEncoded && wsb.length() > 0) {
 540                         // if the previous word was also encoded, we
 541                         // should ignore the collected whitespace. Else
 542                         // we include the whitespace as well.
 543                         sb.append(wsb);
 544                     }
 545                     prevWasEncoded = true;
 546                 } catch (ParseException pex) {
 547                     // This is NOT an 'encoded-word'.
 548                     word = s;
 549                     // possibly decode inner encoded words
 550                     if (!decodeStrict)
 551                         word = decodeInnerWords(word);
 552                     // include colleced whitespace ..
 553                     if (wsb.length() > 0)
 554                         sb.append(wsb);
 555                     prevWasEncoded = false;
 556                 }
 557                 sb.append(word); // append the actual word
 558                 wsb.setLength(0); // reset wsb for reuse
 559             }
 560         }
 561         return sb.toString();
 562     }
 563 
 564     /**
 565      * Encode a RFC 822 "word" token into mail-safe form as per
 566      * RFC 2047. <p>
 567      *
 568      * The given Unicode string is examined for non US-ASCII
 569      * characters. If the string contains only US-ASCII characters,
 570      * it is returned as-is.  If the string contains non US-ASCII
 571      * characters, it is first character-encoded using the platform's
 572      * default charset, then transfer-encoded using either the B or
 573      * Q encoding. The resulting bytes are then returned as a Unicode
 574      * string containing only ASCII  characters. <p>
 575      *
 576      * This method is meant to be used when creating RFC 822 "phrases".
 577      * The InternetAddress class, for example, uses this to encode
 578      * it's 'phrase' component.
 579      *
 580      * @param   word    unicode string
 581      * @return  Array of Unicode strings containing only US-ASCII
 582      *          characters.
 583      * @exception UnsupportedEncodingException if the encoding fails
 584      */
 585     public static String encodeWord(String word)
 586                         throws UnsupportedEncodingException {
 587         return encodeWord(word, null, null);
 588     }
 589 
 590     /**
 591      * Encode a RFC 822 "word" token into mail-safe form as per
 592      * RFC 2047. <p>
 593      *
 594      * The given Unicode string is examined for non US-ASCII
 595      * characters. If the string contains only US-ASCII characters,
 596      * it is returned as-is.  If the string contains non US-ASCII
 597      * characters, it is first character-encoded using the specified
 598      * charset, then transfer-encoded using either the B or Q encoding.
 599      * The resulting bytes are then returned as a Unicode string
 600      * containing only ASCII characters. <p>
 601      *
 602      * @param   word    unicode string
 603      * @param   charset the MIME charset
 604      * @param   encoding the encoding to be used. Currently supported
 605      *          values are "B" and "Q". If this parameter is null, then
 606      *          the "Q" encoding is used if most of characters to be
 607      *          encoded are in the ASCII charset, otherwise "B" encoding
 608      *          is used.
 609      * @return  Unicode string containing only US-ASCII characters
 610      * @exception UnsupportedEncodingException if the encoding fails
 611      */
 612     public static String encodeWord(String word, String charset,
 613                                     String encoding)
 614                         throws UnsupportedEncodingException {
 615         return encodeWord(word, charset, encoding, true);
 616     }
 617 
 618     /*
 619      * Encode the given string. The parameter 'encodingWord' should
 620      * be true if a RFC 822 "word" token is being encoded and false if a
 621      * RFC 822 "text" token is being encoded. This is because the
 622      * "Q" encoding defined in RFC 2047 has more restrictions when
 623      * encoding "word" tokens. (Sigh)
 624      */
 625     private static String encodeWord(String string, String charset,
 626                                      String encoding, boolean encodingWord)
 627                         throws UnsupportedEncodingException {
 628 
 629         // If 'string' contains only US-ASCII characters, just
 630         // return it.
 631         int ascii = checkAscii(string);
 632         if (ascii == ALL_ASCII)
 633             return string;
 634 
 635         // Else, apply the specified charset conversion.
 636         String jcharset;
 637         if (charset == null) { // use default charset
 638             jcharset = getDefaultJavaCharset(); // the java charset
 639             charset = getDefaultMIMECharset(); // the MIME equivalent
 640         } else // MIME charset -> java charset
 641             jcharset = javaCharset(charset);
 642 
 643         // If no transfer-encoding is specified, figure one out.
 644         if (encoding == null) {
 645             if (ascii != MOSTLY_NONASCII)
 646                 encoding = "Q";
 647             else
 648                 encoding = "B";
 649         }
 650 
 651         boolean b64;
 652         if (encoding.equalsIgnoreCase("B"))
 653             b64 = true;
 654         else if (encoding.equalsIgnoreCase("Q"))
 655             b64 = false;
 656         else
 657             throw new UnsupportedEncodingException(
 658                         "Unknown transfer encoding: " + encoding);
 659 
 660         StringBuilder outb = new StringBuilder(); // the output buffer
 661         doEncode(string, b64, jcharset,
 662                  // As per RFC 2047, size of an encoded string should not
 663                  // exceed 75 bytes.
 664                  // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
 665                  75 - 7 - charset.length(), // the available space
 666                  "=?" + charset + "?" + encoding + "?", // prefix
 667                  true, encodingWord, outb);
 668 
 669         return outb.toString();
 670     }
 671 
 672     private static void doEncode(String string, boolean b64,
 673                 String jcharset, int avail, String prefix,
 674                 boolean first, boolean encodingWord, StringBuilder buf)
 675                         throws UnsupportedEncodingException {
 676 
 677         // First find out what the length of the encoded version of
 678         // 'string' would be.
 679         byte[] bytes = string.getBytes(jcharset);
 680         int len;
 681         if (b64) // "B" encoding
 682             len = BEncoderStream.encodedLength(bytes);
 683         else // "Q"
 684             len = QEncoderStream.encodedLength(bytes, encodingWord);
 685 
 686         int size;
 687         if ((len > avail) && ((size = string.length()) > 1)) {
 688             // If the length is greater than 'avail', split 'string'
 689             // into two and recurse.
 690             doEncode(string.substring(0, size/2), b64, jcharset,
 691                      avail, prefix, first, encodingWord, buf);
 692             doEncode(string.substring(size/2, size), b64, jcharset,
 693                      avail, prefix, false, encodingWord, buf);
 694         } else {
 695             // length <= than 'avail'. Encode the given string
 696             ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
 697             OutputStream eos; // the encoder
 698             if (b64) // "B" encoding
 699                 eos = new BEncoderStream(os);
 700             else // "Q" encoding
 701                 eos = new QEncoderStream(os, encodingWord);
 702 
 703             try { // do the encoding
 704                 eos.write(bytes);
 705                 eos.close();
 706             } catch (IOException ioex) { }
 707 
 708             byte[] encodedBytes = os.toByteArray(); // the encoded stuff
 709             // Now write out the encoded (all ASCII) bytes into our
 710             // StringBuffer
 711             if (!first) // not the first line of this sequence
 712                 if (foldEncodedWords)
 713                     buf.append("\r\n "); // start a continuation line
 714                 else
 715                     buf.append(" "); // line will be folded later
 716 
 717             buf.append(prefix);
 718             for (int i = 0; i < encodedBytes.length; i++)
 719                 buf.append((char)encodedBytes[i]);
 720             buf.append("?="); // terminate the current sequence
 721         }
 722     }
 723 
 724     /**
 725      * The string is parsed using the rules in RFC 2047 for parsing
 726      * an "encoded-word". If the parse fails, a ParseException is
 727      * thrown. Otherwise, it is transfer-decoded, and then
 728      * charset-converted into Unicode. If the charset-conversion
 729      * fails, an UnsupportedEncodingException is thrown.<p>
 730      *
 731      * @param   eword   the possibly encoded value
 732      * @return deocoded word
 733      * @exception       ParseException if the string is not an
 734      *                  encoded-word as per RFC 2047.
 735      * @exception       UnsupportedEncodingException if the charset
 736      *                  conversion failed.
 737      */
 738     public static String decodeWord(String eword)
 739                 throws ParseException, UnsupportedEncodingException {
 740 
 741         if (!eword.startsWith("=?")) // not an encoded word
 742             throw new ParseException();
 743 
 744         // get charset
 745         int start = 2; int pos;
 746         if ((pos = eword.indexOf('?', start)) == -1)
 747             throw new ParseException();
 748         String charset = javaCharset(eword.substring(start, pos));
 749 
 750         // get encoding
 751         start = pos+1;
 752         if ((pos = eword.indexOf('?', start)) == -1)
 753             throw new ParseException();
 754         String encoding = eword.substring(start, pos);
 755 
 756         // get encoded-sequence
 757         start = pos+1;
 758         if ((pos = eword.indexOf("?=", start)) == -1)
 759             throw new ParseException();
 760         String word = eword.substring(start, pos);
 761 
 762         try {
 763             // Extract the bytes from word
 764             ByteArrayInputStream bis =
 765                 new ByteArrayInputStream(ASCIIUtility.getBytes(word));
 766 
 767             // Get the appropriate decoder
 768             InputStream is;
 769             if (encoding.equalsIgnoreCase("B"))
 770                 is = new BASE64DecoderStream(bis);
 771             else if (encoding.equalsIgnoreCase("Q"))
 772                 is = new QDecoderStream(bis);
 773             else
 774                 throw new UnsupportedEncodingException(
 775                                 "unknown encoding: " + encoding);
 776 
 777             // For b64 & q, size of decoded word <= size of word. So
 778             // the decoded bytes must fit into the 'bytes' array. This
 779             // is certainly more efficient than writing bytes into a
 780             // ByteArrayOutputStream and then pulling out the byte[]
 781             // from it.
 782             int count = bis.available();
 783             byte[] bytes = new byte[count];
 784             // count is set to the actual number of decoded bytes
 785             count = is.read(bytes, 0, count);
 786 
 787             // Finally, convert the decoded bytes into a String using
 788             // the specified charset
 789             String s = new String(bytes, 0, count, charset);
 790             if (pos + 2 < eword.length()) {
 791                 // there's still more text in the string
 792                 String rest = eword.substring(pos + 2);
 793                 if (!decodeStrict)
 794                     rest = decodeInnerWords(rest);
 795                 s += rest;
 796             }
 797             return s;
 798         } catch (UnsupportedEncodingException uex) {
 799             // explicitly catch and rethrow this exception, otherwise
 800             // the below IOException catch will swallow this up!
 801             throw uex;
 802         } catch (IOException ioex) {
 803             // Shouldn't happen.
 804             throw new ParseException();
 805         } catch (IllegalArgumentException iex) {
 806             /* An unknown charset of the form ISO-XXX-XXX, will cause
 807              * the JDK to throw an IllegalArgumentException ... Since the
 808              * JDK will attempt to create a classname using this string,
 809              * but valid classnames must not contain the character '-',
 810              * and this results in an IllegalArgumentException, rather than
 811              * the expected UnsupportedEncodingException. Yikes
 812              */
 813             throw new UnsupportedEncodingException();
 814         }
 815     }
 816 
 817     /**
 818      * Look for encoded words within a word.  The MIME spec doesn't
 819      * allow this, but many broken mailers, especially Japanese mailers,
 820      * produce such incorrect encodings.
 821      */
 822     private static String decodeInnerWords(String word)
 823                                 throws UnsupportedEncodingException {
 824         int start = 0, i;
 825         StringBuilder buf = new StringBuilder();
 826         while ((i = word.indexOf("=?", start)) >= 0) {
 827             buf.append(word.substring(start, i));
 828             int end = word.indexOf("?=", i);
 829             if (end < 0)
 830                 break;
 831             String s = word.substring(i, end + 2);
 832             try {
 833                 s = decodeWord(s);
 834             } catch (ParseException pex) {
 835                 // ignore it, just use the original string
 836             }
 837             buf.append(s);
 838             start = end + 2;
 839         }
 840         if (start == 0)
 841             return word;
 842         if (start < word.length())
 843             buf.append(word.substring(start));
 844         return buf.toString();
 845     }
 846 
 847     /**
 848      * A utility method to quote a word, if the word contains any
 849      * characters from the specified 'specials' list.<p>
 850      *
 851      * The <code>HeaderTokenizer</code> class defines two special
 852      * sets of delimiters - MIME and RFC 822. <p>
 853      *
 854      * This method is typically used during the generation of
 855      * RFC 822 and MIME header fields.
 856      *
 857      * @param   word    word to be quoted
 858      * @param   specials the set of special characters
 859      * @return          the possibly quoted word
 860      * @see     com.sun.xml.internal.messaging.saaj.packaging.mime.internet.HeaderTokenizer#MIME
 861      * @see     com.sun.xml.internal.messaging.saaj.packaging.mime.internet.HeaderTokenizer#RFC822
 862      */
 863     public static String quote(String word, String specials) {
 864         int len = word.length();
 865 
 866         /*
 867          * Look for any "bad" characters, Escape and
 868          *  quote the entire string if necessary.
 869          */
 870         boolean needQuoting = false;
 871         for (int i = 0; i < len; i++) {
 872             char c = word.charAt(i);
 873             if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
 874                 // need to escape them and then quote the whole string
 875                 StringBuilder sb = new StringBuilder(len + 3);
 876                 sb.append('"');
 877                 sb.append(word.substring(0, i));
 878                 int lastc = 0;
 879                 for (int j = i; j < len; j++) {
 880                     char cc = word.charAt(j);
 881                     if ((cc == '"') || (cc == '\\') ||
 882                         (cc == '\r') || (cc == '\n'))
 883                         if (cc == '\n' && lastc == '\r')
 884                             ;   // do nothing, CR was already escaped
 885                         else
 886                             sb.append('\\');    // Escape the character
 887                     sb.append(cc);
 888                     lastc = cc;
 889                 }
 890                 sb.append('"');
 891                 return sb.toString();
 892             } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
 893                 // These characters cause the string to be quoted
 894                 needQuoting = true;
 895         }
 896 
 897         if (needQuoting) {
 898             StringBuilder sb = new StringBuilder(len + 2);
 899             sb.append('"').append(word).append('"');
 900             return sb.toString();
 901         } else
 902             return word;
 903     }
 904 
 905     /**
 906      * Fold a string at linear whitespace so that each line is no longer
 907      * than 76 characters, if possible.  If there are more than 76
 908      * non-whitespace characters consecutively, the string is folded at
 909      * the first whitespace after that sequence.  The parameter
 910      * <code>used</code> indicates how many characters have been used in
 911      * the current line; it is usually the length of the header name. <p>
 912      *
 913      * Note that line breaks in the string aren't escaped; they probably
 914      * should be.
 915      *
 916      * @param   used    characters used in line so far
 917      * @param   s       the string to fold
 918      * @return          the folded string
 919      */
 920     /*public*/ static String fold(int used, String s) {
 921         if (!foldText)
 922             return s;
 923 
 924         int end;
 925         char c;
 926         // Strip trailing spaces
 927         for (end = s.length() - 1; end >= 0; end--) {
 928             c = s.charAt(end);
 929             if (c != ' ' && c != '\t')
 930                 break;
 931         }
 932         if (end != s.length() - 1)
 933             s = s.substring(0, end + 1);
 934 
 935         // if the string fits now, just return it
 936         if (used + s.length() <= 76)
 937             return s;
 938 
 939         // have to actually fold the string
 940         StringBuilder sb = new StringBuilder(s.length() + 4);
 941         char lastc = 0;
 942         while (used + s.length() > 76) {
 943             int lastspace = -1;
 944             for (int i = 0; i < s.length(); i++) {
 945                 if (lastspace != -1 && used + i > 76)
 946                     break;
 947                 c = s.charAt(i);
 948                 if (c == ' ' || c == '\t')
 949                     if (!(lastc == ' ' || lastc == '\t'))
 950                         lastspace = i;
 951                 lastc = c;
 952             }
 953             if (lastspace == -1) {
 954                 // no space, use the whole thing
 955                 sb.append(s);
 956                 s = "";
 957                 used = 0;
 958                 break;
 959             }
 960             sb.append(s.substring(0, lastspace));
 961             sb.append("\r\n");
 962             lastc = s.charAt(lastspace);
 963             sb.append(lastc);
 964             s = s.substring(lastspace + 1);
 965             used = 1;
 966         }
 967         sb.append(s);
 968         return sb.toString();
 969     }
 970 
 971     /**
 972      * Unfold a folded header.  Any line breaks that aren't escaped and
 973      * are followed by whitespace are removed.
 974      *
 975      * @param   s       the string to unfold
 976      * @return          the unfolded string
 977      */
 978     /*public*/ static String unfold(String s) {
 979         if (!foldText)
 980             return s;
 981 
 982         StringBuilder sb = null;
 983         int i;
 984         while ((i = indexOfAny(s, "\r\n")) >= 0) {
 985             int start = i;
 986             int l = s.length();
 987             i++;                // skip CR or NL
 988             if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
 989                 i++;    // skip LF
 990             if (start == 0 || s.charAt(start - 1) != '\\') {
 991                 char c;
 992                 // if next line starts with whitespace, skip all of it
 993                 // XXX - always has to be true?
 994                 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
 995                     i++;        // skip whitespace
 996                     while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
 997                         i++;
 998                     if (sb == null)
 999                         sb = new StringBuilder(s.length());
1000                     if (start != 0) {
1001                         sb.append(s.substring(0, start));
1002                         sb.append(' ');
1003                     }
1004                     s = s.substring(i);
1005                     continue;
1006                 }
1007                 // it's not a continuation line, just leave it in
1008                 if (sb == null)
1009                     sb = new StringBuilder(s.length());
1010                 sb.append(s.substring(0, i));
1011                 s = s.substring(i);
1012             } else {
1013                 // there's a backslash at "start - 1"
1014                 // strip it out, but leave in the line break
1015                 if (sb == null)
1016                     sb = new StringBuilder(s.length());
1017                 sb.append(s.substring(0, start - 1));
1018                 sb.append(s.substring(start, i));
1019                 s = s.substring(i);
1020             }
1021         }
1022         if (sb != null) {
1023             sb.append(s);
1024             return sb.toString();
1025         } else
1026             return s;
1027     }
1028 
1029     /**
1030      * Return the first index of any of the characters in "any" in "s",
1031      * or -1 if none are found.
1032      *
1033      * This should be a method on String.
1034      */
1035     private static int indexOfAny(String s, String any) {
1036         return indexOfAny(s, any, 0);
1037     }
1038 
1039     private static int indexOfAny(String s, String any, int start) {
1040         try {
1041             int len = s.length();
1042             for (int i = start; i < len; i++) {
1043                 if (any.indexOf(s.charAt(i)) >= 0)
1044                     return i;
1045             }
1046             return -1;
1047         } catch (StringIndexOutOfBoundsException e) {
1048             return -1;
1049         }
1050     }
1051 
1052     /**
1053      * Convert a MIME charset name into a valid Java charset name. <p>
1054      *
1055      * @param charset   the MIME charset name
1056      * @return  the Java charset equivalent. If a suitable mapping is
1057      *          not available, the passed in charset is itself returned.
1058      */
1059     public static String javaCharset(String charset) {
1060         if (mime2java == null || charset == null)
1061             // no mapping table, or charset parameter is null
1062             return charset;
1063 
1064         String alias = mime2java.get(charset.toLowerCase());
1065         return alias == null ? charset : alias;
1066     }
1067 
1068     /**
1069      * Convert a java charset into its MIME charset name. <p>
1070      *
1071      * Note that a future version of JDK (post 1.2) might provide
1072      * this functionality, in which case, we may deprecate this
1073      * method then.
1074      *
1075      * @param   charset    the JDK charset
1076      * @return          the MIME/IANA equivalent. If a mapping
1077      *                  is not possible, the passed in charset itself
1078      *                  is returned.
1079      * @since           JavaMail 1.1
1080      */
1081     public static String mimeCharset(String charset) {
1082         if (java2mime == null || charset == null)
1083             // no mapping table or charset param is null
1084             return charset;
1085 
1086         String alias = java2mime.get(charset.toLowerCase());
1087         return alias == null ? charset : alias;
1088     }
1089 
1090     private static String defaultJavaCharset;
1091     private static String defaultMIMECharset;
1092 
1093     /**
1094      * Get the default charset corresponding to the system's current
1095      * default locale.  If the System property <code>mail.mime.charset</code>
1096      * is set, a system charset corresponding to this MIME charset will be
1097      * returned. <p>
1098      *
1099      * @return  the default charset of the system's default locale,
1100      *          as a Java charset. (NOT a MIME charset)
1101      * @since   JavaMail 1.1
1102      */
1103     public static String getDefaultJavaCharset() {
1104         if (defaultJavaCharset == null) {
1105             /*
1106              * If mail.mime.charset is set, it controls the default
1107              * Java charset as well.
1108              */
1109             String mimecs = null;
1110 
1111             mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
1112 
1113             if (mimecs != null && mimecs.length() > 0) {
1114                 defaultJavaCharset = javaCharset(mimecs);
1115                 return defaultJavaCharset;
1116             }
1117 
1118             try {
1119                 defaultJavaCharset = System.getProperty("file.encoding",
1120                                                         "8859_1");
1121             } catch (SecurityException sex) {
1122 
1123                 class NullInputStream extends InputStream {
1124                     @Override
1125                    public int read() {
1126                         return 0;
1127                     }
1128                 }
1129                 InputStreamReader reader =
1130                         new InputStreamReader(new NullInputStream());
1131                 defaultJavaCharset = reader.getEncoding();
1132                 if (defaultJavaCharset == null)
1133                     defaultJavaCharset = "8859_1";
1134             }
1135         }
1136 
1137         return defaultJavaCharset;
1138     }
1139 
1140     /*
1141      * Get the default MIME charset for this locale.
1142      */
1143     static String getDefaultMIMECharset() {
1144         if (defaultMIMECharset == null) {
1145                 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset");
1146         }
1147         if (defaultMIMECharset == null)
1148             defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
1149         return defaultMIMECharset;
1150     }
1151 
1152     // Tables to map MIME charset names to Java names and vice versa.
1153     // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
1154     private static Hashtable<String, String> mime2java;
1155     private static Hashtable<String, String> java2mime;
1156 
1157     static {
1158         java2mime = new Hashtable<String, String>(40);
1159         mime2java = new Hashtable<String, String>(10);
1160 
1161         try {
1162             // Use this class's classloader to load the mapping file
1163             // XXX - we should use SecuritySupport, but it's in another package
1164             InputStream is =
1165                     com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
1166                     "/META-INF/javamail.charset.map");
1167 
1168             if (is != null) {
1169                 is = new LineInputStream(is);
1170 
1171                 // Load the JDK-to-MIME charset mapping table
1172                 loadMappings((LineInputStream)is, java2mime);
1173 
1174                 // Load the MIME-to-JDK charset mapping table
1175                 loadMappings((LineInputStream)is, mime2java);
1176             }
1177         } catch (Exception ex) { }
1178 
1179         // If we didn't load the tables, e.g., because we didn't have
1180         // permission, load them manually.  The entries here should be
1181         // the same as the default javamail.charset.map.
1182         if (java2mime.isEmpty()) {
1183             java2mime.put("8859_1", "ISO-8859-1");
1184             java2mime.put("iso8859_1", "ISO-8859-1");
1185             java2mime.put("ISO8859-1", "ISO-8859-1");
1186 
1187             java2mime.put("8859_2", "ISO-8859-2");
1188             java2mime.put("iso8859_2", "ISO-8859-2");
1189             java2mime.put("ISO8859-2", "ISO-8859-2");
1190 
1191             java2mime.put("8859_3", "ISO-8859-3");
1192             java2mime.put("iso8859_3", "ISO-8859-3");
1193             java2mime.put("ISO8859-3", "ISO-8859-3");
1194 
1195             java2mime.put("8859_4", "ISO-8859-4");
1196             java2mime.put("iso8859_4", "ISO-8859-4");
1197             java2mime.put("ISO8859-4", "ISO-8859-4");
1198 
1199             java2mime.put("8859_5", "ISO-8859-5");
1200             java2mime.put("iso8859_5", "ISO-8859-5");
1201             java2mime.put("ISO8859-5", "ISO-8859-5");
1202 
1203             java2mime.put("8859_6", "ISO-8859-6");
1204             java2mime.put("iso8859_6", "ISO-8859-6");
1205             java2mime.put("ISO8859-6", "ISO-8859-6");
1206 
1207             java2mime.put("8859_7", "ISO-8859-7");
1208             java2mime.put("iso8859_7", "ISO-8859-7");
1209             java2mime.put("ISO8859-7", "ISO-8859-7");
1210 
1211             java2mime.put("8859_8", "ISO-8859-8");
1212             java2mime.put("iso8859_8", "ISO-8859-8");
1213             java2mime.put("ISO8859-8", "ISO-8859-8");
1214 
1215             java2mime.put("8859_9", "ISO-8859-9");
1216             java2mime.put("iso8859_9", "ISO-8859-9");
1217             java2mime.put("ISO8859-9", "ISO-8859-9");
1218 
1219             java2mime.put("SJIS", "Shift_JIS");
1220             java2mime.put("MS932", "Shift_JIS");
1221             java2mime.put("JIS", "ISO-2022-JP");
1222             java2mime.put("ISO2022JP", "ISO-2022-JP");
1223             java2mime.put("EUC_JP", "euc-jp");
1224             java2mime.put("KOI8_R", "koi8-r");
1225             java2mime.put("EUC_CN", "euc-cn");
1226             java2mime.put("EUC_TW", "euc-tw");
1227             java2mime.put("EUC_KR", "euc-kr");
1228         }
1229         if (mime2java.isEmpty()) {
1230             mime2java.put("iso-2022-cn", "ISO2022CN");
1231             mime2java.put("iso-2022-kr", "ISO2022KR");
1232             mime2java.put("utf-8", "UTF8");
1233             mime2java.put("utf8", "UTF8");
1234             mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1235             mime2java.put("ja_jp.eucjp", "EUCJIS");
1236             mime2java.put("euc-kr", "KSC5601");
1237             mime2java.put("euckr", "KSC5601");
1238             mime2java.put("us-ascii", "ISO-8859-1");
1239             mime2java.put("x-us-ascii", "ISO-8859-1");
1240         }
1241     }
1242 
1243     private static void loadMappings(LineInputStream is, Hashtable<String, String> table) {
1244         String currLine;
1245 
1246         while (true) {
1247             try {
1248                 currLine = is.readLine();
1249             } catch (IOException ioex) {
1250                 break; // error in reading, stop
1251             }
1252 
1253             if (currLine == null) // end of file, stop
1254                 break;
1255             if (currLine.startsWith("--") && currLine.endsWith("--"))
1256                 // end of this table
1257                 break;
1258 
1259             // ignore empty lines and comments
1260             if (currLine.trim().length() == 0 || currLine.startsWith("#"))
1261                 continue;
1262 
1263             // A valid entry is of the form <key><separator><value>
1264             // where, <separator> := SPACE | HT. Parse this
1265             StringTokenizer tk = new StringTokenizer(currLine, " \t");
1266             try {
1267                 String key = tk.nextToken();
1268                 String value = tk.nextToken();
1269                 table.put(key.toLowerCase(), value);
1270             } catch (NoSuchElementException nex) { }
1271         }
1272     }
1273 
1274     static final int ALL_ASCII          = 1;
1275     static final int MOSTLY_ASCII       = 2;
1276     static final int MOSTLY_NONASCII    = 3;
1277 
1278     /**
1279      * Check if the given string contains non US-ASCII characters.
1280      * @param   s       string
1281      * @return          ALL_ASCII if all characters in the string
1282      *                  belong to the US-ASCII charset. MOSTLY_ASCII
1283      *                  if more than half of the available characters
1284      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
1285      */
1286     static int checkAscii(String s) {
1287         int ascii = 0, non_ascii = 0;
1288         int l = s.length();
1289 
1290         for (int i = 0; i < l; i++) {
1291             if (nonascii(s.charAt(i))) // non-ascii
1292                 non_ascii++;
1293             else
1294                 ascii++;
1295         }
1296 
1297         if (non_ascii == 0)
1298             return ALL_ASCII;
1299         if (ascii > non_ascii)
1300             return MOSTLY_ASCII;
1301 
1302         return MOSTLY_NONASCII;
1303     }
1304 
1305     /**
1306      * Check if the given byte array contains non US-ASCII characters.
1307      * @param   b       byte array
1308      * @return          ALL_ASCII if all characters in the string
1309      *                  belong to the US-ASCII charset. MOSTLY_ASCII
1310      *                  if more than half of the available characters
1311      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
1312      *
1313      * XXX - this method is no longer used
1314      */
1315     static int checkAscii(byte[] b) {
1316         int ascii = 0, non_ascii = 0;
1317 
1318         for (int i=0; i < b.length; i++) {
1319             // The '&' operator automatically causes b[i] to be promoted
1320             // to an int, and we mask out the higher bytes in the int
1321             // so that the resulting value is not a negative integer.
1322             if (nonascii(b[i] & 0xff)) // non-ascii
1323                 non_ascii++;
1324             else
1325                 ascii++;
1326         }
1327 
1328         if (non_ascii == 0)
1329             return ALL_ASCII;
1330         if (ascii > non_ascii)
1331             return MOSTLY_ASCII;
1332 
1333         return MOSTLY_NONASCII;
1334     }
1335 
1336     /**
1337      * Check if the given input stream contains non US-ASCII characters.
1338      * Upto <code>max</code> bytes are checked. If <code>max</code> is
1339      * set to <code>ALL</code>, then all the bytes available in this
1340      * input stream are checked. If <code>breakOnNonAscii</code> is true
1341      * the check terminates when the first non-US-ASCII character is
1342      * found and MOSTLY_NONASCII is returned. Else, the check continues
1343      * till <code>max</code> bytes or till the end of stream.
1344      *
1345      * @param   is      the input stream
1346      * @param   max     maximum bytes to check for. The special value
1347      *                  ALL indicates that all the bytes in this input
1348      *                  stream must be checked.
1349      * @param   breakOnNonAscii if <code>true</code>, then terminate the
1350      *                  the check when the first non-US-ASCII character
1351      *                  is found.
1352      * @return          ALL_ASCII if all characters in the string
1353      *                  belong to the US-ASCII charset. MOSTLY_ASCII
1354      *                  if more than half of the available characters
1355      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
1356      */
1357     static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
1358         int ascii = 0, non_ascii = 0;
1359         int len;
1360         int block = 4096;
1361         int linelen = 0;
1362         boolean longLine = false, badEOL = false;
1363         boolean checkEOL = encodeEolStrict && breakOnNonAscii;
1364         byte buf[] = null;
1365         if (max != 0) {
1366             block = (max == ALL) ? 4096 : Math.min(max, 4096);
1367             buf = new byte[block];
1368         }
1369         while (max != 0) {
1370             try {
1371                 if ((len = is.read(buf, 0, block)) == -1)
1372                     break;
1373                 int lastb = 0;
1374                 for (int i = 0; i < len; i++) {
1375                     // The '&' operator automatically causes b[i] to
1376                     // be promoted to an int, and we mask out the higher
1377                     // bytes in the int so that the resulting value is
1378                     // not a negative integer.
1379                     int b = buf[i] & 0xff;
1380                     if (checkEOL &&
1381                             ((lastb == '\r' && b != '\n') ||
1382                             (lastb != '\r' && b == '\n')))
1383                         badEOL = true;
1384                     if (b == '\r' || b == '\n')
1385                         linelen = 0;
1386                     else {
1387                         linelen++;
1388                         if (linelen > 998)      // 1000 - CRLF
1389                             longLine = true;
1390                     }
1391                     if (nonascii(b)) {  // non-ascii
1392                         if (breakOnNonAscii) // we are done
1393                             return MOSTLY_NONASCII;
1394                         else
1395                             non_ascii++;
1396                     } else
1397                         ascii++;
1398                     lastb = b;
1399                 }
1400             } catch (IOException ioex) {
1401                 break;
1402             }
1403             if (max != ALL)
1404                 max -= len;
1405         }
1406 
1407         if (max == 0 && breakOnNonAscii)
1408             // We have been told to break on the first non-ascii character.
1409             // We haven't got any non-ascii character yet, but then we
1410             // have not checked all of the available bytes either. So we
1411             // cannot say for sure that this input stream is ALL_ASCII,
1412             // and hence we must play safe and return MOSTLY_NONASCII
1413 
1414             return MOSTLY_NONASCII;
1415 
1416         if (non_ascii == 0) { // no non-us-ascii characters so far
1417             // If we're looking at non-text data, and we saw CR without LF
1418             // or vice versa, consider this mostly non-ASCII so that it
1419             // will be base64 encoded (since the quoted-printable encoder
1420             // doesn't encode this case properly).
1421             if (badEOL)
1422                 return MOSTLY_NONASCII;
1423             // if we've seen a long line, we degrade to mostly ascii
1424             else if (longLine)
1425                 return MOSTLY_ASCII;
1426             else
1427                 return ALL_ASCII;
1428         }
1429         if (ascii > non_ascii) // mostly ascii
1430             return MOSTLY_ASCII;
1431         return MOSTLY_NONASCII;
1432     }
1433 
1434     static final boolean nonascii(int b) {
1435         return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
1436     }
1437 }
1438 
1439 /**
1440  * An OutputStream that determines whether the data written to
1441  * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1442  */
1443 class AsciiOutputStream extends OutputStream {
1444     private boolean breakOnNonAscii;
1445     private int ascii = 0, non_ascii = 0;
1446     private int linelen = 0;
1447     private boolean longLine = false;
1448     private boolean badEOL = false;
1449     private boolean checkEOL = false;
1450     private int lastb = 0;
1451     private int ret = 0;
1452 
1453     public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
1454         this.breakOnNonAscii = breakOnNonAscii;
1455         checkEOL = encodeEolStrict && breakOnNonAscii;
1456     }
1457 
1458     @Override
1459     public void write(int b) throws IOException {
1460         check(b);
1461     }
1462 
1463     @Override
1464     public void write(byte b[]) throws IOException {
1465         write(b, 0, b.length);
1466     }
1467 
1468     @Override
1469     public void write(byte b[], int off, int len) throws IOException {
1470         len += off;
1471         for (int i = off; i < len ; i++)
1472             check(b[i]);
1473     }
1474 
1475     private final void check(int b) throws IOException {
1476         b &= 0xff;
1477         if (checkEOL &&
1478                 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1479             badEOL = true;
1480         if (b == '\r' || b == '\n')
1481             linelen = 0;
1482         else {
1483             linelen++;
1484             if (linelen > 998)  // 1000 - CRLF
1485                 longLine = true;
1486         }
1487         if (MimeUtility.nonascii(b)) { // non-ascii
1488             non_ascii++;
1489             if (breakOnNonAscii) {      // we are done
1490                 ret = MimeUtility.MOSTLY_NONASCII;
1491                 throw new EOFException();
1492             }
1493         } else
1494             ascii++;
1495         lastb = b;
1496     }
1497 
1498     /**
1499      * Return ASCII-ness of data stream.
1500      */
1501     public int getAscii() {
1502         if (ret != 0)
1503             return ret;
1504         // If we're looking at non-text data, and we saw CR without LF
1505         // or vice versa, consider this mostly non-ASCII so that it
1506         // will be base64 encoded (since the quoted-printable encoder
1507         // doesn't encode this case properly).
1508         if (badEOL)
1509             return MimeUtility.MOSTLY_NONASCII;
1510         else if (non_ascii == 0) { // no non-us-ascii characters so far
1511             // if we've seen a long line, we degrade to mostly ascii
1512             if (longLine)
1513                 return MimeUtility.MOSTLY_ASCII;
1514             else
1515                 return MimeUtility.ALL_ASCII;
1516         }
1517         if (ascii > non_ascii) // mostly ascii
1518             return MimeUtility.MOSTLY_ASCII;
1519         return MimeUtility.MOSTLY_NONASCII;
1520     }
1521 }