1 /*
   2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  * @(#)MimeUtility.java       1.45 03/03/10
  28  */
  29 
  30 
  31 
  32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
  33 
  34 import java.io.*;
  35 import java.util.*;
  36 
  37 import javax.activation.DataHandler;
  38 import javax.activation.DataSource;
  39 
  40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
  41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
  42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil;
  43 
  44 /**
  45  * This is a utility class that provides various MIME related
  46  * functionality. <p>
  47  *
  48  * There are a set of methods to encode and decode MIME headers as
  49  * per RFC 2047. A brief description on handling such headers is
  50  * given below: <p>
  51  *
  52  * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
  53  * characters. Headers that contain non US-ASCII characters must be
  54  * encoded so that they contain only US-ASCII characters. Basically,
  55  * this process involves using either BASE64 or QP to encode certain
  56  * characters. RFC 2047 describes this in detail. <p>
  57  *
  58  * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
  59  * subset of Unicode (and occupies the range 0 - 127). A String
  60  * that contains only ASCII characters is already mail-safe. If the
  61  * String contains non US-ASCII characters, it must be encoded. An
  62  * additional complexity in this step is that since Unicode is not
  63  * yet a widely used charset, one might want to first charset-encode
  64  * the String into another charset and then do the transfer-encoding.
  65  * <p>
  66  * Note that to get the actual bytes of a mail-safe String (say,
  67  * for sending over SMTP), one must do
  68  * <p><blockquote><pre>
  69  *
  70  *      byte[] bytes = string.getBytes("iso-8859-1");
  71  *
  72  * </pre></blockquote><p>
  73  *
  74  * The <code>setHeader</code> and <code>addHeader</code> methods
  75  * on MimeMessage and MimeBodyPart assume that the given header values
  76  * are Unicode strings that contain only US-ASCII characters. Hence
  77  * the callers of those methods must insure that the values they pass
  78  * do not contain non US-ASCII characters. The methods in this class
  79  * help do this. <p>
  80  *
  81  * The <code>getHeader</code> family of methods on MimeMessage and
  82  * MimeBodyPart return the raw header value. These might be encoded
  83  * as per RFC 2047, and if so, must be decoded into Unicode Strings.
  84  * The methods in this class help to do this. <p>
  85  *
  86  * Several System properties control strict conformance to the MIME
  87  * spec.  Note that these are not session properties but must be set
  88  * globally as System properties. <p>
  89  *
  90  * The <code>mail.mime.decodetext.strict</code> property controls
  91  * decoding of MIME encoded words.  The MIME spec requires that encoded
  92  * words start at the beginning of a whitespace separated word.  Some
  93  * mailers incorrectly include encoded words in the middle of a word.
  94  * If the <code>mail.mime.decodetext.strict</code> System property is
  95  * set to <code>"false"</code>, an attempt will be made to decode these
  96  * illegal encoded words. The default is true. <p>
  97  *
  98  * The <code>mail.mime.encodeeol.strict</code> property controls the
  99  * choice of Content-Transfer-Encoding for MIME parts that are not of
 100  * type "text".  Often such parts will contain textual data for which
 101  * an encoding that allows normal end of line conventions is appropriate.
 102  * In rare cases, such a part will appear to contain entirely textual
 103  * data, but will require an encoding that preserves CR and LF characters
 104  * without change.  If the <code>mail.mime.decodetext.strict</code>
 105  * System property is set to <code>"true"</code>, such an encoding will
 106  * be used when necessary.  The default is false. <p>
 107  *
 108  * In addition, the <code>mail.mime.charset</code> System property can
 109  * be used to specify the default MIME charset to use for encoded words
 110  * and text parts that don't otherwise specify a charset.  Normally, the
 111  * default MIME charset is derived from the default Java charset, as
 112  * specified in the <code>file.encoding</code> System property.  Most
 113  * applications will have no need to explicitly set the default MIME
 114  * charset.  In cases where the default MIME charset to be used for
 115  * mail messages is different than the charset used for files stored on
 116  * the system, this property should be set.
 117  *
 118  * @version 1.45, 03/03/10
 119  * @author  John Mani
 120  * @author  Bill Shannon
 121  */
 122 
 123 public class MimeUtility {
 124 
 125     // This class cannot be instantiated
 126     private MimeUtility() { }
 127 
 128     public static final int ALL = -1;
 129 
 130     private static final int BUFFER_SIZE = 1024;
 131     private static boolean decodeStrict = true;
 132     private static boolean encodeEolStrict = false;
 133     private static boolean foldEncodedWords = false;
 134     private static boolean foldText = true;
 135 
 136     static {
 137         try {
 138             String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict");
 139             // default to true
 140             decodeStrict = s == null || !s.equalsIgnoreCase("false");
 141             s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict");
 142             // default to false
 143             encodeEolStrict = s != null && s.equalsIgnoreCase("true");
 144             s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords");
 145             // default to false
 146             foldEncodedWords = s != null && s.equalsIgnoreCase("true");
 147             s = SAAJUtil.getSystemProperty("mail.mime.foldtext");
 148             // default to true
 149             foldText = s == null || !s.equalsIgnoreCase("false");
 150         } catch (SecurityException sex) {
 151             // ignore it
 152         }
 153     }
 154 
 155 
 156     /**
 157      * Get the content-transfer-encoding that should be applied
 158      * to the input stream of this datasource, to make it mailsafe. <p>
 159      *
 160      * The algorithm used here is: <br>
 161      * <ul>
 162      * <li>
 163      * If the primary type of this datasource is "text" and if all
 164      * the bytes in its input stream are US-ASCII, then the encoding
 165      * is "7bit". If more than half of the bytes are non-US-ASCII, then
 166      * the encoding is "base64". If less than half of the bytes are
 167      * non-US-ASCII, then the encoding is "quoted-printable".
 168      * <li>
 169      * If the primary type of this datasource is not "text", then if
 170      * all the bytes of its input stream are US-ASCII, the encoding
 171      * is "7bit". If there is even one non-US-ASCII character, the
 172      * encoding is "base64".
 173      * </ul>
 174      *
 175      * @param   ds      DataSource
 176      * @return          the encoding. This is either "7bit",
 177      *                  "quoted-printable" or "base64"
 178      */
 179     public static String getEncoding(DataSource ds) {
 180         ContentType cType = null;
 181         InputStream is = null;
 182         String encoding = null;
 183 
 184         try {
 185             cType = new ContentType(ds.getContentType());
 186             is = ds.getInputStream();
 187         } catch (Exception ex) {
 188             return "base64"; // what else ?!
 189         }
 190 
 191         boolean isText = cType.match("text/*");
 192         // if not text, stop processing when we see non-ASCII
 193         int i = checkAscii(is, ALL, !isText);
 194         switch (i) {
 195         case ALL_ASCII:
 196             encoding = "7bit"; // all ascii
 197             break;
 198         case MOSTLY_ASCII:
 199             encoding = "quoted-printable"; // mostly ascii
 200             break;
 201         default:
 202             encoding = "base64"; // mostly binary
 203             break;
 204         }
 205 
 206         // Close the input stream
 207         try {
 208             is.close();
 209         } catch (IOException ioex) { }
 210 
 211         return encoding;
 212     }
 213 
 214     /**
 215      * Same as <code>getEncoding(DataSource)</code> except that instead
 216      * of reading the data from an <code>InputStream</code> it uses the
 217      * <code>writeTo</code> method to examine the data.  This is more
 218      * efficient in the common case of a <code>DataHandler</code>
 219      * created with an object and a MIME type (for example, a
 220      * "text/plain" String) because all the I/O is done in this
 221      * thread.  In the case requiring an <code>InputStream</code> the
 222      * <code>DataHandler</code> uses a thread, a pair of pipe streams,
 223      * and the <code>writeTo</code> method to produce the data. <p>
 224      *
 225      * @since   JavaMail 1.2
 226      */
 227     public static String getEncoding(DataHandler dh) {
 228         ContentType cType = null;
 229         String encoding = null;
 230 
 231         /*
 232          * Try to pick the most efficient means of determining the
 233          * encoding.  If this DataHandler was created using a DataSource,
 234          * the getEncoding(DataSource) method is typically faster.  If
 235          * the DataHandler was created with an object, this method is
 236          * much faster.  To distinguish the two cases, we use a heuristic.
 237          * A DataHandler created with an object will always have a null name.
 238          * A DataHandler created with a DataSource will usually have a
 239          * non-null name.
 240          *
 241          * XXX - This is actually quite a disgusting hack, but it makes
 242          *       a common case run over twice as fast.
 243          */
 244         if (dh.getName() != null)
 245             return getEncoding(dh.getDataSource());
 246 
 247         try {
 248             cType = new ContentType(dh.getContentType());
 249         } catch (Exception ex) {
 250             return "base64"; // what else ?!
 251         }
 252 
 253         if (cType.match("text/*")) {
 254             // Check all of the available bytes
 255             AsciiOutputStream aos = new AsciiOutputStream(false, false);
 256             try {
 257                 dh.writeTo(aos);
 258             } catch (IOException ex) { }        // ignore it
 259             switch (aos.getAscii()) {
 260             case ALL_ASCII:
 261                 encoding = "7bit"; // all ascii
 262                 break;
 263             case MOSTLY_ASCII:
 264                 encoding = "quoted-printable"; // mostly ascii
 265                 break;
 266             default:
 267                 encoding = "base64"; // mostly binary
 268                 break;
 269             }
 270         } else { // not "text"
 271             // Check all of available bytes, break out if we find
 272             // at least one non-US-ASCII character
 273             AsciiOutputStream aos =
 274                         new AsciiOutputStream(true, encodeEolStrict);
 275             try {
 276                 dh.writeTo(aos);
 277             } catch (IOException ex) { }        // ignore it
 278             if (aos.getAscii() == ALL_ASCII) // all ascii
 279                 encoding = "7bit";
 280             else // found atleast one non-ascii character, use b64
 281                 encoding = "base64";
 282         }
 283 
 284         return encoding;
 285     }
 286 
 287     /**
 288      * Decode the given input stream. The Input stream returned is
 289      * the decoded input stream. All the encodings defined in RFC 2045
 290      * are supported here. They include "base64", "quoted-printable",
 291      * "7bit", "8bit", and "binary". In addition, "uuencode" is also
 292      * supported.
 293      *
 294      * @param   is              input stream
 295      * @param   encoding        the encoding of the stream.
 296      * @return                  decoded input stream.
 297      */
 298     public static InputStream decode(InputStream is, String encoding)
 299                 throws MessagingException {
 300         if (encoding.equalsIgnoreCase("base64"))
 301             return new BASE64DecoderStream(is);
 302         else if (encoding.equalsIgnoreCase("quoted-printable"))
 303             return new QPDecoderStream(is);
 304         else if (encoding.equalsIgnoreCase("uuencode") ||
 305                  encoding.equalsIgnoreCase("x-uuencode") ||
 306                  encoding.equalsIgnoreCase("x-uue"))
 307             return new UUDecoderStream(is);
 308         else if (encoding.equalsIgnoreCase("binary") ||
 309                  encoding.equalsIgnoreCase("7bit") ||
 310                  encoding.equalsIgnoreCase("8bit"))
 311             return is;
 312         else
 313             throw new MessagingException("Unknown encoding: " + encoding);
 314     }
 315 
 316     /**
 317      * Wrap an encoder around the given output stream.
 318      * All the encodings defined in RFC 2045 are supported here.
 319      * They include "base64", "quoted-printable", "7bit", "8bit" and
 320      * "binary". In addition, "uuencode" is also supported.
 321      *
 322      * @param   os              output stream
 323      * @param   encoding        the encoding of the stream.
 324      * @return                  output stream that applies the
 325      *                          specified encoding.
 326      */
 327     public static OutputStream encode(OutputStream os, String encoding)
 328                 throws MessagingException {
 329         if (encoding == null)
 330             return os;
 331         else if (encoding.equalsIgnoreCase("base64"))
 332             return new BASE64EncoderStream(os);
 333         else if (encoding.equalsIgnoreCase("quoted-printable"))
 334             return new QPEncoderStream(os);
 335         else if (encoding.equalsIgnoreCase("uuencode") ||
 336                  encoding.equalsIgnoreCase("x-uuencode") ||
 337                  encoding.equalsIgnoreCase("x-uue"))
 338             return new UUEncoderStream(os);
 339         else if (encoding.equalsIgnoreCase("binary") ||
 340                  encoding.equalsIgnoreCase("7bit") ||
 341                  encoding.equalsIgnoreCase("8bit"))
 342             return os;
 343         else
 344             throw new MessagingException("Unknown encoding: " +encoding);
 345     }
 346 
 347     /**
 348      * Wrap an encoder around the given output stream.
 349      * All the encodings defined in RFC 2045 are supported here.
 350      * They include "base64", "quoted-printable", "7bit", "8bit" and
 351      * "binary". In addition, "uuencode" is also supported.
 352      * The <code>filename</code> parameter is used with the "uuencode"
 353      * encoding and is included in the encoded output.
 354      *
 355      * @param   os              output stream
 356      * @param   encoding        the encoding of the stream.
 357      * @param   filename        name for the file being encoded (only used
 358      *                          with uuencode)
 359      * @return                  output stream that applies the
 360      *                          specified encoding.
 361      * @since                   JavaMail 1.2
 362      */
 363     public static OutputStream encode(OutputStream os, String encoding,
 364                                       String filename)
 365                 throws MessagingException {
 366         if (encoding == null)
 367             return os;
 368         else if (encoding.equalsIgnoreCase("base64"))
 369             return new BASE64EncoderStream(os);
 370         else if (encoding.equalsIgnoreCase("quoted-printable"))
 371             return new QPEncoderStream(os);
 372         else if (encoding.equalsIgnoreCase("uuencode") ||
 373                  encoding.equalsIgnoreCase("x-uuencode") ||
 374                  encoding.equalsIgnoreCase("x-uue"))
 375             return new UUEncoderStream(os, filename);
 376         else if (encoding.equalsIgnoreCase("binary") ||
 377                  encoding.equalsIgnoreCase("7bit") ||
 378                  encoding.equalsIgnoreCase("8bit"))
 379             return os;
 380         else
 381             throw new MessagingException("Unknown encoding: " +encoding);
 382     }
 383 
 384     /**
 385      * Encode a RFC 822 "text" token into mail-safe form as per
 386      * RFC 2047. <p>
 387      *
 388      * The given Unicode string is examined for non US-ASCII
 389      * characters. If the string contains only US-ASCII characters,
 390      * it is returned as-is.  If the string contains non US-ASCII
 391      * characters, it is first character-encoded using the platform's
 392      * default charset, then transfer-encoded using either the B or
 393      * Q encoding. The resulting bytes are then returned as a Unicode
 394      * string containing only ASCII  characters. <p>
 395      *
 396      * Note that this method should be used to encode only
 397      * "unstructured" RFC 822 headers. <p>
 398      *
 399      * Example of usage:
 400      * <p><blockquote><pre>
 401      *
 402      *  MimeBodyPart part = ...
 403      *  String rawvalue = "FooBar Mailer, Japanese version 1.1"
 404      *  try {
 405      *    // If we know for sure that rawvalue contains only US-ASCII
 406      *    // characters, we can skip the encoding part
 407      *    part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
 408      *  } catch (UnsupportedEncodingException e) {
 409      *    // encoding failure
 410      *  } catch (MessagingException me) {
 411      *   // setHeader() failure
 412      *  }
 413      *
 414      * </pre></blockquote><p>
 415      *
 416      * @param   text    unicode string
 417      * @return  Unicode string containing only US-ASCII characters
 418      * @exception UnsupportedEncodingException if the encoding fails
 419      */
 420     public static String encodeText(String text)
 421                         throws UnsupportedEncodingException {
 422         return encodeText(text, null, null);
 423     }
 424 
 425     /**
 426      * Encode a RFC 822 "text" token into mail-safe form as per
 427      * RFC 2047. <p>
 428      *
 429      * The given Unicode string is examined for non US-ASCII
 430      * characters. If the string contains only US-ASCII characters,
 431      * it is returned as-is.  If the string contains non US-ASCII
 432      * characters, it is first character-encoded using the specified
 433      * charset, then transfer-encoded using either the B or Q encoding.
 434      * The resulting bytes are then returned as a Unicode string
 435      * containing only ASCII characters. <p>
 436      *
 437      * Note that this method should be used to encode only
 438      * "unstructured" RFC 822 headers.
 439      *
 440      * @param   text    the header value
 441      * @param   charset the charset. If this parameter is null, the
 442      *          platform's default chatset is used.
 443      * @param   encoding the encoding to be used. Currently supported
 444      *          values are "B" and "Q". If this parameter is null, then
 445      *          the "Q" encoding is used if most of characters to be
 446      *          encoded are in the ASCII charset, otherwise "B" encoding
 447      *          is used.
 448      * @return  Unicode string containing only US-ASCII characters
 449      */
 450     public static String encodeText(String text, String charset,
 451                                     String encoding)
 452                         throws UnsupportedEncodingException {
 453         return encodeWord(text, charset, encoding, false);
 454     }
 455 
 456     /**
 457      * Decode "unstructured" headers, that is, headers that are defined
 458      * as '*text' as per RFC 822. <p>
 459      *
 460      * The string is decoded using the algorithm specified in
 461      * RFC 2047, Section 6.1.1. If the charset-conversion fails
 462      * for any sequence, an UnsupportedEncodingException is thrown.
 463      * If the String is not an RFC 2047 style encoded header, it is
 464      * returned as-is <p>
 465      *
 466      * Example of usage:
 467      * <p><blockquote><pre>
 468      *
 469      *  MimeBodyPart part = ...
 470      *  String rawvalue = null;
 471      *  String  value = null;
 472      *  try {
 473      *    if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
 474      *      value = MimeUtility.decodeText(rawvalue);
 475      *  } catch (UnsupportedEncodingException e) {
 476      *      // Don't care
 477      *      value = rawvalue;
 478      *  } catch (MessagingException me) { }
 479      *
 480      *  return value;
 481      *
 482      * </pre></blockquote><p>
 483      *
 484      * @param   etext   the possibly encoded value
 485      * @exception       UnsupportedEncodingException if the charset
 486      *                  conversion failed.
 487      */
 488     public static String decodeText(String etext)
 489                 throws UnsupportedEncodingException {
 490         /*
 491          * We look for sequences separated by "linear-white-space".
 492          * (as per RFC 2047, Section 6.1.1)
 493          * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
 494          */
 495         String lwsp = " \t\n\r";
 496         StringTokenizer st;
 497 
 498         /*
 499          * First, lets do a quick run thru the string and check
 500          * whether the sequence "=?"  exists at all. If none exists,
 501          * we know there are no encoded-words in here and we can just
 502          * return the string as-is, without suffering thru the later
 503          * decoding logic.
 504          * This handles the most common case of unencoded headers
 505          * efficiently.
 506          */
 507         if (etext.indexOf("=?") == -1)
 508             return etext;
 509 
 510         // Encoded words found. Start decoding ...
 511 
 512         st = new StringTokenizer(etext, lwsp, true);
 513         StringBuffer sb = new StringBuffer();  // decode buffer
 514         StringBuffer wsb = new StringBuffer(); // white space buffer
 515         boolean prevWasEncoded = false;
 516 
 517         while (st.hasMoreTokens()) {
 518             char c;
 519             String s = st.nextToken();
 520             // If whitespace, append it to the whitespace buffer
 521             if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
 522                 (c == '\r') || (c == '\n'))
 523                 wsb.append(c);
 524             else {
 525                 // Check if token is an 'encoded-word' ..
 526                 String word;
 527                 try {
 528                     word = decodeWord(s);
 529                     // Yes, this IS an 'encoded-word'.
 530                     if (!prevWasEncoded && wsb.length() > 0) {
 531                         // if the previous word was also encoded, we
 532                         // should ignore the collected whitespace. Else
 533                         // we include the whitespace as well.
 534                         sb.append(wsb);
 535                     }
 536                     prevWasEncoded = true;
 537                 } catch (ParseException pex) {
 538                     // This is NOT an 'encoded-word'.
 539                     word = s;
 540                     // possibly decode inner encoded words
 541                     if (!decodeStrict)
 542                         word = decodeInnerWords(word);
 543                     // include colleced whitespace ..
 544                     if (wsb.length() > 0)
 545                         sb.append(wsb);
 546                     prevWasEncoded = false;
 547                 }
 548                 sb.append(word); // append the actual word
 549                 wsb.setLength(0); // reset wsb for reuse
 550             }
 551         }
 552         return sb.toString();
 553     }
 554 
 555     /**
 556      * Encode a RFC 822 "word" token into mail-safe form as per
 557      * RFC 2047. <p>
 558      *
 559      * The given Unicode string is examined for non US-ASCII
 560      * characters. If the string contains only US-ASCII characters,
 561      * it is returned as-is.  If the string contains non US-ASCII
 562      * characters, it is first character-encoded using the platform's
 563      * default charset, then transfer-encoded using either the B or
 564      * Q encoding. The resulting bytes are then returned as a Unicode
 565      * string containing only ASCII  characters. <p>
 566      *
 567      * This method is meant to be used when creating RFC 822 "phrases".
 568      * The InternetAddress class, for example, uses this to encode
 569      * it's 'phrase' component.
 570      *
 571      * @param   text    unicode string
 572      * @return  Array of Unicode strings containing only US-ASCII
 573      *          characters.
 574      * @exception UnsupportedEncodingException if the encoding fails
 575      */
 576     public static String encodeWord(String word)
 577                         throws UnsupportedEncodingException {
 578         return encodeWord(word, null, null);
 579     }
 580 
 581     /**
 582      * Encode a RFC 822 "word" token into mail-safe form as per
 583      * RFC 2047. <p>
 584      *
 585      * The given Unicode string is examined for non US-ASCII
 586      * characters. If the string contains only US-ASCII characters,
 587      * it is returned as-is.  If the string contains non US-ASCII
 588      * characters, it is first character-encoded using the specified
 589      * charset, then transfer-encoded using either the B or Q encoding.
 590      * The resulting bytes are then returned as a Unicode string
 591      * containing only ASCII characters. <p>
 592      *
 593      * @param   text    unicode string
 594      * @param   charset the MIME charset
 595      * @param   encoding the encoding to be used. Currently supported
 596      *          values are "B" and "Q". If this parameter is null, then
 597      *          the "Q" encoding is used if most of characters to be
 598      *          encoded are in the ASCII charset, otherwise "B" encoding
 599      *          is used.
 600      * @return  Unicode string containing only US-ASCII characters
 601      * @exception UnsupportedEncodingException if the encoding fails
 602      */
 603     public static String encodeWord(String word, String charset,
 604                                     String encoding)
 605                         throws UnsupportedEncodingException {
 606         return encodeWord(word, charset, encoding, true);
 607     }
 608 
 609     /*
 610      * Encode the given string. The parameter 'encodingWord' should
 611      * be true if a RFC 822 "word" token is being encoded and false if a
 612      * RFC 822 "text" token is being encoded. This is because the
 613      * "Q" encoding defined in RFC 2047 has more restrictions when
 614      * encoding "word" tokens. (Sigh)
 615      */
 616     private static String encodeWord(String string, String charset,
 617                                      String encoding, boolean encodingWord)
 618                         throws UnsupportedEncodingException {
 619 
 620         // If 'string' contains only US-ASCII characters, just
 621         // return it.
 622         int ascii = checkAscii(string);
 623         if (ascii == ALL_ASCII)
 624             return string;
 625 
 626         // Else, apply the specified charset conversion.
 627         String jcharset;
 628         if (charset == null) { // use default charset
 629             jcharset = getDefaultJavaCharset(); // the java charset
 630             charset = getDefaultMIMECharset(); // the MIME equivalent
 631         } else // MIME charset -> java charset
 632             jcharset = javaCharset(charset);
 633 
 634         // If no transfer-encoding is specified, figure one out.
 635         if (encoding == null) {
 636             if (ascii != MOSTLY_NONASCII)
 637                 encoding = "Q";
 638             else
 639                 encoding = "B";
 640         }
 641 
 642         boolean b64;
 643         if (encoding.equalsIgnoreCase("B"))
 644             b64 = true;
 645         else if (encoding.equalsIgnoreCase("Q"))
 646             b64 = false;
 647         else
 648             throw new UnsupportedEncodingException(
 649                         "Unknown transfer encoding: " + encoding);
 650 
 651         StringBuffer outb = new StringBuffer(); // the output buffer
 652         doEncode(string, b64, jcharset,
 653                  // As per RFC 2047, size of an encoded string should not
 654                  // exceed 75 bytes.
 655                  // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
 656                  75 - 7 - charset.length(), // the available space
 657                  "=?" + charset + "?" + encoding + "?", // prefix
 658                  true, encodingWord, outb);
 659 
 660         return outb.toString();
 661     }
 662 
 663     private static void doEncode(String string, boolean b64,
 664                 String jcharset, int avail, String prefix,
 665                 boolean first, boolean encodingWord, StringBuffer buf)
 666                         throws UnsupportedEncodingException {
 667 
 668         // First find out what the length of the encoded version of
 669         // 'string' would be.
 670         byte[] bytes = string.getBytes(jcharset);
 671         int len;
 672         if (b64) // "B" encoding
 673             len = BEncoderStream.encodedLength(bytes);
 674         else // "Q"
 675             len = QEncoderStream.encodedLength(bytes, encodingWord);
 676 
 677         int size;
 678         if ((len > avail) && ((size = string.length()) > 1)) {
 679             // If the length is greater than 'avail', split 'string'
 680             // into two and recurse.
 681             doEncode(string.substring(0, size/2), b64, jcharset,
 682                      avail, prefix, first, encodingWord, buf);
 683             doEncode(string.substring(size/2, size), b64, jcharset,
 684                      avail, prefix, false, encodingWord, buf);
 685         } else {
 686             // length <= than 'avail'. Encode the given string
 687             ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
 688             OutputStream eos; // the encoder
 689             if (b64) // "B" encoding
 690                 eos = new BEncoderStream(os);
 691             else // "Q" encoding
 692                 eos = new QEncoderStream(os, encodingWord);
 693 
 694             try { // do the encoding
 695                 eos.write(bytes);
 696                 eos.close();
 697             } catch (IOException ioex) { }
 698 
 699             byte[] encodedBytes = os.toByteArray(); // the encoded stuff
 700             // Now write out the encoded (all ASCII) bytes into our
 701             // StringBuffer
 702             if (!first) // not the first line of this sequence
 703                 if (foldEncodedWords)
 704                     buf.append("\r\n "); // start a continuation line
 705                 else
 706                     buf.append(" "); // line will be folded later
 707 
 708             buf.append(prefix);
 709             for (int i = 0; i < encodedBytes.length; i++)
 710                 buf.append((char)encodedBytes[i]);
 711             buf.append("?="); // terminate the current sequence
 712         }
 713     }
 714 
 715     /**
 716      * The string is parsed using the rules in RFC 2047 for parsing
 717      * an "encoded-word". If the parse fails, a ParseException is
 718      * thrown. Otherwise, it is transfer-decoded, and then
 719      * charset-converted into Unicode. If the charset-conversion
 720      * fails, an UnsupportedEncodingException is thrown.<p>
 721      *
 722      * @param   eword   the possibly encoded value
 723      * @exception       ParseException if the string is not an
 724      *                  encoded-word as per RFC 2047.
 725      * @exception       UnsupportedEncodingException if the charset
 726      *                  conversion failed.
 727      */
 728     public static String decodeWord(String eword)
 729                 throws ParseException, UnsupportedEncodingException {
 730 
 731         if (!eword.startsWith("=?")) // not an encoded word
 732             throw new ParseException();
 733 
 734         // get charset
 735         int start = 2; int pos;
 736         if ((pos = eword.indexOf('?', start)) == -1)
 737             throw new ParseException();
 738         String charset = javaCharset(eword.substring(start, pos));
 739 
 740         // get encoding
 741         start = pos+1;
 742         if ((pos = eword.indexOf('?', start)) == -1)
 743             throw new ParseException();
 744         String encoding = eword.substring(start, pos);
 745 
 746         // get encoded-sequence
 747         start = pos+1;
 748         if ((pos = eword.indexOf("?=", start)) == -1)
 749             throw new ParseException();
 750         String word = eword.substring(start, pos);
 751 
 752         try {
 753             // Extract the bytes from word
 754             ByteArrayInputStream bis =
 755                 new ByteArrayInputStream(ASCIIUtility.getBytes(word));
 756 
 757             // Get the appropriate decoder
 758             InputStream is;
 759             if (encoding.equalsIgnoreCase("B"))
 760                 is = new BASE64DecoderStream(bis);
 761             else if (encoding.equalsIgnoreCase("Q"))
 762                 is = new QDecoderStream(bis);
 763             else
 764                 throw new UnsupportedEncodingException(
 765                                 "unknown encoding: " + encoding);
 766 
 767             // For b64 & q, size of decoded word <= size of word. So
 768             // the decoded bytes must fit into the 'bytes' array. This
 769             // is certainly more efficient than writing bytes into a
 770             // ByteArrayOutputStream and then pulling out the byte[]
 771             // from it.
 772             int count = bis.available();
 773             byte[] bytes = new byte[count];
 774             // count is set to the actual number of decoded bytes
 775             count = is.read(bytes, 0, count);
 776 
 777             // Finally, convert the decoded bytes into a String using
 778             // the specified charset
 779             String s = new String(bytes, 0, count, charset);
 780             if (pos + 2 < eword.length()) {
 781                 // there's still more text in the string
 782                 String rest = eword.substring(pos + 2);
 783                 if (!decodeStrict)
 784                     rest = decodeInnerWords(rest);
 785                 s += rest;
 786             }
 787             return s;
 788         } catch (UnsupportedEncodingException uex) {
 789             // explicitly catch and rethrow this exception, otherwise
 790             // the below IOException catch will swallow this up!
 791             throw uex;
 792         } catch (IOException ioex) {
 793             // Shouldn't happen.
 794             throw new ParseException();
 795         } catch (IllegalArgumentException iex) {
 796             /* An unknown charset of the form ISO-XXX-XXX, will cause
 797              * the JDK to throw an IllegalArgumentException ... Since the
 798              * JDK will attempt to create a classname using this string,
 799              * but valid classnames must not contain the character '-',
 800              * and this results in an IllegalArgumentException, rather than
 801              * the expected UnsupportedEncodingException. Yikes
 802              */
 803             throw new UnsupportedEncodingException();
 804         }
 805     }
 806 
 807     /**
 808      * Look for encoded words within a word.  The MIME spec doesn't
 809      * allow this, but many broken mailers, especially Japanese mailers,
 810      * produce such incorrect encodings.
 811      */
 812     private static String decodeInnerWords(String word)
 813                                 throws UnsupportedEncodingException {
 814         int start = 0, i;
 815         StringBuffer buf = new StringBuffer();
 816         while ((i = word.indexOf("=?", start)) >= 0) {
 817             buf.append(word.substring(start, i));
 818             int end = word.indexOf("?=", i);
 819             if (end < 0)
 820                 break;
 821             String s = word.substring(i, end + 2);
 822             try {
 823                 s = decodeWord(s);
 824             } catch (ParseException pex) {
 825                 // ignore it, just use the original string
 826             }
 827             buf.append(s);
 828             start = end + 2;
 829         }
 830         if (start == 0)
 831             return word;
 832         if (start < word.length())
 833             buf.append(word.substring(start));
 834         return buf.toString();
 835     }
 836 
 837     /**
 838      * A utility method to quote a word, if the word contains any
 839      * characters from the specified 'specials' list.<p>
 840      *
 841      * The <code>HeaderTokenizer</code> class defines two special
 842      * sets of delimiters - MIME and RFC 822. <p>
 843      *
 844      * This method is typically used during the generation of
 845      * RFC 822 and MIME header fields.
 846      *
 847      * @param   word    word to be quoted
 848      * @param   specials the set of special characters
 849      * @return          the possibly quoted word
 850      * @see     javax.mail.internet.HeaderTokenizer#MIME
 851      * @see     javax.mail.internet.HeaderTokenizer#RFC822
 852      */
 853     public static String quote(String word, String specials) {
 854         int len = word.length();
 855 
 856         /*
 857          * Look for any "bad" characters, Escape and
 858          *  quote the entire string if necessary.
 859          */
 860         boolean needQuoting = false;
 861         for (int i = 0; i < len; i++) {
 862             char c = word.charAt(i);
 863             if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
 864                 // need to escape them and then quote the whole string
 865                 StringBuffer sb = new StringBuffer(len + 3);
 866                 sb.append('"');
 867                 sb.append(word.substring(0, i));
 868                 int lastc = 0;
 869                 for (int j = i; j < len; j++) {
 870                     char cc = word.charAt(j);
 871                     if ((cc == '"') || (cc == '\\') ||
 872                         (cc == '\r') || (cc == '\n'))
 873                         if (cc == '\n' && lastc == '\r')
 874                             ;   // do nothing, CR was already escaped
 875                         else
 876                             sb.append('\\');    // Escape the character
 877                     sb.append(cc);
 878                     lastc = cc;
 879                 }
 880                 sb.append('"');
 881                 return sb.toString();
 882             } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
 883                 // These characters cause the string to be quoted
 884                 needQuoting = true;
 885         }
 886 
 887         if (needQuoting) {
 888             StringBuffer sb = new StringBuffer(len + 2);
 889             sb.append('"').append(word).append('"');
 890             return sb.toString();
 891         } else
 892             return word;
 893     }
 894 
 895     /**
 896      * Fold a string at linear whitespace so that each line is no longer
 897      * than 76 characters, if possible.  If there are more than 76
 898      * non-whitespace characters consecutively, the string is folded at
 899      * the first whitespace after that sequence.  The parameter
 900      * <code>used</code> indicates how many characters have been used in
 901      * the current line; it is usually the length of the header name. <p>
 902      *
 903      * Note that line breaks in the string aren't escaped; they probably
 904      * should be.
 905      *
 906      * @param   used    characters used in line so far
 907      * @param   s       the string to fold
 908      * @return          the folded string
 909      */
 910     /*public*/ static String fold(int used, String s) {
 911         if (!foldText)
 912             return s;
 913 
 914         int end;
 915         char c;
 916         // Strip trailing spaces
 917         for (end = s.length() - 1; end >= 0; end--) {
 918             c = s.charAt(end);
 919             if (c != ' ' && c != '\t')
 920                 break;
 921         }
 922         if (end != s.length() - 1)
 923             s = s.substring(0, end + 1);
 924 
 925         // if the string fits now, just return it
 926         if (used + s.length() <= 76)
 927             return s;
 928 
 929         // have to actually fold the string
 930         StringBuffer sb = new StringBuffer(s.length() + 4);
 931         char lastc = 0;
 932         while (used + s.length() > 76) {
 933             int lastspace = -1;
 934             for (int i = 0; i < s.length(); i++) {
 935                 if (lastspace != -1 && used + i > 76)
 936                     break;
 937                 c = s.charAt(i);
 938                 if (c == ' ' || c == '\t')
 939                     if (!(lastc == ' ' || lastc == '\t'))
 940                         lastspace = i;
 941                 lastc = c;
 942             }
 943             if (lastspace == -1) {
 944                 // no space, use the whole thing
 945                 sb.append(s);
 946                 s = "";
 947                 used = 0;
 948                 break;
 949             }
 950             sb.append(s.substring(0, lastspace));
 951             sb.append("\r\n");
 952             lastc = s.charAt(lastspace);
 953             sb.append(lastc);
 954             s = s.substring(lastspace + 1);
 955             used = 1;
 956         }
 957         sb.append(s);
 958         return sb.toString();
 959     }
 960 
 961     /**
 962      * Unfold a folded header.  Any line breaks that aren't escaped and
 963      * are followed by whitespace are removed.
 964      *
 965      * @param   s       the string to unfold
 966      * @return          the unfolded string
 967      */
 968     /*public*/ static String unfold(String s) {
 969         if (!foldText)
 970             return s;
 971 
 972         StringBuffer sb = null;
 973         int i;
 974         while ((i = indexOfAny(s, "\r\n")) >= 0) {
 975             int start = i;
 976             int l = s.length();
 977             i++;                // skip CR or NL
 978             if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
 979                 i++;    // skip LF
 980             if (start == 0 || s.charAt(start - 1) != '\\') {
 981                 char c;
 982                 // if next line starts with whitespace, skip all of it
 983                 // XXX - always has to be true?
 984                 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
 985                     i++;        // skip whitespace
 986                     while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
 987                         i++;
 988                     if (sb == null)
 989                         sb = new StringBuffer(s.length());
 990                     if (start != 0) {
 991                         sb.append(s.substring(0, start));
 992                         sb.append(' ');
 993                     }
 994                     s = s.substring(i);
 995                     continue;
 996                 }
 997                 // it's not a continuation line, just leave it in
 998                 if (sb == null)
 999                     sb = new StringBuffer(s.length());
1000                 sb.append(s.substring(0, i));
1001                 s = s.substring(i);
1002             } else {
1003                 // there's a backslash at "start - 1"
1004                 // strip it out, but leave in the line break
1005                 if (sb == null)
1006                     sb = new StringBuffer(s.length());
1007                 sb.append(s.substring(0, start - 1));
1008                 sb.append(s.substring(start, i));
1009                 s = s.substring(i);
1010             }
1011         }
1012         if (sb != null) {
1013             sb.append(s);
1014             return sb.toString();
1015         } else
1016             return s;
1017     }
1018 
1019     /**
1020      * Return the first index of any of the characters in "any" in "s",
1021      * or -1 if none are found.
1022      *
1023      * This should be a method on String.
1024      */
1025     private static int indexOfAny(String s, String any) {
1026         return indexOfAny(s, any, 0);
1027     }
1028 
1029     private static int indexOfAny(String s, String any, int start) {
1030         try {
1031             int len = s.length();
1032             for (int i = start; i < len; i++) {
1033                 if (any.indexOf(s.charAt(i)) >= 0)
1034                     return i;
1035             }
1036             return -1;
1037         } catch (StringIndexOutOfBoundsException e) {
1038             return -1;
1039         }
1040     }
1041 
1042     /**
1043      * Convert a MIME charset name into a valid Java charset name. <p>
1044      *
1045      * @param charset   the MIME charset name
1046      * @return  the Java charset equivalent. If a suitable mapping is
1047      *          not available, the passed in charset is itself returned.
1048      */
1049     public static String javaCharset(String charset) {
1050         if (mime2java == null || charset == null)
1051             // no mapping table, or charset parameter is null
1052             return charset;
1053 
1054         String alias = (String)mime2java.get(charset.toLowerCase());
1055         return alias == null ? charset : alias;
1056     }
1057 
1058     /**
1059      * Convert a java charset into its MIME charset name. <p>
1060      *
1061      * Note that a future version of JDK (post 1.2) might provide
1062      * this functionality, in which case, we may deprecate this
1063      * method then.
1064      *
1065      * @param   charset    the JDK charset
1066      * @return          the MIME/IANA equivalent. If a mapping
1067      *                  is not possible, the passed in charset itself
1068      *                  is returned.
1069      * @since           JavaMail 1.1
1070      */
1071     public static String mimeCharset(String charset) {
1072         if (java2mime == null || charset == null)
1073             // no mapping table or charset param is null
1074             return charset;
1075 
1076         String alias = (String)java2mime.get(charset.toLowerCase());
1077         return alias == null ? charset : alias;
1078     }
1079 
1080     private static String defaultJavaCharset;
1081     private static String defaultMIMECharset;
1082 
1083     /**
1084      * Get the default charset corresponding to the system's current
1085      * default locale.  If the System property <code>mail.mime.charset</code>
1086      * is set, a system charset corresponding to this MIME charset will be
1087      * returned. <p>
1088      *
1089      * @return  the default charset of the system's default locale,
1090      *          as a Java charset. (NOT a MIME charset)
1091      * @since   JavaMail 1.1
1092      */
1093     public static String getDefaultJavaCharset() {
1094         if (defaultJavaCharset == null) {
1095             /*
1096              * If mail.mime.charset is set, it controls the default
1097              * Java charset as well.
1098              */
1099             String mimecs = null;
1100 
1101             mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
1102 
1103             if (mimecs != null && mimecs.length() > 0) {
1104                 defaultJavaCharset = javaCharset(mimecs);
1105                 return defaultJavaCharset;
1106             }
1107 
1108             try {
1109                 defaultJavaCharset = System.getProperty("file.encoding",
1110                                                         "8859_1");
1111             } catch (SecurityException sex) {
1112 
1113                 class NullInputStream extends InputStream {
1114                     public int read() {
1115                         return 0;
1116                     }
1117                 }
1118                 InputStreamReader reader =
1119                         new InputStreamReader(new NullInputStream());
1120                 defaultJavaCharset = reader.getEncoding();
1121                 if (defaultJavaCharset == null)
1122                     defaultJavaCharset = "8859_1";
1123             }
1124         }
1125 
1126         return defaultJavaCharset;
1127     }
1128 
1129     /*
1130      * Get the default MIME charset for this locale.
1131      */
1132     static String getDefaultMIMECharset() {
1133         if (defaultMIMECharset == null) {
1134                 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset");
1135         }
1136         if (defaultMIMECharset == null)
1137             defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
1138         return defaultMIMECharset;
1139     }
1140 
1141     // Tables to map MIME charset names to Java names and vice versa.
1142     // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
1143     private static Hashtable mime2java;
1144     private static Hashtable java2mime;
1145 
1146     static {
1147         java2mime = new Hashtable(40);
1148         mime2java = new Hashtable(10);
1149 
1150         try {
1151             // Use this class's classloader to load the mapping file
1152             // XXX - we should use SecuritySupport, but it's in another package
1153             InputStream is =
1154                     com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
1155                     "/META-INF/javamail.charset.map");
1156 
1157             if (is != null) {
1158                 is = new LineInputStream(is);
1159 
1160                 // Load the JDK-to-MIME charset mapping table
1161                 loadMappings((LineInputStream)is, java2mime);
1162 
1163                 // Load the MIME-to-JDK charset mapping table
1164                 loadMappings((LineInputStream)is, mime2java);
1165             }
1166         } catch (Exception ex) { }
1167 
1168         // If we didn't load the tables, e.g., because we didn't have
1169         // permission, load them manually.  The entries here should be
1170         // the same as the default javamail.charset.map.
1171         if (java2mime.isEmpty()) {
1172             java2mime.put("8859_1", "ISO-8859-1");
1173             java2mime.put("iso8859_1", "ISO-8859-1");
1174             java2mime.put("ISO8859-1", "ISO-8859-1");
1175 
1176             java2mime.put("8859_2", "ISO-8859-2");
1177             java2mime.put("iso8859_2", "ISO-8859-2");
1178             java2mime.put("ISO8859-2", "ISO-8859-2");
1179 
1180             java2mime.put("8859_3", "ISO-8859-3");
1181             java2mime.put("iso8859_3", "ISO-8859-3");
1182             java2mime.put("ISO8859-3", "ISO-8859-3");
1183 
1184             java2mime.put("8859_4", "ISO-8859-4");
1185             java2mime.put("iso8859_4", "ISO-8859-4");
1186             java2mime.put("ISO8859-4", "ISO-8859-4");
1187 
1188             java2mime.put("8859_5", "ISO-8859-5");
1189             java2mime.put("iso8859_5", "ISO-8859-5");
1190             java2mime.put("ISO8859-5", "ISO-8859-5");
1191 
1192             java2mime.put("8859_6", "ISO-8859-6");
1193             java2mime.put("iso8859_6", "ISO-8859-6");
1194             java2mime.put("ISO8859-6", "ISO-8859-6");
1195 
1196             java2mime.put("8859_7", "ISO-8859-7");
1197             java2mime.put("iso8859_7", "ISO-8859-7");
1198             java2mime.put("ISO8859-7", "ISO-8859-7");
1199 
1200             java2mime.put("8859_8", "ISO-8859-8");
1201             java2mime.put("iso8859_8", "ISO-8859-8");
1202             java2mime.put("ISO8859-8", "ISO-8859-8");
1203 
1204             java2mime.put("8859_9", "ISO-8859-9");
1205             java2mime.put("iso8859_9", "ISO-8859-9");
1206             java2mime.put("ISO8859-9", "ISO-8859-9");
1207 
1208             java2mime.put("SJIS", "Shift_JIS");
1209             java2mime.put("MS932", "Shift_JIS");
1210             java2mime.put("JIS", "ISO-2022-JP");
1211             java2mime.put("ISO2022JP", "ISO-2022-JP");
1212             java2mime.put("EUC_JP", "euc-jp");
1213             java2mime.put("KOI8_R", "koi8-r");
1214             java2mime.put("EUC_CN", "euc-cn");
1215             java2mime.put("EUC_TW", "euc-tw");
1216             java2mime.put("EUC_KR", "euc-kr");
1217         }
1218         if (mime2java.isEmpty()) {
1219             mime2java.put("iso-2022-cn", "ISO2022CN");
1220             mime2java.put("iso-2022-kr", "ISO2022KR");
1221             mime2java.put("utf-8", "UTF8");
1222             mime2java.put("utf8", "UTF8");
1223             mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1224             mime2java.put("ja_jp.eucjp", "EUCJIS");
1225             mime2java.put("euc-kr", "KSC5601");
1226             mime2java.put("euckr", "KSC5601");
1227             mime2java.put("us-ascii", "ISO-8859-1");
1228             mime2java.put("x-us-ascii", "ISO-8859-1");
1229         }
1230     }
1231 
1232     private static void loadMappings(LineInputStream is, Hashtable table) {
1233         String currLine;
1234 
1235         while (true) {
1236             try {
1237                 currLine = is.readLine();
1238             } catch (IOException ioex) {
1239                 break; // error in reading, stop
1240             }
1241 
1242             if (currLine == null) // end of file, stop
1243                 break;
1244             if (currLine.startsWith("--") && currLine.endsWith("--"))
1245                 // end of this table
1246                 break;
1247 
1248             // ignore empty lines and comments
1249             if (currLine.trim().length() == 0 || currLine.startsWith("#"))
1250                 continue;
1251 
1252             // A valid entry is of the form <key><separator><value>
1253             // where, <separator> := SPACE | HT. Parse this
1254             StringTokenizer tk = new StringTokenizer(currLine, " \t");
1255             try {
1256                 String key = tk.nextToken();
1257                 String value = tk.nextToken();
1258                 table.put(key.toLowerCase(), value);
1259             } catch (NoSuchElementException nex) { }
1260         }
1261     }
1262 
1263     static final int ALL_ASCII          = 1;
1264     static final int MOSTLY_ASCII       = 2;
1265     static final int MOSTLY_NONASCII    = 3;
1266 
1267     /**
1268      * Check if the given string contains non US-ASCII characters.
1269      * @param   s       string
1270      * @return          ALL_ASCII if all characters in the string
1271      *                  belong to the US-ASCII charset. MOSTLY_ASCII
1272      *                  if more than half of the available characters
1273      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
1274      */
1275     static int checkAscii(String s) {
1276         int ascii = 0, non_ascii = 0;
1277         int l = s.length();
1278 
1279         for (int i = 0; i < l; i++) {
1280             if (nonascii((int)s.charAt(i))) // non-ascii
1281                 non_ascii++;
1282             else
1283                 ascii++;
1284         }
1285 
1286         if (non_ascii == 0)
1287             return ALL_ASCII;
1288         if (ascii > non_ascii)
1289             return MOSTLY_ASCII;
1290 
1291         return MOSTLY_NONASCII;
1292     }
1293 
1294     /**
1295      * Check if the given byte array contains non US-ASCII characters.
1296      * @param   b       byte array
1297      * @return          ALL_ASCII if all characters in the string
1298      *                  belong to the US-ASCII charset. MOSTLY_ASCII
1299      *                  if more than half of the available characters
1300      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
1301      *
1302      * XXX - this method is no longer used
1303      */
1304     static int checkAscii(byte[] b) {
1305         int ascii = 0, non_ascii = 0;
1306 
1307         for (int i=0; i < b.length; i++) {
1308             // The '&' operator automatically causes b[i] to be promoted
1309             // to an int, and we mask out the higher bytes in the int
1310             // so that the resulting value is not a negative integer.
1311             if (nonascii(b[i] & 0xff)) // non-ascii
1312                 non_ascii++;
1313             else
1314                 ascii++;
1315         }
1316 
1317         if (non_ascii == 0)
1318             return ALL_ASCII;
1319         if (ascii > non_ascii)
1320             return MOSTLY_ASCII;
1321 
1322         return MOSTLY_NONASCII;
1323     }
1324 
1325     /**
1326      * Check if the given input stream contains non US-ASCII characters.
1327      * Upto <code>max</code> bytes are checked. If <code>max</code> is
1328      * set to <code>ALL</code>, then all the bytes available in this
1329      * input stream are checked. If <code>breakOnNonAscii</code> is true
1330      * the check terminates when the first non-US-ASCII character is
1331      * found and MOSTLY_NONASCII is returned. Else, the check continues
1332      * till <code>max</code> bytes or till the end of stream.
1333      *
1334      * @param   is      the input stream
1335      * @param   max     maximum bytes to check for. The special value
1336      *                  ALL indicates that all the bytes in this input
1337      *                  stream must be checked.
1338      * @param   breakOnNonAscii if <code>true</code>, then terminate the
1339      *                  the check when the first non-US-ASCII character
1340      *                  is found.
1341      * @return          ALL_ASCII if all characters in the string
1342      *                  belong to the US-ASCII charset. MOSTLY_ASCII
1343      *                  if more than half of the available characters
1344      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
1345      */
1346     static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
1347         int ascii = 0, non_ascii = 0;
1348         int len;
1349         int block = 4096;
1350         int linelen = 0;
1351         boolean longLine = false, badEOL = false;
1352         boolean checkEOL = encodeEolStrict && breakOnNonAscii;
1353         byte buf[] = null;
1354         if (max != 0) {
1355             block = (max == ALL) ? 4096 : Math.min(max, 4096);
1356             buf = new byte[block];
1357         }
1358         while (max != 0) {
1359             try {
1360                 if ((len = is.read(buf, 0, block)) == -1)
1361                     break;
1362                 int lastb = 0;
1363                 for (int i = 0; i < len; i++) {
1364                     // The '&' operator automatically causes b[i] to
1365                     // be promoted to an int, and we mask out the higher
1366                     // bytes in the int so that the resulting value is
1367                     // not a negative integer.
1368                     int b = buf[i] & 0xff;
1369                     if (checkEOL &&
1370                             ((lastb == '\r' && b != '\n') ||
1371                             (lastb != '\r' && b == '\n')))
1372                         badEOL = true;
1373                     if (b == '\r' || b == '\n')
1374                         linelen = 0;
1375                     else {
1376                         linelen++;
1377                         if (linelen > 998)      // 1000 - CRLF
1378                             longLine = true;
1379                     }
1380                     if (nonascii(b)) {  // non-ascii
1381                         if (breakOnNonAscii) // we are done
1382                             return MOSTLY_NONASCII;
1383                         else
1384                             non_ascii++;
1385                     } else
1386                         ascii++;
1387                     lastb = b;
1388                 }
1389             } catch (IOException ioex) {
1390                 break;
1391             }
1392             if (max != ALL)
1393                 max -= len;
1394         }
1395 
1396         if (max == 0 && breakOnNonAscii)
1397             // We have been told to break on the first non-ascii character.
1398             // We haven't got any non-ascii character yet, but then we
1399             // have not checked all of the available bytes either. So we
1400             // cannot say for sure that this input stream is ALL_ASCII,
1401             // and hence we must play safe and return MOSTLY_NONASCII
1402 
1403             return MOSTLY_NONASCII;
1404 
1405         if (non_ascii == 0) { // no non-us-ascii characters so far
1406             // If we're looking at non-text data, and we saw CR without LF
1407             // or vice versa, consider this mostly non-ASCII so that it
1408             // will be base64 encoded (since the quoted-printable encoder
1409             // doesn't encode this case properly).
1410             if (badEOL)
1411                 return MOSTLY_NONASCII;
1412             // if we've seen a long line, we degrade to mostly ascii
1413             else if (longLine)
1414                 return MOSTLY_ASCII;
1415             else
1416                 return ALL_ASCII;
1417         }
1418         if (ascii > non_ascii) // mostly ascii
1419             return MOSTLY_ASCII;
1420         return MOSTLY_NONASCII;
1421     }
1422 
1423     static final boolean nonascii(int b) {
1424         return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
1425     }
1426 }
1427 
1428 /**
1429  * An OutputStream that determines whether the data written to
1430  * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1431  */
1432 class AsciiOutputStream extends OutputStream {
1433     private boolean breakOnNonAscii;
1434     private int ascii = 0, non_ascii = 0;
1435     private int linelen = 0;
1436     private boolean longLine = false;
1437     private boolean badEOL = false;
1438     private boolean checkEOL = false;
1439     private int lastb = 0;
1440     private int ret = 0;
1441 
1442     public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
1443         this.breakOnNonAscii = breakOnNonAscii;
1444         checkEOL = encodeEolStrict && breakOnNonAscii;
1445     }
1446 
1447     public void write(int b) throws IOException {
1448         check(b);
1449     }
1450 
1451     public void write(byte b[]) throws IOException {
1452         write(b, 0, b.length);
1453     }
1454 
1455     public void write(byte b[], int off, int len) throws IOException {
1456         len += off;
1457         for (int i = off; i < len ; i++)
1458             check(b[i]);
1459     }
1460 
1461     private final void check(int b) throws IOException {
1462         b &= 0xff;
1463         if (checkEOL &&
1464                 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1465             badEOL = true;
1466         if (b == '\r' || b == '\n')
1467             linelen = 0;
1468         else {
1469             linelen++;
1470             if (linelen > 998)  // 1000 - CRLF
1471                 longLine = true;
1472         }
1473         if (MimeUtility.nonascii(b)) { // non-ascii
1474             non_ascii++;
1475             if (breakOnNonAscii) {      // we are done
1476                 ret = MimeUtility.MOSTLY_NONASCII;
1477                 throw new EOFException();
1478             }
1479         } else
1480             ascii++;
1481         lastb = b;
1482     }
1483 
1484     /**
1485      * Return ASCII-ness of data stream.
1486      */
1487     public int getAscii() {
1488         if (ret != 0)
1489             return ret;
1490         // If we're looking at non-text data, and we saw CR without LF
1491         // or vice versa, consider this mostly non-ASCII so that it
1492         // will be base64 encoded (since the quoted-printable encoder
1493         // doesn't encode this case properly).
1494         if (badEOL)
1495             return MimeUtility.MOSTLY_NONASCII;
1496         else if (non_ascii == 0) { // no non-us-ascii characters so far
1497             // if we've seen a long line, we degrade to mostly ascii
1498             if (longLine)
1499                 return MimeUtility.MOSTLY_ASCII;
1500             else
1501                 return MimeUtility.ALL_ASCII;
1502         }
1503         if (ascii > non_ascii) // mostly ascii
1504             return MimeUtility.MOSTLY_ASCII;
1505         return MimeUtility.MOSTLY_NONASCII;
1506     }
1507 }
--- EOF ---