1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2004 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 /*
  21  * $Id: Encodings.java,v 1.3 2005/09/28 13:49:04 pvedula Exp $
  22  */
  23 package com.sun.org.apache.xml.internal.serializer;
  24 
  25 import java.io.InputStream;
  26 import java.io.OutputStream;
  27 import java.io.OutputStreamWriter;
  28 import java.io.UnsupportedEncodingException;
  29 import java.io.Writer;
  30 import java.io.BufferedWriter;
  31 import java.net.URL;
  32 import java.util.Enumeration;
  33 import java.util.HashMap;
  34 import java.util.Properties;
  35 import java.util.StringTokenizer;
  36 
  37 import com.sun.org.apache.xalan.internal.utils.SecuritySupport;
  38 
  39 /**
  40  * Provides information about encodings. Depends on the Java runtime
  41  * to provides writers for the different encodings, but can be used
  42  * to override encoding names and provide the last printable character
  43  * for each encoding.
  44  *
  45  * @version $Revision: 1.11 $ $Date: 2010-11-01 04:34:44 $
  46  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  47  */
  48 
  49 public final class Encodings extends Object
  50 {
  51 
  52     /**
  53      * The last printable character for unknown encodings.
  54      */
  55     private static final int m_defaultLastPrintable = 0x7F;
  56 
  57     /**
  58      * Standard filename for properties file with encodings data.
  59      */
  60     private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties";
  61 
  62     /**
  63      * Standard filename for properties file with encodings data.
  64      */
  65     private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings";
  66 
  67 
  68     /**
  69      * Returns a writer for the specified encoding based on
  70      * an output stream.
  71      *
  72      * @param output The output stream
  73      * @param encoding The encoding
  74      * @return A suitable writer
  75      * @throws UnsupportedEncodingException There is no convertor
  76      *  to support this encoding
  77      */
  78     static Writer getWriter(OutputStream output, String encoding)
  79         throws UnsupportedEncodingException
  80     {
  81 
  82         for (int i = 0; i < _encodings.length; ++i)
  83         {
  84             if (_encodings[i].name.equalsIgnoreCase(encoding))
  85             {
  86                 try
  87                 {
  88                     return new BufferedWriter(new OutputStreamWriter(
  89                         output,
  90                         _encodings[i].javaName));
  91                 }
  92                 catch (java.lang.IllegalArgumentException iae) // java 1.1.8
  93                 {
  94                     // keep trying
  95                 }
  96                 catch (UnsupportedEncodingException usee)
  97                 {
  98 
  99                     // keep trying
 100                 }
 101             }
 102         }
 103 
 104         try
 105         {
 106             return new BufferedWriter(new OutputStreamWriter(output, encoding));
 107         }
 108         catch (java.lang.IllegalArgumentException iae) // java 1.1.8
 109         {
 110             throw new UnsupportedEncodingException(encoding);
 111         }
 112     }
 113 
 114 
 115     /**
 116      * Returns the last printable character for an unspecified
 117      * encoding.
 118      *
 119      * @return the default size
 120      */
 121     public static int getLastPrintable()
 122     {
 123         return m_defaultLastPrintable;
 124     }
 125 
 126 
 127 
 128     /**
 129      * Returns the EncodingInfo object for the specified
 130      * encoding.
 131      * <p>
 132      * This is not a public API.
 133      *
 134      * @param encoding The encoding
 135      * @return The object that is used to determine if
 136      * characters are in the given encoding.
 137      * @xsl.usage internal
 138      */
 139     static EncodingInfo getEncodingInfo(String encoding)
 140     {
 141         EncodingInfo ei;
 142 
 143         String normalizedEncoding = toUpperCaseFast(encoding);
 144         ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
 145         if (ei == null)
 146             ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
 147         if (ei == null) {
 148             // We shouldn't have to do this, but just in case.
 149             ei = new EncodingInfo(null,null);
 150         }
 151 
 152         return ei;
 153     }
 154 
 155     /**
 156      * A fast and cheap way to uppercase a String that is
 157      * only made of printable ASCII characters.
 158      * <p>
 159      * This is not a public API.
 160      * @param s a String of ASCII characters
 161      * @return an uppercased version of the input String,
 162      * possibly the same String.
 163      * @xsl.usage internal
 164      */
 165     static private String toUpperCaseFast(final String s) {
 166 
 167         boolean different = false;
 168         final int mx = s.length();
 169                 char[] chars = new char[mx];
 170         for (int i=0; i < mx; i++) {
 171                 char ch = s.charAt(i);
 172             // is the character a lower case ASCII one?
 173                 if ('a' <= ch && ch <= 'z') {
 174                 // a cheap and fast way to uppercase that is good enough
 175                         ch = (char) (ch + ('A' - 'a'));
 176                         different = true; // the uppercased String is different
 177                 }
 178                 chars[i] = ch;
 179         }
 180 
 181         // A little optimization, don't call String.valueOf() if
 182         // the uppercased string is the same as the input string.
 183         final String upper;
 184         if (different)
 185                 upper = String.valueOf(chars);
 186         else
 187                 upper = s;
 188 
 189         return upper;
 190     }
 191 
 192     /** The default encoding, ISO style, ISO style.   */
 193     static final String DEFAULT_MIME_ENCODING = "UTF-8";
 194 
 195     /**
 196      * Get the proper mime encoding.  From the XSLT recommendation: "The encoding
 197      * attribute specifies the preferred encoding to use for outputting the result
 198      * tree. XSLT processors are required to respect values of UTF-8 and UTF-16.
 199      * For other values, if the XSLT processor does not support the specified
 200      * encoding it may signal an error; if it does not signal an error it should
 201      * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding
 202      * whose name does not match the EncName production of the XML Recommendation
 203      * [XML]. If no encoding attribute is specified, then the XSLT processor should
 204      * use either UTF-8 or UTF-16."
 205      *
 206      * @param encoding Reference to java-style encoding string, which may be null,
 207      * in which case a default will be found.
 208      *
 209      * @return The ISO-style encoding string, or null if failure.
 210      */
 211     static String getMimeEncoding(String encoding)
 212     {
 213 
 214         if (null == encoding)
 215         {
 216             try
 217             {
 218 
 219                 // Get the default system character encoding.  This may be
 220                 // incorrect if they passed in a writer, but right now there
 221                 // seems to be no way to get the encoding from a writer.
 222                 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8");
 223 
 224                 if (null != encoding)
 225                 {
 226 
 227                     /*
 228                     * See if the mime type is equal to UTF8.  If you don't
 229                     * do that, then  convertJava2MimeEncoding will convert
 230                     * 8859_1 to "ISO-8859-1", which is not what we want,
 231                     * I think, and I don't think I want to alter the tables
 232                     * to convert everything to UTF-8.
 233                     */
 234                     String jencoding =
 235                         (encoding.equalsIgnoreCase("Cp1252")
 236                             || encoding.equalsIgnoreCase("ISO8859_1")
 237                             || encoding.equalsIgnoreCase("8859_1")
 238                             || encoding.equalsIgnoreCase("UTF8"))
 239                             ? DEFAULT_MIME_ENCODING
 240                             : convertJava2MimeEncoding(encoding);
 241 
 242                     encoding =
 243                         (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
 244                 }
 245                 else
 246                 {
 247                     encoding = DEFAULT_MIME_ENCODING;
 248                 }
 249             }
 250             catch (SecurityException se)
 251             {
 252                 encoding = DEFAULT_MIME_ENCODING;
 253             }
 254         }
 255         else
 256         {
 257             encoding = convertJava2MimeEncoding(encoding);
 258         }
 259 
 260         return encoding;
 261     }
 262 
 263     /**
 264      * Try the best we can to convert a Java encoding to a XML-style encoding.
 265      *
 266      * @param encoding non-null reference to encoding string, java style.
 267      *
 268      * @return ISO-style encoding string.
 269      */
 270     private static String convertJava2MimeEncoding(String encoding)
 271     {
 272         EncodingInfo enc =
 273             (EncodingInfo) _encodingTableKeyJava.get(encoding.toUpperCase());
 274         if (null != enc)
 275             return enc.name;
 276         return encoding;
 277     }
 278 
 279     /**
 280      * Try the best we can to convert a Java encoding to a XML-style encoding.
 281      *
 282      * @param encoding non-null reference to encoding string, java style.
 283      *
 284      * @return ISO-style encoding string.
 285      */
 286     public static String convertMime2JavaEncoding(String encoding)
 287     {
 288 
 289         for (int i = 0; i < _encodings.length; ++i)
 290         {
 291             if (_encodings[i].name.equalsIgnoreCase(encoding))
 292             {
 293                 return _encodings[i].javaName;
 294             }
 295         }
 296 
 297         return encoding;
 298     }
 299 
 300     /**
 301      * Load a list of all the supported encodings.
 302      *
 303      * System property "encodings" formatted using URL syntax may define an
 304      * external encodings list. Thanks to Sergey Ushakov for the code
 305      * contribution!
 306      */
 307     private static EncodingInfo[] loadEncodingInfo()
 308     {
 309         try
 310         {
 311             String urlString = null;
 312             InputStream is = null;
 313 
 314             try
 315             {
 316                 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, "");
 317             }
 318             catch (SecurityException e)
 319             {
 320             }
 321 
 322             if (urlString != null && urlString.length() > 0) {
 323                 URL url = new URL(urlString);
 324                 is = url.openStream();
 325             }
 326 
 327             if (is == null) {
 328                 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE);
 329             }
 330 
 331             Properties props = new Properties();
 332             if (is != null) {
 333                 props.load(is);
 334                 is.close();
 335             } else {
 336                 // Seems to be no real need to force failure here, let the
 337                 // system do its best... The issue is not really very critical,
 338                 // and the output will be in any case _correct_ though maybe not
 339                 // always human-friendly... :)
 340                 // But maybe report/log the resource problem?
 341                 // Any standard ways to report/log errors (in static context)?
 342             }
 343 
 344             int totalEntries = props.size();
 345             int totalMimeNames = 0;
 346             Enumeration keys = props.keys();
 347             for (int i = 0; i < totalEntries; ++i)
 348             {
 349                 String javaName = (String) keys.nextElement();
 350                 String val = props.getProperty(javaName);
 351                 totalMimeNames++;
 352                 int pos = val.indexOf(' ');
 353                 for (int j = 0; j < pos; ++j)
 354                     if (val.charAt(j) == ',')
 355                         totalMimeNames++;
 356             }
 357             EncodingInfo[] ret = new EncodingInfo[totalMimeNames];
 358             int j = 0;
 359             keys = props.keys();
 360             for (int i = 0; i < totalEntries; ++i)
 361             {
 362                 String javaName = (String) keys.nextElement();
 363                 String val = props.getProperty(javaName);
 364                 int pos = val.indexOf(' ');
 365                 String mimeName;
 366                 //int lastPrintable;
 367                 if (pos < 0)
 368                 {
 369                     // Maybe report/log this problem?
 370                     //  "Last printable character not defined for encoding " +
 371                     //  mimeName + " (" + val + ")" ...
 372                     mimeName = val;
 373                     //lastPrintable = 0x00FF;
 374                 }
 375                 else
 376                 {
 377                     //lastPrintable =
 378                     //    Integer.decode(val.substring(pos).trim()).intValue();
 379                     StringTokenizer st =
 380                         new StringTokenizer(val.substring(0, pos), ",");
 381                     for (boolean first = true;
 382                         st.hasMoreTokens();
 383                         first = false)
 384                     {
 385                         mimeName = st.nextToken();
 386                         ret[j] =
 387                             new EncodingInfo(mimeName, javaName);
 388                         _encodingTableKeyMime.put(
 389                             mimeName.toUpperCase(),
 390                             ret[j]);
 391                         if (first)
 392                             _encodingTableKeyJava.put(
 393                                 javaName.toUpperCase(),
 394                                 ret[j]);
 395                         j++;
 396                     }
 397                 }
 398             }
 399             return ret;
 400         }
 401         catch (java.net.MalformedURLException mue)
 402         {
 403             throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue);
 404         }
 405         catch (java.io.IOException ioe)
 406         {
 407             throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe);
 408         }
 409     }
 410 
 411     /**
 412      * Return true if the character is the high member of a surrogate pair.
 413      * <p>
 414      * This is not a public API.
 415      * @param ch the character to test
 416      * @xsl.usage internal
 417      */
 418     static boolean isHighUTF16Surrogate(char ch) {
 419         return ('\uD800' <= ch && ch <= '\uDBFF');
 420     }
 421     /**
 422      * Return true if the character is the low member of a surrogate pair.
 423      * <p>
 424      * This is not a public API.
 425      * @param ch the character to test
 426      * @xsl.usage internal
 427      */
 428     static boolean isLowUTF16Surrogate(char ch) {
 429         return ('\uDC00' <= ch && ch <= '\uDFFF');
 430     }
 431     /**
 432      * Return the unicode code point represented by the high/low surrogate pair.
 433      * <p>
 434      * This is not a public API.
 435      * @param highSurrogate the high char of the high/low pair
 436      * @param lowSurrogate the low char of the high/low pair
 437      * @xsl.usage internal
 438      */
 439     static int toCodePoint(char highSurrogate, char lowSurrogate) {
 440         int codePoint =
 441             ((highSurrogate - 0xd800) << 10)
 442                 + (lowSurrogate - 0xdc00)
 443                 + 0x10000;
 444         return codePoint;
 445     }
 446     /**
 447      * Return the unicode code point represented by the char.
 448      * A bit of a dummy method, since all it does is return the char,
 449      * but as an int value.
 450      * <p>
 451      * This is not a public API.
 452      * @param ch the char.
 453      * @xsl.usage internal
 454      */
 455     static int toCodePoint(char ch) {
 456         int codePoint = ch;
 457         return codePoint;
 458     }
 459 
 460     private static final HashMap _encodingTableKeyJava = new HashMap();
 461     private static final HashMap _encodingTableKeyMime = new HashMap();
 462     private static final EncodingInfo[] _encodings = loadEncodingInfo();
 463 }