1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2004 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 /*
  21  * $Id: Encodings.java,v 1.3 2005/09/28 13:49:04 pvedula Exp $
  22  */
  23 package com.sun.org.apache.xml.internal.serializer;
  24 
  25 import java.io.InputStream;
  26 import java.io.OutputStream;
  27 import java.io.OutputStreamWriter;
  28 import java.io.UnsupportedEncodingException;
  29 import java.io.Writer;
  30 import java.io.BufferedWriter;
  31 import java.net.URL;
  32 import java.util.Enumeration;
  33 import java.util.HashMap;
  34 import java.util.Properties;
  35 import java.util.StringTokenizer;
  36 import java.io.IOException;
  37 import java.net.MalformedURLException;
  38 import java.nio.charset.Charset;
  39 import java.nio.charset.IllegalCharsetNameException;
  40 import java.nio.charset.UnsupportedCharsetException;
  41 import java.util.Collections;
  42 import java.util.Map;
  43 import java.util.Map.Entry;
  44 
  45 import com.sun.org.apache.xalan.internal.utils.SecuritySupport;
  46 
  47 /**
  48  * Provides information about encodings. Depends on the Java runtime
  49  * to provides writers for the different encodings, but can be used
  50  * to override encoding names and provide the last printable character
  51  * for each encoding.
  52  *
  53  * @version $Revision: 1.11 $ $Date: 2010-11-01 04:34:44 $
  54  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  55  */
  56 
  57 public final class Encodings extends Object
  58 {
  59 
  60     /**
  61      * The last printable character for unknown encodings.
  62      */
  63     private static final int m_defaultLastPrintable = 0x7F;
  64 
  65     /**
  66      * Standard filename for properties file with encodings data.
  67      */
  68     private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties";
  69 
  70     /**
  71      * Standard filename for properties file with encodings data.
  72      */
  73     private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings";
  74 
  75 
  76     /**
  77      * Returns a writer for the specified encoding based on
  78      * an output stream.
  79      *
  80      * @param output The output stream
  81      * @param encoding The encoding
  82      * @return A suitable writer
  83      * @throws UnsupportedEncodingException There is no convertor
  84      *  to support this encoding
  85      */
  86     static Writer getWriter(OutputStream output, String encoding)
  87         throws UnsupportedEncodingException
  88     {
  89 
  90         final EncodingInfo ei = _encodingInfos.findEncoding(toUpperCaseFast(encoding));
  91         if (ei != null) {
  92             try {
  93                 return new BufferedWriter(new OutputStreamWriter(
  94                         output, ei.javaName));
  95             } catch (UnsupportedEncodingException usee) {
  96                 // keep trying
  97             }
  98         }
  99 
 100         return new BufferedWriter(new OutputStreamWriter(output, encoding));
 101     }
 102 
 103 
 104     /**
 105      * Returns the last printable character for an unspecified
 106      * encoding.
 107      *
 108      * @return the default size
 109      */
 110     public static int getLastPrintable()
 111     {
 112         return m_defaultLastPrintable;
 113     }
 114 
 115 
 116 
 117     /**
 118      * Returns the EncodingInfo object for the specified
 119      * encoding.
 120      * <p>
 121      * This is not a public API.
 122      *
 123      * @param encoding The encoding
 124      * @return The object that is used to determine if
 125      * characters are in the given encoding.
 126      * @xsl.usage internal
 127      */
 128     static EncodingInfo getEncodingInfo(String encoding)
 129     {
 130         EncodingInfo ei;
 131 
 132         String normalizedEncoding = toUpperCaseFast(encoding);
 133         ei = _encodingInfos.findEncoding(normalizedEncoding);
 134         if (ei == null) {
 135             // We shouldn't have to do this, but just in case.
 136             try {
 137                 // This may happen if the caller tries to use
 138                 // an encoding that wasn't registered in the
 139                 // (java name)->(preferred mime name) mapping file.
 140                 // In that case we attempt to load the charset for the
 141                 // given encoding, and if that succeeds - we create a new
 142                 // EncodingInfo instance - assuming the canonical name
 143                 // of the charset can be used as the mime name.
 144                 final Charset c = Charset.forName(encoding);
 145                 final String name = c.name();
 146                 ei = new EncodingInfo(name, name);
 147                 _encodingInfos.putEncoding(normalizedEncoding, ei);
 148             } catch (IllegalCharsetNameException | UnsupportedCharsetException x) {
 149                 ei = new EncodingInfo(null,null);
 150             }
 151         }
 152 
 153         return ei;
 154     }
 155 
 156     /**
 157      * A fast and cheap way to uppercase a String that is
 158      * only made of printable ASCII characters.
 159      * <p>
 160      * This is not a public API.
 161      * @param s a String of ASCII characters
 162      * @return an uppercased version of the input String,
 163      * possibly the same String.
 164      * @xsl.usage internal
 165      */
 166     static private String toUpperCaseFast(final String s) {
 167 
 168         boolean different = false;
 169         final int mx = s.length();
 170                 char[] chars = new char[mx];
 171         for (int i=0; i < mx; i++) {
 172                 char ch = s.charAt(i);
 173             // is the character a lower case ASCII one?
 174                 if ('a' <= ch && ch <= 'z') {
 175                 // a cheap and fast way to uppercase that is good enough
 176                         ch = (char) (ch + ('A' - 'a'));
 177                         different = true; // the uppercased String is different
 178                 }
 179                 chars[i] = ch;
 180         }
 181 
 182         // A little optimization, don't call String.valueOf() if
 183         // the uppercased string is the same as the input string.
 184         final String upper;
 185         if (different)
 186                 upper = String.valueOf(chars);
 187         else
 188                 upper = s;
 189 
 190         return upper;
 191     }
 192 
 193     /** The default encoding, ISO style, ISO style.   */
 194     static final String DEFAULT_MIME_ENCODING = "UTF-8";
 195 
 196     /**
 197      * Get the proper mime encoding.  From the XSLT recommendation: "The encoding
 198      * attribute specifies the preferred encoding to use for outputting the result
 199      * tree. XSLT processors are required to respect values of UTF-8 and UTF-16.
 200      * For other values, if the XSLT processor does not support the specified
 201      * encoding it may signal an error; if it does not signal an error it should
 202      * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding
 203      * whose name does not match the EncName production of the XML Recommendation
 204      * [XML]. If no encoding attribute is specified, then the XSLT processor should
 205      * use either UTF-8 or UTF-16."
 206      *
 207      * @param encoding Reference to java-style encoding string, which may be null,
 208      * in which case a default will be found.
 209      *
 210      * @return The ISO-style encoding string, or null if failure.
 211      */
 212     static String getMimeEncoding(String encoding)
 213     {
 214 
 215         if (null == encoding)
 216         {
 217             try
 218             {
 219 
 220                 // Get the default system character encoding.  This may be
 221                 // incorrect if they passed in a writer, but right now there
 222                 // seems to be no way to get the encoding from a writer.
 223                 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8");
 224 
 225                 if (null != encoding)
 226                 {
 227 
 228                     /*
 229                     * See if the mime type is equal to UTF8.  If you don't
 230                     * do that, then  convertJava2MimeEncoding will convert
 231                     * 8859_1 to "ISO-8859-1", which is not what we want,
 232                     * I think, and I don't think I want to alter the tables
 233                     * to convert everything to UTF-8.
 234                     */
 235                     String jencoding =
 236                         (encoding.equalsIgnoreCase("Cp1252")
 237                             || encoding.equalsIgnoreCase("ISO8859_1")
 238                             || encoding.equalsIgnoreCase("8859_1")
 239                             || encoding.equalsIgnoreCase("UTF8"))
 240                             ? DEFAULT_MIME_ENCODING
 241                             : convertJava2MimeEncoding(encoding);
 242 
 243                     encoding =
 244                         (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
 245                 }
 246                 else
 247                 {
 248                     encoding = DEFAULT_MIME_ENCODING;
 249                 }
 250             }
 251             catch (SecurityException se)
 252             {
 253                 encoding = DEFAULT_MIME_ENCODING;
 254             }
 255         }
 256         else
 257         {
 258             encoding = convertJava2MimeEncoding(encoding);
 259         }
 260 
 261         return encoding;
 262     }
 263 
 264     /**
 265      * Try the best we can to convert a Java encoding to a XML-style encoding.
 266      *
 267      * @param encoding non-null reference to encoding string, java style.
 268      *
 269      * @return ISO-style encoding string.
 270      */
 271     private static String convertJava2MimeEncoding(String encoding)
 272     {
 273         final EncodingInfo enc =
 274              _encodingInfos.getEncodingFromJavaKey(toUpperCaseFast(encoding));
 275         if (null != enc)
 276             return enc.name;
 277         return encoding;
 278     }
 279 
 280     /**
 281      * Try the best we can to convert a Java encoding to a XML-style encoding.
 282      *
 283      * @param encoding non-null reference to encoding string, java style.
 284      *
 285      * @return ISO-style encoding string.
 286      */
 287     public static String convertMime2JavaEncoding(String encoding)
 288     {
 289         final EncodingInfo info = _encodingInfos.findEncoding(toUpperCaseFast(encoding));
 290         return info != null ? info.javaName : encoding;
 291     }
 292 
 293     // Using an inner static class here prevent initialization races
 294     // where the hash maps could be used before they were populated.
 295     //
 296     private final static class EncodingInfos {
 297         // These maps are final and not modified after initialization.
 298         private final Map<String, EncodingInfo> _encodingTableKeyJava = new HashMap<>();
 299         private final Map<String, EncodingInfo> _encodingTableKeyMime = new HashMap<>();
 300         // This map will be added to after initialization: make sure it's
 301         // thread-safe. This map should not be used frequently - only in cases
 302         // where the mapping requested was not declared in the Encodings.properties
 303         // file.
 304         private final Map<String, EncodingInfo> _encodingDynamicTable =
 305                 Collections.synchronizedMap(new HashMap<String, EncodingInfo>());
 306 
 307         private EncodingInfos() {
 308             loadEncodingInfo();
 309         }
 310 
 311         // Opens the file/resource containing java charset name -> preferred mime
 312         // name mapping and returns it as an InputStream.
 313         private InputStream openEncodingsFileStream() throws MalformedURLException, IOException {
 314             String urlString = null;
 315             InputStream is = null;
 316 
 317             try {
 318                 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, "");
 319             } catch (SecurityException e) {
 320             }
 321 
 322             if (urlString != null && urlString.length() > 0) {
 323                 URL url = new URL(urlString);
 324                 is = url.openStream();
 325             }
 326 
 327             if (is == null) {
 328                 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE);
 329             }
 330             return is;
 331         }
 332 
 333         // Loads the Properties resource containing the mapping:
 334         //    java charset name -> preferred mime name
 335         // and returns it.
 336         private Properties loadProperties() throws MalformedURLException, IOException {
 337             Properties props = new Properties();
 338             final InputStream is = openEncodingsFileStream();
 339             try {
 340                 if (is != null) {
 341                     props.load(is);
 342                 } else {
 343                     // Seems to be no real need to force failure here, let the
 344                     // system do its best... The issue is not really very critical,
 345                     // and the output will be in any case _correct_ though maybe not
 346                     // always human-friendly... :)
 347                     // But maybe report/log the resource problem?
 348                     // Any standard ways to report/log errors (in static context)?
 349                 }
 350             } finally {
 351                 if (is != null) {
 352                     is.close();
 353                 }
 354             }
 355             return props;
 356         }
 357 
 358         // Parses the mime list associated to a java charset name.
 359         // The first mime name in the list is supposed to be the preferred
 360         // mime name.
 361         private String[] parseMimeTypes(String val) {
 362             int pos = val.indexOf(' ');
 363             //int lastPrintable;
 364             if (pos < 0) {
 365                 // Maybe report/log this problem?
 366                 //  "Last printable character not defined for encoding " +
 367                 //  mimeName + " (" + val + ")" ...
 368                 return new String[] { val };
 369                 //lastPrintable = 0x00FF;
 370             }
 371             //lastPrintable =
 372             //    Integer.decode(val.substring(pos).trim()).intValue();
 373             StringTokenizer st =
 374                     new StringTokenizer(val.substring(0, pos), ",");
 375             String[] values = new String[st.countTokens()];
 376             for (int i=0; st.hasMoreTokens(); i++) {
 377                 values[i] = st.nextToken();
 378             }
 379             return values;
 380         }
 381 
 382         // This method here attempts to find the canonical charset name for the
 383         // the given name - which is supposed to be either a java name or a mime
 384         // name.
 385         // For that, it attempts to load the charset using the given name, and
 386         // then returns the charset's canonical name.
 387         // If the charset could not be loaded from the given name,
 388         // the method returns null.
 389         private String findCharsetNameFor(String name) {
 390             try {
 391                 return Charset.forName(name).name();
 392             } catch (Exception x) {
 393                 return null;
 394             }
 395         }
 396 
 397         // This method here attempts to find the canonical charset name for the
 398         // the set javaName+mimeNames - which are supposed to all refer to the
 399         // same charset.
 400         // For that it attempts to load the charset using the javaName, and if
 401         // not found, attempts again using each of the mime names in turn.
 402         // If the charset could be loaded from the javaName, then the javaName
 403         // itself is returned as charset name. Otherwise, each of the mime names
 404         // is tried in turn, until a charset can be loaded from one of the names,
 405         // and the loaded charset's canonical name is returned.
 406         // If no charset can be loaded from either the javaName or one of the
 407         // mime names, then null is returned.
 408         //
 409         // Note that the returned name is the 'java' name that will be used in
 410         // instances of EncodingInfo.
 411         // This is important because EncodingInfo uses that 'java name' later on
 412         // in calls to String.getBytes(javaName).
 413         // As it happens, sometimes only one element of the set mime names/javaName
 414         // is known by Charset: sometimes only one of the mime names is known,
 415         // sometime only the javaName is known, sometimes all are known.
 416         //
 417         // By using this method here, we fix the problem where one of the mime
 418         // names is known but the javaName is unknown, by associating the charset
 419         // loaded from one of the mime names with the unrecognized javaName.
 420         //
 421         // When none of the mime names or javaName are known - there's not much we can
 422         // do... It can mean that this encoding is not supported for this
 423         // OS. If such a charset is ever use it will result in having all characters
 424         // escaped.
 425         //
 426         private String findCharsetNameFor(String javaName, String[] mimes) {
 427             String cs = findCharsetNameFor(javaName);
 428             if (cs != null) return javaName;
 429             for (String m : mimes) {
 430                 cs = findCharsetNameFor(m);
 431                 if (cs != null) break;
 432             }
 433             return cs;
 434         }
 435 
 436         /**
 437          * Loads a list of all the supported encodings.
 438          *
 439          * System property "encodings" formatted using URL syntax may define an
 440          * external encodings list. Thanks to Sergey Ushakov for the code
 441          * contribution!
 442          */
 443         private void loadEncodingInfo() {
 444             try {
 445                 // load (java name)->(preferred mime name) mapping.
 446                 final Properties props = loadProperties();
 447 
 448                 // create instances of EncodingInfo from the loaded mapping
 449                 Enumeration keys = props.keys();
 450                 Map<String, EncodingInfo> canonicals = new HashMap<>();
 451                 while (keys.hasMoreElements()) {
 452                     final String javaName = (String) keys.nextElement();
 453                     final String[] mimes = parseMimeTypes(props.getProperty(javaName));
 454 
 455                     final String charsetName = findCharsetNameFor(javaName, mimes);
 456                     if (charsetName != null) {
 457                         final String kj = toUpperCaseFast(javaName);
 458                         final String kc = toUpperCaseFast(charsetName);
 459                         for (int i = 0; i < mimes.length; ++i) {
 460                             final String mimeName = mimes[i];
 461                             final String km = toUpperCaseFast(mimeName);
 462                             EncodingInfo info = new EncodingInfo(mimeName, charsetName);
 463                             _encodingTableKeyMime.put(km, info);
 464                             if (!canonicals.containsKey(kc)) {
 465                                 // canonicals will map the charset name to
 466                                 //   the info containing the prefered mime name
 467                                 //   (the preferred mime name is the first mime
 468                                 //   name in the list).
 469                                 canonicals.put(kc, info);
 470                                 _encodingTableKeyJava.put(kc, info);
 471                             }
 472                             _encodingTableKeyJava.put(kj, info);
 473                         }
 474                     } else {
 475                         // None of the java or mime names on the line were
 476                         // recognized => this charset is not supported?
 477                     }
 478                 }
 479 
 480                 // Fix up the _encodingTableKeyJava so that the info mapped to
 481                 // the java name contains the preferred mime name.
 482                 // (a given java name can correspond to several mime name,
 483                 //  but we want the _encodingTableKeyJava to point to the
 484                 //  preferred mime name).
 485                 for (Entry<String, EncodingInfo> e : _encodingTableKeyJava.entrySet()) {
 486                     e.setValue(canonicals.get(toUpperCaseFast(e.getValue().javaName)));
 487                 }
 488 
 489             } catch (java.net.MalformedURLException mue) {
 490                 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue);
 491             } catch (java.io.IOException ioe) {
 492                 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe);
 493             }
 494         }
 495 
 496         EncodingInfo findEncoding(String normalizedEncoding) {
 497             EncodingInfo info = _encodingTableKeyJava.get(normalizedEncoding);
 498             if (info == null) {
 499                 info = _encodingTableKeyMime.get(normalizedEncoding);
 500             }
 501             if (info == null) {
 502                 info = _encodingDynamicTable.get(normalizedEncoding);
 503             }
 504             return info;
 505         }
 506 
 507         EncodingInfo getEncodingFromMimeKey(String normalizedMimeName) {
 508             return _encodingTableKeyMime.get(normalizedMimeName);
 509         }
 510 
 511         EncodingInfo getEncodingFromJavaKey(String normalizedJavaName) {
 512             return _encodingTableKeyJava.get(normalizedJavaName);
 513         }
 514 
 515         void putEncoding(String key, EncodingInfo info) {
 516             _encodingDynamicTable.put(key, info);
 517         }
 518     }
 519 
 520     /**
 521      * Return true if the character is the high member of a surrogate pair.
 522      * <p>
 523      * This is not a public API.
 524      * @param ch the character to test
 525      * @xsl.usage internal
 526      */
 527     static boolean isHighUTF16Surrogate(char ch) {
 528         return ('\uD800' <= ch && ch <= '\uDBFF');
 529     }
 530     /**
 531      * Return true if the character is the low member of a surrogate pair.
 532      * <p>
 533      * This is not a public API.
 534      * @param ch the character to test
 535      * @xsl.usage internal
 536      */
 537     static boolean isLowUTF16Surrogate(char ch) {
 538         return ('\uDC00' <= ch && ch <= '\uDFFF');
 539     }
 540     /**
 541      * Return the unicode code point represented by the high/low surrogate pair.
 542      * <p>
 543      * This is not a public API.
 544      * @param highSurrogate the high char of the high/low pair
 545      * @param lowSurrogate the low char of the high/low pair
 546      * @xsl.usage internal
 547      */
 548     static int toCodePoint(char highSurrogate, char lowSurrogate) {
 549         int codePoint =
 550             ((highSurrogate - 0xd800) << 10)
 551                 + (lowSurrogate - 0xdc00)
 552                 + 0x10000;
 553         return codePoint;
 554     }
 555     /**
 556      * Return the unicode code point represented by the char.
 557      * A bit of a dummy method, since all it does is return the char,
 558      * but as an int value.
 559      * <p>
 560      * This is not a public API.
 561      * @param ch the char.
 562      * @xsl.usage internal
 563      */
 564     static int toCodePoint(char ch) {
 565         int codePoint = ch;
 566         return codePoint;
 567     }
 568 
 569     private final static EncodingInfos _encodingInfos = new EncodingInfos();
 570 
 571 }