New src/com/sun/org/apache/xml/internal/serializer/Encodings.java

   1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2004 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 /*
  21  * $Id: Encodings.java,v 1.3 2005/09/28 13:49:04 pvedula Exp $
  22  */
  23 package com.sun.org.apache.xml.internal.serializer;
  24 
  25 import java.io.InputStream;
  26 import java.io.OutputStream;
  27 import java.io.OutputStreamWriter;
  28 import java.io.UnsupportedEncodingException;
  29 import java.io.Writer;
  30 import java.io.BufferedWriter;
  31 import java.net.URL;
  32 import java.util.Enumeration;
  33 import java.util.HashMap;
  34 import java.util.Properties;
  35 import java.util.StringTokenizer;
  36 import java.io.IOException;
  37 import java.net.MalformedURLException;
  38 import java.nio.charset.Charset;
  39 import java.nio.charset.IllegalCharsetNameException;
  40 import java.nio.charset.UnsupportedCharsetException;
  41 import java.util.Collections;
  42 import java.util.Map;
  43 import java.util.Map.Entry;
  44 
  45 import com.sun.org.apache.xalan.internal.utils.SecuritySupport;
  46 
  47 /**
  48  * Provides information about encodings. Depends on the Java runtime
  49  * to provides writers for the different encodings, but can be used
  50  * to override encoding names and provide the last printable character
  51  * for each encoding.
  52  *
  53  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  54  */
  55 
  56 public final class Encodings extends Object
  57 {
  58 
  59     /**
  60      * The last printable character for unknown encodings.
  61      */
  62     private static final int m_defaultLastPrintable = 0x7F;
  63 
  64     /**
  65      * Standard filename for properties file with encodings data.
  66      */
  67     private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties";
  68 
  69     /**
  70      * Standard filename for properties file with encodings data.
  71      */
  72     private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings";
  73 
  74 
  75     /**
  76      * Returns a writer for the specified encoding based on
  77      * an output stream.
  78      *
  79      * @param output The output stream
  80      * @param encoding The encoding
  81      * @return A suitable writer
  82      * @throws UnsupportedEncodingException There is no convertor
  83      *  to support this encoding
  84      */
  85     static Writer getWriter(OutputStream output, String encoding)
  86         throws UnsupportedEncodingException
  87     {
  88 
  89         final EncodingInfo ei = _encodingInfos.findEncoding(toUpperCaseFast(encoding));
  90         if (ei != null) {
  91             try {
  92                 return new BufferedWriter(new OutputStreamWriter(
  93                         output, ei.javaName));
  94             } catch (UnsupportedEncodingException usee) {
  95                 // keep trying
  96             }
  97         }
  98 
  99         return new BufferedWriter(new OutputStreamWriter(output, encoding));
 100     }
 101 
 102 
 103     /**
 104      * Returns the last printable character for an unspecified
 105      * encoding.
 106      *
 107      * @return the default size
 108      */
 109     public static int getLastPrintable()
 110     {
 111         return m_defaultLastPrintable;
 112     }
 113 
 114 
 115 
 116     /**
 117      * Returns the EncodingInfo object for the specified
 118      * encoding.
 119      * <p>
 120      * This is not a public API.
 121      *
 122      * @param encoding The encoding
 123      * @return The object that is used to determine if
 124      * characters are in the given encoding.
 125      * @xsl.usage internal
 126      */
 127     static EncodingInfo getEncodingInfo(String encoding)
 128     {
 129         EncodingInfo ei;
 130 
 131         String normalizedEncoding = toUpperCaseFast(encoding);
 132         ei = _encodingInfos.findEncoding(normalizedEncoding);
 133         if (ei == null) {
 134             // We shouldn't have to do this, but just in case.
 135             try {
 136                 // This may happen if the caller tries to use
 137                 // an encoding that wasn't registered in the
 138                 // (java name)->(preferred mime name) mapping file.
 139                 // In that case we attempt to load the charset for the
 140                 // given encoding, and if that succeeds - we create a new
 141                 // EncodingInfo instance - assuming the canonical name
 142                 // of the charset can be used as the mime name.
 143                 final Charset c = Charset.forName(encoding);
 144                 final String name = c.name();
 145                 ei = new EncodingInfo(name, name);
 146                 _encodingInfos.putEncoding(normalizedEncoding, ei);
 147             } catch (IllegalCharsetNameException | UnsupportedCharsetException x) {
 148                 ei = new EncodingInfo(null,null);
 149             }
 150         }
 151 
 152         return ei;
 153     }
 154 
 155     /**
 156      * A fast and cheap way to uppercase a String that is
 157      * only made of printable ASCII characters.
 158      * <p>
 159      * This is not a public API.
 160      * @param s a String of ASCII characters
 161      * @return an uppercased version of the input String,
 162      * possibly the same String.
 163      * @xsl.usage internal
 164      */
 165     static private String toUpperCaseFast(final String s) {
 166 
 167         boolean different = false;
 168         final int mx = s.length();
 169                 char[] chars = new char[mx];
 170         for (int i=0; i < mx; i++) {
 171                 char ch = s.charAt(i);
 172             // is the character a lower case ASCII one?
 173                 if ('a' <= ch && ch <= 'z') {
 174                 // a cheap and fast way to uppercase that is good enough
 175                         ch = (char) (ch + ('A' - 'a'));
 176                         different = true; // the uppercased String is different
 177                 }
 178                 chars[i] = ch;
 179         }
 180 
 181         // A little optimization, don't call String.valueOf() if
 182         // the uppercased string is the same as the input string.
 183         final String upper;
 184         if (different)
 185                 upper = String.valueOf(chars);
 186         else
 187                 upper = s;
 188 
 189         return upper;
 190     }
 191 
 192     /** The default encoding, ISO style, ISO style.   */
 193     static final String DEFAULT_MIME_ENCODING = "UTF-8";
 194 
 195     /**
 196      * Get the proper mime encoding.  From the XSLT recommendation: "The encoding
 197      * attribute specifies the preferred encoding to use for outputting the result
 198      * tree. XSLT processors are required to respect values of UTF-8 and UTF-16.
 199      * For other values, if the XSLT processor does not support the specified
 200      * encoding it may signal an error; if it does not signal an error it should
 201      * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding
 202      * whose name does not match the EncName production of the XML Recommendation
 203      * [XML]. If no encoding attribute is specified, then the XSLT processor should
 204      * use either UTF-8 or UTF-16."
 205      *
 206      * @param encoding Reference to java-style encoding string, which may be null,
 207      * in which case a default will be found.
 208      *
 209      * @return The ISO-style encoding string, or null if failure.
 210      */
 211     static String getMimeEncoding(String encoding)
 212     {
 213 
 214         if (null == encoding)
 215         {
 216             try
 217             {
 218 
 219                 // Get the default system character encoding.  This may be
 220                 // incorrect if they passed in a writer, but right now there
 221                 // seems to be no way to get the encoding from a writer.
 222                 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8");
 223 
 224                 if (null != encoding)
 225                 {
 226 
 227                     /*
 228                     * See if the mime type is equal to UTF8.  If you don't
 229                     * do that, then  convertJava2MimeEncoding will convert
 230                     * 8859_1 to "ISO-8859-1", which is not what we want,
 231                     * I think, and I don't think I want to alter the tables
 232                     * to convert everything to UTF-8.
 233                     */
 234                     String jencoding =
 235                         (encoding.equalsIgnoreCase("Cp1252")
 236                             || encoding.equalsIgnoreCase("ISO8859_1")
 237                             || encoding.equalsIgnoreCase("8859_1")
 238                             || encoding.equalsIgnoreCase("UTF8"))
 239                             ? DEFAULT_MIME_ENCODING
 240                             : convertJava2MimeEncoding(encoding);
 241 
 242                     encoding =
 243                         (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
 244                 }
 245                 else
 246                 {
 247                     encoding = DEFAULT_MIME_ENCODING;
 248                 }
 249             }
 250             catch (SecurityException se)
 251             {
 252                 encoding = DEFAULT_MIME_ENCODING;
 253             }
 254         }
 255         else
 256         {
 257             encoding = convertJava2MimeEncoding(encoding);
 258         }
 259 
 260         return encoding;
 261     }
 262 
 263     /**
 264      * Try the best we can to convert a Java encoding to a XML-style encoding.
 265      *
 266      * @param encoding non-null reference to encoding string, java style.
 267      *
 268      * @return ISO-style encoding string.
 269      */
 270     private static String convertJava2MimeEncoding(String encoding)
 271     {
 272         final EncodingInfo enc =
 273              _encodingInfos.getEncodingFromJavaKey(toUpperCaseFast(encoding));
 274         if (null != enc)
 275             return enc.name;
 276         return encoding;
 277     }
 278 
 279     /**
 280      * Try the best we can to convert a Java encoding to a XML-style encoding.
 281      *
 282      * @param encoding non-null reference to encoding string, java style.
 283      *
 284      * @return ISO-style encoding string.
 285      */
 286     public static String convertMime2JavaEncoding(String encoding)
 287     {
 288         final EncodingInfo info = _encodingInfos.findEncoding(toUpperCaseFast(encoding));
 289         return info != null ? info.javaName : encoding;
 290     }
 291 
 292     // Using an inner static class here prevent initialization races
 293     // where the hash maps could be used before they were populated.
 294     //
 295     private final static class EncodingInfos {
 296         // These maps are final and not modified after initialization.
 297         private final Map<String, EncodingInfo> _encodingTableKeyJava = new HashMap<>();
 298         private final Map<String, EncodingInfo> _encodingTableKeyMime = new HashMap<>();
 299         // This map will be added to after initialization: make sure it's
 300         // thread-safe. This map should not be used frequently - only in cases
 301         // where the mapping requested was not declared in the Encodings.properties
 302         // file.
 303         private final Map<String, EncodingInfo> _encodingDynamicTable =
 304                 Collections.synchronizedMap(new HashMap<String, EncodingInfo>());
 305 
 306         private EncodingInfos() {
 307             loadEncodingInfo();
 308         }
 309 
 310         // Opens the file/resource containing java charset name -> preferred mime
 311         // name mapping and returns it as an InputStream.
 312         private InputStream openEncodingsFileStream() throws MalformedURLException, IOException {
 313             String urlString = null;
 314             InputStream is = null;
 315 
 316             try {
 317                 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, "");
 318             } catch (SecurityException e) {
 319             }
 320 
 321             if (urlString != null && urlString.length() > 0) {
 322                 URL url = new URL(urlString);
 323                 is = url.openStream();
 324             }
 325 
 326             if (is == null) {
 327                 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE);
 328             }
 329             return is;
 330         }
 331 
 332         // Loads the Properties resource containing the mapping:
 333         //    java charset name -> preferred mime name
 334         // and returns it.
 335         private Properties loadProperties() throws MalformedURLException, IOException {
 336             Properties props = new Properties();
 337             try (InputStream is = openEncodingsFileStream()) {
 338                 if (is != null) {
 339                     props.load(is);
 340                 } else {
 341                     // Seems to be no real need to force failure here, let the
 342                     // system do its best... The issue is not really very critical,
 343                     // and the output will be in any case _correct_ though maybe not
 344                     // always human-friendly... :)
 345                     // But maybe report/log the resource problem?
 346                     // Any standard ways to report/log errors (in static context)?
 347                 }
 348             }
 349             return props;
 350         }
 351 
 352         // Parses the mime list associated to a java charset name.
 353         // The first mime name in the list is supposed to be the preferred
 354         // mime name.
 355         private String[] parseMimeTypes(String val) {
 356             int pos = val.indexOf(' ');
 357             //int lastPrintable;
 358             if (pos < 0) {
 359                 // Maybe report/log this problem?
 360                 //  "Last printable character not defined for encoding " +
 361                 //  mimeName + " (" + val + ")" ...
 362                 return new String[] { val };
 363                 //lastPrintable = 0x00FF;
 364             }
 365             //lastPrintable =
 366             //    Integer.decode(val.substring(pos).trim()).intValue();
 367             StringTokenizer st =
 368                     new StringTokenizer(val.substring(0, pos), ",");
 369             String[] values = new String[st.countTokens()];
 370             for (int i=0; st.hasMoreTokens(); i++) {
 371                 values[i] = st.nextToken();
 372             }
 373             return values;
 374         }
 375 
 376         // This method here attempts to find the canonical charset name for the
 377         // the given name - which is supposed to be either a java name or a mime
 378         // name.
 379         // For that, it attempts to load the charset using the given name, and
 380         // then returns the charset's canonical name.
 381         // If the charset could not be loaded from the given name,
 382         // the method returns null.
 383         private String findCharsetNameFor(String name) {
 384             try {
 385                 return Charset.forName(name).name();
 386             } catch (Exception x) {
 387                 return null;
 388             }
 389         }
 390 
 391         // This method here attempts to find the canonical charset name for the
 392         // the set javaName+mimeNames - which are supposed to all refer to the
 393         // same charset.
 394         // For that it attempts to load the charset using the javaName, and if
 395         // not found, attempts again using each of the mime names in turn.
 396         // If the charset could be loaded from the javaName, then the javaName
 397         // itself is returned as charset name. Otherwise, each of the mime names
 398         // is tried in turn, until a charset can be loaded from one of the names,
 399         // and the loaded charset's canonical name is returned.
 400         // If no charset can be loaded from either the javaName or one of the
 401         // mime names, then null is returned.
 402         //
 403         // Note that the returned name is the 'java' name that will be used in
 404         // instances of EncodingInfo.
 405         // This is important because EncodingInfo uses that 'java name' later on
 406         // in calls to String.getBytes(javaName).
 407         // As it happens, sometimes only one element of the set mime names/javaName
 408         // is known by Charset: sometimes only one of the mime names is known,
 409         // sometime only the javaName is known, sometimes all are known.
 410         //
 411         // By using this method here, we fix the problem where one of the mime
 412         // names is known but the javaName is unknown, by associating the charset
 413         // loaded from one of the mime names with the unrecognized javaName.
 414         //
 415         // When none of the mime names or javaName are known - there's not much we can
 416         // do... It can mean that this encoding is not supported for this
 417         // OS. If such a charset is ever use it will result in having all characters
 418         // escaped.
 419         //
 420         private String findCharsetNameFor(String javaName, String[] mimes) {
 421             String cs = findCharsetNameFor(javaName);
 422             if (cs != null) return javaName;
 423             for (String m : mimes) {
 424                 cs = findCharsetNameFor(m);
 425                 if (cs != null) break;
 426             }
 427             return cs;
 428         }
 429 
 430         /**
 431          * Loads a list of all the supported encodings.
 432          *
 433          * System property "encodings" formatted using URL syntax may define an
 434          * external encodings list. Thanks to Sergey Ushakov for the code
 435          * contribution!
 436          */
 437         private void loadEncodingInfo() {
 438             try {
 439                 // load (java name)->(preferred mime name) mapping.
 440                 final Properties props = loadProperties();
 441 
 442                 // create instances of EncodingInfo from the loaded mapping
 443                 Enumeration keys = props.keys();
 444                 Map<String, EncodingInfo> canonicals = new HashMap<>();
 445                 while (keys.hasMoreElements()) {
 446                     final String javaName = (String) keys.nextElement();
 447                     final String[] mimes = parseMimeTypes(props.getProperty(javaName));
 448 
 449                     final String charsetName = findCharsetNameFor(javaName, mimes);
 450                     if (charsetName != null) {
 451                         final String kj = toUpperCaseFast(javaName);
 452                         final String kc = toUpperCaseFast(charsetName);
 453                         for (int i = 0; i < mimes.length; ++i) {
 454                             final String mimeName = mimes[i];
 455                             final String km = toUpperCaseFast(mimeName);
 456                             EncodingInfo info = new EncodingInfo(mimeName, charsetName);
 457                             _encodingTableKeyMime.put(km, info);
 458                             if (!canonicals.containsKey(kc)) {
 459                                 // canonicals will map the charset name to
 460                                 //   the info containing the prefered mime name
 461                                 //   (the preferred mime name is the first mime
 462                                 //   name in the list).
 463                                 canonicals.put(kc, info);
 464                                 _encodingTableKeyJava.put(kc, info);
 465                             }
 466                             _encodingTableKeyJava.put(kj, info);
 467                         }
 468                     } else {
 469                         // None of the java or mime names on the line were
 470                         // recognized => this charset is not supported?
 471                     }
 472                 }
 473 
 474                 // Fix up the _encodingTableKeyJava so that the info mapped to
 475                 // the java name contains the preferred mime name.
 476                 // (a given java name can correspond to several mime name,
 477                 //  but we want the _encodingTableKeyJava to point to the
 478                 //  preferred mime name).
 479                 for (Entry<String, EncodingInfo> e : _encodingTableKeyJava.entrySet()) {
 480                     e.setValue(canonicals.get(toUpperCaseFast(e.getValue().javaName)));
 481                 }
 482 
 483             } catch (java.net.MalformedURLException mue) {
 484                 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue);
 485             } catch (java.io.IOException ioe) {
 486                 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe);
 487             }
 488         }
 489 
 490         EncodingInfo findEncoding(String normalizedEncoding) {
 491             EncodingInfo info = _encodingTableKeyJava.get(normalizedEncoding);
 492             if (info == null) {
 493                 info = _encodingTableKeyMime.get(normalizedEncoding);
 494             }
 495             if (info == null) {
 496                 info = _encodingDynamicTable.get(normalizedEncoding);
 497             }
 498             return info;
 499         }
 500 
 501         EncodingInfo getEncodingFromMimeKey(String normalizedMimeName) {
 502             return _encodingTableKeyMime.get(normalizedMimeName);
 503         }
 504 
 505         EncodingInfo getEncodingFromJavaKey(String normalizedJavaName) {
 506             return _encodingTableKeyJava.get(normalizedJavaName);
 507         }
 508 
 509         void putEncoding(String key, EncodingInfo info) {
 510             _encodingDynamicTable.put(key, info);
 511         }
 512     }
 513 
 514     /**
 515      * Return true if the character is the high member of a surrogate pair.
 516      * <p>
 517      * This is not a public API.
 518      * @param ch the character to test
 519      * @xsl.usage internal
 520      */
 521     static boolean isHighUTF16Surrogate(char ch) {
 522         return ('\uD800' <= ch && ch <= '\uDBFF');
 523     }
 524     /**
 525      * Return true if the character is the low member of a surrogate pair.
 526      * <p>
 527      * This is not a public API.
 528      * @param ch the character to test
 529      * @xsl.usage internal
 530      */
 531     static boolean isLowUTF16Surrogate(char ch) {
 532         return ('\uDC00' <= ch && ch <= '\uDFFF');
 533     }
 534     /**
 535      * Return the unicode code point represented by the high/low surrogate pair.
 536      * <p>
 537      * This is not a public API.
 538      * @param highSurrogate the high char of the high/low pair
 539      * @param lowSurrogate the low char of the high/low pair
 540      * @xsl.usage internal
 541      */
 542     static int toCodePoint(char highSurrogate, char lowSurrogate) {
 543         int codePoint =
 544             ((highSurrogate - 0xd800) << 10)
 545                 + (lowSurrogate - 0xdc00)
 546                 + 0x10000;
 547         return codePoint;
 548     }
 549     /**
 550      * Return the unicode code point represented by the char.
 551      * A bit of a dummy method, since all it does is return the char,
 552      * but as an int value.
 553      * <p>
 554      * This is not a public API.
 555      * @param ch the char.
 556      * @xsl.usage internal
 557      */
 558     static int toCodePoint(char ch) {
 559         int codePoint = ch;
 560         return codePoint;
 561     }
 562 
 563     private final static EncodingInfos _encodingInfos = new EncodingInfos();
 564 
 565 }