1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 1999-2004 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 /* 21 * $Id: Encodings.java,v 1.3 2005/09/28 13:49:04 pvedula Exp $ 22 */ 23 package com.sun.org.apache.xml.internal.serializer; 24 25 import java.io.InputStream; 26 import java.io.OutputStream; 27 import java.io.OutputStreamWriter; 28 import java.io.UnsupportedEncodingException; 29 import java.io.Writer; 30 import java.io.BufferedWriter; 31 import java.net.URL; 32 import java.util.Enumeration; 33 import java.util.HashMap; 34 import java.util.Properties; 35 import java.util.StringTokenizer; 36 import java.io.IOException; 37 import java.net.MalformedURLException; 38 import java.nio.charset.Charset; 39 import java.nio.charset.IllegalCharsetNameException; 40 import java.nio.charset.UnsupportedCharsetException; 41 import java.util.Collections; 42 import java.util.Map; 43 import java.util.Map.Entry; 44 45 import com.sun.org.apache.xalan.internal.utils.SecuritySupport; 46 47 /** 48 * Provides information about encodings. Depends on the Java runtime 49 * to provides writers for the different encodings, but can be used 50 * to override encoding names and provide the last printable character 51 * for each encoding. 52 * 53 * @version $Revision: 1.11 $ $Date: 2010-11-01 04:34:44 $ 54 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 55 */ 56 57 public final class Encodings extends Object 58 { 59 60 /** 61 * The last printable character for unknown encodings. 62 */ 63 private static final int m_defaultLastPrintable = 0x7F; 64 65 /** 66 * Standard filename for properties file with encodings data. 67 */ 68 private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties"; 69 70 /** 71 * Standard filename for properties file with encodings data. 72 */ 73 private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings"; 74 75 76 /** 77 * Returns a writer for the specified encoding based on 78 * an output stream. 79 * 80 * @param output The output stream 81 * @param encoding The encoding 82 * @return A suitable writer 83 * @throws UnsupportedEncodingException There is no convertor 84 * to support this encoding 85 */ 86 static Writer getWriter(OutputStream output, String encoding) 87 throws UnsupportedEncodingException 88 { 89 90 final EncodingInfo ei = _encodingInfos.findEncoding(toUpperCaseFast(encoding)); 91 if (ei != null) { 92 try { 93 return new BufferedWriter(new OutputStreamWriter( 94 output, ei.javaName)); 95 } catch (UnsupportedEncodingException usee) { 96 // keep trying 97 } 98 } 99 100 return new BufferedWriter(new OutputStreamWriter(output, encoding)); 101 } 102 103 104 /** 105 * Returns the last printable character for an unspecified 106 * encoding. 107 * 108 * @return the default size 109 */ 110 public static int getLastPrintable() 111 { 112 return m_defaultLastPrintable; 113 } 114 115 116 117 /** 118 * Returns the EncodingInfo object for the specified 119 * encoding. 120 * <p> 121 * This is not a public API. 122 * 123 * @param encoding The encoding 124 * @return The object that is used to determine if 125 * characters are in the given encoding. 126 * @xsl.usage internal 127 */ 128 static EncodingInfo getEncodingInfo(String encoding) 129 { 130 EncodingInfo ei; 131 132 String normalizedEncoding = toUpperCaseFast(encoding); 133 ei = _encodingInfos.findEncoding(normalizedEncoding); 134 if (ei == null) { 135 // We shouldn't have to do this, but just in case. 136 try { 137 // This may happen if the caller tries to use 138 // an encoding that wasn't registered in the 139 // (java name)->(preferred mime name) mapping file. 140 // In that case we attempt to load the charset for the 141 // given encoding, and if that succeeds - we create a new 142 // EncodingInfo instance - assuming the canonical name 143 // of the charset can be used as the mime name. 144 final Charset c = Charset.forName(encoding); 145 final String name = c.name(); 146 ei = new EncodingInfo(name, name); 147 _encodingInfos.putEncoding(normalizedEncoding, ei); 148 } catch (IllegalCharsetNameException | UnsupportedCharsetException x) { 149 ei = new EncodingInfo(null,null); 150 } 151 } 152 153 return ei; 154 } 155 156 /** 157 * A fast and cheap way to uppercase a String that is 158 * only made of printable ASCII characters. 159 * <p> 160 * This is not a public API. 161 * @param s a String of ASCII characters 162 * @return an uppercased version of the input String, 163 * possibly the same String. 164 * @xsl.usage internal 165 */ 166 static private String toUpperCaseFast(final String s) { 167 168 boolean different = false; 169 final int mx = s.length(); 170 char[] chars = new char[mx]; 171 for (int i=0; i < mx; i++) { 172 char ch = s.charAt(i); 173 // is the character a lower case ASCII one? 174 if ('a' <= ch && ch <= 'z') { 175 // a cheap and fast way to uppercase that is good enough 176 ch = (char) (ch + ('A' - 'a')); 177 different = true; // the uppercased String is different 178 } 179 chars[i] = ch; 180 } 181 182 // A little optimization, don't call String.valueOf() if 183 // the uppercased string is the same as the input string. 184 final String upper; 185 if (different) 186 upper = String.valueOf(chars); 187 else 188 upper = s; 189 190 return upper; 191 } 192 193 /** The default encoding, ISO style, ISO style. */ 194 static final String DEFAULT_MIME_ENCODING = "UTF-8"; 195 196 /** 197 * Get the proper mime encoding. From the XSLT recommendation: "The encoding 198 * attribute specifies the preferred encoding to use for outputting the result 199 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16. 200 * For other values, if the XSLT processor does not support the specified 201 * encoding it may signal an error; if it does not signal an error it should 202 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding 203 * whose name does not match the EncName production of the XML Recommendation 204 * [XML]. If no encoding attribute is specified, then the XSLT processor should 205 * use either UTF-8 or UTF-16." 206 * 207 * @param encoding Reference to java-style encoding string, which may be null, 208 * in which case a default will be found. 209 * 210 * @return The ISO-style encoding string, or null if failure. 211 */ 212 static String getMimeEncoding(String encoding) 213 { 214 215 if (null == encoding) 216 { 217 try 218 { 219 220 // Get the default system character encoding. This may be 221 // incorrect if they passed in a writer, but right now there 222 // seems to be no way to get the encoding from a writer. 223 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8"); 224 225 if (null != encoding) 226 { 227 228 /* 229 * See if the mime type is equal to UTF8. If you don't 230 * do that, then convertJava2MimeEncoding will convert 231 * 8859_1 to "ISO-8859-1", which is not what we want, 232 * I think, and I don't think I want to alter the tables 233 * to convert everything to UTF-8. 234 */ 235 String jencoding = 236 (encoding.equalsIgnoreCase("Cp1252") 237 || encoding.equalsIgnoreCase("ISO8859_1") 238 || encoding.equalsIgnoreCase("8859_1") 239 || encoding.equalsIgnoreCase("UTF8")) 240 ? DEFAULT_MIME_ENCODING 241 : convertJava2MimeEncoding(encoding); 242 243 encoding = 244 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING; 245 } 246 else 247 { 248 encoding = DEFAULT_MIME_ENCODING; 249 } 250 } 251 catch (SecurityException se) 252 { 253 encoding = DEFAULT_MIME_ENCODING; 254 } 255 } 256 else 257 { 258 encoding = convertJava2MimeEncoding(encoding); 259 } 260 261 return encoding; 262 } 263 264 /** 265 * Try the best we can to convert a Java encoding to a XML-style encoding. 266 * 267 * @param encoding non-null reference to encoding string, java style. 268 * 269 * @return ISO-style encoding string. 270 */ 271 private static String convertJava2MimeEncoding(String encoding) 272 { 273 final EncodingInfo enc = 274 _encodingInfos.getEncodingFromJavaKey(toUpperCaseFast(encoding)); 275 if (null != enc) 276 return enc.name; 277 return encoding; 278 } 279 280 /** 281 * Try the best we can to convert a Java encoding to a XML-style encoding. 282 * 283 * @param encoding non-null reference to encoding string, java style. 284 * 285 * @return ISO-style encoding string. 286 */ 287 public static String convertMime2JavaEncoding(String encoding) 288 { 289 final EncodingInfo info = _encodingInfos.findEncoding(toUpperCaseFast(encoding)); 290 return info != null ? info.javaName : encoding; 291 } 292 293 // Using an inner static class here prevent initialization races 294 // where the hash maps could be used before they were populated. 295 // 296 private final static class EncodingInfos { 297 // These maps are final and not modified after initialization. 298 private final Map<String, EncodingInfo> _encodingTableKeyJava = new HashMap<>(); 299 private final Map<String, EncodingInfo> _encodingTableKeyMime = new HashMap<>(); 300 // This map will be added to after initialization: make sure it's 301 // thread-safe. This map should not be used frequently - only in cases 302 // where the mapping requested was not declared in the Encodings.properties 303 // file. 304 private final Map<String, EncodingInfo> _encodingDynamicTable = 305 Collections.synchronizedMap(new HashMap<String, EncodingInfo>()); 306 307 private EncodingInfos() { 308 loadEncodingInfo(); 309 } 310 311 // Opens the file/resource containing java charset name -> preferred mime 312 // name mapping and returns it as an InputStream. 313 private InputStream openEncodingsFileStream() throws MalformedURLException, IOException { 314 String urlString = null; 315 InputStream is = null; 316 317 try { 318 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, ""); 319 } catch (SecurityException e) { 320 } 321 322 if (urlString != null && urlString.length() > 0) { 323 URL url = new URL(urlString); 324 is = url.openStream(); 325 } 326 327 if (is == null) { 328 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE); 329 } 330 return is; 331 } 332 333 // Loads the Properties resource containing the mapping: 334 // java charset name -> preferred mime name 335 // and returns it. 336 private Properties loadProperties() throws MalformedURLException, IOException { 337 Properties props = new Properties(); 338 final InputStream is = openEncodingsFileStream(); 339 try { 340 if (is != null) { 341 props.load(is); 342 } else { 343 // Seems to be no real need to force failure here, let the 344 // system do its best... The issue is not really very critical, 345 // and the output will be in any case _correct_ though maybe not 346 // always human-friendly... :) 347 // But maybe report/log the resource problem? 348 // Any standard ways to report/log errors (in static context)? 349 } 350 } finally { 351 if (is != null) { 352 is.close(); 353 } 354 } 355 return props; 356 } 357 358 // Parses the mime list associated to a java charset name. 359 // The first mime name in the list is supposed to be the preferred 360 // mime name. 361 private String[] parseMimeTypes(String val) { 362 int pos = val.indexOf(' '); 363 //int lastPrintable; 364 if (pos < 0) { 365 // Maybe report/log this problem? 366 // "Last printable character not defined for encoding " + 367 // mimeName + " (" + val + ")" ... 368 return new String[] { val }; 369 //lastPrintable = 0x00FF; 370 } 371 //lastPrintable = 372 // Integer.decode(val.substring(pos).trim()).intValue(); 373 StringTokenizer st = 374 new StringTokenizer(val.substring(0, pos), ","); 375 String[] values = new String[st.countTokens()]; 376 for (int i=0; st.hasMoreTokens(); i++) { 377 values[i] = st.nextToken(); 378 } 379 return values; 380 } 381 382 // This method here attempts to find the canonical charset name for the 383 // the given name - which is supposed to be either a java name or a mime 384 // name. 385 // For that, it attempts to load the charset using the given name, and 386 // then returns the charset's canonical name. 387 // If the charset could not be loaded from the given name, 388 // the method returns null. 389 private String findCharsetNameFor(String name) { 390 try { 391 return Charset.forName(name).name(); 392 } catch (Exception x) { 393 return null; 394 } 395 } 396 397 // This method here attempts to find the canonical charset name for the 398 // the set javaName+mimeNames - which are supposed to all refer to the 399 // same charset. 400 // For that it attempts to load the charset using the javaName, and if 401 // not found, attempts again using each of the mime names in turn. 402 // If the charset could be loaded from the javaName, then the javaName 403 // itself is returned as charset name. Otherwise, each of the mime names 404 // is tried in turn, until a charset can be loaded from one of the names, 405 // and the loaded charset's canonical name is returned. 406 // If no charset can be loaded from either the javaName or one of the 407 // mime names, then null is returned. 408 // 409 // Note that the returned name is the 'java' name that will be used in 410 // instances of EncodingInfo. 411 // This is important because EncodingInfo uses that 'java name' later on 412 // in calls to String.getBytes(javaName). 413 // As it happens, sometimes only one element of the set mime names/javaName 414 // is known by Charset: sometimes only one of the mime names is known, 415 // sometime only the javaName is known, sometimes all are known. 416 // 417 // By using this method here, we fix the problem where one of the mime 418 // names is known but the javaName is unknown, by associating the charset 419 // loaded from one of the mime names with the unrecognized javaName. 420 // 421 // When none of the mime names or javaName are known - there's not much we can 422 // do... It can mean that this encoding is not supported for this 423 // OS. If such a charset is ever use it will result in having all characters 424 // escaped. 425 // 426 private String findCharsetNameFor(String javaName, String[] mimes) { 427 String cs = findCharsetNameFor(javaName); 428 if (cs != null) return javaName; 429 for (String m : mimes) { 430 cs = findCharsetNameFor(m); 431 if (cs != null) break; 432 } 433 return cs; 434 } 435 436 /** 437 * Loads a list of all the supported encodings. 438 * 439 * System property "encodings" formatted using URL syntax may define an 440 * external encodings list. Thanks to Sergey Ushakov for the code 441 * contribution! 442 */ 443 private void loadEncodingInfo() { 444 try { 445 // load (java name)->(preferred mime name) mapping. 446 final Properties props = loadProperties(); 447 448 // create instances of EncodingInfo from the loaded mapping 449 Enumeration keys = props.keys(); 450 Map<String, EncodingInfo> canonicals = new HashMap<>(); 451 while (keys.hasMoreElements()) { 452 final String javaName = (String) keys.nextElement(); 453 final String[] mimes = parseMimeTypes(props.getProperty(javaName)); 454 455 final String charsetName = findCharsetNameFor(javaName, mimes); 456 if (charsetName != null) { 457 final String kj = toUpperCaseFast(javaName); 458 final String kc = toUpperCaseFast(charsetName); 459 for (int i = 0; i < mimes.length; ++i) { 460 final String mimeName = mimes[i]; 461 final String km = toUpperCaseFast(mimeName); 462 EncodingInfo info = new EncodingInfo(mimeName, charsetName); 463 _encodingTableKeyMime.put(km, info); 464 if (!canonicals.containsKey(kc)) { 465 // canonicals will map the charset name to 466 // the info containing the prefered mime name 467 // (the preferred mime name is the first mime 468 // name in the list). 469 canonicals.put(kc, info); 470 _encodingTableKeyJava.put(kc, info); 471 } 472 _encodingTableKeyJava.put(kj, info); 473 } 474 } else { 475 // None of the java or mime names on the line were 476 // recognized => this charset is not supported? 477 } 478 } 479 480 // Fix up the _encodingTableKeyJava so that the info mapped to 481 // the java name contains the preferred mime name. 482 // (a given java name can correspond to several mime name, 483 // but we want the _encodingTableKeyJava to point to the 484 // preferred mime name). 485 for (Entry<String, EncodingInfo> e : _encodingTableKeyJava.entrySet()) { 486 e.setValue(canonicals.get(toUpperCaseFast(e.getValue().javaName))); 487 } 488 489 } catch (java.net.MalformedURLException mue) { 490 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue); 491 } catch (java.io.IOException ioe) { 492 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe); 493 } 494 } 495 496 EncodingInfo findEncoding(String normalizedEncoding) { 497 EncodingInfo info = _encodingTableKeyJava.get(normalizedEncoding); 498 if (info == null) { 499 info = _encodingTableKeyMime.get(normalizedEncoding); 500 } 501 if (info == null) { 502 info = _encodingDynamicTable.get(normalizedEncoding); 503 } 504 return info; 505 } 506 507 EncodingInfo getEncodingFromMimeKey(String normalizedMimeName) { 508 return _encodingTableKeyMime.get(normalizedMimeName); 509 } 510 511 EncodingInfo getEncodingFromJavaKey(String normalizedJavaName) { 512 return _encodingTableKeyJava.get(normalizedJavaName); 513 } 514 515 void putEncoding(String key, EncodingInfo info) { 516 _encodingDynamicTable.put(key, info); 517 } 518 } 519 520 /** 521 * Return true if the character is the high member of a surrogate pair. 522 * <p> 523 * This is not a public API. 524 * @param ch the character to test 525 * @xsl.usage internal 526 */ 527 static boolean isHighUTF16Surrogate(char ch) { 528 return ('\uD800' <= ch && ch <= '\uDBFF'); 529 } 530 /** 531 * Return true if the character is the low member of a surrogate pair. 532 * <p> 533 * This is not a public API. 534 * @param ch the character to test 535 * @xsl.usage internal 536 */ 537 static boolean isLowUTF16Surrogate(char ch) { 538 return ('\uDC00' <= ch && ch <= '\uDFFF'); 539 } 540 /** 541 * Return the unicode code point represented by the high/low surrogate pair. 542 * <p> 543 * This is not a public API. 544 * @param highSurrogate the high char of the high/low pair 545 * @param lowSurrogate the low char of the high/low pair 546 * @xsl.usage internal 547 */ 548 static int toCodePoint(char highSurrogate, char lowSurrogate) { 549 int codePoint = 550 ((highSurrogate - 0xd800) << 10) 551 + (lowSurrogate - 0xdc00) 552 + 0x10000; 553 return codePoint; 554 } 555 /** 556 * Return the unicode code point represented by the char. 557 * A bit of a dummy method, since all it does is return the char, 558 * but as an int value. 559 * <p> 560 * This is not a public API. 561 * @param ch the char. 562 * @xsl.usage internal 563 */ 564 static int toCodePoint(char ch) { 565 int codePoint = ch; 566 return codePoint; 567 } 568 569 private final static EncodingInfos _encodingInfos = new EncodingInfos(); 570 571 }