1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 1999-2004 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 /* 21 * $Id: Encodings.java,v 1.3 2005/09/28 13:49:04 pvedula Exp $ 22 */ 23 package com.sun.org.apache.xml.internal.serializer; 24 25 import java.io.InputStream; 26 import java.io.OutputStream; 27 import java.io.OutputStreamWriter; 28 import java.io.UnsupportedEncodingException; 29 import java.io.Writer; 30 import java.io.BufferedWriter; 31 import java.net.URL; 32 import java.util.Enumeration; 33 import java.util.HashMap; 34 import java.util.Properties; 35 import java.util.StringTokenizer; 36 import java.io.IOException; 37 import java.net.MalformedURLException; 38 import java.nio.charset.Charset; 39 import java.nio.charset.IllegalCharsetNameException; 40 import java.nio.charset.UnsupportedCharsetException; 41 import java.util.Collections; 42 import java.util.Map; 43 import java.util.Map.Entry; 44 45 import com.sun.org.apache.xalan.internal.utils.SecuritySupport; 46 47 /** 48 * Provides information about encodings. Depends on the Java runtime 49 * to provides writers for the different encodings, but can be used 50 * to override encoding names and provide the last printable character 51 * for each encoding. 52 * 53 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 54 */ 55 56 public final class Encodings extends Object 57 { 58 59 /** 60 * The last printable character for unknown encodings. 61 */ 62 private static final int m_defaultLastPrintable = 0x7F; 63 64 /** 65 * Standard filename for properties file with encodings data. 66 */ 67 private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties"; 68 69 /** 70 * Standard filename for properties file with encodings data. 71 */ 72 private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings"; 73 74 75 /** 76 * Returns a writer for the specified encoding based on 77 * an output stream. 78 * 79 * @param output The output stream 80 * @param encoding The encoding 81 * @return A suitable writer 82 * @throws UnsupportedEncodingException There is no convertor 83 * to support this encoding 84 */ 85 static Writer getWriter(OutputStream output, String encoding) 86 throws UnsupportedEncodingException 87 { 88 89 final EncodingInfo ei = _encodingInfos.findEncoding(toUpperCaseFast(encoding)); 90 if (ei != null) { 91 try { 92 return new BufferedWriter(new OutputStreamWriter( 93 output, ei.javaName)); 94 } catch (UnsupportedEncodingException usee) { 95 // keep trying 96 } 97 } 98 99 return new BufferedWriter(new OutputStreamWriter(output, encoding)); 100 } 101 102 103 /** 104 * Returns the last printable character for an unspecified 105 * encoding. 106 * 107 * @return the default size 108 */ 109 public static int getLastPrintable() 110 { 111 return m_defaultLastPrintable; 112 } 113 114 115 116 /** 117 * Returns the EncodingInfo object for the specified 118 * encoding. 119 * <p> 120 * This is not a public API. 121 * 122 * @param encoding The encoding 123 * @return The object that is used to determine if 124 * characters are in the given encoding. 125 * @xsl.usage internal 126 */ 127 static EncodingInfo getEncodingInfo(String encoding) 128 { 129 EncodingInfo ei; 130 131 String normalizedEncoding = toUpperCaseFast(encoding); 132 ei = _encodingInfos.findEncoding(normalizedEncoding); 133 if (ei == null) { 134 // We shouldn't have to do this, but just in case. 135 try { 136 // This may happen if the caller tries to use 137 // an encoding that wasn't registered in the 138 // (java name)->(preferred mime name) mapping file. 139 // In that case we attempt to load the charset for the 140 // given encoding, and if that succeeds - we create a new 141 // EncodingInfo instance - assuming the canonical name 142 // of the charset can be used as the mime name. 143 final Charset c = Charset.forName(encoding); 144 final String name = c.name(); 145 ei = new EncodingInfo(name, name); 146 _encodingInfos.putEncoding(normalizedEncoding, ei); 147 } catch (IllegalCharsetNameException | UnsupportedCharsetException x) { 148 ei = new EncodingInfo(null,null); 149 } 150 } 151 152 return ei; 153 } 154 155 /** 156 * A fast and cheap way to uppercase a String that is 157 * only made of printable ASCII characters. 158 * <p> 159 * This is not a public API. 160 * @param s a String of ASCII characters 161 * @return an uppercased version of the input String, 162 * possibly the same String. 163 * @xsl.usage internal 164 */ 165 static private String toUpperCaseFast(final String s) { 166 167 boolean different = false; 168 final int mx = s.length(); 169 char[] chars = new char[mx]; 170 for (int i=0; i < mx; i++) { 171 char ch = s.charAt(i); 172 // is the character a lower case ASCII one? 173 if ('a' <= ch && ch <= 'z') { 174 // a cheap and fast way to uppercase that is good enough 175 ch = (char) (ch + ('A' - 'a')); 176 different = true; // the uppercased String is different 177 } 178 chars[i] = ch; 179 } 180 181 // A little optimization, don't call String.valueOf() if 182 // the uppercased string is the same as the input string. 183 final String upper; 184 if (different) 185 upper = String.valueOf(chars); 186 else 187 upper = s; 188 189 return upper; 190 } 191 192 /** The default encoding, ISO style, ISO style. */ 193 static final String DEFAULT_MIME_ENCODING = "UTF-8"; 194 195 /** 196 * Get the proper mime encoding. From the XSLT recommendation: "The encoding 197 * attribute specifies the preferred encoding to use for outputting the result 198 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16. 199 * For other values, if the XSLT processor does not support the specified 200 * encoding it may signal an error; if it does not signal an error it should 201 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding 202 * whose name does not match the EncName production of the XML Recommendation 203 * [XML]. If no encoding attribute is specified, then the XSLT processor should 204 * use either UTF-8 or UTF-16." 205 * 206 * @param encoding Reference to java-style encoding string, which may be null, 207 * in which case a default will be found. 208 * 209 * @return The ISO-style encoding string, or null if failure. 210 */ 211 static String getMimeEncoding(String encoding) 212 { 213 214 if (null == encoding) 215 { 216 try 217 { 218 219 // Get the default system character encoding. This may be 220 // incorrect if they passed in a writer, but right now there 221 // seems to be no way to get the encoding from a writer. 222 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8"); 223 224 if (null != encoding) 225 { 226 227 /* 228 * See if the mime type is equal to UTF8. If you don't 229 * do that, then convertJava2MimeEncoding will convert 230 * 8859_1 to "ISO-8859-1", which is not what we want, 231 * I think, and I don't think I want to alter the tables 232 * to convert everything to UTF-8. 233 */ 234 String jencoding = 235 (encoding.equalsIgnoreCase("Cp1252") 236 || encoding.equalsIgnoreCase("ISO8859_1") 237 || encoding.equalsIgnoreCase("8859_1") 238 || encoding.equalsIgnoreCase("UTF8")) 239 ? DEFAULT_MIME_ENCODING 240 : convertJava2MimeEncoding(encoding); 241 242 encoding = 243 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING; 244 } 245 else 246 { 247 encoding = DEFAULT_MIME_ENCODING; 248 } 249 } 250 catch (SecurityException se) 251 { 252 encoding = DEFAULT_MIME_ENCODING; 253 } 254 } 255 else 256 { 257 encoding = convertJava2MimeEncoding(encoding); 258 } 259 260 return encoding; 261 } 262 263 /** 264 * Try the best we can to convert a Java encoding to a XML-style encoding. 265 * 266 * @param encoding non-null reference to encoding string, java style. 267 * 268 * @return ISO-style encoding string. 269 */ 270 private static String convertJava2MimeEncoding(String encoding) 271 { 272 final EncodingInfo enc = 273 _encodingInfos.getEncodingFromJavaKey(toUpperCaseFast(encoding)); 274 if (null != enc) 275 return enc.name; 276 return encoding; 277 } 278 279 /** 280 * Try the best we can to convert a Java encoding to a XML-style encoding. 281 * 282 * @param encoding non-null reference to encoding string, java style. 283 * 284 * @return ISO-style encoding string. 285 */ 286 public static String convertMime2JavaEncoding(String encoding) 287 { 288 final EncodingInfo info = _encodingInfos.findEncoding(toUpperCaseFast(encoding)); 289 return info != null ? info.javaName : encoding; 290 } 291 292 // Using an inner static class here prevent initialization races 293 // where the hash maps could be used before they were populated. 294 // 295 private final static class EncodingInfos { 296 // These maps are final and not modified after initialization. 297 private final Map<String, EncodingInfo> _encodingTableKeyJava = new HashMap<>(); 298 private final Map<String, EncodingInfo> _encodingTableKeyMime = new HashMap<>(); 299 // This map will be added to after initialization: make sure it's 300 // thread-safe. This map should not be used frequently - only in cases 301 // where the mapping requested was not declared in the Encodings.properties 302 // file. 303 private final Map<String, EncodingInfo> _encodingDynamicTable = 304 Collections.synchronizedMap(new HashMap<String, EncodingInfo>()); 305 306 private EncodingInfos() { 307 loadEncodingInfo(); 308 } 309 310 // Opens the file/resource containing java charset name -> preferred mime 311 // name mapping and returns it as an InputStream. 312 private InputStream openEncodingsFileStream() throws MalformedURLException, IOException { 313 String urlString = null; 314 InputStream is = null; 315 316 try { 317 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, ""); 318 } catch (SecurityException e) { 319 } 320 321 if (urlString != null && urlString.length() > 0) { 322 URL url = new URL(urlString); 323 is = url.openStream(); 324 } 325 326 if (is == null) { 327 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE); 328 } 329 return is; 330 } 331 332 // Loads the Properties resource containing the mapping: 333 // java charset name -> preferred mime name 334 // and returns it. 335 private Properties loadProperties() throws MalformedURLException, IOException { 336 Properties props = new Properties(); 337 try (InputStream is = openEncodingsFileStream()) { 338 if (is != null) { 339 props.load(is); 340 } else { 341 // Seems to be no real need to force failure here, let the 342 // system do its best... The issue is not really very critical, 343 // and the output will be in any case _correct_ though maybe not 344 // always human-friendly... :) 345 // But maybe report/log the resource problem? 346 // Any standard ways to report/log errors (in static context)? 347 } 348 } 349 return props; 350 } 351 352 // Parses the mime list associated to a java charset name. 353 // The first mime name in the list is supposed to be the preferred 354 // mime name. 355 private String[] parseMimeTypes(String val) { 356 int pos = val.indexOf(' '); 357 //int lastPrintable; 358 if (pos < 0) { 359 // Maybe report/log this problem? 360 // "Last printable character not defined for encoding " + 361 // mimeName + " (" + val + ")" ... 362 return new String[] { val }; 363 //lastPrintable = 0x00FF; 364 } 365 //lastPrintable = 366 // Integer.decode(val.substring(pos).trim()).intValue(); 367 StringTokenizer st = 368 new StringTokenizer(val.substring(0, pos), ","); 369 String[] values = new String[st.countTokens()]; 370 for (int i=0; st.hasMoreTokens(); i++) { 371 values[i] = st.nextToken(); 372 } 373 return values; 374 } 375 376 // This method here attempts to find the canonical charset name for the 377 // the given name - which is supposed to be either a java name or a mime 378 // name. 379 // For that, it attempts to load the charset using the given name, and 380 // then returns the charset's canonical name. 381 // If the charset could not be loaded from the given name, 382 // the method returns null. 383 private String findCharsetNameFor(String name) { 384 try { 385 return Charset.forName(name).name(); 386 } catch (Exception x) { 387 return null; 388 } 389 } 390 391 // This method here attempts to find the canonical charset name for the 392 // the set javaName+mimeNames - which are supposed to all refer to the 393 // same charset. 394 // For that it attempts to load the charset using the javaName, and if 395 // not found, attempts again using each of the mime names in turn. 396 // If the charset could be loaded from the javaName, then the javaName 397 // itself is returned as charset name. Otherwise, each of the mime names 398 // is tried in turn, until a charset can be loaded from one of the names, 399 // and the loaded charset's canonical name is returned. 400 // If no charset can be loaded from either the javaName or one of the 401 // mime names, then null is returned. 402 // 403 // Note that the returned name is the 'java' name that will be used in 404 // instances of EncodingInfo. 405 // This is important because EncodingInfo uses that 'java name' later on 406 // in calls to String.getBytes(javaName). 407 // As it happens, sometimes only one element of the set mime names/javaName 408 // is known by Charset: sometimes only one of the mime names is known, 409 // sometime only the javaName is known, sometimes all are known. 410 // 411 // By using this method here, we fix the problem where one of the mime 412 // names is known but the javaName is unknown, by associating the charset 413 // loaded from one of the mime names with the unrecognized javaName. 414 // 415 // When none of the mime names or javaName are known - there's not much we can 416 // do... It can mean that this encoding is not supported for this 417 // OS. If such a charset is ever use it will result in having all characters 418 // escaped. 419 // 420 private String findCharsetNameFor(String javaName, String[] mimes) { 421 String cs = findCharsetNameFor(javaName); 422 if (cs != null) return javaName; 423 for (String m : mimes) { 424 cs = findCharsetNameFor(m); 425 if (cs != null) break; 426 } 427 return cs; 428 } 429 430 /** 431 * Loads a list of all the supported encodings. 432 * 433 * System property "encodings" formatted using URL syntax may define an 434 * external encodings list. Thanks to Sergey Ushakov for the code 435 * contribution! 436 */ 437 private void loadEncodingInfo() { 438 try { 439 // load (java name)->(preferred mime name) mapping. 440 final Properties props = loadProperties(); 441 442 // create instances of EncodingInfo from the loaded mapping 443 Enumeration keys = props.keys(); 444 Map<String, EncodingInfo> canonicals = new HashMap<>(); 445 while (keys.hasMoreElements()) { 446 final String javaName = (String) keys.nextElement(); 447 final String[] mimes = parseMimeTypes(props.getProperty(javaName)); 448 449 final String charsetName = findCharsetNameFor(javaName, mimes); 450 if (charsetName != null) { 451 final String kj = toUpperCaseFast(javaName); 452 final String kc = toUpperCaseFast(charsetName); 453 for (int i = 0; i < mimes.length; ++i) { 454 final String mimeName = mimes[i]; 455 final String km = toUpperCaseFast(mimeName); 456 EncodingInfo info = new EncodingInfo(mimeName, charsetName); 457 _encodingTableKeyMime.put(km, info); 458 if (!canonicals.containsKey(kc)) { 459 // canonicals will map the charset name to 460 // the info containing the prefered mime name 461 // (the preferred mime name is the first mime 462 // name in the list). 463 canonicals.put(kc, info); 464 _encodingTableKeyJava.put(kc, info); 465 } 466 _encodingTableKeyJava.put(kj, info); 467 } 468 } else { 469 // None of the java or mime names on the line were 470 // recognized => this charset is not supported? 471 } 472 } 473 474 // Fix up the _encodingTableKeyJava so that the info mapped to 475 // the java name contains the preferred mime name. 476 // (a given java name can correspond to several mime name, 477 // but we want the _encodingTableKeyJava to point to the 478 // preferred mime name). 479 for (Entry<String, EncodingInfo> e : _encodingTableKeyJava.entrySet()) { 480 e.setValue(canonicals.get(toUpperCaseFast(e.getValue().javaName))); 481 } 482 483 } catch (java.net.MalformedURLException mue) { 484 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue); 485 } catch (java.io.IOException ioe) { 486 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe); 487 } 488 } 489 490 EncodingInfo findEncoding(String normalizedEncoding) { 491 EncodingInfo info = _encodingTableKeyJava.get(normalizedEncoding); 492 if (info == null) { 493 info = _encodingTableKeyMime.get(normalizedEncoding); 494 } 495 if (info == null) { 496 info = _encodingDynamicTable.get(normalizedEncoding); 497 } 498 return info; 499 } 500 501 EncodingInfo getEncodingFromMimeKey(String normalizedMimeName) { 502 return _encodingTableKeyMime.get(normalizedMimeName); 503 } 504 505 EncodingInfo getEncodingFromJavaKey(String normalizedJavaName) { 506 return _encodingTableKeyJava.get(normalizedJavaName); 507 } 508 509 void putEncoding(String key, EncodingInfo info) { 510 _encodingDynamicTable.put(key, info); 511 } 512 } 513 514 /** 515 * Return true if the character is the high member of a surrogate pair. 516 * <p> 517 * This is not a public API. 518 * @param ch the character to test 519 * @xsl.usage internal 520 */ 521 static boolean isHighUTF16Surrogate(char ch) { 522 return ('\uD800' <= ch && ch <= '\uDBFF'); 523 } 524 /** 525 * Return true if the character is the low member of a surrogate pair. 526 * <p> 527 * This is not a public API. 528 * @param ch the character to test 529 * @xsl.usage internal 530 */ 531 static boolean isLowUTF16Surrogate(char ch) { 532 return ('\uDC00' <= ch && ch <= '\uDFFF'); 533 } 534 /** 535 * Return the unicode code point represented by the high/low surrogate pair. 536 * <p> 537 * This is not a public API. 538 * @param highSurrogate the high char of the high/low pair 539 * @param lowSurrogate the low char of the high/low pair 540 * @xsl.usage internal 541 */ 542 static int toCodePoint(char highSurrogate, char lowSurrogate) { 543 int codePoint = 544 ((highSurrogate - 0xd800) << 10) 545 + (lowSurrogate - 0xdc00) 546 + 0x10000; 547 return codePoint; 548 } 549 /** 550 * Return the unicode code point represented by the char. 551 * A bit of a dummy method, since all it does is return the char, 552 * but as an int value. 553 * <p> 554 * This is not a public API. 555 * @param ch the char. 556 * @xsl.usage internal 557 */ 558 static int toCodePoint(char ch) { 559 int codePoint = ch; 560 return codePoint; 561 } 562 563 private final static EncodingInfos _encodingInfos = new EncodingInfos(); 564 565 }