1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 1999-2004 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 /* 21 * $Id: CharInfo.java,v 1.2.4.1 2005/09/15 08:15:14 suresh_emailid Exp $ 22 */ 23 package com.sun.org.apache.xml.internal.serializer; 24 25 import java.io.BufferedReader; 26 import java.io.InputStream; 27 import java.io.InputStreamReader; 28 import java.io.UnsupportedEncodingException; 29 import java.net.URL; 30 import java.util.Enumeration; 31 import java.util.HashMap; 32 import java.util.PropertyResourceBundle; 33 import java.util.ResourceBundle; 34 import java.security.AccessController; 35 import java.security.PrivilegedAction; 36 37 import javax.xml.transform.TransformerException; 38 39 import com.sun.org.apache.xml.internal.serializer.utils.MsgKey; 40 import com.sun.org.apache.xml.internal.serializer.utils.SystemIDResolver; 41 import com.sun.org.apache.xml.internal.serializer.utils.Utils; 42 import com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException; 43 import com.sun.org.apache.xalan.internal.utils.ObjectFactory; 44 45 /** 46 * This class provides services that tell if a character should have 47 * special treatement, such as entity reference substitution or normalization 48 * of a newline character. It also provides character to entity reference 49 * lookup. 50 * 51 * DEVELOPERS: See Known Issue in the constructor. 52 * 53 * @xsl.usage internal 54 */ 55 final class CharInfo 56 { 57 /** Given a character, lookup a String to output (e.g. a decorated entity reference). */ 58 private HashMap m_charToString = new HashMap(); 59 60 /** 61 * The name of the HTML entities file. 62 * If specified, the file will be resource loaded with the default class loader. 63 */ 64 public static final String HTML_ENTITIES_RESOURCE = 65 "com.sun.org.apache.xml.internal.serializer.HTMLEntities"; 66 67 /** 68 * The name of the XML entities file. 69 * If specified, the file will be resource loaded with the default class loader. 70 */ 71 public static final String XML_ENTITIES_RESOURCE = 72 "com.sun.org.apache.xml.internal.serializer.XMLEntities"; 73 74 /** The horizontal tab character, which the parser should always normalize. */ 75 public static final char S_HORIZONAL_TAB = 0x09; 76 77 /** The linefeed character, which the parser should always normalize. */ 78 public static final char S_LINEFEED = 0x0A; 79 80 /** The carriage return character, which the parser should always normalize. */ 81 public static final char S_CARRIAGERETURN = 0x0D; 82 83 /** This flag is an optimization for HTML entities. It false if entities 84 * other than quot (34), amp (38), lt (60) and gt (62) are defined 85 * in the range 0 to 127. 86 * @xsl.usage internal 87 */ 88 final boolean onlyQuotAmpLtGt; 89 90 /** Copy the first 0,1 ... ASCII_MAX values into an array */ 91 private static final int ASCII_MAX = 128; 92 93 /** Array of values is faster access than a set of bits 94 * to quickly check ASCII characters in attribute values. 95 */ 96 private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX]; 97 98 /** Array of values is faster access than a set of bits 99 * to quickly check ASCII characters in text nodes. 100 */ 101 private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX]; 102 103 private boolean[] isCleanTextASCII = new boolean[ASCII_MAX]; 104 105 /** An array of bits to record if the character is in the set. 106 * Although information in this array is complete, the 107 * isSpecialAttrASCII array is used first because access to its values 108 * is common and faster. 109 */ 110 private int array_of_bits[] = createEmptySetOfIntegers(65535); 111 112 113 // 5 for 32 bit words, 6 for 64 bit words ... 114 /* 115 * This constant is used to shift an integer to quickly 116 * calculate which element its bit is stored in. 117 * 5 for 32 bit words (int) , 6 for 64 bit words (long) 118 */ 119 private static final int SHIFT_PER_WORD = 5; 120 121 /* 122 * A mask to get the low order bits which are used to 123 * calculate the value of the bit within a given word, 124 * that will represent the presence of the integer in the 125 * set. 126 * 127 * 0x1F for 32 bit words (int), 128 * or 0x3F for 64 bit words (long) 129 */ 130 private static final int LOW_ORDER_BITMASK = 0x1f; 131 132 /* 133 * This is used for optimizing the lookup of bits representing 134 * the integers in the set. It is the index of the first element 135 * in the array array_of_bits[] that is not used. 136 */ 137 private int firstWordNotUsed; 138 139 140 /** 141 * Constructor that reads in a resource file that describes the mapping of 142 * characters to entity references. 143 * This constructor is private, just to force the use 144 * of the getCharInfo(entitiesResource) factory 145 * 146 * Resource files must be encoded in UTF-8 and can either be properties 147 * files with a .properties extension assumed. Alternatively, they can 148 * have the following form, with no particular extension assumed: 149 * 150 * <pre> 151 * # First char # is a comment 152 * Entity numericValue 153 * quot 34 154 * amp 38 155 * </pre> 156 * 157 * @param entitiesResource Name of properties or resource file that should 158 * be loaded, which describes that mapping of characters to entity 159 * references. 160 */ 161 private CharInfo(String entitiesResource, String method) 162 { 163 this(entitiesResource, method, false); 164 } 165 166 private CharInfo(String entitiesResource, String method, boolean internal) 167 { 168 ResourceBundle entities = null; 169 boolean noExtraEntities = true; 170 171 // Make various attempts to interpret the parameter as a properties 172 // file or resource file, as follows: 173 // 174 // 1) attempt to load .properties file using ResourceBundle 175 // 2) try using the class loader to find the specified file a resource 176 // file 177 // 3) try treating the resource a URI 178 179 if (internal) { 180 try { 181 // Load entity property files by using PropertyResourceBundle, 182 // cause of security issure for applets 183 entities = PropertyResourceBundle.getBundle(entitiesResource); 184 } catch (Exception e) {} 185 } 186 187 if (entities != null) { 188 Enumeration keys = entities.getKeys(); 189 while (keys.hasMoreElements()){ 190 String name = (String) keys.nextElement(); 191 String value = entities.getString(name); 192 int code = Integer.parseInt(value); 193 defineEntity(name, (char) code); 194 if (extraEntity(code)) 195 noExtraEntities = false; 196 } 197 set(S_LINEFEED); 198 set(S_CARRIAGERETURN); 199 } else { 200 InputStream is = null; 201 202 // Load user specified resource file by using URL loading, it 203 // requires a valid URI as parameter 204 try { 205 if (internal) { 206 is = CharInfo.class.getResourceAsStream(entitiesResource); 207 } else { 208 ClassLoader cl = ObjectFactory.findClassLoader(); 209 if (cl == null) { 210 is = ClassLoader.getSystemResourceAsStream(entitiesResource); 211 } else { 212 is = cl.getResourceAsStream(entitiesResource); 213 } 214 215 if (is == null) { 216 try { 217 URL url = new URL(entitiesResource); 218 is = url.openStream(); 219 } catch (Exception e) {} 220 } 221 } 222 223 if (is == null) { 224 throw new RuntimeException( 225 Utils.messages.createMessage( 226 MsgKey.ER_RESOURCE_COULD_NOT_FIND, 227 new Object[] {entitiesResource, entitiesResource})); 228 } 229 230 // Fix Bugzilla#4000: force reading in UTF-8 231 // This creates the de facto standard that Xalan's resource 232 // files must be encoded in UTF-8. This should work in all 233 // JVMs. 234 // 235 // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which 236 // didn't implement the UTF-8 encoding. Theoretically, we should 237 // simply let it fail in that case, since the JVM is obviously 238 // broken if it doesn't support such a basic standard. But 239 // since there are still some users attempting to use VJ++ for 240 // development, we have dropped in a fallback which makes a 241 // second attempt using the platform's default encoding. In VJ++ 242 // this is apparently ASCII, which is subset of UTF-8... and 243 // since the strings we'll be reading here are also primarily 244 // limited to the 7-bit ASCII range (at least, in English 245 // versions of Xalan), this should work well enough to keep us 246 // on the air until we're ready to officially decommit from 247 // VJ++. 248 249 BufferedReader reader; 250 try { 251 reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); 252 } catch (UnsupportedEncodingException e) { 253 reader = new BufferedReader(new InputStreamReader(is)); 254 } 255 256 String line = reader.readLine(); 257 258 while (line != null) { 259 if (line.length() == 0 || line.charAt(0) == '#') { 260 line = reader.readLine(); 261 262 continue; 263 } 264 265 int index = line.indexOf(' '); 266 267 if (index > 1) { 268 String name = line.substring(0, index); 269 270 ++index; 271 272 if (index < line.length()) { 273 String value = line.substring(index); 274 index = value.indexOf(' '); 275 276 if (index > 0) { 277 value = value.substring(0, index); 278 } 279 280 int code = Integer.parseInt(value); 281 282 defineEntity(name, (char) code); 283 if (extraEntity(code)) 284 noExtraEntities = false; 285 } 286 } 287 288 line = reader.readLine(); 289 } 290 291 is.close(); 292 set(S_LINEFEED); 293 set(S_CARRIAGERETURN); 294 } catch (Exception e) { 295 throw new RuntimeException( 296 Utils.messages.createMessage( 297 MsgKey.ER_RESOURCE_COULD_NOT_LOAD, 298 new Object[] { entitiesResource, 299 e.toString(), 300 entitiesResource, 301 e.toString()})); 302 } finally { 303 if (is != null) { 304 try { 305 is.close(); 306 } catch (Exception except) {} 307 } 308 } 309 } 310 311 /* initialize the array isCleanTextASCII[] with a cache of values 312 * for use by ToStream.character(char[], int , int) 313 * and the array isSpecialTextASCII[] with the opposite values 314 * (all in the name of performance!) 315 */ 316 for (int ch = 0; ch <ASCII_MAX; ch++) 317 if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch))) 318 && (!get(ch))) || ('"' == ch)) 319 { 320 isCleanTextASCII[ch] = true; 321 isSpecialTextASCII[ch] = false; 322 } 323 else { 324 isCleanTextASCII[ch] = false; 325 isSpecialTextASCII[ch] = true; 326 } 327 328 329 330 onlyQuotAmpLtGt = noExtraEntities; 331 332 // initialize the array with a cache of the BitSet values 333 for (int i=0; i<ASCII_MAX; i++) 334 isSpecialAttrASCII[i] = get(i); 335 336 /* Now that we've used get(ch) just above to initialize the 337 * two arrays we will change by adding a tab to the set of 338 * special chars for XML (but not HTML!). 339 * We do this because a tab is always a 340 * special character in an XML attribute, 341 * but only a special character in XML text 342 * if it has an entity defined for it. 343 * This is the reason for this delay. 344 */ 345 if (Method.XML.equals(method)) 346 { 347 isSpecialAttrASCII[S_HORIZONAL_TAB] = true; 348 } 349 } 350 351 /** 352 * Defines a new character reference. The reference's name and value are 353 * supplied. Nothing happens if the character reference is already defined. 354 * <p>Unlike internal entities, character references are a string to single 355 * character mapping. They are used to map non-ASCII characters both on 356 * parsing and printing, primarily for HTML documents. '<amp;' is an 357 * example of a character reference.</p> 358 * 359 * @param name The entity's name 360 * @param value The entity's value 361 */ 362 private void defineEntity(String name, char value) 363 { 364 StringBuilder sb = new StringBuilder("&"); 365 sb.append(name); 366 sb.append(';'); 367 String entityString = sb.toString(); 368 369 defineChar2StringMapping(entityString, value); 370 } 371 372 /** 373 * Map a character to a String. For example given 374 * the character '>' this method would return the fully decorated 375 * entity name "<". 376 * Strings for entity references are loaded from a properties file, 377 * but additional mappings defined through calls to defineChar2String() 378 * are possible. Such entity reference mappings could be over-ridden. 379 * 380 * This is reusing a stored key object, in an effort to avoid 381 * heap activity. Unfortunately, that introduces a threading risk. 382 * Simplest fix for now is to make it a synchronized method, or to give 383 * up the reuse; I see very little performance difference between them. 384 * Long-term solution would be to replace the hashtable with a sparse array 385 * keyed directly from the character's integer value; see DTM's 386 * string pool for a related solution. 387 * 388 * @param value The character that should be resolved to 389 * a String, e.g. resolve '>' to "<". 390 * 391 * @return The String that the character is mapped to, or null if not found. 392 * @xsl.usage internal 393 */ 394 String getOutputStringForChar(char value) 395 { 396 CharKey charKey = new CharKey(); 397 charKey.setChar(value); 398 return (String) m_charToString.get(charKey); 399 } 400 401 /** 402 * Tell if the character argument that is from 403 * an attribute value should have special treatment. 404 * 405 * @param value the value of a character that is in an attribute value 406 * @return true if the character should have any special treatment, 407 * such as when writing out attribute values, 408 * or entity references. 409 * @xsl.usage internal 410 */ 411 final boolean isSpecialAttrChar(int value) 412 { 413 // for performance try the values in the boolean array first, 414 // this is faster access than the BitSet for common ASCII values 415 416 if (value < ASCII_MAX) 417 return isSpecialAttrASCII[value]; 418 419 // rather than java.util.BitSet, our private 420 // implementation is faster (and less general). 421 return get(value); 422 } 423 424 /** 425 * Tell if the character argument that is from a 426 * text node should have special treatment. 427 * 428 * @param value the value of a character that is in a text node 429 * @return true if the character should have any special treatment, 430 * such as when writing out attribute values, 431 * or entity references. 432 * @xsl.usage internal 433 */ 434 final boolean isSpecialTextChar(int value) 435 { 436 // for performance try the values in the boolean array first, 437 // this is faster access than the BitSet for common ASCII values 438 439 if (value < ASCII_MAX) 440 return isSpecialTextASCII[value]; 441 442 // rather than java.util.BitSet, our private 443 // implementation is faster (and less general). 444 return get(value); 445 } 446 447 /** 448 * This method is used to determine if an ASCII character in 449 * a text node (not an attribute value) is "clean". 450 * @param value the character to check (0 to 127). 451 * @return true if the character can go to the writer as-is 452 * @xsl.usage internal 453 */ 454 final boolean isTextASCIIClean(int value) 455 { 456 return isCleanTextASCII[value]; 457 } 458 459 // In the future one might want to use the array directly and avoid 460 // the method call, but I think the JIT alreay inlines this well enough 461 // so don't do it (for now) - bjm 462 // public final boolean[] getASCIIClean() 463 // { 464 // return isCleanTextASCII; 465 // } 466 467 468 private static CharInfo getCharInfoBasedOnPrivilege( 469 final String entitiesFileName, final String method, 470 final boolean internal){ 471 return (CharInfo) AccessController.doPrivileged( 472 new PrivilegedAction() { 473 public Object run() { 474 return new CharInfo(entitiesFileName, 475 method, internal);} 476 }); 477 } 478 479 /** 480 * Factory that reads in a resource file that describes the mapping of 481 * characters to entity references. 482 * 483 * Resource files must be encoded in UTF-8 and have a format like: 484 * <pre> 485 * # First char # is a comment 486 * Entity numericValue 487 * quot 34 488 * amp 38 489 * </pre> 490 * (Note: Why don't we just switch to .properties files? Oct-01 -sc) 491 * 492 * @param entitiesResource Name of entities resource file that should 493 * be loaded, which describes that mapping of characters to entity references. 494 * @param method the output method type, which should be one of "xml", "html", "text"... 495 * 496 * @xsl.usage internal 497 */ 498 static CharInfo getCharInfo(String entitiesFileName, String method) 499 { 500 CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName); 501 if (charInfo != null) { 502 return charInfo; 503 } 504 505 // try to load it internally - cache 506 try { 507 charInfo = getCharInfoBasedOnPrivilege(entitiesFileName, 508 method, true); 509 m_getCharInfoCache.put(entitiesFileName, charInfo); 510 return charInfo; 511 } catch (Exception e) {} 512 513 // try to load it externally - do not cache 514 try { 515 return getCharInfoBasedOnPrivilege(entitiesFileName, 516 method, false); 517 } catch (Exception e) {} 518 519 String absoluteEntitiesFileName; 520 521 if (entitiesFileName.indexOf(':') < 0) { 522 absoluteEntitiesFileName = 523 SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName); 524 } else { 525 try { 526 absoluteEntitiesFileName = 527 SystemIDResolver.getAbsoluteURI(entitiesFileName, null); 528 } catch (TransformerException te) { 529 throw new WrappedRuntimeException(te); 530 } 531 } 532 533 return getCharInfoBasedOnPrivilege(entitiesFileName, 534 method, false); 535 } 536 537 /** Table of user-specified char infos. */ 538 private static HashMap m_getCharInfoCache = new HashMap(); 539 540 /** 541 * Returns the array element holding the bit value for the 542 * given integer 543 * @param i the integer that might be in the set of integers 544 * 545 */ 546 private static int arrayIndex(int i) { 547 return (i >> SHIFT_PER_WORD); 548 } 549 550 /** 551 * For a given integer in the set it returns the single bit 552 * value used within a given word that represents whether 553 * the integer is in the set or not. 554 */ 555 private static int bit(int i) { 556 int ret = (1 << (i & LOW_ORDER_BITMASK)); 557 return ret; 558 } 559 560 /** 561 * Creates a new empty set of integers (characters) 562 * @param max the maximum integer to be in the set. 563 */ 564 private int[] createEmptySetOfIntegers(int max) { 565 firstWordNotUsed = 0; // an optimization 566 567 int[] arr = new int[arrayIndex(max - 1) + 1]; 568 return arr; 569 570 } 571 572 /** 573 * Adds the integer (character) to the set of integers. 574 * @param i the integer to add to the set, valid values are 575 * 0, 1, 2 ... up to the maximum that was specified at 576 * the creation of the set. 577 */ 578 private final void set(int i) { 579 setASCIIdirty(i); 580 581 int j = (i >> SHIFT_PER_WORD); // this word is used 582 int k = j + 1; 583 584 if(firstWordNotUsed < k) // for optimization purposes. 585 firstWordNotUsed = k; 586 587 array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK)); 588 } 589 590 591 /** 592 * Return true if the integer (character)is in the set of integers. 593 * 594 * This implementation uses an array of integers with 32 bits per 595 * integer. If a bit is set to 1 the corresponding integer is 596 * in the set of integers. 597 * 598 * @param i an integer that is tested to see if it is the 599 * set of integers, or not. 600 */ 601 private final boolean get(int i) { 602 603 boolean in_the_set = false; 604 int j = (i >> SHIFT_PER_WORD); // wordIndex(i) 605 // an optimization here, ... a quick test to see 606 // if this integer is beyond any of the words in use 607 if(j < firstWordNotUsed) 608 in_the_set = (array_of_bits[j] & 609 (1 << (i & LOW_ORDER_BITMASK)) 610 ) != 0; // 0L for 64 bit words 611 return in_the_set; 612 } 613 614 // record if there are any entities other than 615 // quot, amp, lt, gt (probably user defined) 616 /** 617 * @return true if the entity 618 * @param code The value of the character that has an entity defined 619 * for it. 620 */ 621 private boolean extraEntity(int entityValue) 622 { 623 boolean extra = false; 624 if (entityValue < 128) 625 { 626 switch (entityValue) 627 { 628 case 34 : // quot 629 case 38 : // amp 630 case 60 : // lt 631 case 62 : // gt 632 break; 633 default : // other entity in range 0 to 127 634 extra = true; 635 } 636 } 637 return extra; 638 } 639 640 /** 641 * If the character is a printable ASCII character then 642 * mark it as not clean and needing replacement with 643 * a String on output. 644 * @param ch 645 */ 646 private void setASCIIdirty(int j) 647 { 648 if (0 <= j && j < ASCII_MAX) 649 { 650 isCleanTextASCII[j] = false; 651 isSpecialTextASCII[j] = true; 652 } 653 } 654 655 /** 656 * If the character is a printable ASCII character then 657 * mark it as and not needing replacement with 658 * a String on output. 659 * @param ch 660 */ 661 private void setASCIIclean(int j) 662 { 663 if (0 <= j && j < ASCII_MAX) 664 { 665 isCleanTextASCII[j] = true; 666 isSpecialTextASCII[j] = false; 667 } 668 } 669 670 private void defineChar2StringMapping(String outputString, char inputChar) 671 { 672 CharKey character = new CharKey(inputChar); 673 m_charToString.put(character, outputString); 674 set(inputChar); 675 } 676 677 /** 678 * Simple class for fast lookup of char values, when used with 679 * hashtables. You can set the char, then use it as a key. 680 * 681 * This class is a copy of the one in com.sun.org.apache.xml.internal.utils. 682 * It exists to cut the serializers dependancy on that package. 683 * 684 * @xsl.usage internal 685 */ 686 private static class CharKey extends Object 687 { 688 689 /** String value */ 690 private char m_char; 691 692 /** 693 * Constructor CharKey 694 * 695 * @param key char value of this object. 696 */ 697 public CharKey(char key) 698 { 699 m_char = key; 700 } 701 702 /** 703 * Default constructor for a CharKey. 704 * 705 * @param key char value of this object. 706 */ 707 public CharKey() 708 { 709 } 710 711 /** 712 * Get the hash value of the character. 713 * 714 * @return hash value of the character. 715 */ 716 public final void setChar(char c) 717 { 718 m_char = c; 719 } 720 721 722 723 /** 724 * Get the hash value of the character. 725 * 726 * @return hash value of the character. 727 */ 728 public final int hashCode() 729 { 730 return (int)m_char; 731 } 732 733 /** 734 * Override of equals() for this object 735 * 736 * @param obj to compare to 737 * 738 * @return True if this object equals this string value 739 */ 740 public final boolean equals(Object obj) 741 { 742 return ((CharKey)obj).m_char == m_char; 743 } 744 } 745 746 747 }