1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 2003-2005 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 package com.sun.org.apache.xerces.internal.xinclude; 21 22 import java.io.BufferedInputStream; 23 import java.io.IOException; 24 import java.io.InputStream; 25 import java.io.InputStreamReader; 26 import java.io.Reader; 27 import java.net.HttpURLConnection; 28 import java.net.URL; 29 import java.net.URLConnection; 30 import java.util.Iterator; 31 import java.util.Locale; 32 import java.util.Map; 33 34 import com.sun.org.apache.xerces.internal.impl.XMLEntityManager; 35 import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter; 36 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader; 37 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader; 38 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; 39 import com.sun.org.apache.xerces.internal.util.EncodingMap; 40 import com.sun.org.apache.xerces.internal.util.HTTPInputSource; 41 import com.sun.org.apache.xerces.internal.util.MessageFormatter; 42 import com.sun.org.apache.xerces.internal.util.XMLChar; 43 import com.sun.org.apache.xerces.internal.xni.XMLString; 44 import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource; 45 46 /** 47 * This class is used for reading resources requested in <include> elements, 48 * when the parse attribute of the <include> element is "text". Using this 49 * class will open the location, detect the encoding, and discard the byte order 50 * mark, if applicable. 51 * 52 * REVISIT: 53 * Much of the code in this class is taken from XMLEntityManager. It would be nice 54 * if this code could be shared in some way. However, since XMLEntityManager is used 55 * for reading files as XML, and this needs to read files as text, there would need 56 * to be some refactoring done. 57 * 58 * @author Michael Glavassevich, IBM 59 * @author Peter McCracken, IBM 60 * @author Ankit Pasricha, IBM 61 * @author Arun Yadav, Sun Microsystems Inc. 62 * 63 * 64 * @see XIncludeHandler 65 */ 66 public class XIncludeTextReader { 67 68 private Reader fReader; 69 private XIncludeHandler fHandler; 70 private XMLInputSource fSource; 71 private XMLErrorReporter fErrorReporter; 72 private XMLString fTempString = new XMLString(); 73 74 /** 75 * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler. 76 * 77 * @param source The XMLInputSource to use. 78 * @param handler The XIncludeHandler to use. 79 * @param bufferSize The size of this text reader's buffer. 80 */ 81 public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) 82 throws IOException { 83 fHandler = handler; 84 fSource = source; 85 fTempString = new XMLString(new char[bufferSize + 1], 0, 0); 86 } 87 88 /** 89 * Sets the XMLErrorReporter used for reporting errors while 90 * reading the text include. 91 * 92 * @param errorReporter the XMLErrorReporter to be used for 93 * reporting errors. 94 */ 95 public void setErrorReporter(XMLErrorReporter errorReporter) { 96 fErrorReporter = errorReporter; 97 } 98 99 /** 100 * Return the Reader for given XMLInputSource. 101 * 102 * @param source The XMLInputSource to use. 103 */ 104 protected Reader getReader(XMLInputSource source) throws IOException { 105 if (source.getCharacterStream() != null) { 106 return source.getCharacterStream(); 107 } 108 else { 109 InputStream stream = null; 110 111 String encoding = source.getEncoding(); 112 if (encoding == null) { 113 encoding = "UTF-8"; 114 } 115 if (source.getByteStream() != null) { 116 stream = source.getByteStream(); 117 // Wrap the InputStream so that it is possible to rewind it. 118 if (!(stream instanceof BufferedInputStream)) { 119 stream = new BufferedInputStream(stream, fTempString.ch.length); 120 } 121 } 122 else { 123 String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false); 124 125 URL url = new URL(expandedSystemId); 126 URLConnection urlCon = url.openConnection(); 127 128 // If this is an HTTP connection attach any request properties to the request. 129 if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) { 130 final HttpURLConnection urlConnection = (HttpURLConnection) urlCon; 131 final HTTPInputSource httpInputSource = (HTTPInputSource) source; 132 133 // set request properties 134 Iterator propIter = httpInputSource.getHTTPRequestProperties(); 135 while (propIter.hasNext()) { 136 Map.Entry entry = (Map.Entry) propIter.next(); 137 urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue()); 138 } 139 140 // set preference for redirection 141 boolean followRedirects = httpInputSource.getFollowHTTPRedirects(); 142 if (!followRedirects) { 143 XMLEntityManager.setInstanceFollowRedirects(urlConnection, followRedirects); 144 } 145 } 146 147 // Wrap the InputStream so that it is possible to rewind it. 148 stream = new BufferedInputStream(urlCon.getInputStream()); 149 150 // content type will be string like "text/xml; charset=UTF-8" or "text/xml" 151 String rawContentType = urlCon.getContentType(); 152 153 // text/xml and application/xml offer only one optional parameter 154 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1; 155 156 String contentType = null; 157 String charset = null; 158 if (index != -1) { 159 // this should be something like "text/xml" 160 contentType = rawContentType.substring(0, index).trim(); 161 162 // this should be something like "charset=UTF-8", but we want to 163 // strip it down to just "UTF-8" 164 charset = rawContentType.substring(index + 1).trim(); 165 if (charset.startsWith("charset=")) { 166 // 8 is the length of "charset=" 167 charset = charset.substring(8).trim(); 168 // strip quotes, if present 169 if ((charset.charAt(0) == '"' 170 && charset.charAt(charset.length() - 1) == '"') 171 || (charset.charAt(0) == '\'' 172 && charset.charAt(charset.length() - 1) 173 == '\'')) { 174 charset = 175 charset.substring(1, charset.length() - 1); 176 } 177 } 178 else { 179 charset = null; 180 } 181 } 182 else { 183 contentType = rawContentType.trim(); 184 } 185 186 String detectedEncoding = null; 187 /** The encoding of such a resource is determined by: 188 1 external encoding information, if available, otherwise 189 -- the most common type of external information is the "charset" parameter of a MIME package 190 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise 191 3 the value of the encoding attribute if one exists, otherwise 192 4 UTF-8. 193 **/ 194 if (contentType.equals("text/xml")) { 195 if (charset != null) { 196 detectedEncoding = charset; 197 } 198 else { 199 // see RFC2376 or 3023, section 3.1 200 detectedEncoding = "US-ASCII"; 201 } 202 } 203 else if (contentType.equals("application/xml")) { 204 if (charset != null) { 205 detectedEncoding = charset; 206 } 207 else { 208 // see RFC2376 or 3023, section 3.2 209 detectedEncoding = getEncodingName(stream); 210 } 211 } 212 else if (contentType.endsWith("+xml")) { 213 detectedEncoding = getEncodingName(stream); 214 } 215 216 if (detectedEncoding != null) { 217 encoding = detectedEncoding; 218 } 219 // else 3 or 4. 220 } 221 222 encoding = encoding.toUpperCase(Locale.ENGLISH); 223 224 // eat the Byte Order Mark 225 encoding = consumeBOM(stream, encoding); 226 227 // If the document is UTF-8 or US-ASCII use 228 // the Xerces readers for these encodings. For 229 // US-ASCII consult the encoding map since 230 // this encoding has many aliases. 231 if (encoding.equals("UTF-8")) { 232 return new UTF8Reader(stream, 233 fTempString.ch.length, 234 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 235 fErrorReporter.getLocale() ); 236 } 237 238 // Try to use a Java reader. 239 String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding); 240 241 // If the specified encoding wasn't a recognized IANA encoding throw an IOException. 242 // The XIncludeHandler will report this as a ResourceError and then will 243 // attempt to include a fallback if there is one. 244 if (javaEncoding == null) { 245 MessageFormatter aFormatter = 246 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN); 247 Locale aLocale = fErrorReporter.getLocale(); 248 throw new IOException( aFormatter.formatMessage( aLocale, 249 "EncodingDeclInvalid", 250 new Object[] {encoding} ) ); 251 } 252 else if (javaEncoding.equals("ASCII")) { 253 return new ASCIIReader(stream, 254 fTempString.ch.length, 255 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 256 fErrorReporter.getLocale() ); 257 } 258 259 return new InputStreamReader(stream, javaEncoding); 260 } 261 } 262 263 /** 264 * XMLEntityManager cares about endian-ness, since it creates its own optimized 265 * readers. Since we're just using generic Java readers for now, we're not caring 266 * about endian-ness. If this changes, even more code needs to be copied from 267 * XMLEntity manager. -- PJM 268 */ 269 protected String getEncodingName(InputStream stream) throws IOException { 270 final byte[] b4 = new byte[4]; 271 String encoding = null; 272 273 // this has the potential to throw an exception 274 // it will be fixed when we ensure the stream is rewindable (see note above) 275 stream.mark(4); 276 int count = stream.read(b4, 0, 4); 277 stream.reset(); 278 if (count == 4) { 279 encoding = getEncodingName(b4); 280 } 281 282 return encoding; 283 } 284 285 /** 286 * Removes the byte order mark from the stream, if 287 * it exists and returns the encoding name. 288 * 289 * @param stream 290 * @param encoding 291 * @throws IOException 292 */ 293 protected String consumeBOM(InputStream stream, String encoding) 294 throws IOException { 295 296 byte[] b = new byte[3]; 297 int count = 0; 298 stream.mark(3); 299 if (encoding.equals("UTF-8")) { 300 count = stream.read(b, 0, 3); 301 if (count == 3) { 302 final int b0 = b[0] & 0xFF; 303 final int b1 = b[1] & 0xFF; 304 final int b2 = b[2] & 0xFF; 305 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) { 306 // First three bytes are not BOM, so reset. 307 stream.reset(); 308 } 309 } 310 else { 311 stream.reset(); 312 } 313 } 314 else if (encoding.startsWith("UTF-16")) { 315 count = stream.read(b, 0, 2); 316 if (count == 2) { 317 final int b0 = b[0] & 0xFF; 318 final int b1 = b[1] & 0xFF; 319 if (b0 == 0xFE && b1 == 0xFF) { 320 return "UTF-16BE"; 321 } 322 else if (b0 == 0xFF && b1 == 0xFE) { 323 return "UTF-16LE"; 324 } 325 } 326 // First two bytes are not BOM, so reset. 327 stream.reset(); 328 } 329 // We could do UTF-32, but since the getEncodingName() doesn't support that 330 // we won't support it here. 331 // To implement UTF-32, look for: 00 00 FE FF for big-endian 332 // or FF FE 00 00 for little-endian 333 return encoding; 334 } 335 336 /** 337 * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager. 338 * Is there any way we can share the code, without having it implemented twice? 339 * I think we should make it public and static in XMLEntityManager. --PJM 340 * 341 * Returns the IANA encoding name that is auto-detected from 342 * the bytes specified, with the endian-ness of that encoding where appropriate. 343 * 344 * @param b4 The first four bytes of the input. 345 * @return the encoding name, or null if no encoding could be detected 346 */ 347 protected String getEncodingName(byte[] b4) { 348 349 // UTF-16, with BOM 350 int b0 = b4[0] & 0xFF; 351 int b1 = b4[1] & 0xFF; 352 if (b0 == 0xFE && b1 == 0xFF) { 353 // UTF-16, big-endian 354 return "UTF-16BE"; 355 } 356 if (b0 == 0xFF && b1 == 0xFE) { 357 // UTF-16, little-endian 358 return "UTF-16LE"; 359 } 360 361 // UTF-8 with a BOM 362 int b2 = b4[2] & 0xFF; 363 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 364 return "UTF-8"; 365 } 366 367 // other encodings 368 int b3 = b4[3] & 0xFF; 369 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 370 // UCS-4, big endian (1234) 371 return "ISO-10646-UCS-4"; 372 } 373 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 374 // UCS-4, little endian (4321) 375 return "ISO-10646-UCS-4"; 376 } 377 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 378 // UCS-4, unusual octet order (2143) 379 return "ISO-10646-UCS-4"; 380 } 381 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 382 // UCS-4, unusual octect order (3412) 383 return "ISO-10646-UCS-4"; 384 } 385 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 386 // UTF-16, big-endian, no BOM 387 // (or could turn out to be UCS-2... 388 return "UTF-16BE"; 389 } 390 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 391 // UTF-16, little-endian, no BOM 392 // (or could turn out to be UCS-2... 393 return "UTF-16LE"; 394 } 395 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 396 // EBCDIC 397 // a la xerces1, return CP037 instead of EBCDIC here 398 return "CP037"; 399 } 400 401 // this signals us to use the value from the encoding attribute 402 return null; 403 404 } // getEncodingName(byte[]):Object[] 405 406 /** 407 * Read the input stream as text, and pass the text on to the XIncludeHandler 408 * using calls to characters(). This will read all of the text it can from the 409 * resource. 410 * 411 * @throws IOException 412 */ 413 public void parse() throws IOException { 414 415 fReader = getReader(fSource); 416 fSource = null; 417 int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); 418 while (readSize != -1) { 419 for (int i = 0; i < readSize; ++i) { 420 char ch = fTempString.ch[i]; 421 if (!isValid(ch)) { 422 if (XMLChar.isHighSurrogate(ch)) { 423 int ch2; 424 // retrieve next character 425 if (++i < readSize) { 426 ch2 = fTempString.ch[i]; 427 } 428 // handle rare boundary case 429 else { 430 ch2 = fReader.read(); 431 if (ch2 != -1) { 432 fTempString.ch[readSize++] = (char) ch2; 433 } 434 } 435 if (XMLChar.isLowSurrogate(ch2)) { 436 // convert surrogates to a supplemental character 437 int sup = XMLChar.supplemental(ch, (char)ch2); 438 if (!isValid(sup)) { 439 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 440 "InvalidCharInContent", 441 new Object[] { Integer.toString(sup, 16) }, 442 XMLErrorReporter.SEVERITY_FATAL_ERROR); 443 } 444 } 445 else { 446 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 447 "InvalidCharInContent", 448 new Object[] { Integer.toString(ch2, 16) }, 449 XMLErrorReporter.SEVERITY_FATAL_ERROR); 450 } 451 } 452 else { 453 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 454 "InvalidCharInContent", 455 new Object[] { Integer.toString(ch, 16) }, 456 XMLErrorReporter.SEVERITY_FATAL_ERROR); 457 } 458 } 459 } 460 if (fHandler != null && readSize > 0) { 461 fTempString.offset = 0; 462 fTempString.length = readSize; 463 fHandler.characters( 464 fTempString, 465 fHandler.modifyAugmentations(null, true)); 466 } 467 readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); 468 } 469 470 } 471 472 /** 473 * Sets the input source on this text reader. 474 * 475 * @param source The XMLInputSource to use. 476 */ 477 public void setInputSource(XMLInputSource source) { 478 fSource = source; 479 } 480 481 /** 482 * Closes the stream. Call this after parse(), or when there is no longer any need 483 * for this object. 484 * 485 * @throws IOException 486 */ 487 public void close() throws IOException { 488 if (fReader != null) { 489 fReader.close(); 490 fReader = null; 491 } 492 } 493 494 /** 495 * Returns true if the specified character is a valid XML character 496 * as per the rules of XML 1.0. 497 * 498 * @param ch The character to check. 499 */ 500 protected boolean isValid(int ch) { 501 return XMLChar.isValid(ch); 502 } 503 504 /** 505 * Sets the buffer size property for the reader which decides the chunk sizes that are parsed 506 * by the reader at a time and passed to the handler 507 * 508 * @param bufferSize The size of the buffer desired 509 */ 510 protected void setBufferSize(int bufferSize) { 511 if (fTempString.ch.length != ++bufferSize) { 512 fTempString.ch = new char[bufferSize]; 513 } 514 } 515 516 }