1 /* 2 * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 3 * @LastModified: Oct 2017 4 */ 5 /* 6 * Licensed to the Apache Software Foundation (ASF) under one or more 7 * contributor license agreements. See the NOTICE file distributed with 8 * this work for additional information regarding copyright ownership. 9 * The ASF licenses this file to You under the Apache License, Version 2.0 10 * (the "License"); you may not use this file except in compliance with 11 * the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, software 16 * distributed under the License is distributed on an "AS IS" BASIS, 17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 * See the License for the specific language governing permissions and 19 * limitations under the License. 20 */ 21 22 package com.sun.org.apache.xerces.internal.xinclude; 23 24 import com.sun.org.apache.xerces.internal.impl.XMLEntityManager; 25 import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter; 26 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader; 27 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader; 28 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; 29 import com.sun.org.apache.xerces.internal.util.EncodingMap; 30 import com.sun.org.apache.xerces.internal.util.HTTPInputSource; 31 import com.sun.org.apache.xerces.internal.util.MessageFormatter; 32 import com.sun.org.apache.xerces.internal.util.XMLChar; 33 import com.sun.org.apache.xerces.internal.xni.XMLString; 34 import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource; 35 import java.io.BufferedInputStream; 36 import java.io.IOException; 37 import java.io.InputStream; 38 import java.io.InputStreamReader; 39 import java.io.Reader; 40 import java.net.HttpURLConnection; 41 import java.net.URL; 42 import java.net.URLConnection; 43 import java.util.Iterator; 44 import java.util.Locale; 45 import java.util.Map; 46 47 /** 48 * This class is used for reading resources requested in <include> elements, 49 * when the parse attribute of the <include> element is "text". Using this 50 * class will open the location, detect the encoding, and discard the byte order 51 * mark, if applicable. 52 * 53 * REVISIT: 54 * Much of the code in this class is taken from XMLEntityManager. It would be nice 55 * if this code could be shared in some way. However, since XMLEntityManager is used 56 * for reading files as XML, and this needs to read files as text, there would need 57 * to be some refactoring done. 58 * 59 * @author Michael Glavassevich, IBM 60 * @author Peter McCracken, IBM 61 * @author Ankit Pasricha, IBM 62 * @author Arun Yadav, Sun Microsystems Inc. 63 * 64 * 65 * @see XIncludeHandler 66 */ 67 public class XIncludeTextReader { 68 69 private Reader fReader; 70 private XIncludeHandler fHandler; 71 private XMLInputSource fSource; 72 private XMLErrorReporter fErrorReporter; 73 private XMLString fTempString = new XMLString(); 74 75 /** 76 * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler. 77 * 78 * @param source The XMLInputSource to use. 79 * @param handler The XIncludeHandler to use. 80 * @param bufferSize The size of this text reader's buffer. 81 */ 82 public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) 83 throws IOException { 84 fHandler = handler; 85 fSource = source; 86 fTempString = new XMLString(new char[bufferSize + 1], 0, 0); 87 } 88 89 /** 90 * Sets the XMLErrorReporter used for reporting errors while 91 * reading the text include. 92 * 93 * @param errorReporter the XMLErrorReporter to be used for 94 * reporting errors. 95 */ 96 public void setErrorReporter(XMLErrorReporter errorReporter) { 97 fErrorReporter = errorReporter; 98 } 99 100 /** 101 * Return the Reader for given XMLInputSource. 102 * 103 * @param source The XMLInputSource to use. 104 */ 105 protected Reader getReader(XMLInputSource source) throws IOException { 106 if (source.getCharacterStream() != null) { 107 return source.getCharacterStream(); 108 } 109 else { 110 InputStream stream = null; 111 112 String encoding = source.getEncoding(); 113 if (encoding == null) { 114 encoding = "UTF-8"; 115 } 116 if (source.getByteStream() != null) { 117 stream = source.getByteStream(); 118 // Wrap the InputStream so that it is possible to rewind it. 119 if (!(stream instanceof BufferedInputStream)) { 120 stream = new BufferedInputStream(stream, fTempString.ch.length); 121 } 122 } 123 else { 124 String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false); 125 126 URL url = new URL(expandedSystemId); 127 URLConnection urlCon = url.openConnection(); 128 129 // If this is an HTTP connection attach any request properties to the request. 130 if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) { 131 final HttpURLConnection urlConnection = (HttpURLConnection) urlCon; 132 final HTTPInputSource httpInputSource = (HTTPInputSource) source; 133 134 // set request properties 135 Iterator<Map.Entry<String, String>> propIter = httpInputSource.getHTTPRequestProperties(); 136 while (propIter.hasNext()) { 137 Map.Entry<String, String> entry = propIter.next(); 138 urlConnection.setRequestProperty(entry.getKey(), entry.getValue()); 139 } 140 141 // set preference for redirection 142 boolean followRedirects = httpInputSource.getFollowHTTPRedirects(); 143 if (!followRedirects) { 144 urlConnection.setInstanceFollowRedirects(followRedirects); 145 } 146 } 147 148 // Wrap the InputStream so that it is possible to rewind it. 149 stream = new BufferedInputStream(urlCon.getInputStream()); 150 151 // content type will be string like "text/xml; charset=UTF-8" or "text/xml" 152 String rawContentType = urlCon.getContentType(); 153 154 // text/xml and application/xml offer only one optional parameter 155 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1; 156 157 String contentType = null; 158 String charset = null; 159 if (index != -1) { 160 // this should be something like "text/xml" 161 contentType = rawContentType.substring(0, index).trim(); 162 163 // this should be something like "charset=UTF-8", but we want to 164 // strip it down to just "UTF-8" 165 charset = rawContentType.substring(index + 1).trim(); 166 if (charset.startsWith("charset=")) { 167 // 8 is the length of "charset=" 168 charset = charset.substring(8).trim(); 169 // strip quotes, if present 170 if ((charset.charAt(0) == '"' 171 && charset.charAt(charset.length() - 1) == '"') 172 || (charset.charAt(0) == '\'' 173 && charset.charAt(charset.length() - 1) 174 == '\'')) { 175 charset = 176 charset.substring(1, charset.length() - 1); 177 } 178 } 179 else { 180 charset = null; 181 } 182 } 183 else { 184 contentType = rawContentType.trim(); 185 } 186 187 String detectedEncoding = null; 188 /** The encoding of such a resource is determined by: 189 1 external encoding information, if available, otherwise 190 -- the most common type of external information is the "charset" parameter of a MIME package 191 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise 192 3 the value of the encoding attribute if one exists, otherwise 193 4 UTF-8. 194 **/ 195 if (contentType.equals("text/xml")) { 196 if (charset != null) { 197 detectedEncoding = charset; 198 } 199 else { 200 // see RFC2376 or 3023, section 3.1 201 detectedEncoding = "US-ASCII"; 202 } 203 } 204 else if (contentType.equals("application/xml")) { 205 if (charset != null) { 206 detectedEncoding = charset; 207 } 208 else { 209 // see RFC2376 or 3023, section 3.2 210 detectedEncoding = getEncodingName(stream); 211 } 212 } 213 else if (contentType.endsWith("+xml")) { 214 detectedEncoding = getEncodingName(stream); 215 } 216 217 if (detectedEncoding != null) { 218 encoding = detectedEncoding; 219 } 220 // else 3 or 4. 221 } 222 223 encoding = encoding.toUpperCase(Locale.ENGLISH); 224 225 // eat the Byte Order Mark 226 encoding = consumeBOM(stream, encoding); 227 228 // If the document is UTF-8 or US-ASCII use 229 // the Xerces readers for these encodings. For 230 // US-ASCII consult the encoding map since 231 // this encoding has many aliases. 232 if (encoding.equals("UTF-8")) { 233 return new UTF8Reader(stream, 234 fTempString.ch.length, 235 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 236 fErrorReporter.getLocale() ); 237 } 238 239 // Try to use a Java reader. 240 String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding); 241 242 // If the specified encoding wasn't a recognized IANA encoding throw an IOException. 243 // The XIncludeHandler will report this as a ResourceError and then will 244 // attempt to include a fallback if there is one. 245 if (javaEncoding == null) { 246 MessageFormatter aFormatter = 247 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN); 248 Locale aLocale = fErrorReporter.getLocale(); 249 throw new IOException( aFormatter.formatMessage( aLocale, 250 "EncodingDeclInvalid", 251 new Object[] {encoding} ) ); 252 } 253 else if (javaEncoding.equals("ASCII")) { 254 return new ASCIIReader(stream, 255 fTempString.ch.length, 256 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 257 fErrorReporter.getLocale() ); 258 } 259 260 return new InputStreamReader(stream, javaEncoding); 261 } 262 } 263 264 /** 265 * XMLEntityManager cares about endian-ness, since it creates its own optimized 266 * readers. Since we're just using generic Java readers for now, we're not caring 267 * about endian-ness. If this changes, even more code needs to be copied from 268 * XMLEntity manager. -- PJM 269 */ 270 protected String getEncodingName(InputStream stream) throws IOException { 271 final byte[] b4 = new byte[4]; 272 String encoding = null; 273 274 // this has the potential to throw an exception 275 // it will be fixed when we ensure the stream is rewindable (see note above) 276 stream.mark(4); 277 int count = stream.read(b4, 0, 4); 278 stream.reset(); 279 if (count == 4) { 280 encoding = getEncodingName(b4); 281 } 282 283 return encoding; 284 } 285 286 /** 287 * Removes the byte order mark from the stream, if 288 * it exists and returns the encoding name. 289 * 290 * @param stream 291 * @param encoding 292 * @throws IOException 293 */ 294 protected String consumeBOM(InputStream stream, String encoding) 295 throws IOException { 296 297 byte[] b = new byte[3]; 298 int count = 0; 299 stream.mark(3); 300 if (encoding.equals("UTF-8")) { 301 count = stream.read(b, 0, 3); 302 if (count == 3) { 303 final int b0 = b[0] & 0xFF; 304 final int b1 = b[1] & 0xFF; 305 final int b2 = b[2] & 0xFF; 306 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) { 307 // First three bytes are not BOM, so reset. 308 stream.reset(); 309 } 310 } 311 else { 312 stream.reset(); 313 } 314 } 315 else if (encoding.startsWith("UTF-16")) { 316 count = stream.read(b, 0, 2); 317 if (count == 2) { 318 final int b0 = b[0] & 0xFF; 319 final int b1 = b[1] & 0xFF; 320 if (b0 == 0xFE && b1 == 0xFF) { 321 return "UTF-16BE"; 322 } 323 else if (b0 == 0xFF && b1 == 0xFE) { 324 return "UTF-16LE"; 325 } 326 } 327 // First two bytes are not BOM, so reset. 328 stream.reset(); 329 } 330 // We could do UTF-32, but since the getEncodingName() doesn't support that 331 // we won't support it here. 332 // To implement UTF-32, look for: 00 00 FE FF for big-endian 333 // or FF FE 00 00 for little-endian 334 return encoding; 335 } 336 337 /** 338 * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager. 339 * Is there any way we can share the code, without having it implemented twice? 340 * I think we should make it public and static in XMLEntityManager. --PJM 341 * 342 * Returns the IANA encoding name that is auto-detected from 343 * the bytes specified, with the endian-ness of that encoding where appropriate. 344 * 345 * @param b4 The first four bytes of the input. 346 * @return the encoding name, or null if no encoding could be detected 347 */ 348 protected String getEncodingName(byte[] b4) { 349 350 // UTF-16, with BOM 351 int b0 = b4[0] & 0xFF; 352 int b1 = b4[1] & 0xFF; 353 if (b0 == 0xFE && b1 == 0xFF) { 354 // UTF-16, big-endian 355 return "UTF-16BE"; 356 } 357 if (b0 == 0xFF && b1 == 0xFE) { 358 // UTF-16, little-endian 359 return "UTF-16LE"; 360 } 361 362 // UTF-8 with a BOM 363 int b2 = b4[2] & 0xFF; 364 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 365 return "UTF-8"; 366 } 367 368 // other encodings 369 int b3 = b4[3] & 0xFF; 370 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 371 // UCS-4, big endian (1234) 372 return "ISO-10646-UCS-4"; 373 } 374 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 375 // UCS-4, little endian (4321) 376 return "ISO-10646-UCS-4"; 377 } 378 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 379 // UCS-4, unusual octet order (2143) 380 return "ISO-10646-UCS-4"; 381 } 382 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 383 // UCS-4, unusual octect order (3412) 384 return "ISO-10646-UCS-4"; 385 } 386 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 387 // UTF-16, big-endian, no BOM 388 // (or could turn out to be UCS-2... 389 return "UTF-16BE"; 390 } 391 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 392 // UTF-16, little-endian, no BOM 393 // (or could turn out to be UCS-2... 394 return "UTF-16LE"; 395 } 396 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 397 // EBCDIC 398 // a la xerces1, return CP037 instead of EBCDIC here 399 return "CP037"; 400 } 401 402 // this signals us to use the value from the encoding attribute 403 return null; 404 405 } // getEncodingName(byte[]):Object[] 406 407 /** 408 * Read the input stream as text, and pass the text on to the XIncludeHandler 409 * using calls to characters(). This will read all of the text it can from the 410 * resource. 411 * 412 * @throws IOException 413 */ 414 public void parse() throws IOException { 415 416 fReader = getReader(fSource); 417 fSource = null; 418 int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); 419 while (readSize != -1) { 420 for (int i = 0; i < readSize; ++i) { 421 char ch = fTempString.ch[i]; 422 if (!isValid(ch)) { 423 if (XMLChar.isHighSurrogate(ch)) { 424 int ch2; 425 // retrieve next character 426 if (++i < readSize) { 427 ch2 = fTempString.ch[i]; 428 } 429 // handle rare boundary case 430 else { 431 ch2 = fReader.read(); 432 if (ch2 != -1) { 433 fTempString.ch[readSize++] = (char) ch2; 434 } 435 } 436 if (XMLChar.isLowSurrogate(ch2)) { 437 // convert surrogates to a supplemental character 438 int sup = XMLChar.supplemental(ch, (char)ch2); 439 if (!isValid(sup)) { 440 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 441 "InvalidCharInContent", 442 new Object[] { Integer.toString(sup, 16) }, 443 XMLErrorReporter.SEVERITY_FATAL_ERROR); 444 } 445 } 446 else { 447 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 448 "InvalidCharInContent", 449 new Object[] { Integer.toString(ch2, 16) }, 450 XMLErrorReporter.SEVERITY_FATAL_ERROR); 451 } 452 } 453 else { 454 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 455 "InvalidCharInContent", 456 new Object[] { Integer.toString(ch, 16) }, 457 XMLErrorReporter.SEVERITY_FATAL_ERROR); 458 } 459 } 460 } 461 if (fHandler != null && readSize > 0) { 462 fTempString.offset = 0; 463 fTempString.length = readSize; 464 fHandler.characters( 465 fTempString, 466 fHandler.modifyAugmentations(null, true)); 467 } 468 readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); 469 } 470 471 } 472 473 /** 474 * Sets the input source on this text reader. 475 * 476 * @param source The XMLInputSource to use. 477 */ 478 public void setInputSource(XMLInputSource source) { 479 fSource = source; 480 } 481 482 /** 483 * Closes the stream. Call this after parse(), or when there is no longer any need 484 * for this object. 485 * 486 * @throws IOException 487 */ 488 public void close() throws IOException { 489 if (fReader != null) { 490 fReader.close(); 491 fReader = null; 492 } 493 } 494 495 /** 496 * Returns true if the specified character is a valid XML character 497 * as per the rules of XML 1.0. 498 * 499 * @param ch The character to check. 500 */ 501 protected boolean isValid(int ch) { 502 return XMLChar.isValid(ch); 503 } 504 505 /** 506 * Sets the buffer size property for the reader which decides the chunk sizes that are parsed 507 * by the reader at a time and passed to the handler 508 * 509 * @param bufferSize The size of the buffer desired 510 */ 511 protected void setBufferSize(int bufferSize) { 512 if (fTempString.ch.length != ++bufferSize) { 513 fTempString.ch = new char[bufferSize]; 514 } 515 } 516 517 }