1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  * @LastModified: Oct 2017
   4  */
   5 /*
   6  * Licensed to the Apache Software Foundation (ASF) under one or more
   7  * contributor license agreements.  See the NOTICE file distributed with
   8  * this work for additional information regarding copyright ownership.
   9  * The ASF licenses this file to You under the Apache License, Version 2.0
  10  * (the "License"); you may not use this file except in compliance with
  11  * the License.  You may obtain a copy of the License at
  12  *
  13  *      http://www.apache.org/licenses/LICENSE-2.0
  14  *
  15  * Unless required by applicable law or agreed to in writing, software
  16  * distributed under the License is distributed on an "AS IS" BASIS,
  17  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  18  * See the License for the specific language governing permissions and
  19  * limitations under the License.
  20  */
  21 
  22 package com.sun.org.apache.xerces.internal.xinclude;
  23 
  24 import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
  25 import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
  26 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
  27 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
  28 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  29 import com.sun.org.apache.xerces.internal.util.EncodingMap;
  30 import com.sun.org.apache.xerces.internal.util.HTTPInputSource;
  31 import com.sun.org.apache.xerces.internal.util.MessageFormatter;
  32 import com.sun.org.apache.xerces.internal.util.XMLChar;
  33 import com.sun.org.apache.xerces.internal.xni.XMLString;
  34 import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
  35 import java.io.BufferedInputStream;
  36 import java.io.IOException;
  37 import java.io.InputStream;
  38 import java.io.InputStreamReader;
  39 import java.io.Reader;
  40 import java.net.HttpURLConnection;
  41 import java.net.URL;
  42 import java.net.URLConnection;
  43 import java.util.Iterator;
  44 import java.util.Locale;
  45 import java.util.Map;
  46 
  47 /**
  48  * This class is used for reading resources requested in <include> elements,
  49  * when the parse attribute of the <include> element is "text".  Using this
  50  * class will open the location, detect the encoding, and discard the byte order
  51  * mark, if applicable.
  52  *
  53  * REVISIT:
  54  * Much of the code in this class is taken from XMLEntityManager.  It would be nice
  55  * if this code could be shared in some way.  However, since XMLEntityManager is used
  56  * for reading files as XML, and this needs to read files as text, there would need
  57  * to be some refactoring done.
  58  *
  59  * @author Michael Glavassevich, IBM
  60  * @author Peter McCracken, IBM
  61  * @author Ankit Pasricha, IBM
  62  * @author Arun Yadav, Sun Microsystems Inc.
  63  *
  64  *
  65  * @see XIncludeHandler
  66  */
  67 public class XIncludeTextReader {
  68 
  69     private Reader fReader;
  70     private XIncludeHandler fHandler;
  71     private XMLInputSource fSource;
  72     private XMLErrorReporter fErrorReporter;
  73     private XMLString fTempString = new XMLString();
  74 
  75     /**
  76      * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
  77      *
  78      * @param source The XMLInputSource to use.
  79      * @param handler The XIncludeHandler to use.
  80      * @param bufferSize The size of this text reader's buffer.
  81      */
  82     public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
  83         throws IOException {
  84         fHandler = handler;
  85         fSource = source;
  86         fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
  87     }
  88 
  89     /**
  90      * Sets the XMLErrorReporter used for reporting errors while
  91      * reading the text include.
  92      *
  93      * @param errorReporter the XMLErrorReporter to be used for
  94      * reporting errors.
  95      */
  96     public void setErrorReporter(XMLErrorReporter errorReporter) {
  97         fErrorReporter = errorReporter;
  98     }
  99 
 100     /**
 101      * Return the Reader for given XMLInputSource.
 102      *
 103      * @param source The XMLInputSource to use.
 104      */
 105     protected Reader getReader(XMLInputSource source) throws IOException {
 106         if (source.getCharacterStream() != null) {
 107             return source.getCharacterStream();
 108         }
 109         else {
 110             InputStream stream = null;
 111 
 112             String encoding = source.getEncoding();
 113             if (encoding == null) {
 114                 encoding = "UTF-8";
 115             }
 116             if (source.getByteStream() != null) {
 117                 stream = source.getByteStream();
 118                 // Wrap the InputStream so that it is possible to rewind it.
 119                 if (!(stream instanceof BufferedInputStream)) {
 120                     stream = new BufferedInputStream(stream, fTempString.ch.length);
 121                 }
 122             }
 123             else {
 124                 String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
 125 
 126                 URL url = new URL(expandedSystemId);
 127                 URLConnection urlCon = url.openConnection();
 128 
 129                 // If this is an HTTP connection attach any request properties to the request.
 130                 if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) {
 131                     final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
 132                     final HTTPInputSource httpInputSource = (HTTPInputSource) source;
 133 
 134                     // set request properties
 135                     Iterator<Map.Entry<String, String>> propIter = httpInputSource.getHTTPRequestProperties();
 136                     while (propIter.hasNext()) {
 137                         Map.Entry<String, String> entry = propIter.next();
 138                         urlConnection.setRequestProperty(entry.getKey(), entry.getValue());
 139                     }
 140 
 141                     // set preference for redirection
 142                     boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
 143                     if (!followRedirects) {
 144                         urlConnection.setInstanceFollowRedirects(followRedirects);
 145                     }
 146                 }
 147 
 148                 // Wrap the InputStream so that it is possible to rewind it.
 149                 stream = new BufferedInputStream(urlCon.getInputStream());
 150 
 151                 // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
 152                 String rawContentType = urlCon.getContentType();
 153 
 154                 // text/xml and application/xml offer only one optional parameter
 155                 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
 156 
 157                 String contentType = null;
 158                 String charset = null;
 159                 if (index != -1) {
 160                     // this should be something like "text/xml"
 161                     contentType = rawContentType.substring(0, index).trim();
 162 
 163                     // this should be something like "charset=UTF-8", but we want to
 164                     // strip it down to just "UTF-8"
 165                     charset = rawContentType.substring(index + 1).trim();
 166                     if (charset.startsWith("charset=")) {
 167                         // 8 is the length of "charset="
 168                         charset = charset.substring(8).trim();
 169                         // strip quotes, if present
 170                         if ((charset.charAt(0) == '"'
 171                             && charset.charAt(charset.length() - 1) == '"')
 172                             || (charset.charAt(0) == '\''
 173                                 && charset.charAt(charset.length() - 1)
 174                                     == '\'')) {
 175                             charset =
 176                                 charset.substring(1, charset.length() - 1);
 177                         }
 178                     }
 179                     else {
 180                         charset = null;
 181                     }
 182                 }
 183                 else {
 184                     contentType = rawContentType.trim();
 185                 }
 186 
 187                 String detectedEncoding = null;
 188                 /**  The encoding of such a resource is determined by:
 189                     1 external encoding information, if available, otherwise
 190                          -- the most common type of external information is the "charset" parameter of a MIME package
 191                     2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
 192                     3 the value of the encoding attribute if one exists, otherwise
 193                     4 UTF-8.
 194                  **/
 195                 if (contentType.equals("text/xml")) {
 196                     if (charset != null) {
 197                         detectedEncoding = charset;
 198                     }
 199                     else {
 200                         // see RFC2376 or 3023, section 3.1
 201                         detectedEncoding = "US-ASCII";
 202                     }
 203                 }
 204                 else if (contentType.equals("application/xml")) {
 205                     if (charset != null) {
 206                         detectedEncoding = charset;
 207                     }
 208                     else {
 209                         // see RFC2376 or 3023, section 3.2
 210                         detectedEncoding = getEncodingName(stream);
 211                     }
 212                 }
 213                 else if (contentType.endsWith("+xml")) {
 214                     detectedEncoding = getEncodingName(stream);
 215                 }
 216 
 217                 if (detectedEncoding != null) {
 218                     encoding = detectedEncoding;
 219                 }
 220                 // else 3 or 4.
 221             }
 222 
 223             encoding = encoding.toUpperCase(Locale.ENGLISH);
 224 
 225             // eat the Byte Order Mark
 226             encoding = consumeBOM(stream, encoding);
 227 
 228             // If the document is UTF-8 or US-ASCII use
 229             // the Xerces readers for these encodings. For
 230             // US-ASCII consult the encoding map since
 231             // this encoding has many aliases.
 232             if (encoding.equals("UTF-8")) {
 233                 return new UTF8Reader(stream,
 234                     fTempString.ch.length,
 235                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
 236                     fErrorReporter.getLocale() );
 237             }
 238 
 239             // Try to use a Java reader.
 240             String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
 241 
 242             // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
 243             // The XIncludeHandler will report this as a ResourceError and then will
 244             // attempt to include a fallback if there is one.
 245             if (javaEncoding == null) {
 246                 MessageFormatter aFormatter =
 247                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
 248                 Locale aLocale = fErrorReporter.getLocale();
 249                 throw new IOException( aFormatter.formatMessage( aLocale,
 250                     "EncodingDeclInvalid",
 251                     new Object[] {encoding} ) );
 252             }
 253             else if (javaEncoding.equals("ASCII")) {
 254                 return new ASCIIReader(stream,
 255                     fTempString.ch.length,
 256                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
 257                     fErrorReporter.getLocale() );
 258             }
 259 
 260             return new InputStreamReader(stream, javaEncoding);
 261         }
 262     }
 263 
 264     /**
 265      * XMLEntityManager cares about endian-ness, since it creates its own optimized
 266      * readers. Since we're just using generic Java readers for now, we're not caring
 267      * about endian-ness.  If this changes, even more code needs to be copied from
 268      * XMLEntity manager. -- PJM
 269      */
 270     protected String getEncodingName(InputStream stream) throws IOException {
 271         final byte[] b4 = new byte[4];
 272         String encoding = null;
 273 
 274         // this has the potential to throw an exception
 275         // it will be fixed when we ensure the stream is rewindable (see note above)
 276         stream.mark(4);
 277         int count = stream.read(b4, 0, 4);
 278         stream.reset();
 279         if (count == 4) {
 280             encoding = getEncodingName(b4);
 281         }
 282 
 283         return encoding;
 284     }
 285 
 286     /**
 287      * Removes the byte order mark from the stream, if
 288      * it exists and returns the encoding name.
 289      *
 290      * @param stream
 291      * @param encoding
 292      * @throws IOException
 293      */
 294     protected String consumeBOM(InputStream stream, String encoding)
 295         throws IOException {
 296 
 297         byte[] b = new byte[3];
 298         int count = 0;
 299         stream.mark(3);
 300         if (encoding.equals("UTF-8")) {
 301             count = stream.read(b, 0, 3);
 302             if (count == 3) {
 303                 final int b0 = b[0] & 0xFF;
 304                 final int b1 = b[1] & 0xFF;
 305                 final int b2 = b[2] & 0xFF;
 306                 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
 307                     // First three bytes are not BOM, so reset.
 308                     stream.reset();
 309                 }
 310             }
 311             else {
 312                 stream.reset();
 313             }
 314         }
 315         else if (encoding.startsWith("UTF-16")) {
 316             count = stream.read(b, 0, 2);
 317             if (count == 2) {
 318                 final int b0 = b[0] & 0xFF;
 319                 final int b1 = b[1] & 0xFF;
 320                 if (b0 == 0xFE && b1 == 0xFF) {
 321                     return "UTF-16BE";
 322                 }
 323                 else if (b0 == 0xFF && b1 == 0xFE) {
 324                     return "UTF-16LE";
 325                 }
 326             }
 327             // First two bytes are not BOM, so reset.
 328             stream.reset();
 329         }
 330         // We could do UTF-32, but since the getEncodingName() doesn't support that
 331         // we won't support it here.
 332         // To implement UTF-32, look for:  00 00 FE FF for big-endian
 333         //                             or  FF FE 00 00 for little-endian
 334         return encoding;
 335     }
 336 
 337     /**
 338      * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
 339      *          Is there any way we can share the code, without having it implemented twice?
 340      *          I think we should make it public and static in XMLEntityManager. --PJM
 341      *
 342      * Returns the IANA encoding name that is auto-detected from
 343      * the bytes specified, with the endian-ness of that encoding where appropriate.
 344      *
 345      * @param b4    The first four bytes of the input.
 346      * @return the encoding name, or null if no encoding could be detected
 347      */
 348     protected String getEncodingName(byte[] b4) {
 349 
 350         // UTF-16, with BOM
 351         int b0 = b4[0] & 0xFF;
 352         int b1 = b4[1] & 0xFF;
 353         if (b0 == 0xFE && b1 == 0xFF) {
 354             // UTF-16, big-endian
 355             return "UTF-16BE";
 356         }
 357         if (b0 == 0xFF && b1 == 0xFE) {
 358             // UTF-16, little-endian
 359             return "UTF-16LE";
 360         }
 361 
 362         // UTF-8 with a BOM
 363         int b2 = b4[2] & 0xFF;
 364         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
 365             return "UTF-8";
 366         }
 367 
 368         // other encodings
 369         int b3 = b4[3] & 0xFF;
 370         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
 371             // UCS-4, big endian (1234)
 372             return "ISO-10646-UCS-4";
 373         }
 374         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
 375             // UCS-4, little endian (4321)
 376             return "ISO-10646-UCS-4";
 377         }
 378         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
 379             // UCS-4, unusual octet order (2143)
 380             return "ISO-10646-UCS-4";
 381         }
 382         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
 383             // UCS-4, unusual octect order (3412)
 384             return "ISO-10646-UCS-4";
 385         }
 386         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
 387             // UTF-16, big-endian, no BOM
 388             // (or could turn out to be UCS-2...
 389             return "UTF-16BE";
 390         }
 391         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
 392             // UTF-16, little-endian, no BOM
 393             // (or could turn out to be UCS-2...
 394             return "UTF-16LE";
 395         }
 396         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
 397             // EBCDIC
 398             // a la xerces1, return CP037 instead of EBCDIC here
 399             return "CP037";
 400         }
 401 
 402         // this signals us to use the value from the encoding attribute
 403         return null;
 404 
 405     } // getEncodingName(byte[]):Object[]
 406 
 407     /**
 408      * Read the input stream as text, and pass the text on to the XIncludeHandler
 409      * using calls to characters().  This will read all of the text it can from the
 410      * resource.
 411      *
 412      * @throws IOException
 413      */
 414     public void parse() throws IOException {
 415 
 416         fReader = getReader(fSource);
 417         fSource = null;
 418         int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
 419         while (readSize != -1) {
 420             for (int i = 0; i < readSize; ++i) {
 421                 char ch = fTempString.ch[i];
 422                 if (!isValid(ch)) {
 423                     if (XMLChar.isHighSurrogate(ch)) {
 424                         int ch2;
 425                         // retrieve next character
 426                         if (++i < readSize) {
 427                             ch2 = fTempString.ch[i];
 428                         }
 429                         // handle rare boundary case
 430                         else {
 431                             ch2 = fReader.read();
 432                             if (ch2 != -1) {
 433                                 fTempString.ch[readSize++] = (char) ch2;
 434                             }
 435                         }
 436                         if (XMLChar.isLowSurrogate(ch2)) {
 437                             // convert surrogates to a supplemental character
 438                             int sup = XMLChar.supplemental(ch, (char)ch2);
 439                             if (!isValid(sup)) {
 440                                 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 441                                                            "InvalidCharInContent",
 442                                                            new Object[] { Integer.toString(sup, 16) },
 443                                                            XMLErrorReporter.SEVERITY_FATAL_ERROR);
 444                             }
 445                         }
 446                         else {
 447                             fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 448                                                        "InvalidCharInContent",
 449                                                        new Object[] { Integer.toString(ch2, 16) },
 450                                                        XMLErrorReporter.SEVERITY_FATAL_ERROR);
 451                         }
 452                     }
 453                     else {
 454                         fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 455                                                    "InvalidCharInContent",
 456                                                    new Object[] { Integer.toString(ch, 16) },
 457                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
 458                     }
 459                 }
 460             }
 461             if (fHandler != null && readSize > 0) {
 462                 fTempString.offset = 0;
 463                 fTempString.length = readSize;
 464                 fHandler.characters(
 465                     fTempString,
 466                     fHandler.modifyAugmentations(null, true));
 467             }
 468             readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
 469         }
 470 
 471     }
 472 
 473     /**
 474      * Sets the input source on this text reader.
 475      *
 476      * @param source The XMLInputSource to use.
 477      */
 478     public void setInputSource(XMLInputSource source) {
 479         fSource = source;
 480     }
 481 
 482     /**
 483      * Closes the stream.  Call this after parse(), or when there is no longer any need
 484      * for this object.
 485      *
 486      * @throws IOException
 487      */
 488     public void close() throws IOException {
 489         if (fReader != null) {
 490             fReader.close();
 491             fReader = null;
 492         }
 493     }
 494 
 495     /**
 496      * Returns true if the specified character is a valid XML character
 497      * as per the rules of XML 1.0.
 498      *
 499      * @param ch The character to check.
 500      */
 501     protected boolean isValid(int ch) {
 502         return XMLChar.isValid(ch);
 503     }
 504 
 505     /**
 506      * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
 507      * by the reader at a time and passed to the handler
 508      *
 509      * @param bufferSize The size of the buffer desired
 510      */
 511     protected void setBufferSize(int bufferSize) {
 512         if (fTempString.ch.length != ++bufferSize) {
 513             fTempString.ch = new char[bufferSize];
 514         }
 515     }
 516 
 517 }