1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 2003-2005 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 package com.sun.org.apache.xerces.internal.xinclude;
  21 
  22 import java.io.BufferedInputStream;
  23 import java.io.IOException;
  24 import java.io.InputStream;
  25 import java.io.InputStreamReader;
  26 import java.io.Reader;
  27 import java.net.HttpURLConnection;
  28 import java.net.URL;
  29 import java.net.URLConnection;
  30 import java.util.Iterator;
  31 import java.util.Locale;
  32 import java.util.Map;
  33 
  34 import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
  35 import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
  36 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
  37 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
  38 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  39 import com.sun.org.apache.xerces.internal.util.EncodingMap;
  40 import com.sun.org.apache.xerces.internal.util.HTTPInputSource;
  41 import com.sun.org.apache.xerces.internal.util.MessageFormatter;
  42 import com.sun.org.apache.xerces.internal.util.XMLChar;
  43 import com.sun.org.apache.xerces.internal.xni.XMLString;
  44 import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
  45 
  46 /**
  47  * This class is used for reading resources requested in <include> elements,
  48  * when the parse attribute of the <include> element is "text".  Using this
  49  * class will open the location, detect the encoding, and discard the byte order
  50  * mark, if applicable.
  51  *
  52  * REVISIT:
  53  * Much of the code in this class is taken from XMLEntityManager.  It would be nice
  54  * if this code could be shared in some way.  However, since XMLEntityManager is used
  55  * for reading files as XML, and this needs to read files as text, there would need
  56  * to be some refactoring done.
  57  *
  58  * @author Michael Glavassevich, IBM
  59  * @author Peter McCracken, IBM
  60  * @author Ankit Pasricha, IBM
  61  * @author Arun Yadav, Sun Microsystems Inc.
  62  *
  63  *
  64  * @see XIncludeHandler
  65  */
  66 public class XIncludeTextReader {
  67 
  68     private Reader fReader;
  69     private XIncludeHandler fHandler;
  70     private XMLInputSource fSource;
  71     private XMLErrorReporter fErrorReporter;
  72     private XMLString fTempString = new XMLString();
  73 
  74     /**
  75      * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
  76      *
  77      * @param source The XMLInputSource to use.
  78      * @param handler The XIncludeHandler to use.
  79      * @param bufferSize The size of this text reader's buffer.
  80      */
  81     public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
  82         throws IOException {
  83         fHandler = handler;
  84         fSource = source;
  85         fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
  86     }
  87 
  88     /**
  89      * Sets the XMLErrorReporter used for reporting errors while
  90      * reading the text include.
  91      *
  92      * @param errorReporter the XMLErrorReporter to be used for
  93      * reporting errors.
  94      */
  95     public void setErrorReporter(XMLErrorReporter errorReporter) {
  96         fErrorReporter = errorReporter;
  97     }
  98 
  99     /**
 100      * Return the Reader for given XMLInputSource.
 101      *
 102      * @param source The XMLInputSource to use.
 103      */
 104     protected Reader getReader(XMLInputSource source) throws IOException {
 105         if (source.getCharacterStream() != null) {
 106             return source.getCharacterStream();
 107         }
 108         else {
 109             InputStream stream = null;
 110 
 111             String encoding = source.getEncoding();
 112             if (encoding == null) {
 113                 encoding = "UTF-8";
 114             }
 115             if (source.getByteStream() != null) {
 116                 stream = source.getByteStream();
 117                 // Wrap the InputStream so that it is possible to rewind it.
 118                 if (!(stream instanceof BufferedInputStream)) {
 119                     stream = new BufferedInputStream(stream, fTempString.ch.length);
 120                 }
 121             }
 122             else {
 123                 String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
 124 
 125                 URL url = new URL(expandedSystemId);
 126                 URLConnection urlCon = url.openConnection();
 127 
 128                 // If this is an HTTP connection attach any request properties to the request.
 129                 if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) {
 130                     final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
 131                     final HTTPInputSource httpInputSource = (HTTPInputSource) source;
 132 
 133                     // set request properties
 134                     Iterator propIter = httpInputSource.getHTTPRequestProperties();
 135                     while (propIter.hasNext()) {
 136                         Map.Entry entry = (Map.Entry) propIter.next();
 137                         urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
 138                     }
 139 
 140                     // set preference for redirection
 141                     boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
 142                     if (!followRedirects) {
 143                         XMLEntityManager.setInstanceFollowRedirects(urlConnection, followRedirects);
 144                     }
 145                 }
 146 
 147                 // Wrap the InputStream so that it is possible to rewind it.
 148                 stream = new BufferedInputStream(urlCon.getInputStream());
 149 
 150                 // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
 151                 String rawContentType = urlCon.getContentType();
 152 
 153                 // text/xml and application/xml offer only one optional parameter
 154                 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
 155 
 156                 String contentType = null;
 157                 String charset = null;
 158                 if (index != -1) {
 159                     // this should be something like "text/xml"
 160                     contentType = rawContentType.substring(0, index).trim();
 161 
 162                     // this should be something like "charset=UTF-8", but we want to
 163                     // strip it down to just "UTF-8"
 164                     charset = rawContentType.substring(index + 1).trim();
 165                     if (charset.startsWith("charset=")) {
 166                         // 8 is the length of "charset="
 167                         charset = charset.substring(8).trim();
 168                         // strip quotes, if present
 169                         if ((charset.charAt(0) == '"'
 170                             && charset.charAt(charset.length() - 1) == '"')
 171                             || (charset.charAt(0) == '\''
 172                                 && charset.charAt(charset.length() - 1)
 173                                     == '\'')) {
 174                             charset =
 175                                 charset.substring(1, charset.length() - 1);
 176                         }
 177                     }
 178                     else {
 179                         charset = null;
 180                     }
 181                 }
 182                 else {
 183                     contentType = rawContentType.trim();
 184                 }
 185 
 186                 String detectedEncoding = null;
 187                 /**  The encoding of such a resource is determined by:
 188                     1 external encoding information, if available, otherwise
 189                          -- the most common type of external information is the "charset" parameter of a MIME package
 190                     2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
 191                     3 the value of the encoding attribute if one exists, otherwise
 192                     4 UTF-8.
 193                  **/
 194                 if (contentType.equals("text/xml")) {
 195                     if (charset != null) {
 196                         detectedEncoding = charset;
 197                     }
 198                     else {
 199                         // see RFC2376 or 3023, section 3.1
 200                         detectedEncoding = "US-ASCII";
 201                     }
 202                 }
 203                 else if (contentType.equals("application/xml")) {
 204                     if (charset != null) {
 205                         detectedEncoding = charset;
 206                     }
 207                     else {
 208                         // see RFC2376 or 3023, section 3.2
 209                         detectedEncoding = getEncodingName(stream);
 210                     }
 211                 }
 212                 else if (contentType.endsWith("+xml")) {
 213                     detectedEncoding = getEncodingName(stream);
 214                 }
 215 
 216                 if (detectedEncoding != null) {
 217                     encoding = detectedEncoding;
 218                 }
 219                 // else 3 or 4.
 220             }
 221 
 222             encoding = encoding.toUpperCase(Locale.ENGLISH);
 223 
 224             // eat the Byte Order Mark
 225             encoding = consumeBOM(stream, encoding);
 226 
 227             // If the document is UTF-8 or US-ASCII use
 228             // the Xerces readers for these encodings. For
 229             // US-ASCII consult the encoding map since
 230             // this encoding has many aliases.
 231             if (encoding.equals("UTF-8")) {
 232                 return new UTF8Reader(stream,
 233                     fTempString.ch.length,
 234                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
 235                     fErrorReporter.getLocale() );
 236             }
 237 
 238             // Try to use a Java reader.
 239             String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
 240 
 241             // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
 242             // The XIncludeHandler will report this as a ResourceError and then will
 243             // attempt to include a fallback if there is one.
 244             if (javaEncoding == null) {
 245                 MessageFormatter aFormatter =
 246                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
 247                 Locale aLocale = fErrorReporter.getLocale();
 248                 throw new IOException( aFormatter.formatMessage( aLocale,
 249                     "EncodingDeclInvalid",
 250                     new Object[] {encoding} ) );
 251             }
 252             else if (javaEncoding.equals("ASCII")) {
 253                 return new ASCIIReader(stream,
 254                     fTempString.ch.length,
 255                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
 256                     fErrorReporter.getLocale() );
 257             }
 258 
 259             return new InputStreamReader(stream, javaEncoding);
 260         }
 261     }
 262 
 263     /**
 264      * XMLEntityManager cares about endian-ness, since it creates its own optimized
 265      * readers. Since we're just using generic Java readers for now, we're not caring
 266      * about endian-ness.  If this changes, even more code needs to be copied from
 267      * XMLEntity manager. -- PJM
 268      */
 269     protected String getEncodingName(InputStream stream) throws IOException {
 270         final byte[] b4 = new byte[4];
 271         String encoding = null;
 272 
 273         // this has the potential to throw an exception
 274         // it will be fixed when we ensure the stream is rewindable (see note above)
 275         stream.mark(4);
 276         int count = stream.read(b4, 0, 4);
 277         stream.reset();
 278         if (count == 4) {
 279             encoding = getEncodingName(b4);
 280         }
 281 
 282         return encoding;
 283     }
 284 
 285     /**
 286      * Removes the byte order mark from the stream, if
 287      * it exists and returns the encoding name.
 288      *
 289      * @param stream
 290      * @param encoding
 291      * @throws IOException
 292      */
 293     protected String consumeBOM(InputStream stream, String encoding)
 294         throws IOException {
 295 
 296         byte[] b = new byte[3];
 297         int count = 0;
 298         stream.mark(3);
 299         if (encoding.equals("UTF-8")) {
 300             count = stream.read(b, 0, 3);
 301             if (count == 3) {
 302                 final int b0 = b[0] & 0xFF;
 303                 final int b1 = b[1] & 0xFF;
 304                 final int b2 = b[2] & 0xFF;
 305                 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
 306                     // First three bytes are not BOM, so reset.
 307                     stream.reset();
 308                 }
 309             }
 310             else {
 311                 stream.reset();
 312             }
 313         }
 314         else if (encoding.startsWith("UTF-16")) {
 315             count = stream.read(b, 0, 2);
 316             if (count == 2) {
 317                 final int b0 = b[0] & 0xFF;
 318                 final int b1 = b[1] & 0xFF;
 319                 if (b0 == 0xFE && b1 == 0xFF) {
 320                     return "UTF-16BE";
 321                 }
 322                 else if (b0 == 0xFF && b1 == 0xFE) {
 323                     return "UTF-16LE";
 324                 }
 325             }
 326             // First two bytes are not BOM, so reset.
 327             stream.reset();
 328         }
 329         // We could do UTF-32, but since the getEncodingName() doesn't support that
 330         // we won't support it here.
 331         // To implement UTF-32, look for:  00 00 FE FF for big-endian
 332         //                             or  FF FE 00 00 for little-endian
 333         return encoding;
 334     }
 335 
 336     /**
 337      * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
 338      *          Is there any way we can share the code, without having it implemented twice?
 339      *          I think we should make it public and static in XMLEntityManager. --PJM
 340      *
 341      * Returns the IANA encoding name that is auto-detected from
 342      * the bytes specified, with the endian-ness of that encoding where appropriate.
 343      *
 344      * @param b4    The first four bytes of the input.
 345      * @return the encoding name, or null if no encoding could be detected
 346      */
 347     protected String getEncodingName(byte[] b4) {
 348 
 349         // UTF-16, with BOM
 350         int b0 = b4[0] & 0xFF;
 351         int b1 = b4[1] & 0xFF;
 352         if (b0 == 0xFE && b1 == 0xFF) {
 353             // UTF-16, big-endian
 354             return "UTF-16BE";
 355         }
 356         if (b0 == 0xFF && b1 == 0xFE) {
 357             // UTF-16, little-endian
 358             return "UTF-16LE";
 359         }
 360 
 361         // UTF-8 with a BOM
 362         int b2 = b4[2] & 0xFF;
 363         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
 364             return "UTF-8";
 365         }
 366 
 367         // other encodings
 368         int b3 = b4[3] & 0xFF;
 369         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
 370             // UCS-4, big endian (1234)
 371             return "ISO-10646-UCS-4";
 372         }
 373         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
 374             // UCS-4, little endian (4321)
 375             return "ISO-10646-UCS-4";
 376         }
 377         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
 378             // UCS-4, unusual octet order (2143)
 379             return "ISO-10646-UCS-4";
 380         }
 381         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
 382             // UCS-4, unusual octect order (3412)
 383             return "ISO-10646-UCS-4";
 384         }
 385         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
 386             // UTF-16, big-endian, no BOM
 387             // (or could turn out to be UCS-2...
 388             return "UTF-16BE";
 389         }
 390         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
 391             // UTF-16, little-endian, no BOM
 392             // (or could turn out to be UCS-2...
 393             return "UTF-16LE";
 394         }
 395         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
 396             // EBCDIC
 397             // a la xerces1, return CP037 instead of EBCDIC here
 398             return "CP037";
 399         }
 400 
 401         // this signals us to use the value from the encoding attribute
 402         return null;
 403 
 404     } // getEncodingName(byte[]):Object[]
 405 
 406     /**
 407      * Read the input stream as text, and pass the text on to the XIncludeHandler
 408      * using calls to characters().  This will read all of the text it can from the
 409      * resource.
 410      *
 411      * @throws IOException
 412      */
 413     public void parse() throws IOException {
 414 
 415         fReader = getReader(fSource);
 416         fSource = null;
 417         int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
 418         while (readSize != -1) {
 419             for (int i = 0; i < readSize; ++i) {
 420                 char ch = fTempString.ch[i];
 421                 if (!isValid(ch)) {
 422                     if (XMLChar.isHighSurrogate(ch)) {
 423                         int ch2;
 424                         // retrieve next character
 425                         if (++i < readSize) {
 426                             ch2 = fTempString.ch[i];
 427                         }
 428                         // handle rare boundary case
 429                         else {
 430                             ch2 = fReader.read();
 431                             if (ch2 != -1) {
 432                                 fTempString.ch[readSize++] = (char) ch2;
 433                             }
 434                         }
 435                         if (XMLChar.isLowSurrogate(ch2)) {
 436                             // convert surrogates to a supplemental character
 437                             int sup = XMLChar.supplemental(ch, (char)ch2);
 438                             if (!isValid(sup)) {
 439                                 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 440                                                            "InvalidCharInContent",
 441                                                            new Object[] { Integer.toString(sup, 16) },
 442                                                            XMLErrorReporter.SEVERITY_FATAL_ERROR);
 443                             }
 444                         }
 445                         else {
 446                             fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 447                                                        "InvalidCharInContent",
 448                                                        new Object[] { Integer.toString(ch2, 16) },
 449                                                        XMLErrorReporter.SEVERITY_FATAL_ERROR);
 450                         }
 451                     }
 452                     else {
 453                         fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 454                                                    "InvalidCharInContent",
 455                                                    new Object[] { Integer.toString(ch, 16) },
 456                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
 457                     }
 458                 }
 459             }
 460             if (fHandler != null && readSize > 0) {
 461                 fTempString.offset = 0;
 462                 fTempString.length = readSize;
 463                 fHandler.characters(
 464                     fTempString,
 465                     fHandler.modifyAugmentations(null, true));
 466             }
 467             readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
 468         }
 469 
 470     }
 471 
 472     /**
 473      * Sets the input source on this text reader.
 474      *
 475      * @param source The XMLInputSource to use.
 476      */
 477     public void setInputSource(XMLInputSource source) {
 478         fSource = source;
 479     }
 480 
 481     /**
 482      * Closes the stream.  Call this after parse(), or when there is no longer any need
 483      * for this object.
 484      *
 485      * @throws IOException
 486      */
 487     public void close() throws IOException {
 488         if (fReader != null) {
 489             fReader.close();
 490             fReader = null;
 491         }
 492     }
 493 
 494     /**
 495      * Returns true if the specified character is a valid XML character
 496      * as per the rules of XML 1.0.
 497      *
 498      * @param ch The character to check.
 499      */
 500     protected boolean isValid(int ch) {
 501         return XMLChar.isValid(ch);
 502     }
 503 
 504     /**
 505      * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
 506      * by the reader at a time and passed to the handler
 507      *
 508      * @param bufferSize The size of the buffer desired
 509      */
 510     protected void setBufferSize(int bufferSize) {
 511         if (fTempString.ch.length != ++bufferSize) {
 512             fTempString.ch = new char[bufferSize];
 513         }
 514     }
 515 
 516 }