open Sdiff src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl

src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLEntityManager.java

   1 /*
   2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl ;
  22 
  23 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
  24 import com.sun.org.apache.xerces.internal.impl.io.UCSReader;

  25 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
  26 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  27 import com.sun.org.apache.xerces.internal.impl.validation.ValidationManager;
  28 import com.sun.org.apache.xerces.internal.util.*;
  29 import com.sun.org.apache.xerces.internal.util.URI;
  30 import com.sun.org.apache.xerces.internal.utils.XMLLimitAnalyzer;
  31 import com.sun.org.apache.xerces.internal.utils.XMLSecurityManager;
  32 import com.sun.org.apache.xerces.internal.utils.XMLSecurityPropertyManager;
  33 import com.sun.org.apache.xerces.internal.xni.Augmentations;
  34 import com.sun.org.apache.xerces.internal.xni.XMLResourceIdentifier;
  35 import com.sun.org.apache.xerces.internal.xni.XNIException;
  36 import com.sun.org.apache.xerces.internal.xni.parser.*;
  37 import com.sun.xml.internal.stream.Entity;
  38 import com.sun.xml.internal.stream.StaxEntityResolverWrapper;
  39 import com.sun.xml.internal.stream.StaxXMLInputSource;
  40 import com.sun.xml.internal.stream.XMLEntityStorage;
  41 import java.io.*;
  42 import java.net.HttpURLConnection;
  43 import java.net.URISyntaxException;
  44 import java.net.URL;

  72  * xni.
  73  * <p>
  74  * This component requires the following features and properties from the
  75  * component manager that uses it:
  76  * <ul>
  77  *  <li>http://xml.org/sax/features/validation</li>
  78  *  <li>http://xml.org/sax/features/external-general-entities</li>
  79  *  <li>http://xml.org/sax/features/external-parameter-entities</li>
  80  *  <li>http://apache.org/xml/features/allow-java-encodings</li>
  81  *  <li>http://apache.org/xml/properties/internal/symbol-table</li>
  82  *  <li>http://apache.org/xml/properties/internal/error-reporter</li>
  83  *  <li>http://apache.org/xml/properties/internal/entity-resolver</li>
  84  * </ul>
  85  *
  86  *
  87  * @author Andy Clark, IBM
  88  * @author Arnaud  Le Hors, IBM
  89  * @author K.Venugopal SUN Microsystems
  90  * @author Neeraj Bajaj SUN Microsystems
  91  * @author Sunitha Reddy SUN Microsystems
  92  * @LastModified: Nov 2018
  93  */
  94 public class XMLEntityManager implements XMLComponent, XMLEntityResolver {
  95 
  96     //
  97     // Constants
  98     //
  99 
 100     /** Default buffer size (2048). */
 101     public static final int DEFAULT_BUFFER_SIZE = 8192;
 102 
 103     /** Default buffer size before we've finished with the XMLDecl:  */
 104     public static final int DEFAULT_XMLDECL_BUFFER_SIZE = 64;
 105 
 106     /** Default internal entity buffer size (1024). */
 107     public static final int DEFAULT_INTERNAL_BUFFER_SIZE = 1024;
 108 
 109     // feature identifiers
 110 
 111     /** Feature identifier: validation. */
 112     protected static final String VALIDATION =

 395     protected Entity.ScannedEntity fCurrentEntity = null;
 396 
 397     /** identify if the InputSource is created by a resolver */
 398     boolean fISCreatedByResolver = false;
 399 
 400     // shared context
 401 
 402     protected XMLEntityStorage fEntityStorage ;
 403 
 404     protected final Object [] defaultEncoding = new Object[]{"UTF-8", null};
 405 
 406 
 407     // temp vars
 408 
 409     /** Resource identifer. */
 410     private final XMLResourceIdentifierImpl fResourceIdentifier = new XMLResourceIdentifierImpl();
 411 
 412     /** Augmentations for entities. */
 413     private final Augmentations fEntityAugs = new AugmentationsImpl();
 414 
 415     /** Pool of character buffers. */
 416     private CharacterBufferPool fBufferPool = new CharacterBufferPool(fBufferSize, DEFAULT_INTERNAL_BUFFER_SIZE);
 417 
 418     /** indicate whether Catalog should be used for resolving external resources */
 419     private boolean fUseCatalog = true;
 420     CatalogFeatures fCatalogFeatures;
 421     CatalogResolver fCatalogResolver;
 422 
 423     private String fCatalogFile;
 424     private String fDefer;
 425     private String fPrefer;
 426     private String fResolve;
 427 
 428     //
 429     // Constructors
 430     //
 431 
 432     /**
 433      * If this constructor is used to create the object, reset() should be invoked on this object
 434      */
 435     public XMLEntityManager() {
 436         //for entity managers not created by parsers
 437         fSecurityManager = new XMLSecurityManager(true);

 677 
 678                     stream = connect.getInputStream();
 679 
 680                     // REVISIT: If the URLConnection has external encoding
 681                     // information, we should be reading it here. It's located
 682                     // in the charset parameter of Content-Type. -- mrglavas
 683 
 684                     if (followRedirects) {
 685                         String redirect = connect.getURL().toString();
 686                         // E43: Check if the URL was redirected, and then
 687                         // update literal and expanded system IDs if needed.
 688                         if (!redirect.equals(expandedSystemId)) {
 689                             literalSystemId = redirect;
 690                             expandedSystemId = redirect;
 691                         }
 692                     }
 693                 }
 694             }
 695 
 696             // wrap this stream in RewindableInputStream
 697             stream = new RewindableInputStream(stream);

 698 
 699             // perform auto-detect of encoding if necessary
 700             if (encoding == null) {
 701                 // read first four bytes and determine encoding
 702                 final byte[] b4 = new byte[4];
 703                 int count = 0;
 704                 for (; count<4; count++ ) {
 705                     b4[count] = (byte)stream.read();
 706                 }
 707                 if (count == 4) {
 708                     Object [] encodingDesc = getEncodingName(b4, count);
 709                     encoding = (String)(encodingDesc[0]);
 710                     isBigEndian = (Boolean)(encodingDesc[1]);
 711 
 712                     stream.reset();

 713                     // Special case UTF-8 files with BOM created by Microsoft
 714                     // tools. It's more efficient to consume the BOM than make
 715                     // the reader perform extra checks. -Ac
 716                     if (count > 2 && encoding.equals("UTF-8")) {
 717                         int b0 = b4[0] & 0xFF;
 718                         int b1 = b4[1] & 0xFF;
 719                         int b2 = b4[2] & 0xFF;
 720                         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
 721                             // ignore first three bytes...
 722                             stream.skip(3);
 723                         }





 724                     }
 725                     reader = createReader(stream, encoding, isBigEndian);

 726                 } else {
 727                     reader = createReader(stream, encoding, isBigEndian);
 728                 }
 729             }
 730 
 731             // use specified encoding
 732             else {
 733                 encoding = encoding.toUpperCase(Locale.ENGLISH);
 734 
 735                 // If encoding is UTF-8, consume BOM if one is present.
 736                 if (encoding.equals("UTF-8")) {
 737                     final int[] b3 = new int[3];
 738                     int count = 0;
 739                     for (; count < 3; ++count) {
 740                         b3[count] = stream.read();
 741                         if (b3[count] == -1)
 742                             break;
 743                     }
 744                     if (count == 3) {
 745                         if (b3[0] != 0xEF || b3[1] != 0xBB || b3[2] != 0xBF) {
 746                             // First three bytes are not BOM, so reset.
 747                             stream.reset();
 748                         }
 749                     } else {
 750                         stream.reset();
 751                     }
 752                 }
 753                 // If encoding is UTF-16, we still need to read the first four bytes
 754                 // in order to discover the byte order.
 755                 else if (encoding.equals("UTF-16")) {
 756                     final int[] b4 = new int[4];
 757                     int count = 0;
 758                     for (; count < 4; ++count) {
 759                         b4[count] = stream.read();
 760                         if (b4[count] == -1)
 761                             break;
 762                     }
 763                     stream.reset();
 764 
 765                     String utf16Encoding = "UTF-16";
 766                     if (count >= 2) {
 767                         final int b0 = b4[0];
 768                         final int b1 = b4[1];
 769                         if (b0 == 0xFE && b1 == 0xFF) {
 770                             // UTF-16, big-endian
 771                             utf16Encoding = "UTF-16BE";
 772                             isBigEndian = Boolean.TRUE;

 773                         }
 774                         else if (b0 == 0xFF && b1 == 0xFE) {
 775                             // UTF-16, little-endian
 776                             utf16Encoding = "UTF-16LE";
 777                             isBigEndian = Boolean.FALSE;

 778                         }
 779                         else if (count == 4) {
 780                             final int b2 = b4[2];
 781                             final int b3 = b4[3];
 782                             if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
 783                                 // UTF-16, big-endian, no BOM
 784                                 utf16Encoding = "UTF-16BE";
 785                                 isBigEndian = Boolean.TRUE;
 786                             }
 787                             if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
 788                                 // UTF-16, little-endian, no BOM
 789                                 utf16Encoding = "UTF-16LE";
 790                                 isBigEndian = Boolean.FALSE;
 791                             }
 792                         }
 793                     }
 794                     reader = createReader(stream, utf16Encoding, isBigEndian);
 795                 }
 796                 // If encoding is UCS-4, we still need to read the first four bytes
 797                 // in order to discover the byte order.
 798                 else if (encoding.equals("ISO-10646-UCS-4")) {
 799                     final int[] b4 = new int[4];
 800                     int count = 0;
 801                     for (; count < 4; ++count) {
 802                         b4[count] = stream.read();
 803                         if (b4[count] == -1)
 804                             break;
 805                     }
 806                     stream.reset();
 807 
 808                     // Ignore unusual octet order for now.
 809                     if (count == 4) {
 810                         // UCS-4, big endian (1234)
 811                         if (b4[0] == 0x00 && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x3C) {
 812                             isBigEndian = Boolean.TRUE;
 813                         }
 814                         // UCS-4, little endian (1234)
 815                         else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x00) {
 816                             isBigEndian = Boolean.FALSE;
 817                         }
 818                     }
 819                 }
 820                 // If encoding is UCS-2, we still need to read the first four bytes
 821                 // in order to discover the byte order.
 822                 else if (encoding.equals("ISO-10646-UCS-2")) {
 823                     final int[] b4 = new int[4];
 824                     int count = 0;
 825                     for (; count < 4; ++count) {
 826                         b4[count] = stream.read();
 827                         if (b4[count] == -1)
 828                             break;
 829                     }
 830                     stream.reset();
 831 
 832                     if (count == 4) {
 833                         // UCS-2, big endian
 834                         if (b4[0] == 0x00 && b4[1] == 0x3C && b4[2] == 0x00 && b4[3] == 0x3F) {
 835                             isBigEndian = Boolean.TRUE;
 836                         }
 837                         // UCS-2, little endian
 838                         else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x3F && b4[3] == 0x00) {
 839                             isBigEndian = Boolean.FALSE;
 840                         }
 841                     }
 842                 }
 843 
 844                 reader = createReader(stream, encoding, isBigEndian);
 845             }
 846

1781                 fSymbolTable = (SymbolTable)value;
1782                 return;
1783             }
1784             if (suffixLength == Constants.ERROR_REPORTER_PROPERTY.length() &&
1785                 propertyId.endsWith(Constants.ERROR_REPORTER_PROPERTY)) {
1786                 fErrorReporter = (XMLErrorReporter)value;
1787                 return;
1788             }
1789             if (suffixLength == Constants.ENTITY_RESOLVER_PROPERTY.length() &&
1790                 propertyId.endsWith(Constants.ENTITY_RESOLVER_PROPERTY)) {
1791                 fEntityResolver = (XMLEntityResolver)value;
1792                 return;
1793             }
1794             if (suffixLength == Constants.BUFFER_SIZE_PROPERTY.length() &&
1795                 propertyId.endsWith(Constants.BUFFER_SIZE_PROPERTY)) {
1796                 Integer bufferSize = (Integer)value;
1797                 if (bufferSize != null &&
1798                     bufferSize.intValue() > DEFAULT_XMLDECL_BUFFER_SIZE) {
1799                     fBufferSize = bufferSize.intValue();
1800                     fEntityScanner.setBufferSize(fBufferSize);
1801                     fBufferPool.setExternalBufferSize(fBufferSize);
1802                 }
1803             }
1804             if (suffixLength == Constants.SECURITY_MANAGER_PROPERTY.length() &&
1805                 propertyId.endsWith(Constants.SECURITY_MANAGER_PROPERTY)) {
1806                 fSecurityManager = (XMLSecurityManager)value;
1807             }
1808         }
1809 
1810         //JAXP 1.5 properties
1811         if (propertyId.equals(XML_SECURITY_PROPERTY_MANAGER))
1812         {
1813             XMLSecurityPropertyManager spm = (XMLSecurityPropertyManager)value;
1814             fAccessExternalDTD = spm.getValue(XMLSecurityPropertyManager.Property.ACCESS_EXTERNAL_DTD);
1815             return;
1816         }
1817 
1818         //Catalog properties
1819         if (propertyId.equals(JdkXmlUtils.CATALOG_FILES)) {
1820             fCatalogFile = (String)value;
1821         } else if (propertyId.equals(JdkXmlUtils.CATALOG_DEFER)) {

2408         systemURI = (new java.net.URI(baseURI.toString())).resolve(systemURI);
2409 
2410         // return the string rep of the new uri (an absolute one)
2411         return systemURI.toString();
2412 
2413         // if any exception is thrown, it'll get thrown to the caller.
2414 
2415     } // expandSystemIdStrictOff(String,String):String
2416 
2417     //
2418     // Protected methods
2419     //
2420 
2421 
2422     /**
2423      * Returns the IANA encoding name that is auto-detected from
2424      * the bytes specified, with the endian-ness of that encoding where appropriate.
2425      *
2426      * @param b4    The first four bytes of the input.
2427      * @param count The number of bytes actually read.
2428      * @return a 2-element array:  the first element, an IANA-encoding string,
2429      *  the second element a Boolean which is true iff the document is big endian, false
2430      *  if it's little-endian, and null if the distinction isn't relevant.
2431      */
2432     protected Object[] getEncodingName(byte[] b4, int count) {
2433 
2434         if (count < 2) {
2435             return defaultEncoding;
2436         }
2437 
2438         // UTF-16, with BOM
2439         int b0 = b4[0] & 0xFF;
2440         int b1 = b4[1] & 0xFF;
2441         if (b0 == 0xFE && b1 == 0xFF) {
2442             // UTF-16, big-endian
2443             return new Object [] {"UTF-16BE", true};
2444         }
2445         if (b0 == 0xFF && b1 == 0xFE) {
2446             // UTF-16, little-endian
2447             return new Object [] {"UTF-16LE", false};
2448         }
2449 
2450         // default to UTF-8 if we don't have enough bytes to make a
2451         // good determination of the encoding
2452         if (count < 3) {
2453             return defaultEncoding;
2454         }
2455 
2456         // UTF-8 with a BOM
2457         int b2 = b4[2] & 0xFF;
2458         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
2459             return defaultEncoding;
2460         }
2461 
2462         // default to UTF-8 if we don't have enough bytes to make a
2463         // good determination of the encoding
2464         if (count < 4) {
2465             return defaultEncoding;
2466         }
2467 
2468         // other encodings
2469         int b3 = b4[3] & 0xFF;
2470         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
2471             // UCS-4, big endian (1234)
2472             return new Object [] {"ISO-10646-UCS-4", true};
2473         }
2474         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
2475             // UCS-4, little endian (4321)
2476             return new Object [] {"ISO-10646-UCS-4", false};
2477         }
2478         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
2479             // UCS-4, unusual octet order (2143)
2480             // REVISIT: What should this be?
2481             return new Object [] {"ISO-10646-UCS-4", null};
2482         }
2483         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
2484             // UCS-4, unusual octect order (3412)
2485             // REVISIT: What should this be?
2486             return new Object [] {"ISO-10646-UCS-4", null};
2487         }
2488         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2489             // UTF-16, big-endian, no BOM
2490             // (or could turn out to be UCS-2...
2491             // REVISIT: What should this be?
2492             return new Object [] {"UTF-16BE", true};
2493         }
2494         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2495             // UTF-16, little-endian, no BOM
2496             // (or could turn out to be UCS-2...
2497             return new Object [] {"UTF-16LE", false};
2498         }
2499         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
2500             // EBCDIC
2501             // a la xerces1, return CP037 instead of EBCDIC here
2502             return new Object [] {"CP037", null};
2503         }
2504 
2505         return defaultEncoding;

2506 
2507     } // getEncodingName(byte[],int):Object[]
2508 
2509     /**
2510      * Creates a reader capable of reading the given input stream in
2511      * the specified encoding.
2512      *
2513      * @param inputStream  The input stream.
2514      * @param encoding     The encoding name that the input stream is
2515      *                     encoded using. If the user has specified that
2516      *                     Java encoding names are allowed, then the
2517      *                     encoding name may be a Java encoding name;
2518      *                     otherwise, it is an ianaEncoding name.
2519      * @param isBigEndian   For encodings (like uCS-4), whose names cannot
2520      *                      specify a byte order, this tells whether the order is bigEndian.  null menas
2521      *                      unknown or not relevant.
2522      *
2523      * @return Returns a reader.
2524      */
2525     protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian)
2526     throws IOException {
2527 
2528         // normalize encoding name
2529         if (encoding == null) {
2530             encoding = "UTF-8";







2531         }
2532 
2533         // try to use an optimized reader
2534         String ENCODING = encoding.toUpperCase(Locale.ENGLISH);
2535         if (ENCODING.equals("UTF-8")) {
2536             if (DEBUG_ENCODINGS) {
2537                 System.out.println("$$$ creating UTF8Reader");
2538             }
2539             return new UTF8Reader(inputStream, fBufferSize, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale() );
2540         }
2541         if (ENCODING.equals("US-ASCII")) {
2542             if (DEBUG_ENCODINGS) {
2543                 System.out.println("$$$ creating ASCIIReader");
2544             }
2545             return new ASCIIReader(inputStream, fBufferSize, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale());
2546         }
2547         if(ENCODING.equals("ISO-10646-UCS-4")) {
2548             if(isBigEndian != null) {
2549                 boolean isBE = isBigEndian.booleanValue();
2550                 if(isBE) {
2551                     return new UCSReader(inputStream, UCSReader.UCS4BE);
2552                 } else {
2553                     return new UCSReader(inputStream, UCSReader.UCS4LE);
2554                 }
2555             } else {
2556                 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,

2557                         "EncodingByteOrderUnsupported",
2558                         new Object[] { encoding },
2559                         XMLErrorReporter.SEVERITY_FATAL_ERROR);
2560             }
2561         }
2562         if(ENCODING.equals("ISO-10646-UCS-2")) {
2563             if(isBigEndian != null) { // sould never happen with this encoding...
2564                 boolean isBE = isBigEndian.booleanValue();
2565                 if(isBE) {
2566                     return new UCSReader(inputStream, UCSReader.UCS2BE);
2567                 } else {
2568                     return new UCSReader(inputStream, UCSReader.UCS2LE);
2569                 }
2570             } else {
2571                 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,

2572                         "EncodingByteOrderUnsupported",
2573                         new Object[] { encoding },
2574                         XMLErrorReporter.SEVERITY_FATAL_ERROR);
2575             }

2576         }
2577 
2578         // check for valid name
2579         boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
2580         boolean validJava = XMLChar.isValidJavaEncoding(encoding);
2581         if (!validIANA || (fAllowJavaEncodings && !validJava)) {
2582             fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,

2583                     "EncodingDeclInvalid",
2584                     new Object[] { encoding },
2585                     XMLErrorReporter.SEVERITY_FATAL_ERROR);
2586                     // NOTE: AndyH suggested that, on failure, we use ISO Latin 1
2587                     //       because every byte is a valid ISO Latin 1 character.
2588                     //       It may not translate correctly but if we failed on
2589                     //       the encoding anyway, then we're expecting the content
2590                     //       of the document to be bad. This will just prevent an
2591                     //       invalid UTF-8 sequence to be detected. This is only
2592                     //       important when continue-after-fatal-error is turned
2593                     //       on. -Ac
2594                     encoding = "ISO-8859-1";
2595         }
2596 
2597         // try to use a Java reader
2598         String javaEncoding = EncodingMap.getIANA2JavaMapping(ENCODING);
2599         if (javaEncoding == null) {
2600             if(fAllowJavaEncodings) {
2601                 javaEncoding = encoding;
2602             } else {
2603                 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,

2604                         "EncodingDeclInvalid",
2605                         new Object[] { encoding },
2606                         XMLErrorReporter.SEVERITY_FATAL_ERROR);
2607                         // see comment above.
2608                         javaEncoding = "ISO8859_1";
2609             }
2610         }
2611         if (DEBUG_ENCODINGS) {
2612             System.out.print("$$$ creating Java InputStreamReader: encoding="+javaEncoding);
2613             if (javaEncoding == encoding) {
2614                 System.out.print(" (IANA encoding)");
2615             }
2616             System.out.println();
2617         }
2618         return new BufferedReader( new InputStreamReader(inputStream, javaEncoding));
2619 
2620     } // createReader(InputStream,String, Boolean): Reader
2621 
2622 
2623     /**

2881                             }
2882                         }
2883                     }
2884                     if (fCurrentEntity.position == fCurrentEntity.count) {
2885                         System.out.print('^');
2886                     }
2887                     System.out.print('"');
2888                 }
2889                 System.out.print(']');
2890                 System.out.print(" @ ");
2891                 System.out.print(fCurrentEntity.lineNumber);
2892                 System.out.print(',');
2893                 System.out.print(fCurrentEntity.columnNumber);
2894             } else {
2895                 System.out.print("*NO CURRENT ENTITY*");
2896             }
2897         }
2898     } // print()
2899 
2900     /**
2901      * Buffer used in entity manager to reuse character arrays instead
2902      * of creating new ones every time.
2903      *
2904      * @xerces.internal
2905      *
2906      * @author Ankit Pasricha, IBM
2907      */
2908     private static class CharacterBuffer {
2909 
2910         /** character buffer */
2911         private char[] ch;
2912 
2913         /** whether the buffer is for an external or internal scanned entity */
2914         private boolean isExternal;
2915 
2916         public CharacterBuffer(boolean isExternal, int size) {
2917             this.isExternal = isExternal;
2918             ch = new char[size];
2919         }
2920     }
2921 
2922 
2923      /**
2924      * Stores a number of character buffers and provides it to the entity
2925      * manager to use when an entity is seen.
2926      *
2927      * @xerces.internal
2928      *
2929      * @author Ankit Pasricha, IBM
2930      */
2931     private static class CharacterBufferPool {
2932 
2933         private static final int DEFAULT_POOL_SIZE = 3;
2934 
2935         private CharacterBuffer[] fInternalBufferPool;
2936         private CharacterBuffer[] fExternalBufferPool;
2937 
2938         private int fExternalBufferSize;
2939         private int fInternalBufferSize;
2940         private int poolSize;
2941 
2942         private int fInternalTop;
2943         private int fExternalTop;
2944 
2945         public CharacterBufferPool(int externalBufferSize, int internalBufferSize) {
2946             this(DEFAULT_POOL_SIZE, externalBufferSize, internalBufferSize);
2947         }















































2948 
2949         public CharacterBufferPool(int poolSize, int externalBufferSize, int internalBufferSize) {
2950             fExternalBufferSize = externalBufferSize;
2951             fInternalBufferSize = internalBufferSize;
2952             this.poolSize = poolSize;
2953             init();
2954         }
2955 
2956         /** Initializes buffer pool. **/
2957         private void init() {
2958             fInternalBufferPool = new CharacterBuffer[poolSize];
2959             fExternalBufferPool = new CharacterBuffer[poolSize];
2960             fInternalTop = -1;
2961             fExternalTop = -1;
2962         }
2963 
2964         /** Retrieves buffer from pool. **/
2965         public CharacterBuffer getBuffer(boolean external) {
2966             if (external) {
2967                 if (fExternalTop > -1) {
2968                     return fExternalBufferPool[fExternalTop--];
2969                 }
2970                 else {
2971                     return new CharacterBuffer(true, fExternalBufferSize);
2972                 }
2973             }
2974             else {
2975                 if (fInternalTop > -1) {
2976                     return fInternalBufferPool[fInternalTop--];
2977                 }
2978                 else {
2979                     return new CharacterBuffer(false, fInternalBufferSize);
2980                 }
2981             }
2982         }
2983 
2984         /** Returns buffer to pool. **/
2985         public void returnToPool(CharacterBuffer buffer) {
2986             if (buffer.isExternal) {
2987                 if (fExternalTop < fExternalBufferPool.length - 1) {
2988                     fExternalBufferPool[++fExternalTop] = buffer;
2989                 }
2990             }
2991             else if (fInternalTop < fInternalBufferPool.length - 1) {
2992                 fInternalBufferPool[++fInternalTop] = buffer;
2993             }
2994         }
2995 
2996         /** Sets the size of external buffers and dumps the old pool. **/
2997         public void setExternalBufferSize(int bufferSize) {
2998             fExternalBufferSize = bufferSize;
2999             fExternalBufferPool = new CharacterBuffer[poolSize];
3000             fExternalTop = -1;
3001         }
3002     }
3003 
3004     /**
3005     * This class wraps the byte inputstreams we're presented with.
3006     * We need it because java.io.InputStreams don't provide
3007     * functionality to reread processed bytes, and they have a habit
3008     * of reading more than one character when you call their read()
3009     * methods.  This means that, once we discover the true (declared)
3010     * encoding of a document, we can neither backtrack to read the
3011     * whole doc again nor start reading where we are with a new
3012     * reader.
3013     *
3014     * This class allows rewinding an inputStream by allowing a mark
3015     * to be set, and the stream reset to that position.  <strong>The
3016     * class assumes that it needs to read one character per
3017     * invocation when it's read() method is inovked, but uses the
3018     * underlying InputStream's read(char[], offset length) method--it
3019     * won't buffer data read this way!</strong>
3020     *
3021     * @xerces.internal
3022     *

3035         private int fMark;
3036 
3037         public RewindableInputStream(InputStream is) {
3038             fData = new byte[DEFAULT_XMLDECL_BUFFER_SIZE];
3039             fInputStream = is;
3040             fStartOffset = 0;
3041             fEndOffset = -1;
3042             fOffset = 0;
3043             fLength = 0;
3044             fMark = 0;
3045         }
3046 
3047         public void setStartOffset(int offset) {
3048             fStartOffset = offset;
3049         }
3050 
3051         public void rewind() {
3052             fOffset = fStartOffset;
3053         }
3054 
3055         public int read() throws IOException {
3056             int b = 0;
3057             if (fOffset < fLength) {
3058                 return fData[fOffset++] & 0xff;
3059             }
3060             if (fOffset == fEndOffset) {
3061                 return -1;
3062             }
3063             if (fOffset == fData.length) {
3064                 byte[] newData = new byte[fOffset << 1];
3065                 System.arraycopy(fData, 0, newData, 0, fOffset);
3066                 fData = newData;
3067             }
3068             b = fInputStream.read();
3069             if (b == -1) {
3070                 fEndOffset = fOffset;
3071                 return -1;
3072             }
3073             fData[fLength++] = (byte)b;
3074             fOffset++;
3075             return b & 0xff;
3076         }
3077 













3078         public int read(byte[] b, int off, int len) throws IOException {
3079             int bytesLeft = fLength - fOffset;
3080             if (bytesLeft == 0) {
3081                 if (fOffset == fEndOffset) {
3082                     return -1;
3083                 }
3084 
3085                 /**
3086                  * //System.out.println("fCurrentEntitty = " + fCurrentEntity );
3087                  * //System.out.println("fInputStream = " + fInputStream );
3088                  * // better get some more for the voracious reader... */
3089 
3090                 if(fCurrentEntity.mayReadChunks || !fCurrentEntity.xmlDeclChunkRead) {
3091 
3092                     if (!fCurrentEntity.xmlDeclChunkRead)
3093                     {
3094                         fCurrentEntity.xmlDeclChunkRead = true;
3095                         len = Entity.ScannedEntity.DEFAULT_XMLDECL_BUFFER_SIZE;
3096                     }
3097                     return fInputStream.read(b, off, len);
3098                 }
3099 
3100                 int returnedVal = read();
3101                 if(returnedVal == -1) {
3102                   fEndOffset = fOffset;
3103                   return -1;
3104                 }
3105                 b[off] = (byte)returnedVal;
3106                 return 1;
3107 
3108             }
3109             if (len < bytesLeft) {
3110                 if (len <= 0) {
3111                     return 0;
3112                 }
3113             } else {
3114                 len = bytesLeft;
3115             }
3116             if (b != null) {
3117                 System.arraycopy(fData, fOffset, b, off, len);
3118             }
3119             fOffset += len;
3120             return len;
3121         }
3122 
3123         public long skip(long n)
3124         throws IOException {
3125             int bytesLeft;
3126             if (n <= 0) {
3127                 return 0;
3128             }
3129             bytesLeft = fLength - fOffset;
3130             if (bytesLeft == 0) {
3131                 if (fOffset == fEndOffset) {
3132                     return 0;
3133                 }
3134                 return fInputStream.skip(n);
3135             }
3136             if (n <= bytesLeft) {
3137                 fOffset += n;
3138                 return n;
3139             }
3140             fOffset += bytesLeft;
3141             if (fOffset == fEndOffset) {
3142                 return bytesLeft;
3143             }
3144             n -= bytesLeft;
3145             /*
3146             * In a manner of speaking, when this class isn't permitting more
3147             * than one byte at a time to be read, it is "blocking".  The
3148             * available() method should indicate how much can be read without
3149             * blocking, so while we're in this mode, it should only indicate
3150             * that bytes in its buffer are available; otherwise, the result of
3151             * available() on the underlying InputStream is appropriate.
3152             */
3153             return fInputStream.skip(n) + bytesLeft;
3154         }
3155 
3156         public int available() throws IOException {
3157             int bytesLeft = fLength - fOffset;
3158             if (bytesLeft == 0) {
3159                 if (fOffset == fEndOffset) {
3160                     return -1;
3161                 }
3162                 return fCurrentEntity.mayReadChunks ? fInputStream.available()
3163                 : 0;
3164             }
3165             return bytesLeft;
3166         }
3167 
3168         public void mark(int howMuch) {
3169             fMark = fOffset;
3170         }
3171 
3172         public void reset() {
3173             fOffset = fMark;
3174             //test();
3175         }
3176 
3177         public boolean markSupported() {
3178             return true;
3179         }
3180 
3181         public void close() throws IOException {
3182             if (fInputStream != null) {
3183                 fInputStream.close();
3184                 fInputStream = null;
3185             }
3186         }
3187     } // end of RewindableInputStream class
3188 
3189     public void test(){
3190         //System.out.println("TESTING: Added familytree to entityManager");
3191         //Usecase1
3192         fEntityStorage.addExternalEntity("entityUsecase1",null,
3193                 "/space/home/stax/sun/6thJan2004/zephyr/data/test.txt",
3194                 "/space/home/stax/sun/6thJan2004/zephyr/data/entity.xml");

   1 /*
   2  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl ;
  22 
  23 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
  24 import com.sun.org.apache.xerces.internal.impl.io.UCSReader;
  25 import com.sun.org.apache.xerces.internal.impl.io.UTF16Reader;
  26 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
  27 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  28 import com.sun.org.apache.xerces.internal.impl.validation.ValidationManager;
  29 import com.sun.org.apache.xerces.internal.util.*;
  30 import com.sun.org.apache.xerces.internal.util.URI;
  31 import com.sun.org.apache.xerces.internal.utils.XMLLimitAnalyzer;
  32 import com.sun.org.apache.xerces.internal.utils.XMLSecurityManager;
  33 import com.sun.org.apache.xerces.internal.utils.XMLSecurityPropertyManager;
  34 import com.sun.org.apache.xerces.internal.xni.Augmentations;
  35 import com.sun.org.apache.xerces.internal.xni.XMLResourceIdentifier;
  36 import com.sun.org.apache.xerces.internal.xni.XNIException;
  37 import com.sun.org.apache.xerces.internal.xni.parser.*;
  38 import com.sun.xml.internal.stream.Entity;
  39 import com.sun.xml.internal.stream.StaxEntityResolverWrapper;
  40 import com.sun.xml.internal.stream.StaxXMLInputSource;
  41 import com.sun.xml.internal.stream.XMLEntityStorage;
  42 import java.io.*;
  43 import java.net.HttpURLConnection;
  44 import java.net.URISyntaxException;
  45 import java.net.URL;

  73  * xni.
  74  * <p>
  75  * This component requires the following features and properties from the
  76  * component manager that uses it:
  77  * <ul>
  78  *  <li>http://xml.org/sax/features/validation</li>
  79  *  <li>http://xml.org/sax/features/external-general-entities</li>
  80  *  <li>http://xml.org/sax/features/external-parameter-entities</li>
  81  *  <li>http://apache.org/xml/features/allow-java-encodings</li>
  82  *  <li>http://apache.org/xml/properties/internal/symbol-table</li>
  83  *  <li>http://apache.org/xml/properties/internal/error-reporter</li>
  84  *  <li>http://apache.org/xml/properties/internal/entity-resolver</li>
  85  * </ul>
  86  *
  87  *
  88  * @author Andy Clark, IBM
  89  * @author Arnaud  Le Hors, IBM
  90  * @author K.Venugopal SUN Microsystems
  91  * @author Neeraj Bajaj SUN Microsystems
  92  * @author Sunitha Reddy SUN Microsystems
  93  * @LastModified: Apr 2019
  94  */
  95 public class XMLEntityManager implements XMLComponent, XMLEntityResolver {
  96 
  97     //
  98     // Constants
  99     //
 100 
 101     /** Default buffer size (2048). */
 102     public static final int DEFAULT_BUFFER_SIZE = 8192;
 103 
 104     /** Default buffer size before we've finished with the XMLDecl:  */
 105     public static final int DEFAULT_XMLDECL_BUFFER_SIZE = 64;
 106 
 107     /** Default internal entity buffer size (1024). */
 108     public static final int DEFAULT_INTERNAL_BUFFER_SIZE = 1024;
 109 
 110     // feature identifiers
 111 
 112     /** Feature identifier: validation. */
 113     protected static final String VALIDATION =

 396     protected Entity.ScannedEntity fCurrentEntity = null;
 397 
 398     /** identify if the InputSource is created by a resolver */
 399     boolean fISCreatedByResolver = false;
 400 
 401     // shared context
 402 
 403     protected XMLEntityStorage fEntityStorage ;
 404 
 405     protected final Object [] defaultEncoding = new Object[]{"UTF-8", null};
 406 
 407 
 408     // temp vars
 409 
 410     /** Resource identifer. */
 411     private final XMLResourceIdentifierImpl fResourceIdentifier = new XMLResourceIdentifierImpl();
 412 
 413     /** Augmentations for entities. */
 414     private final Augmentations fEntityAugs = new AugmentationsImpl();
 415 



 416     /** indicate whether Catalog should be used for resolving external resources */
 417     private boolean fUseCatalog = true;
 418     CatalogFeatures fCatalogFeatures;
 419     CatalogResolver fCatalogResolver;
 420 
 421     private String fCatalogFile;
 422     private String fDefer;
 423     private String fPrefer;
 424     private String fResolve;
 425 
 426     //
 427     // Constructors
 428     //
 429 
 430     /**
 431      * If this constructor is used to create the object, reset() should be invoked on this object
 432      */
 433     public XMLEntityManager() {
 434         //for entity managers not created by parsers
 435         fSecurityManager = new XMLSecurityManager(true);

 675 
 676                     stream = connect.getInputStream();
 677 
 678                     // REVISIT: If the URLConnection has external encoding
 679                     // information, we should be reading it here. It's located
 680                     // in the charset parameter of Content-Type. -- mrglavas
 681 
 682                     if (followRedirects) {
 683                         String redirect = connect.getURL().toString();
 684                         // E43: Check if the URL was redirected, and then
 685                         // update literal and expanded system IDs if needed.
 686                         if (!redirect.equals(expandedSystemId)) {
 687                             literalSystemId = redirect;
 688                             expandedSystemId = redirect;
 689                         }
 690                     }
 691                 }
 692             }
 693 
 694             // wrap this stream in RewindableInputStream
 695             RewindableInputStream rewindableStream = new RewindableInputStream(stream);
 696             stream = rewindableStream;
 697 
 698             // perform auto-detect of encoding if necessary
 699             if (encoding == null) {
 700                 // read first four bytes and determine encoding
 701                 final byte[] b4 = new byte[4];
 702                 int count = 0;
 703                 for (; count<4; count++ ) {
 704                     b4[count] = (byte)rewindableStream.readAndBuffer();
 705                 }
 706                 if (count == 4) {
 707                     final EncodingInfo info = getEncodingInfo(b4, count);
 708                     encoding = info.autoDetectedEncoding;
 709                     final String readerEncoding = info.readerEncoding;
 710                     isBigEndian = info.isBigEndian;
 711                     stream.reset();
 712                     if (info.hasBOM) {
 713                         // Special case UTF-8 files with BOM created by Microsoft
 714                         // tools. It's more efficient to consume the BOM than make
 715                         // the reader perform extra checks. -Ac
 716                         if (EncodingInfo.STR_UTF8.equals(readerEncoding)) {
 717                             // UTF-8 BOM: 0xEF 0xBB 0xBF




 718                             stream.skip(3);
 719                         }
 720                         // It's also more efficient to consume the UTF-16 BOM.
 721                         else if (EncodingInfo.STR_UTF16.equals(readerEncoding)) {
 722                             // UTF-16 BE BOM: 0xFE 0xFF
 723                             // UTF-16 LE BOM: 0xFF 0xFE
 724                             stream.skip(2);
 725                         }
 726                     }
 727                     reader = createReader(stream, readerEncoding, isBigEndian);
 728                 } else {
 729                     reader = createReader(stream, encoding, isBigEndian);
 730                 }
 731             }
 732 
 733             // use specified encoding
 734             else {
 735                 encoding = encoding.toUpperCase(Locale.ENGLISH);
 736 
 737                 // If encoding is UTF-8, consume BOM if one is present.
 738                 if (EncodingInfo.STR_UTF8.equals(encoding)) {
 739                     final int[] b3 = new int[3];
 740                     int count = 0;
 741                     for (; count < 3; ++count) {
 742                         b3[count] = rewindableStream.readAndBuffer();
 743                         if (b3[count] == -1)
 744                             break;
 745                     }
 746                     if (count == 3) {
 747                         if (b3[0] != 0xEF || b3[1] != 0xBB || b3[2] != 0xBF) {
 748                             // First three bytes are not BOM, so reset.
 749                             stream.reset();
 750                         }
 751                     } else {
 752                         stream.reset();
 753                     }
 754                 }
 755                 // If encoding is UTF-16, we still need to read the first
 756                 // four bytes, in order to discover the byte order.
 757                 else if (EncodingInfo.STR_UTF16.equals(encoding)) {
 758                     final int[] b4 = new int[4];
 759                     int count = 0;
 760                     for (; count < 4; ++count) {
 761                         b4[count] = rewindableStream.readAndBuffer();
 762                         if (b4[count] == -1)
 763                             break;
 764                     }
 765                     stream.reset();


 766                     if (count >= 2) {
 767                         final int b0 = b4[0];
 768                         final int b1 = b4[1];
 769                         if (b0 == 0xFE && b1 == 0xFF) {
 770                             // UTF-16, big-endian

 771                             isBigEndian = Boolean.TRUE;
 772                             stream.skip(2);
 773                         }
 774                         else if (b0 == 0xFF && b1 == 0xFE) {
 775                             // UTF-16, little-endian

 776                             isBigEndian = Boolean.FALSE;
 777                             stream.skip(2);
 778                         }
 779                         else if (count == 4) {
 780                             final int b2 = b4[2];
 781                             final int b3 = b4[3];
 782                             if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
 783                                 // UTF-16, big-endian, no BOM

 784                                 isBigEndian = Boolean.TRUE;
 785                             }
 786                             if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
 787                                 // UTF-16, little-endian, no BOM

 788                                 isBigEndian = Boolean.FALSE;
 789                             }
 790                         }
 791                     }

 792                 }
 793                 // If encoding is UCS-4, we still need to read the first four bytes
 794                 // in order to discover the byte order.
 795                 else if (EncodingInfo.STR_UCS4.equals(encoding)) {
 796                     final int[] b4 = new int[4];
 797                     int count = 0;
 798                     for (; count < 4; ++count) {
 799                         b4[count] = rewindableStream.readAndBuffer();
 800                         if (b4[count] == -1)
 801                             break;
 802                     }
 803                     stream.reset();
 804 
 805                     // Ignore unusual octet order for now.
 806                     if (count == 4) {
 807                         // UCS-4, big endian (1234)
 808                         if (b4[0] == 0x00 && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x3C) {
 809                             isBigEndian = Boolean.TRUE;
 810                         }
 811                         // UCS-4, little endian (1234)
 812                         else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x00) {
 813                             isBigEndian = Boolean.FALSE;
 814                         }
 815                     }
 816                 }
 817                 // If encoding is UCS-2, we still need to read the first four bytes
 818                 // in order to discover the byte order.
 819                 else if (EncodingInfo.STR_UCS2.equals(encoding)) {
 820                     final int[] b4 = new int[4];
 821                     int count = 0;
 822                     for (; count < 4; ++count) {
 823                         b4[count] = rewindableStream.readAndBuffer();
 824                         if (b4[count] == -1)
 825                             break;
 826                     }
 827                     stream.reset();
 828 
 829                     if (count == 4) {
 830                         // UCS-2, big endian
 831                         if (b4[0] == 0x00 && b4[1] == 0x3C && b4[2] == 0x00 && b4[3] == 0x3F) {
 832                             isBigEndian = Boolean.TRUE;
 833                         }
 834                         // UCS-2, little endian
 835                         else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x3F && b4[3] == 0x00) {
 836                             isBigEndian = Boolean.FALSE;
 837                         }
 838                     }
 839                 }
 840 
 841                 reader = createReader(stream, encoding, isBigEndian);
 842             }
 843

1778                 fSymbolTable = (SymbolTable)value;
1779                 return;
1780             }
1781             if (suffixLength == Constants.ERROR_REPORTER_PROPERTY.length() &&
1782                 propertyId.endsWith(Constants.ERROR_REPORTER_PROPERTY)) {
1783                 fErrorReporter = (XMLErrorReporter)value;
1784                 return;
1785             }
1786             if (suffixLength == Constants.ENTITY_RESOLVER_PROPERTY.length() &&
1787                 propertyId.endsWith(Constants.ENTITY_RESOLVER_PROPERTY)) {
1788                 fEntityResolver = (XMLEntityResolver)value;
1789                 return;
1790             }
1791             if (suffixLength == Constants.BUFFER_SIZE_PROPERTY.length() &&
1792                 propertyId.endsWith(Constants.BUFFER_SIZE_PROPERTY)) {
1793                 Integer bufferSize = (Integer)value;
1794                 if (bufferSize != null &&
1795                     bufferSize.intValue() > DEFAULT_XMLDECL_BUFFER_SIZE) {
1796                     fBufferSize = bufferSize.intValue();
1797                     fEntityScanner.setBufferSize(fBufferSize);

1798                 }
1799             }
1800             if (suffixLength == Constants.SECURITY_MANAGER_PROPERTY.length() &&
1801                 propertyId.endsWith(Constants.SECURITY_MANAGER_PROPERTY)) {
1802                 fSecurityManager = (XMLSecurityManager)value;
1803             }
1804         }
1805 
1806         //JAXP 1.5 properties
1807         if (propertyId.equals(XML_SECURITY_PROPERTY_MANAGER))
1808         {
1809             XMLSecurityPropertyManager spm = (XMLSecurityPropertyManager)value;
1810             fAccessExternalDTD = spm.getValue(XMLSecurityPropertyManager.Property.ACCESS_EXTERNAL_DTD);
1811             return;
1812         }
1813 
1814         //Catalog properties
1815         if (propertyId.equals(JdkXmlUtils.CATALOG_FILES)) {
1816             fCatalogFile = (String)value;
1817         } else if (propertyId.equals(JdkXmlUtils.CATALOG_DEFER)) {

2404         systemURI = (new java.net.URI(baseURI.toString())).resolve(systemURI);
2405 
2406         // return the string rep of the new uri (an absolute one)
2407         return systemURI.toString();
2408 
2409         // if any exception is thrown, it'll get thrown to the caller.
2410 
2411     } // expandSystemIdStrictOff(String,String):String
2412 
2413     //
2414     // Protected methods
2415     //
2416 
2417 
2418     /**
2419      * Returns the IANA encoding name that is auto-detected from
2420      * the bytes specified, with the endian-ness of that encoding where appropriate.
2421      *
2422      * @param b4    The first four bytes of the input.
2423      * @param count The number of bytes actually read.
2424      * @return an instance of EncodingInfo which represents the auto-detected encoding.


2425      */
2426     protected EncodingInfo getEncodingInfo(byte[] b4, int count) {
2427 
2428         if (count < 2) {
2429             return EncodingInfo.UTF_8;
2430         }
2431 
2432         // UTF-16, with BOM
2433         int b0 = b4[0] & 0xFF;
2434         int b1 = b4[1] & 0xFF;
2435         if (b0 == 0xFE && b1 == 0xFF) {
2436             // UTF-16, big-endian
2437             return EncodingInfo.UTF_16_BIG_ENDIAN_WITH_BOM;
2438         }
2439         if (b0 == 0xFF && b1 == 0xFE) {
2440             // UTF-16, little-endian
2441             return EncodingInfo.UTF_16_LITTLE_ENDIAN_WITH_BOM;
2442         }
2443 
2444         // default to UTF-8 if we don't have enough bytes to make a
2445         // good determination of the encoding
2446         if (count < 3) {
2447             return EncodingInfo.UTF_8;
2448         }
2449 
2450         // UTF-8 with a BOM
2451         int b2 = b4[2] & 0xFF;
2452         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
2453             return EncodingInfo.UTF_8_WITH_BOM;
2454         }
2455 
2456         // default to UTF-8 if we don't have enough bytes to make a
2457         // good determination of the encoding
2458         if (count < 4) {
2459             return EncodingInfo.UTF_8;
2460         }
2461 
2462         // other encodings
2463         int b3 = b4[3] & 0xFF;
2464         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
2465             // UCS-4, big endian (1234)
2466             return EncodingInfo.UCS_4_BIG_ENDIAN;
2467         }
2468         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
2469             // UCS-4, little endian (4321)
2470             return EncodingInfo.UCS_4_LITTLE_ENDIAN;
2471         }
2472         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
2473             // UCS-4, unusual octet order (2143)
2474             // REVISIT: What should this be?
2475             return EncodingInfo.UCS_4_UNUSUAL_BYTE_ORDER;
2476         }
2477         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
2478             // UCS-4, unusual octect order (3412)
2479             // REVISIT: What should this be?
2480             return EncodingInfo.UCS_4_UNUSUAL_BYTE_ORDER;
2481         }
2482         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2483             // UTF-16, big-endian, no BOM
2484             // (or could turn out to be UCS-2...
2485             // REVISIT: What should this be?
2486             return EncodingInfo.UTF_16_BIG_ENDIAN;
2487         }
2488         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2489             // UTF-16, little-endian, no BOM
2490             // (or could turn out to be UCS-2...
2491             return EncodingInfo.UTF_16_LITTLE_ENDIAN;
2492         }
2493         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
2494             // EBCDIC
2495             // a la xerces1, return CP037 instead of EBCDIC here
2496             return EncodingInfo.EBCDIC;
2497         }
2498 
2499         // default encoding
2500         return EncodingInfo.UTF_8;
2501 
2502     } // getEncodingName(byte[],int):Object[]
2503 
2504     /**
2505      * Creates a reader capable of reading the given input stream in
2506      * the specified encoding.
2507      *
2508      * @param inputStream  The input stream.
2509      * @param encoding     The encoding name that the input stream is
2510      *                     encoded using. If the user has specified that
2511      *                     Java encoding names are allowed, then the
2512      *                     encoding name may be a Java encoding name;
2513      *                     otherwise, it is an ianaEncoding name.
2514      * @param isBigEndian   For encodings (like uCS-4), whose names cannot
2515      *                      specify a byte order, this tells whether the order
2516      *                      is bigEndian.  null if unknown or irrelevant.
2517      *
2518      * @return Returns a reader.
2519      */
2520     protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian)
2521         throws IOException {
2522 
2523         String enc = (encoding != null) ? encoding : EncodingInfo.STR_UTF8;
2524         enc = enc.toUpperCase(Locale.ENGLISH);
2525         MessageFormatter f = fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
2526         Locale l = fErrorReporter.getLocale();
2527         switch (enc) {
2528             case EncodingInfo.STR_UTF8:
2529                 return new UTF8Reader(inputStream, fBufferSize, f, l);
2530             case EncodingInfo.STR_UTF16:
2531                 if (isBigEndian != null) {
2532                     return new UTF16Reader(inputStream, fBufferSize, isBigEndian, f, l);
2533                 }
2534                 break;
2535             case EncodingInfo.STR_UTF16BE:
2536                 return new UTF16Reader(inputStream, fBufferSize, true, f, l);
2537             case EncodingInfo.STR_UTF16LE:
2538                 return new UTF16Reader(inputStream, fBufferSize, false, f, l);
2539             case EncodingInfo.STR_UCS4:










2540                 if(isBigEndian != null) {
2541                     if(isBigEndian) {

2542                         return new UCSReader(inputStream, UCSReader.UCS4BE);
2543                     } else {
2544                         return new UCSReader(inputStream, UCSReader.UCS4LE);
2545                     }
2546                 } else {
2547                     fErrorReporter.reportError(this.getEntityScanner(),
2548                             XMLMessageFormatter.XML_DOMAIN,
2549                             "EncodingByteOrderUnsupported",
2550                             new Object[] { encoding },
2551                             XMLErrorReporter.SEVERITY_FATAL_ERROR);
2552                 }
2553                 break;
2554             case EncodingInfo.STR_UCS2:
2555                 if(isBigEndian != null) {
2556                     if(isBigEndian) {

2557                         return new UCSReader(inputStream, UCSReader.UCS2BE);
2558                     } else {
2559                         return new UCSReader(inputStream, UCSReader.UCS2LE);
2560                     }
2561                 } else {
2562                     fErrorReporter.reportError(this.getEntityScanner(),
2563                             XMLMessageFormatter.XML_DOMAIN,
2564                             "EncodingByteOrderUnsupported",
2565                             new Object[] { encoding },
2566                             XMLErrorReporter.SEVERITY_FATAL_ERROR);
2567                 }
2568                 break;
2569         }
2570 
2571         // check for valid name
2572         boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
2573         boolean validJava = XMLChar.isValidJavaEncoding(encoding);
2574         if (!validIANA || (fAllowJavaEncodings && !validJava)) {
2575             fErrorReporter.reportError(this.getEntityScanner(),
2576                     XMLMessageFormatter.XML_DOMAIN,
2577                     "EncodingDeclInvalid",
2578                     new Object[] { encoding },
2579                     XMLErrorReporter.SEVERITY_FATAL_ERROR);
2580             // NOTE: AndyH suggested that, on failure, we use ISO Latin 1
2581             //       because every byte is a valid ISO Latin 1 character.
2582             //       It may not translate correctly but if we failed on
2583             //       the encoding anyway, then we're expecting the content
2584             //       of the document to be bad. This will just prevent an
2585             //       invalid UTF-8 sequence to be detected. This is only
2586             //       important when continue-after-fatal-error is turned
2587             //       on. -Ac
2588                     encoding = "ISO-8859-1";
2589         }
2590 
2591         // try to use a Java reader
2592         String javaEncoding = EncodingMap.getIANA2JavaMapping(enc);
2593         if (javaEncoding == null) {
2594             if (fAllowJavaEncodings) {
2595                 javaEncoding = encoding;
2596             } else {
2597                 fErrorReporter.reportError(this.getEntityScanner(),
2598                         XMLMessageFormatter.XML_DOMAIN,
2599                         "EncodingDeclInvalid",
2600                         new Object[] { encoding },
2601                         XMLErrorReporter.SEVERITY_FATAL_ERROR);
2602                 // see comment above.
2603                 javaEncoding = "ISO8859_1";
2604             }
2605         }
2606         if (DEBUG_ENCODINGS) {
2607             System.out.print("$$$ creating Java InputStreamReader: encoding="+javaEncoding);
2608             if (javaEncoding == encoding) {
2609                 System.out.print(" (IANA encoding)");
2610             }
2611             System.out.println();
2612         }
2613         return new BufferedReader( new InputStreamReader(inputStream, javaEncoding));
2614 
2615     } // createReader(InputStream,String, Boolean): Reader
2616 
2617 
2618     /**

2876                             }
2877                         }
2878                     }
2879                     if (fCurrentEntity.position == fCurrentEntity.count) {
2880                         System.out.print('^');
2881                     }
2882                     System.out.print('"');
2883                 }
2884                 System.out.print(']');
2885                 System.out.print(" @ ");
2886                 System.out.print(fCurrentEntity.lineNumber);
2887                 System.out.print(',');
2888                 System.out.print(fCurrentEntity.columnNumber);
2889             } else {
2890                 System.out.print("*NO CURRENT ENTITY*");
2891             }
2892         }
2893     } // print()
2894 
2895     /**
2896      * Information about auto-detectable encodings.
























2897      *
2898      * @xerces.internal
2899      *
2900      * @author Michael Glavassevich, IBM
2901      */
2902     private static class EncodingInfo {
2903         public static final String STR_UTF8 = "UTF-8";
2904         public static final String STR_UTF16 = "UTF-16";
2905         public static final String STR_UTF16BE = "UTF-16BE";
2906         public static final String STR_UTF16LE = "UTF-16LE";
2907         public static final String STR_UCS4 = "ISO-10646-UCS-4";
2908         public static final String STR_UCS2 = "ISO-10646-UCS-2";
2909         public static final String STR_CP037 = "CP037";
2910 
2911         /** UTF-8 **/
2912         public static final EncodingInfo UTF_8 =
2913                 new EncodingInfo(STR_UTF8, null, false);
2914 
2915         /** UTF-8, with BOM **/
2916         public static final EncodingInfo UTF_8_WITH_BOM =
2917                 new EncodingInfo(STR_UTF8, null, true);
2918 
2919         /** UTF-16, big-endian **/
2920         public static final EncodingInfo UTF_16_BIG_ENDIAN =
2921                 new EncodingInfo(STR_UTF16BE, STR_UTF16, Boolean.TRUE, false);
2922 
2923         /** UTF-16, big-endian with BOM **/
2924         public static final EncodingInfo UTF_16_BIG_ENDIAN_WITH_BOM =
2925                 new EncodingInfo(STR_UTF16BE, STR_UTF16, Boolean.TRUE, true);
2926 
2927         /** UTF-16, little-endian **/
2928         public static final EncodingInfo UTF_16_LITTLE_ENDIAN =
2929                 new EncodingInfo(STR_UTF16LE, STR_UTF16, Boolean.FALSE, false);
2930 
2931         /** UTF-16, little-endian with BOM **/
2932         public static final EncodingInfo UTF_16_LITTLE_ENDIAN_WITH_BOM =
2933                 new EncodingInfo(STR_UTF16LE, STR_UTF16, Boolean.FALSE, true);
2934 
2935         /** UCS-4, big-endian **/
2936         public static final EncodingInfo UCS_4_BIG_ENDIAN =
2937                 new EncodingInfo(STR_UCS4, Boolean.TRUE, false);
2938 
2939         /** UCS-4, little-endian **/
2940         public static final EncodingInfo UCS_4_LITTLE_ENDIAN =
2941                 new EncodingInfo(STR_UCS4, Boolean.FALSE, false);
2942 
2943         /** UCS-4, unusual byte-order (2143) or (3412) **/
2944         public static final EncodingInfo UCS_4_UNUSUAL_BYTE_ORDER =
2945                 new EncodingInfo(STR_UCS4, null, false);
2946 
2947         /** EBCDIC **/
2948         public static final EncodingInfo EBCDIC = new EncodingInfo(STR_CP037, null, false);
2949 
2950         public final String autoDetectedEncoding;
2951         public final String readerEncoding;
2952         public final Boolean isBigEndian;
2953         public final boolean hasBOM;
2954 
2955         private EncodingInfo(String autoDetectedEncoding, Boolean isBigEndian, boolean hasBOM) {
2956             this(autoDetectedEncoding, autoDetectedEncoding, isBigEndian, hasBOM);
2957         } // <init>(String,Boolean,boolean)
2958 
2959         private EncodingInfo(String autoDetectedEncoding, String readerEncoding,
2960                 Boolean isBigEndian, boolean hasBOM) {
2961             this.autoDetectedEncoding = autoDetectedEncoding;
2962             this.readerEncoding = readerEncoding;
2963             this.isBigEndian = isBigEndian;
2964             this.hasBOM = hasBOM;
2965         } // <init>(String,String,Boolean,boolean)
2966 
2967     } // class EncodingInfo





















































2968 
2969     /**
2970     * This class wraps the byte inputstreams we're presented with.
2971     * We need it because java.io.InputStreams don't provide
2972     * functionality to reread processed bytes, and they have a habit
2973     * of reading more than one character when you call their read()
2974     * methods.  This means that, once we discover the true (declared)
2975     * encoding of a document, we can neither backtrack to read the
2976     * whole doc again nor start reading where we are with a new
2977     * reader.
2978     *
2979     * This class allows rewinding an inputStream by allowing a mark
2980     * to be set, and the stream reset to that position.  <strong>The
2981     * class assumes that it needs to read one character per
2982     * invocation when it's read() method is inovked, but uses the
2983     * underlying InputStream's read(char[], offset length) method--it
2984     * won't buffer data read this way!</strong>
2985     *
2986     * @xerces.internal
2987     *

3000         private int fMark;
3001 
3002         public RewindableInputStream(InputStream is) {
3003             fData = new byte[DEFAULT_XMLDECL_BUFFER_SIZE];
3004             fInputStream = is;
3005             fStartOffset = 0;
3006             fEndOffset = -1;
3007             fOffset = 0;
3008             fLength = 0;
3009             fMark = 0;
3010         }
3011 
3012         public void setStartOffset(int offset) {
3013             fStartOffset = offset;
3014         }
3015 
3016         public void rewind() {
3017             fOffset = fStartOffset;
3018         }
3019 
3020         public int readAndBuffer() throws IOException {







3021             if (fOffset == fData.length) {
3022                 byte[] newData = new byte[fOffset << 1];
3023                 System.arraycopy(fData, 0, newData, 0, fOffset);
3024                 fData = newData;
3025             }
3026             final int b = fInputStream.read();
3027             if (b == -1) {
3028                 fEndOffset = fOffset;
3029                 return -1;
3030             }
3031             fData[fLength++] = (byte)b;
3032             fOffset++;
3033             return b & 0xff;
3034         }
3035 
3036         public int read() throws IOException {
3037             if (fOffset < fLength) {
3038                 return fData[fOffset++] & 0xff;
3039             }
3040             if (fOffset == fEndOffset) {
3041                 return -1;
3042             }
3043             if (fCurrentEntity.mayReadChunks) {
3044                 return fInputStream.read();
3045             }
3046             return readAndBuffer();
3047         }
3048 
3049         public int read(byte[] b, int off, int len) throws IOException {
3050             final int bytesLeft = fLength - fOffset;
3051             if (bytesLeft == 0) {
3052                 if (fOffset == fEndOffset) {
3053                     return -1;
3054                 }
3055 
3056                 // read a block of data as requested




3057                 if(fCurrentEntity.mayReadChunks || !fCurrentEntity.xmlDeclChunkRead) {
3058 
3059                     if (!fCurrentEntity.xmlDeclChunkRead)
3060                     {
3061                         fCurrentEntity.xmlDeclChunkRead = true;
3062                         len = Entity.ScannedEntity.DEFAULT_XMLDECL_BUFFER_SIZE;
3063                     }
3064                     return fInputStream.read(b, off, len);
3065                 }
3066                 int returnedVal = readAndBuffer();
3067                 if (returnedVal == -1) {

3068                     fEndOffset = fOffset;
3069                     return -1;
3070                 }
3071                 b[off] = (byte)returnedVal;
3072                 return 1;

3073             }
3074             if (len < bytesLeft) {
3075                 if (len <= 0) {
3076                     return 0;
3077                 }
3078             } else {
3079                 len = bytesLeft;
3080             }
3081             if (b != null) {
3082                 System.arraycopy(fData, fOffset, b, off, len);
3083             }
3084             fOffset += len;
3085             return len;
3086         }
3087 
3088         public long skip(long n) throws IOException {

3089             int bytesLeft;
3090             if (n <= 0) {
3091                 return 0;
3092             }
3093             bytesLeft = fLength - fOffset;
3094             if (bytesLeft == 0) {
3095                 if (fOffset == fEndOffset) {
3096                     return 0;
3097                 }
3098                 return fInputStream.skip(n);
3099             }
3100             if (n <= bytesLeft) {
3101                 fOffset += n;
3102                 return n;
3103             }
3104             fOffset += bytesLeft;
3105             if (fOffset == fEndOffset) {
3106                 return bytesLeft;
3107             }
3108             n -= bytesLeft;
3109            /*
3110             * In a manner of speaking, when this class isn't permitting more
3111             * than one byte at a time to be read, it is "blocking".  The
3112             * available() method should indicate how much can be read without
3113             * blocking, so while we're in this mode, it should only indicate
3114             * that bytes in its buffer are available; otherwise, the result of
3115             * available() on the underlying InputStream is appropriate.
3116             */
3117             return fInputStream.skip(n) + bytesLeft;
3118         }
3119 
3120         public int available() throws IOException {
3121             final int bytesLeft = fLength - fOffset;
3122             if (bytesLeft == 0) {
3123                 if (fOffset == fEndOffset) {
3124                     return -1;
3125                 }
3126                 return fCurrentEntity.mayReadChunks ? fInputStream.available()
3127                                                     : 0;
3128             }
3129             return bytesLeft;
3130         }
3131 
3132         public void mark(int howMuch) {
3133             fMark = fOffset;
3134         }
3135 
3136         public void reset() {
3137             fOffset = fMark;

3138         }
3139 
3140         public boolean markSupported() {
3141             return true;
3142         }
3143 
3144         public void close() throws IOException {
3145             if (fInputStream != null) {
3146                 fInputStream.close();
3147                 fInputStream = null;
3148             }
3149         }
3150     } // end of RewindableInputStream class
3151 
3152     public void test(){
3153         //System.out.println("TESTING: Added familytree to entityManager");
3154         //Usecase1
3155         fEntityStorage.addExternalEntity("entityUsecase1",null,
3156                 "/space/home/stax/sun/6thJan2004/zephyr/data/test.txt",
3157                 "/space/home/stax/sun/6thJan2004/zephyr/data/entity.xml");

< prev index next >