1 /*
2 * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
3 */
4 /*
5 * Licensed to the Apache Software Foundation (ASF) under one or more
6 * contributor license agreements. See the NOTICE file distributed with
7 * this work for additional information regarding copyright ownership.
8 * The ASF licenses this file to You under the Apache License, Version 2.0
9 * (the "License"); you may not use this file except in compliance with
10 * the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21 package com.sun.org.apache.xerces.internal.impl ;
22
23 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
24 import com.sun.org.apache.xerces.internal.impl.io.UCSReader;
25 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
26 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
27 import com.sun.org.apache.xerces.internal.impl.validation.ValidationManager;
28 import com.sun.org.apache.xerces.internal.util.*;
29 import com.sun.org.apache.xerces.internal.util.URI;
30 import com.sun.org.apache.xerces.internal.utils.XMLLimitAnalyzer;
31 import com.sun.org.apache.xerces.internal.utils.XMLSecurityManager;
32 import com.sun.org.apache.xerces.internal.utils.XMLSecurityPropertyManager;
33 import com.sun.org.apache.xerces.internal.xni.Augmentations;
34 import com.sun.org.apache.xerces.internal.xni.XMLResourceIdentifier;
35 import com.sun.org.apache.xerces.internal.xni.XNIException;
36 import com.sun.org.apache.xerces.internal.xni.parser.*;
37 import com.sun.xml.internal.stream.Entity;
38 import com.sun.xml.internal.stream.StaxEntityResolverWrapper;
39 import com.sun.xml.internal.stream.StaxXMLInputSource;
40 import com.sun.xml.internal.stream.XMLEntityStorage;
41 import java.io.*;
42 import java.net.HttpURLConnection;
43 import java.net.URISyntaxException;
44 import java.net.URL;
72 * xni.
73 * <p>
74 * This component requires the following features and properties from the
75 * component manager that uses it:
76 * <ul>
77 * <li>http://xml.org/sax/features/validation</li>
78 * <li>http://xml.org/sax/features/external-general-entities</li>
79 * <li>http://xml.org/sax/features/external-parameter-entities</li>
80 * <li>http://apache.org/xml/features/allow-java-encodings</li>
81 * <li>http://apache.org/xml/properties/internal/symbol-table</li>
82 * <li>http://apache.org/xml/properties/internal/error-reporter</li>
83 * <li>http://apache.org/xml/properties/internal/entity-resolver</li>
84 * </ul>
85 *
86 *
87 * @author Andy Clark, IBM
88 * @author Arnaud Le Hors, IBM
89 * @author K.Venugopal SUN Microsystems
90 * @author Neeraj Bajaj SUN Microsystems
91 * @author Sunitha Reddy SUN Microsystems
92 * @LastModified: Nov 2018
93 */
94 public class XMLEntityManager implements XMLComponent, XMLEntityResolver {
95
96 //
97 // Constants
98 //
99
100 /** Default buffer size (2048). */
101 public static final int DEFAULT_BUFFER_SIZE = 8192;
102
103 /** Default buffer size before we've finished with the XMLDecl: */
104 public static final int DEFAULT_XMLDECL_BUFFER_SIZE = 64;
105
106 /** Default internal entity buffer size (1024). */
107 public static final int DEFAULT_INTERNAL_BUFFER_SIZE = 1024;
108
109 // feature identifiers
110
111 /** Feature identifier: validation. */
112 protected static final String VALIDATION =
395 protected Entity.ScannedEntity fCurrentEntity = null;
396
397 /** identify if the InputSource is created by a resolver */
398 boolean fISCreatedByResolver = false;
399
400 // shared context
401
402 protected XMLEntityStorage fEntityStorage ;
403
404 protected final Object [] defaultEncoding = new Object[]{"UTF-8", null};
405
406
407 // temp vars
408
409 /** Resource identifer. */
410 private final XMLResourceIdentifierImpl fResourceIdentifier = new XMLResourceIdentifierImpl();
411
412 /** Augmentations for entities. */
413 private final Augmentations fEntityAugs = new AugmentationsImpl();
414
415 /** Pool of character buffers. */
416 private CharacterBufferPool fBufferPool = new CharacterBufferPool(fBufferSize, DEFAULT_INTERNAL_BUFFER_SIZE);
417
418 /** indicate whether Catalog should be used for resolving external resources */
419 private boolean fUseCatalog = true;
420 CatalogFeatures fCatalogFeatures;
421 CatalogResolver fCatalogResolver;
422
423 private String fCatalogFile;
424 private String fDefer;
425 private String fPrefer;
426 private String fResolve;
427
428 //
429 // Constructors
430 //
431
432 /**
433 * If this constructor is used to create the object, reset() should be invoked on this object
434 */
435 public XMLEntityManager() {
436 //for entity managers not created by parsers
437 fSecurityManager = new XMLSecurityManager(true);
677
678 stream = connect.getInputStream();
679
680 // REVISIT: If the URLConnection has external encoding
681 // information, we should be reading it here. It's located
682 // in the charset parameter of Content-Type. -- mrglavas
683
684 if (followRedirects) {
685 String redirect = connect.getURL().toString();
686 // E43: Check if the URL was redirected, and then
687 // update literal and expanded system IDs if needed.
688 if (!redirect.equals(expandedSystemId)) {
689 literalSystemId = redirect;
690 expandedSystemId = redirect;
691 }
692 }
693 }
694 }
695
696 // wrap this stream in RewindableInputStream
697 stream = new RewindableInputStream(stream);
698
699 // perform auto-detect of encoding if necessary
700 if (encoding == null) {
701 // read first four bytes and determine encoding
702 final byte[] b4 = new byte[4];
703 int count = 0;
704 for (; count<4; count++ ) {
705 b4[count] = (byte)stream.read();
706 }
707 if (count == 4) {
708 Object [] encodingDesc = getEncodingName(b4, count);
709 encoding = (String)(encodingDesc[0]);
710 isBigEndian = (Boolean)(encodingDesc[1]);
711
712 stream.reset();
713 // Special case UTF-8 files with BOM created by Microsoft
714 // tools. It's more efficient to consume the BOM than make
715 // the reader perform extra checks. -Ac
716 if (count > 2 && encoding.equals("UTF-8")) {
717 int b0 = b4[0] & 0xFF;
718 int b1 = b4[1] & 0xFF;
719 int b2 = b4[2] & 0xFF;
720 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
721 // ignore first three bytes...
722 stream.skip(3);
723 }
724 }
725 reader = createReader(stream, encoding, isBigEndian);
726 } else {
727 reader = createReader(stream, encoding, isBigEndian);
728 }
729 }
730
731 // use specified encoding
732 else {
733 encoding = encoding.toUpperCase(Locale.ENGLISH);
734
735 // If encoding is UTF-8, consume BOM if one is present.
736 if (encoding.equals("UTF-8")) {
737 final int[] b3 = new int[3];
738 int count = 0;
739 for (; count < 3; ++count) {
740 b3[count] = stream.read();
741 if (b3[count] == -1)
742 break;
743 }
744 if (count == 3) {
745 if (b3[0] != 0xEF || b3[1] != 0xBB || b3[2] != 0xBF) {
746 // First three bytes are not BOM, so reset.
747 stream.reset();
748 }
749 } else {
750 stream.reset();
751 }
752 }
753 // If encoding is UTF-16, we still need to read the first four bytes
754 // in order to discover the byte order.
755 else if (encoding.equals("UTF-16")) {
756 final int[] b4 = new int[4];
757 int count = 0;
758 for (; count < 4; ++count) {
759 b4[count] = stream.read();
760 if (b4[count] == -1)
761 break;
762 }
763 stream.reset();
764
765 String utf16Encoding = "UTF-16";
766 if (count >= 2) {
767 final int b0 = b4[0];
768 final int b1 = b4[1];
769 if (b0 == 0xFE && b1 == 0xFF) {
770 // UTF-16, big-endian
771 utf16Encoding = "UTF-16BE";
772 isBigEndian = Boolean.TRUE;
773 }
774 else if (b0 == 0xFF && b1 == 0xFE) {
775 // UTF-16, little-endian
776 utf16Encoding = "UTF-16LE";
777 isBigEndian = Boolean.FALSE;
778 }
779 else if (count == 4) {
780 final int b2 = b4[2];
781 final int b3 = b4[3];
782 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
783 // UTF-16, big-endian, no BOM
784 utf16Encoding = "UTF-16BE";
785 isBigEndian = Boolean.TRUE;
786 }
787 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
788 // UTF-16, little-endian, no BOM
789 utf16Encoding = "UTF-16LE";
790 isBigEndian = Boolean.FALSE;
791 }
792 }
793 }
794 reader = createReader(stream, utf16Encoding, isBigEndian);
795 }
796 // If encoding is UCS-4, we still need to read the first four bytes
797 // in order to discover the byte order.
798 else if (encoding.equals("ISO-10646-UCS-4")) {
799 final int[] b4 = new int[4];
800 int count = 0;
801 for (; count < 4; ++count) {
802 b4[count] = stream.read();
803 if (b4[count] == -1)
804 break;
805 }
806 stream.reset();
807
808 // Ignore unusual octet order for now.
809 if (count == 4) {
810 // UCS-4, big endian (1234)
811 if (b4[0] == 0x00 && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x3C) {
812 isBigEndian = Boolean.TRUE;
813 }
814 // UCS-4, little endian (1234)
815 else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x00) {
816 isBigEndian = Boolean.FALSE;
817 }
818 }
819 }
820 // If encoding is UCS-2, we still need to read the first four bytes
821 // in order to discover the byte order.
822 else if (encoding.equals("ISO-10646-UCS-2")) {
823 final int[] b4 = new int[4];
824 int count = 0;
825 for (; count < 4; ++count) {
826 b4[count] = stream.read();
827 if (b4[count] == -1)
828 break;
829 }
830 stream.reset();
831
832 if (count == 4) {
833 // UCS-2, big endian
834 if (b4[0] == 0x00 && b4[1] == 0x3C && b4[2] == 0x00 && b4[3] == 0x3F) {
835 isBigEndian = Boolean.TRUE;
836 }
837 // UCS-2, little endian
838 else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x3F && b4[3] == 0x00) {
839 isBigEndian = Boolean.FALSE;
840 }
841 }
842 }
843
844 reader = createReader(stream, encoding, isBigEndian);
845 }
846
1781 fSymbolTable = (SymbolTable)value;
1782 return;
1783 }
1784 if (suffixLength == Constants.ERROR_REPORTER_PROPERTY.length() &&
1785 propertyId.endsWith(Constants.ERROR_REPORTER_PROPERTY)) {
1786 fErrorReporter = (XMLErrorReporter)value;
1787 return;
1788 }
1789 if (suffixLength == Constants.ENTITY_RESOLVER_PROPERTY.length() &&
1790 propertyId.endsWith(Constants.ENTITY_RESOLVER_PROPERTY)) {
1791 fEntityResolver = (XMLEntityResolver)value;
1792 return;
1793 }
1794 if (suffixLength == Constants.BUFFER_SIZE_PROPERTY.length() &&
1795 propertyId.endsWith(Constants.BUFFER_SIZE_PROPERTY)) {
1796 Integer bufferSize = (Integer)value;
1797 if (bufferSize != null &&
1798 bufferSize.intValue() > DEFAULT_XMLDECL_BUFFER_SIZE) {
1799 fBufferSize = bufferSize.intValue();
1800 fEntityScanner.setBufferSize(fBufferSize);
1801 fBufferPool.setExternalBufferSize(fBufferSize);
1802 }
1803 }
1804 if (suffixLength == Constants.SECURITY_MANAGER_PROPERTY.length() &&
1805 propertyId.endsWith(Constants.SECURITY_MANAGER_PROPERTY)) {
1806 fSecurityManager = (XMLSecurityManager)value;
1807 }
1808 }
1809
1810 //JAXP 1.5 properties
1811 if (propertyId.equals(XML_SECURITY_PROPERTY_MANAGER))
1812 {
1813 XMLSecurityPropertyManager spm = (XMLSecurityPropertyManager)value;
1814 fAccessExternalDTD = spm.getValue(XMLSecurityPropertyManager.Property.ACCESS_EXTERNAL_DTD);
1815 return;
1816 }
1817
1818 //Catalog properties
1819 if (propertyId.equals(JdkXmlUtils.CATALOG_FILES)) {
1820 fCatalogFile = (String)value;
1821 } else if (propertyId.equals(JdkXmlUtils.CATALOG_DEFER)) {
2408 systemURI = (new java.net.URI(baseURI.toString())).resolve(systemURI);
2409
2410 // return the string rep of the new uri (an absolute one)
2411 return systemURI.toString();
2412
2413 // if any exception is thrown, it'll get thrown to the caller.
2414
2415 } // expandSystemIdStrictOff(String,String):String
2416
2417 //
2418 // Protected methods
2419 //
2420
2421
2422 /**
2423 * Returns the IANA encoding name that is auto-detected from
2424 * the bytes specified, with the endian-ness of that encoding where appropriate.
2425 *
2426 * @param b4 The first four bytes of the input.
2427 * @param count The number of bytes actually read.
2428 * @return a 2-element array: the first element, an IANA-encoding string,
2429 * the second element a Boolean which is true iff the document is big endian, false
2430 * if it's little-endian, and null if the distinction isn't relevant.
2431 */
2432 protected Object[] getEncodingName(byte[] b4, int count) {
2433
2434 if (count < 2) {
2435 return defaultEncoding;
2436 }
2437
2438 // UTF-16, with BOM
2439 int b0 = b4[0] & 0xFF;
2440 int b1 = b4[1] & 0xFF;
2441 if (b0 == 0xFE && b1 == 0xFF) {
2442 // UTF-16, big-endian
2443 return new Object [] {"UTF-16BE", true};
2444 }
2445 if (b0 == 0xFF && b1 == 0xFE) {
2446 // UTF-16, little-endian
2447 return new Object [] {"UTF-16LE", false};
2448 }
2449
2450 // default to UTF-8 if we don't have enough bytes to make a
2451 // good determination of the encoding
2452 if (count < 3) {
2453 return defaultEncoding;
2454 }
2455
2456 // UTF-8 with a BOM
2457 int b2 = b4[2] & 0xFF;
2458 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
2459 return defaultEncoding;
2460 }
2461
2462 // default to UTF-8 if we don't have enough bytes to make a
2463 // good determination of the encoding
2464 if (count < 4) {
2465 return defaultEncoding;
2466 }
2467
2468 // other encodings
2469 int b3 = b4[3] & 0xFF;
2470 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
2471 // UCS-4, big endian (1234)
2472 return new Object [] {"ISO-10646-UCS-4", true};
2473 }
2474 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
2475 // UCS-4, little endian (4321)
2476 return new Object [] {"ISO-10646-UCS-4", false};
2477 }
2478 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
2479 // UCS-4, unusual octet order (2143)
2480 // REVISIT: What should this be?
2481 return new Object [] {"ISO-10646-UCS-4", null};
2482 }
2483 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
2484 // UCS-4, unusual octect order (3412)
2485 // REVISIT: What should this be?
2486 return new Object [] {"ISO-10646-UCS-4", null};
2487 }
2488 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2489 // UTF-16, big-endian, no BOM
2490 // (or could turn out to be UCS-2...
2491 // REVISIT: What should this be?
2492 return new Object [] {"UTF-16BE", true};
2493 }
2494 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2495 // UTF-16, little-endian, no BOM
2496 // (or could turn out to be UCS-2...
2497 return new Object [] {"UTF-16LE", false};
2498 }
2499 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
2500 // EBCDIC
2501 // a la xerces1, return CP037 instead of EBCDIC here
2502 return new Object [] {"CP037", null};
2503 }
2504
2505 return defaultEncoding;
2506
2507 } // getEncodingName(byte[],int):Object[]
2508
2509 /**
2510 * Creates a reader capable of reading the given input stream in
2511 * the specified encoding.
2512 *
2513 * @param inputStream The input stream.
2514 * @param encoding The encoding name that the input stream is
2515 * encoded using. If the user has specified that
2516 * Java encoding names are allowed, then the
2517 * encoding name may be a Java encoding name;
2518 * otherwise, it is an ianaEncoding name.
2519 * @param isBigEndian For encodings (like uCS-4), whose names cannot
2520 * specify a byte order, this tells whether the order is bigEndian. null menas
2521 * unknown or not relevant.
2522 *
2523 * @return Returns a reader.
2524 */
2525 protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian)
2526 throws IOException {
2527
2528 // normalize encoding name
2529 if (encoding == null) {
2530 encoding = "UTF-8";
2531 }
2532
2533 // try to use an optimized reader
2534 String ENCODING = encoding.toUpperCase(Locale.ENGLISH);
2535 if (ENCODING.equals("UTF-8")) {
2536 if (DEBUG_ENCODINGS) {
2537 System.out.println("$$$ creating UTF8Reader");
2538 }
2539 return new UTF8Reader(inputStream, fBufferSize, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale() );
2540 }
2541 if (ENCODING.equals("US-ASCII")) {
2542 if (DEBUG_ENCODINGS) {
2543 System.out.println("$$$ creating ASCIIReader");
2544 }
2545 return new ASCIIReader(inputStream, fBufferSize, fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), fErrorReporter.getLocale());
2546 }
2547 if(ENCODING.equals("ISO-10646-UCS-4")) {
2548 if(isBigEndian != null) {
2549 boolean isBE = isBigEndian.booleanValue();
2550 if(isBE) {
2551 return new UCSReader(inputStream, UCSReader.UCS4BE);
2552 } else {
2553 return new UCSReader(inputStream, UCSReader.UCS4LE);
2554 }
2555 } else {
2556 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,
2557 "EncodingByteOrderUnsupported",
2558 new Object[] { encoding },
2559 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2560 }
2561 }
2562 if(ENCODING.equals("ISO-10646-UCS-2")) {
2563 if(isBigEndian != null) { // sould never happen with this encoding...
2564 boolean isBE = isBigEndian.booleanValue();
2565 if(isBE) {
2566 return new UCSReader(inputStream, UCSReader.UCS2BE);
2567 } else {
2568 return new UCSReader(inputStream, UCSReader.UCS2LE);
2569 }
2570 } else {
2571 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,
2572 "EncodingByteOrderUnsupported",
2573 new Object[] { encoding },
2574 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2575 }
2576 }
2577
2578 // check for valid name
2579 boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
2580 boolean validJava = XMLChar.isValidJavaEncoding(encoding);
2581 if (!validIANA || (fAllowJavaEncodings && !validJava)) {
2582 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,
2583 "EncodingDeclInvalid",
2584 new Object[] { encoding },
2585 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2586 // NOTE: AndyH suggested that, on failure, we use ISO Latin 1
2587 // because every byte is a valid ISO Latin 1 character.
2588 // It may not translate correctly but if we failed on
2589 // the encoding anyway, then we're expecting the content
2590 // of the document to be bad. This will just prevent an
2591 // invalid UTF-8 sequence to be detected. This is only
2592 // important when continue-after-fatal-error is turned
2593 // on. -Ac
2594 encoding = "ISO-8859-1";
2595 }
2596
2597 // try to use a Java reader
2598 String javaEncoding = EncodingMap.getIANA2JavaMapping(ENCODING);
2599 if (javaEncoding == null) {
2600 if(fAllowJavaEncodings) {
2601 javaEncoding = encoding;
2602 } else {
2603 fErrorReporter.reportError(this.getEntityScanner(),XMLMessageFormatter.XML_DOMAIN,
2604 "EncodingDeclInvalid",
2605 new Object[] { encoding },
2606 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2607 // see comment above.
2608 javaEncoding = "ISO8859_1";
2609 }
2610 }
2611 if (DEBUG_ENCODINGS) {
2612 System.out.print("$$$ creating Java InputStreamReader: encoding="+javaEncoding);
2613 if (javaEncoding == encoding) {
2614 System.out.print(" (IANA encoding)");
2615 }
2616 System.out.println();
2617 }
2618 return new BufferedReader( new InputStreamReader(inputStream, javaEncoding));
2619
2620 } // createReader(InputStream,String, Boolean): Reader
2621
2622
2623 /**
2881 }
2882 }
2883 }
2884 if (fCurrentEntity.position == fCurrentEntity.count) {
2885 System.out.print('^');
2886 }
2887 System.out.print('"');
2888 }
2889 System.out.print(']');
2890 System.out.print(" @ ");
2891 System.out.print(fCurrentEntity.lineNumber);
2892 System.out.print(',');
2893 System.out.print(fCurrentEntity.columnNumber);
2894 } else {
2895 System.out.print("*NO CURRENT ENTITY*");
2896 }
2897 }
2898 } // print()
2899
2900 /**
2901 * Buffer used in entity manager to reuse character arrays instead
2902 * of creating new ones every time.
2903 *
2904 * @xerces.internal
2905 *
2906 * @author Ankit Pasricha, IBM
2907 */
2908 private static class CharacterBuffer {
2909
2910 /** character buffer */
2911 private char[] ch;
2912
2913 /** whether the buffer is for an external or internal scanned entity */
2914 private boolean isExternal;
2915
2916 public CharacterBuffer(boolean isExternal, int size) {
2917 this.isExternal = isExternal;
2918 ch = new char[size];
2919 }
2920 }
2921
2922
2923 /**
2924 * Stores a number of character buffers and provides it to the entity
2925 * manager to use when an entity is seen.
2926 *
2927 * @xerces.internal
2928 *
2929 * @author Ankit Pasricha, IBM
2930 */
2931 private static class CharacterBufferPool {
2932
2933 private static final int DEFAULT_POOL_SIZE = 3;
2934
2935 private CharacterBuffer[] fInternalBufferPool;
2936 private CharacterBuffer[] fExternalBufferPool;
2937
2938 private int fExternalBufferSize;
2939 private int fInternalBufferSize;
2940 private int poolSize;
2941
2942 private int fInternalTop;
2943 private int fExternalTop;
2944
2945 public CharacterBufferPool(int externalBufferSize, int internalBufferSize) {
2946 this(DEFAULT_POOL_SIZE, externalBufferSize, internalBufferSize);
2947 }
2948
2949 public CharacterBufferPool(int poolSize, int externalBufferSize, int internalBufferSize) {
2950 fExternalBufferSize = externalBufferSize;
2951 fInternalBufferSize = internalBufferSize;
2952 this.poolSize = poolSize;
2953 init();
2954 }
2955
2956 /** Initializes buffer pool. **/
2957 private void init() {
2958 fInternalBufferPool = new CharacterBuffer[poolSize];
2959 fExternalBufferPool = new CharacterBuffer[poolSize];
2960 fInternalTop = -1;
2961 fExternalTop = -1;
2962 }
2963
2964 /** Retrieves buffer from pool. **/
2965 public CharacterBuffer getBuffer(boolean external) {
2966 if (external) {
2967 if (fExternalTop > -1) {
2968 return fExternalBufferPool[fExternalTop--];
2969 }
2970 else {
2971 return new CharacterBuffer(true, fExternalBufferSize);
2972 }
2973 }
2974 else {
2975 if (fInternalTop > -1) {
2976 return fInternalBufferPool[fInternalTop--];
2977 }
2978 else {
2979 return new CharacterBuffer(false, fInternalBufferSize);
2980 }
2981 }
2982 }
2983
2984 /** Returns buffer to pool. **/
2985 public void returnToPool(CharacterBuffer buffer) {
2986 if (buffer.isExternal) {
2987 if (fExternalTop < fExternalBufferPool.length - 1) {
2988 fExternalBufferPool[++fExternalTop] = buffer;
2989 }
2990 }
2991 else if (fInternalTop < fInternalBufferPool.length - 1) {
2992 fInternalBufferPool[++fInternalTop] = buffer;
2993 }
2994 }
2995
2996 /** Sets the size of external buffers and dumps the old pool. **/
2997 public void setExternalBufferSize(int bufferSize) {
2998 fExternalBufferSize = bufferSize;
2999 fExternalBufferPool = new CharacterBuffer[poolSize];
3000 fExternalTop = -1;
3001 }
3002 }
3003
3004 /**
3005 * This class wraps the byte inputstreams we're presented with.
3006 * We need it because java.io.InputStreams don't provide
3007 * functionality to reread processed bytes, and they have a habit
3008 * of reading more than one character when you call their read()
3009 * methods. This means that, once we discover the true (declared)
3010 * encoding of a document, we can neither backtrack to read the
3011 * whole doc again nor start reading where we are with a new
3012 * reader.
3013 *
3014 * This class allows rewinding an inputStream by allowing a mark
3015 * to be set, and the stream reset to that position. <strong>The
3016 * class assumes that it needs to read one character per
3017 * invocation when it's read() method is inovked, but uses the
3018 * underlying InputStream's read(char[], offset length) method--it
3019 * won't buffer data read this way!</strong>
3020 *
3021 * @xerces.internal
3022 *
3035 private int fMark;
3036
3037 public RewindableInputStream(InputStream is) {
3038 fData = new byte[DEFAULT_XMLDECL_BUFFER_SIZE];
3039 fInputStream = is;
3040 fStartOffset = 0;
3041 fEndOffset = -1;
3042 fOffset = 0;
3043 fLength = 0;
3044 fMark = 0;
3045 }
3046
3047 public void setStartOffset(int offset) {
3048 fStartOffset = offset;
3049 }
3050
3051 public void rewind() {
3052 fOffset = fStartOffset;
3053 }
3054
3055 public int read() throws IOException {
3056 int b = 0;
3057 if (fOffset < fLength) {
3058 return fData[fOffset++] & 0xff;
3059 }
3060 if (fOffset == fEndOffset) {
3061 return -1;
3062 }
3063 if (fOffset == fData.length) {
3064 byte[] newData = new byte[fOffset << 1];
3065 System.arraycopy(fData, 0, newData, 0, fOffset);
3066 fData = newData;
3067 }
3068 b = fInputStream.read();
3069 if (b == -1) {
3070 fEndOffset = fOffset;
3071 return -1;
3072 }
3073 fData[fLength++] = (byte)b;
3074 fOffset++;
3075 return b & 0xff;
3076 }
3077
3078 public int read(byte[] b, int off, int len) throws IOException {
3079 int bytesLeft = fLength - fOffset;
3080 if (bytesLeft == 0) {
3081 if (fOffset == fEndOffset) {
3082 return -1;
3083 }
3084
3085 /**
3086 * //System.out.println("fCurrentEntitty = " + fCurrentEntity );
3087 * //System.out.println("fInputStream = " + fInputStream );
3088 * // better get some more for the voracious reader... */
3089
3090 if(fCurrentEntity.mayReadChunks || !fCurrentEntity.xmlDeclChunkRead) {
3091
3092 if (!fCurrentEntity.xmlDeclChunkRead)
3093 {
3094 fCurrentEntity.xmlDeclChunkRead = true;
3095 len = Entity.ScannedEntity.DEFAULT_XMLDECL_BUFFER_SIZE;
3096 }
3097 return fInputStream.read(b, off, len);
3098 }
3099
3100 int returnedVal = read();
3101 if(returnedVal == -1) {
3102 fEndOffset = fOffset;
3103 return -1;
3104 }
3105 b[off] = (byte)returnedVal;
3106 return 1;
3107
3108 }
3109 if (len < bytesLeft) {
3110 if (len <= 0) {
3111 return 0;
3112 }
3113 } else {
3114 len = bytesLeft;
3115 }
3116 if (b != null) {
3117 System.arraycopy(fData, fOffset, b, off, len);
3118 }
3119 fOffset += len;
3120 return len;
3121 }
3122
3123 public long skip(long n)
3124 throws IOException {
3125 int bytesLeft;
3126 if (n <= 0) {
3127 return 0;
3128 }
3129 bytesLeft = fLength - fOffset;
3130 if (bytesLeft == 0) {
3131 if (fOffset == fEndOffset) {
3132 return 0;
3133 }
3134 return fInputStream.skip(n);
3135 }
3136 if (n <= bytesLeft) {
3137 fOffset += n;
3138 return n;
3139 }
3140 fOffset += bytesLeft;
3141 if (fOffset == fEndOffset) {
3142 return bytesLeft;
3143 }
3144 n -= bytesLeft;
3145 /*
3146 * In a manner of speaking, when this class isn't permitting more
3147 * than one byte at a time to be read, it is "blocking". The
3148 * available() method should indicate how much can be read without
3149 * blocking, so while we're in this mode, it should only indicate
3150 * that bytes in its buffer are available; otherwise, the result of
3151 * available() on the underlying InputStream is appropriate.
3152 */
3153 return fInputStream.skip(n) + bytesLeft;
3154 }
3155
3156 public int available() throws IOException {
3157 int bytesLeft = fLength - fOffset;
3158 if (bytesLeft == 0) {
3159 if (fOffset == fEndOffset) {
3160 return -1;
3161 }
3162 return fCurrentEntity.mayReadChunks ? fInputStream.available()
3163 : 0;
3164 }
3165 return bytesLeft;
3166 }
3167
3168 public void mark(int howMuch) {
3169 fMark = fOffset;
3170 }
3171
3172 public void reset() {
3173 fOffset = fMark;
3174 //test();
3175 }
3176
3177 public boolean markSupported() {
3178 return true;
3179 }
3180
3181 public void close() throws IOException {
3182 if (fInputStream != null) {
3183 fInputStream.close();
3184 fInputStream = null;
3185 }
3186 }
3187 } // end of RewindableInputStream class
3188
3189 public void test(){
3190 //System.out.println("TESTING: Added familytree to entityManager");
3191 //Usecase1
3192 fEntityStorage.addExternalEntity("entityUsecase1",null,
3193 "/space/home/stax/sun/6thJan2004/zephyr/data/test.txt",
3194 "/space/home/stax/sun/6thJan2004/zephyr/data/entity.xml");
|
1 /*
2 * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
3 */
4 /*
5 * Licensed to the Apache Software Foundation (ASF) under one or more
6 * contributor license agreements. See the NOTICE file distributed with
7 * this work for additional information regarding copyright ownership.
8 * The ASF licenses this file to You under the Apache License, Version 2.0
9 * (the "License"); you may not use this file except in compliance with
10 * the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21 package com.sun.org.apache.xerces.internal.impl ;
22
23 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
24 import com.sun.org.apache.xerces.internal.impl.io.UCSReader;
25 import com.sun.org.apache.xerces.internal.impl.io.UTF16Reader;
26 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
27 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
28 import com.sun.org.apache.xerces.internal.impl.validation.ValidationManager;
29 import com.sun.org.apache.xerces.internal.util.*;
30 import com.sun.org.apache.xerces.internal.util.URI;
31 import com.sun.org.apache.xerces.internal.utils.XMLLimitAnalyzer;
32 import com.sun.org.apache.xerces.internal.utils.XMLSecurityManager;
33 import com.sun.org.apache.xerces.internal.utils.XMLSecurityPropertyManager;
34 import com.sun.org.apache.xerces.internal.xni.Augmentations;
35 import com.sun.org.apache.xerces.internal.xni.XMLResourceIdentifier;
36 import com.sun.org.apache.xerces.internal.xni.XNIException;
37 import com.sun.org.apache.xerces.internal.xni.parser.*;
38 import com.sun.xml.internal.stream.Entity;
39 import com.sun.xml.internal.stream.StaxEntityResolverWrapper;
40 import com.sun.xml.internal.stream.StaxXMLInputSource;
41 import com.sun.xml.internal.stream.XMLEntityStorage;
42 import java.io.*;
43 import java.net.HttpURLConnection;
44 import java.net.URISyntaxException;
45 import java.net.URL;
73 * xni.
74 * <p>
75 * This component requires the following features and properties from the
76 * component manager that uses it:
77 * <ul>
78 * <li>http://xml.org/sax/features/validation</li>
79 * <li>http://xml.org/sax/features/external-general-entities</li>
80 * <li>http://xml.org/sax/features/external-parameter-entities</li>
81 * <li>http://apache.org/xml/features/allow-java-encodings</li>
82 * <li>http://apache.org/xml/properties/internal/symbol-table</li>
83 * <li>http://apache.org/xml/properties/internal/error-reporter</li>
84 * <li>http://apache.org/xml/properties/internal/entity-resolver</li>
85 * </ul>
86 *
87 *
88 * @author Andy Clark, IBM
89 * @author Arnaud Le Hors, IBM
90 * @author K.Venugopal SUN Microsystems
91 * @author Neeraj Bajaj SUN Microsystems
92 * @author Sunitha Reddy SUN Microsystems
93 * @LastModified: Apr 2019
94 */
95 public class XMLEntityManager implements XMLComponent, XMLEntityResolver {
96
97 //
98 // Constants
99 //
100
101 /** Default buffer size (2048). */
102 public static final int DEFAULT_BUFFER_SIZE = 8192;
103
104 /** Default buffer size before we've finished with the XMLDecl: */
105 public static final int DEFAULT_XMLDECL_BUFFER_SIZE = 64;
106
107 /** Default internal entity buffer size (1024). */
108 public static final int DEFAULT_INTERNAL_BUFFER_SIZE = 1024;
109
110 // feature identifiers
111
112 /** Feature identifier: validation. */
113 protected static final String VALIDATION =
396 protected Entity.ScannedEntity fCurrentEntity = null;
397
398 /** identify if the InputSource is created by a resolver */
399 boolean fISCreatedByResolver = false;
400
401 // shared context
402
403 protected XMLEntityStorage fEntityStorage ;
404
405 protected final Object [] defaultEncoding = new Object[]{"UTF-8", null};
406
407
408 // temp vars
409
410 /** Resource identifer. */
411 private final XMLResourceIdentifierImpl fResourceIdentifier = new XMLResourceIdentifierImpl();
412
413 /** Augmentations for entities. */
414 private final Augmentations fEntityAugs = new AugmentationsImpl();
415
416 /** indicate whether Catalog should be used for resolving external resources */
417 private boolean fUseCatalog = true;
418 CatalogFeatures fCatalogFeatures;
419 CatalogResolver fCatalogResolver;
420
421 private String fCatalogFile;
422 private String fDefer;
423 private String fPrefer;
424 private String fResolve;
425
426 //
427 // Constructors
428 //
429
430 /**
431 * If this constructor is used to create the object, reset() should be invoked on this object
432 */
433 public XMLEntityManager() {
434 //for entity managers not created by parsers
435 fSecurityManager = new XMLSecurityManager(true);
675
676 stream = connect.getInputStream();
677
678 // REVISIT: If the URLConnection has external encoding
679 // information, we should be reading it here. It's located
680 // in the charset parameter of Content-Type. -- mrglavas
681
682 if (followRedirects) {
683 String redirect = connect.getURL().toString();
684 // E43: Check if the URL was redirected, and then
685 // update literal and expanded system IDs if needed.
686 if (!redirect.equals(expandedSystemId)) {
687 literalSystemId = redirect;
688 expandedSystemId = redirect;
689 }
690 }
691 }
692 }
693
694 // wrap this stream in RewindableInputStream
695 RewindableInputStream rewindableStream = new RewindableInputStream(stream);
696 stream = rewindableStream;
697
698 // perform auto-detect of encoding if necessary
699 if (encoding == null) {
700 // read first four bytes and determine encoding
701 final byte[] b4 = new byte[4];
702 int count = 0;
703 for (; count<4; count++ ) {
704 b4[count] = (byte)rewindableStream.readAndBuffer();
705 }
706 if (count == 4) {
707 final EncodingInfo info = getEncodingInfo(b4, count);
708 encoding = info.autoDetectedEncoding;
709 final String readerEncoding = info.readerEncoding;
710 isBigEndian = info.isBigEndian;
711 stream.reset();
712 if (info.hasBOM) {
713 // Special case UTF-8 files with BOM created by Microsoft
714 // tools. It's more efficient to consume the BOM than make
715 // the reader perform extra checks. -Ac
716 if (EncodingInfo.STR_UTF8.equals(readerEncoding)) {
717 // UTF-8 BOM: 0xEF 0xBB 0xBF
718 stream.skip(3);
719 }
720 // It's also more efficient to consume the UTF-16 BOM.
721 else if (EncodingInfo.STR_UTF16.equals(readerEncoding)) {
722 // UTF-16 BE BOM: 0xFE 0xFF
723 // UTF-16 LE BOM: 0xFF 0xFE
724 stream.skip(2);
725 }
726 }
727 reader = createReader(stream, readerEncoding, isBigEndian);
728 } else {
729 reader = createReader(stream, encoding, isBigEndian);
730 }
731 }
732
733 // use specified encoding
734 else {
735 encoding = encoding.toUpperCase(Locale.ENGLISH);
736
737 // If encoding is UTF-8, consume BOM if one is present.
738 if (EncodingInfo.STR_UTF8.equals(encoding)) {
739 final int[] b3 = new int[3];
740 int count = 0;
741 for (; count < 3; ++count) {
742 b3[count] = rewindableStream.readAndBuffer();
743 if (b3[count] == -1)
744 break;
745 }
746 if (count == 3) {
747 if (b3[0] != 0xEF || b3[1] != 0xBB || b3[2] != 0xBF) {
748 // First three bytes are not BOM, so reset.
749 stream.reset();
750 }
751 } else {
752 stream.reset();
753 }
754 }
755 // If encoding is UTF-16, we still need to read the first
756 // four bytes, in order to discover the byte order.
757 else if (EncodingInfo.STR_UTF16.equals(encoding)) {
758 final int[] b4 = new int[4];
759 int count = 0;
760 for (; count < 4; ++count) {
761 b4[count] = rewindableStream.readAndBuffer();
762 if (b4[count] == -1)
763 break;
764 }
765 stream.reset();
766 if (count >= 2) {
767 final int b0 = b4[0];
768 final int b1 = b4[1];
769 if (b0 == 0xFE && b1 == 0xFF) {
770 // UTF-16, big-endian
771 isBigEndian = Boolean.TRUE;
772 stream.skip(2);
773 }
774 else if (b0 == 0xFF && b1 == 0xFE) {
775 // UTF-16, little-endian
776 isBigEndian = Boolean.FALSE;
777 stream.skip(2);
778 }
779 else if (count == 4) {
780 final int b2 = b4[2];
781 final int b3 = b4[3];
782 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
783 // UTF-16, big-endian, no BOM
784 isBigEndian = Boolean.TRUE;
785 }
786 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
787 // UTF-16, little-endian, no BOM
788 isBigEndian = Boolean.FALSE;
789 }
790 }
791 }
792 }
793 // If encoding is UCS-4, we still need to read the first four bytes
794 // in order to discover the byte order.
795 else if (EncodingInfo.STR_UCS4.equals(encoding)) {
796 final int[] b4 = new int[4];
797 int count = 0;
798 for (; count < 4; ++count) {
799 b4[count] = rewindableStream.readAndBuffer();
800 if (b4[count] == -1)
801 break;
802 }
803 stream.reset();
804
805 // Ignore unusual octet order for now.
806 if (count == 4) {
807 // UCS-4, big endian (1234)
808 if (b4[0] == 0x00 && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x3C) {
809 isBigEndian = Boolean.TRUE;
810 }
811 // UCS-4, little endian (1234)
812 else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x00 && b4[3] == 0x00) {
813 isBigEndian = Boolean.FALSE;
814 }
815 }
816 }
817 // If encoding is UCS-2, we still need to read the first four bytes
818 // in order to discover the byte order.
819 else if (EncodingInfo.STR_UCS2.equals(encoding)) {
820 final int[] b4 = new int[4];
821 int count = 0;
822 for (; count < 4; ++count) {
823 b4[count] = rewindableStream.readAndBuffer();
824 if (b4[count] == -1)
825 break;
826 }
827 stream.reset();
828
829 if (count == 4) {
830 // UCS-2, big endian
831 if (b4[0] == 0x00 && b4[1] == 0x3C && b4[2] == 0x00 && b4[3] == 0x3F) {
832 isBigEndian = Boolean.TRUE;
833 }
834 // UCS-2, little endian
835 else if (b4[0] == 0x3C && b4[1] == 0x00 && b4[2] == 0x3F && b4[3] == 0x00) {
836 isBigEndian = Boolean.FALSE;
837 }
838 }
839 }
840
841 reader = createReader(stream, encoding, isBigEndian);
842 }
843
1778 fSymbolTable = (SymbolTable)value;
1779 return;
1780 }
1781 if (suffixLength == Constants.ERROR_REPORTER_PROPERTY.length() &&
1782 propertyId.endsWith(Constants.ERROR_REPORTER_PROPERTY)) {
1783 fErrorReporter = (XMLErrorReporter)value;
1784 return;
1785 }
1786 if (suffixLength == Constants.ENTITY_RESOLVER_PROPERTY.length() &&
1787 propertyId.endsWith(Constants.ENTITY_RESOLVER_PROPERTY)) {
1788 fEntityResolver = (XMLEntityResolver)value;
1789 return;
1790 }
1791 if (suffixLength == Constants.BUFFER_SIZE_PROPERTY.length() &&
1792 propertyId.endsWith(Constants.BUFFER_SIZE_PROPERTY)) {
1793 Integer bufferSize = (Integer)value;
1794 if (bufferSize != null &&
1795 bufferSize.intValue() > DEFAULT_XMLDECL_BUFFER_SIZE) {
1796 fBufferSize = bufferSize.intValue();
1797 fEntityScanner.setBufferSize(fBufferSize);
1798 }
1799 }
1800 if (suffixLength == Constants.SECURITY_MANAGER_PROPERTY.length() &&
1801 propertyId.endsWith(Constants.SECURITY_MANAGER_PROPERTY)) {
1802 fSecurityManager = (XMLSecurityManager)value;
1803 }
1804 }
1805
1806 //JAXP 1.5 properties
1807 if (propertyId.equals(XML_SECURITY_PROPERTY_MANAGER))
1808 {
1809 XMLSecurityPropertyManager spm = (XMLSecurityPropertyManager)value;
1810 fAccessExternalDTD = spm.getValue(XMLSecurityPropertyManager.Property.ACCESS_EXTERNAL_DTD);
1811 return;
1812 }
1813
1814 //Catalog properties
1815 if (propertyId.equals(JdkXmlUtils.CATALOG_FILES)) {
1816 fCatalogFile = (String)value;
1817 } else if (propertyId.equals(JdkXmlUtils.CATALOG_DEFER)) {
2404 systemURI = (new java.net.URI(baseURI.toString())).resolve(systemURI);
2405
2406 // return the string rep of the new uri (an absolute one)
2407 return systemURI.toString();
2408
2409 // if any exception is thrown, it'll get thrown to the caller.
2410
2411 } // expandSystemIdStrictOff(String,String):String
2412
2413 //
2414 // Protected methods
2415 //
2416
2417
2418 /**
2419 * Returns the IANA encoding name that is auto-detected from
2420 * the bytes specified, with the endian-ness of that encoding where appropriate.
2421 *
2422 * @param b4 The first four bytes of the input.
2423 * @param count The number of bytes actually read.
2424 * @return an instance of EncodingInfo which represents the auto-detected encoding.
2425 */
2426 protected EncodingInfo getEncodingInfo(byte[] b4, int count) {
2427
2428 if (count < 2) {
2429 return EncodingInfo.UTF_8;
2430 }
2431
2432 // UTF-16, with BOM
2433 int b0 = b4[0] & 0xFF;
2434 int b1 = b4[1] & 0xFF;
2435 if (b0 == 0xFE && b1 == 0xFF) {
2436 // UTF-16, big-endian
2437 return EncodingInfo.UTF_16_BIG_ENDIAN_WITH_BOM;
2438 }
2439 if (b0 == 0xFF && b1 == 0xFE) {
2440 // UTF-16, little-endian
2441 return EncodingInfo.UTF_16_LITTLE_ENDIAN_WITH_BOM;
2442 }
2443
2444 // default to UTF-8 if we don't have enough bytes to make a
2445 // good determination of the encoding
2446 if (count < 3) {
2447 return EncodingInfo.UTF_8;
2448 }
2449
2450 // UTF-8 with a BOM
2451 int b2 = b4[2] & 0xFF;
2452 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
2453 return EncodingInfo.UTF_8_WITH_BOM;
2454 }
2455
2456 // default to UTF-8 if we don't have enough bytes to make a
2457 // good determination of the encoding
2458 if (count < 4) {
2459 return EncodingInfo.UTF_8;
2460 }
2461
2462 // other encodings
2463 int b3 = b4[3] & 0xFF;
2464 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
2465 // UCS-4, big endian (1234)
2466 return EncodingInfo.UCS_4_BIG_ENDIAN;
2467 }
2468 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
2469 // UCS-4, little endian (4321)
2470 return EncodingInfo.UCS_4_LITTLE_ENDIAN;
2471 }
2472 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
2473 // UCS-4, unusual octet order (2143)
2474 // REVISIT: What should this be?
2475 return EncodingInfo.UCS_4_UNUSUAL_BYTE_ORDER;
2476 }
2477 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
2478 // UCS-4, unusual octect order (3412)
2479 // REVISIT: What should this be?
2480 return EncodingInfo.UCS_4_UNUSUAL_BYTE_ORDER;
2481 }
2482 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2483 // UTF-16, big-endian, no BOM
2484 // (or could turn out to be UCS-2...
2485 // REVISIT: What should this be?
2486 return EncodingInfo.UTF_16_BIG_ENDIAN;
2487 }
2488 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2489 // UTF-16, little-endian, no BOM
2490 // (or could turn out to be UCS-2...
2491 return EncodingInfo.UTF_16_LITTLE_ENDIAN;
2492 }
2493 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
2494 // EBCDIC
2495 // a la xerces1, return CP037 instead of EBCDIC here
2496 return EncodingInfo.EBCDIC;
2497 }
2498
2499 // default encoding
2500 return EncodingInfo.UTF_8;
2501
2502 } // getEncodingName(byte[],int):Object[]
2503
2504 /**
2505 * Creates a reader capable of reading the given input stream in
2506 * the specified encoding.
2507 *
2508 * @param inputStream The input stream.
2509 * @param encoding The encoding name that the input stream is
2510 * encoded using. If the user has specified that
2511 * Java encoding names are allowed, then the
2512 * encoding name may be a Java encoding name;
2513 * otherwise, it is an ianaEncoding name.
2514 * @param isBigEndian For encodings (like uCS-4), whose names cannot
2515 * specify a byte order, this tells whether the order
2516 * is bigEndian. null if unknown or irrelevant.
2517 *
2518 * @return Returns a reader.
2519 */
2520 protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian)
2521 throws IOException {
2522
2523 String enc = (encoding != null) ? encoding : EncodingInfo.STR_UTF8;
2524 enc = enc.toUpperCase(Locale.ENGLISH);
2525 MessageFormatter f = fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
2526 Locale l = fErrorReporter.getLocale();
2527 switch (enc) {
2528 case EncodingInfo.STR_UTF8:
2529 return new UTF8Reader(inputStream, fBufferSize, f, l);
2530 case EncodingInfo.STR_UTF16:
2531 if (isBigEndian != null) {
2532 return new UTF16Reader(inputStream, fBufferSize, isBigEndian, f, l);
2533 }
2534 break;
2535 case EncodingInfo.STR_UTF16BE:
2536 return new UTF16Reader(inputStream, fBufferSize, true, f, l);
2537 case EncodingInfo.STR_UTF16LE:
2538 return new UTF16Reader(inputStream, fBufferSize, false, f, l);
2539 case EncodingInfo.STR_UCS4:
2540 if(isBigEndian != null) {
2541 if(isBigEndian) {
2542 return new UCSReader(inputStream, UCSReader.UCS4BE);
2543 } else {
2544 return new UCSReader(inputStream, UCSReader.UCS4LE);
2545 }
2546 } else {
2547 fErrorReporter.reportError(this.getEntityScanner(),
2548 XMLMessageFormatter.XML_DOMAIN,
2549 "EncodingByteOrderUnsupported",
2550 new Object[] { encoding },
2551 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2552 }
2553 break;
2554 case EncodingInfo.STR_UCS2:
2555 if(isBigEndian != null) {
2556 if(isBigEndian) {
2557 return new UCSReader(inputStream, UCSReader.UCS2BE);
2558 } else {
2559 return new UCSReader(inputStream, UCSReader.UCS2LE);
2560 }
2561 } else {
2562 fErrorReporter.reportError(this.getEntityScanner(),
2563 XMLMessageFormatter.XML_DOMAIN,
2564 "EncodingByteOrderUnsupported",
2565 new Object[] { encoding },
2566 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2567 }
2568 break;
2569 }
2570
2571 // check for valid name
2572 boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
2573 boolean validJava = XMLChar.isValidJavaEncoding(encoding);
2574 if (!validIANA || (fAllowJavaEncodings && !validJava)) {
2575 fErrorReporter.reportError(this.getEntityScanner(),
2576 XMLMessageFormatter.XML_DOMAIN,
2577 "EncodingDeclInvalid",
2578 new Object[] { encoding },
2579 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2580 // NOTE: AndyH suggested that, on failure, we use ISO Latin 1
2581 // because every byte is a valid ISO Latin 1 character.
2582 // It may not translate correctly but if we failed on
2583 // the encoding anyway, then we're expecting the content
2584 // of the document to be bad. This will just prevent an
2585 // invalid UTF-8 sequence to be detected. This is only
2586 // important when continue-after-fatal-error is turned
2587 // on. -Ac
2588 encoding = "ISO-8859-1";
2589 }
2590
2591 // try to use a Java reader
2592 String javaEncoding = EncodingMap.getIANA2JavaMapping(enc);
2593 if (javaEncoding == null) {
2594 if (fAllowJavaEncodings) {
2595 javaEncoding = encoding;
2596 } else {
2597 fErrorReporter.reportError(this.getEntityScanner(),
2598 XMLMessageFormatter.XML_DOMAIN,
2599 "EncodingDeclInvalid",
2600 new Object[] { encoding },
2601 XMLErrorReporter.SEVERITY_FATAL_ERROR);
2602 // see comment above.
2603 javaEncoding = "ISO8859_1";
2604 }
2605 }
2606 if (DEBUG_ENCODINGS) {
2607 System.out.print("$$$ creating Java InputStreamReader: encoding="+javaEncoding);
2608 if (javaEncoding == encoding) {
2609 System.out.print(" (IANA encoding)");
2610 }
2611 System.out.println();
2612 }
2613 return new BufferedReader( new InputStreamReader(inputStream, javaEncoding));
2614
2615 } // createReader(InputStream,String, Boolean): Reader
2616
2617
2618 /**
2876 }
2877 }
2878 }
2879 if (fCurrentEntity.position == fCurrentEntity.count) {
2880 System.out.print('^');
2881 }
2882 System.out.print('"');
2883 }
2884 System.out.print(']');
2885 System.out.print(" @ ");
2886 System.out.print(fCurrentEntity.lineNumber);
2887 System.out.print(',');
2888 System.out.print(fCurrentEntity.columnNumber);
2889 } else {
2890 System.out.print("*NO CURRENT ENTITY*");
2891 }
2892 }
2893 } // print()
2894
2895 /**
2896 * Information about auto-detectable encodings.
2897 *
2898 * @xerces.internal
2899 *
2900 * @author Michael Glavassevich, IBM
2901 */
2902 private static class EncodingInfo {
2903 public static final String STR_UTF8 = "UTF-8";
2904 public static final String STR_UTF16 = "UTF-16";
2905 public static final String STR_UTF16BE = "UTF-16BE";
2906 public static final String STR_UTF16LE = "UTF-16LE";
2907 public static final String STR_UCS4 = "ISO-10646-UCS-4";
2908 public static final String STR_UCS2 = "ISO-10646-UCS-2";
2909 public static final String STR_CP037 = "CP037";
2910
2911 /** UTF-8 **/
2912 public static final EncodingInfo UTF_8 =
2913 new EncodingInfo(STR_UTF8, null, false);
2914
2915 /** UTF-8, with BOM **/
2916 public static final EncodingInfo UTF_8_WITH_BOM =
2917 new EncodingInfo(STR_UTF8, null, true);
2918
2919 /** UTF-16, big-endian **/
2920 public static final EncodingInfo UTF_16_BIG_ENDIAN =
2921 new EncodingInfo(STR_UTF16BE, STR_UTF16, Boolean.TRUE, false);
2922
2923 /** UTF-16, big-endian with BOM **/
2924 public static final EncodingInfo UTF_16_BIG_ENDIAN_WITH_BOM =
2925 new EncodingInfo(STR_UTF16BE, STR_UTF16, Boolean.TRUE, true);
2926
2927 /** UTF-16, little-endian **/
2928 public static final EncodingInfo UTF_16_LITTLE_ENDIAN =
2929 new EncodingInfo(STR_UTF16LE, STR_UTF16, Boolean.FALSE, false);
2930
2931 /** UTF-16, little-endian with BOM **/
2932 public static final EncodingInfo UTF_16_LITTLE_ENDIAN_WITH_BOM =
2933 new EncodingInfo(STR_UTF16LE, STR_UTF16, Boolean.FALSE, true);
2934
2935 /** UCS-4, big-endian **/
2936 public static final EncodingInfo UCS_4_BIG_ENDIAN =
2937 new EncodingInfo(STR_UCS4, Boolean.TRUE, false);
2938
2939 /** UCS-4, little-endian **/
2940 public static final EncodingInfo UCS_4_LITTLE_ENDIAN =
2941 new EncodingInfo(STR_UCS4, Boolean.FALSE, false);
2942
2943 /** UCS-4, unusual byte-order (2143) or (3412) **/
2944 public static final EncodingInfo UCS_4_UNUSUAL_BYTE_ORDER =
2945 new EncodingInfo(STR_UCS4, null, false);
2946
2947 /** EBCDIC **/
2948 public static final EncodingInfo EBCDIC = new EncodingInfo(STR_CP037, null, false);
2949
2950 public final String autoDetectedEncoding;
2951 public final String readerEncoding;
2952 public final Boolean isBigEndian;
2953 public final boolean hasBOM;
2954
2955 private EncodingInfo(String autoDetectedEncoding, Boolean isBigEndian, boolean hasBOM) {
2956 this(autoDetectedEncoding, autoDetectedEncoding, isBigEndian, hasBOM);
2957 } // <init>(String,Boolean,boolean)
2958
2959 private EncodingInfo(String autoDetectedEncoding, String readerEncoding,
2960 Boolean isBigEndian, boolean hasBOM) {
2961 this.autoDetectedEncoding = autoDetectedEncoding;
2962 this.readerEncoding = readerEncoding;
2963 this.isBigEndian = isBigEndian;
2964 this.hasBOM = hasBOM;
2965 } // <init>(String,String,Boolean,boolean)
2966
2967 } // class EncodingInfo
2968
2969 /**
2970 * This class wraps the byte inputstreams we're presented with.
2971 * We need it because java.io.InputStreams don't provide
2972 * functionality to reread processed bytes, and they have a habit
2973 * of reading more than one character when you call their read()
2974 * methods. This means that, once we discover the true (declared)
2975 * encoding of a document, we can neither backtrack to read the
2976 * whole doc again nor start reading where we are with a new
2977 * reader.
2978 *
2979 * This class allows rewinding an inputStream by allowing a mark
2980 * to be set, and the stream reset to that position. <strong>The
2981 * class assumes that it needs to read one character per
2982 * invocation when it's read() method is inovked, but uses the
2983 * underlying InputStream's read(char[], offset length) method--it
2984 * won't buffer data read this way!</strong>
2985 *
2986 * @xerces.internal
2987 *
3000 private int fMark;
3001
3002 public RewindableInputStream(InputStream is) {
3003 fData = new byte[DEFAULT_XMLDECL_BUFFER_SIZE];
3004 fInputStream = is;
3005 fStartOffset = 0;
3006 fEndOffset = -1;
3007 fOffset = 0;
3008 fLength = 0;
3009 fMark = 0;
3010 }
3011
3012 public void setStartOffset(int offset) {
3013 fStartOffset = offset;
3014 }
3015
3016 public void rewind() {
3017 fOffset = fStartOffset;
3018 }
3019
3020 public int readAndBuffer() throws IOException {
3021 if (fOffset == fData.length) {
3022 byte[] newData = new byte[fOffset << 1];
3023 System.arraycopy(fData, 0, newData, 0, fOffset);
3024 fData = newData;
3025 }
3026 final int b = fInputStream.read();
3027 if (b == -1) {
3028 fEndOffset = fOffset;
3029 return -1;
3030 }
3031 fData[fLength++] = (byte)b;
3032 fOffset++;
3033 return b & 0xff;
3034 }
3035
3036 public int read() throws IOException {
3037 if (fOffset < fLength) {
3038 return fData[fOffset++] & 0xff;
3039 }
3040 if (fOffset == fEndOffset) {
3041 return -1;
3042 }
3043 if (fCurrentEntity.mayReadChunks) {
3044 return fInputStream.read();
3045 }
3046 return readAndBuffer();
3047 }
3048
3049 public int read(byte[] b, int off, int len) throws IOException {
3050 final int bytesLeft = fLength - fOffset;
3051 if (bytesLeft == 0) {
3052 if (fOffset == fEndOffset) {
3053 return -1;
3054 }
3055
3056 // read a block of data as requested
3057 if(fCurrentEntity.mayReadChunks || !fCurrentEntity.xmlDeclChunkRead) {
3058
3059 if (!fCurrentEntity.xmlDeclChunkRead)
3060 {
3061 fCurrentEntity.xmlDeclChunkRead = true;
3062 len = Entity.ScannedEntity.DEFAULT_XMLDECL_BUFFER_SIZE;
3063 }
3064 return fInputStream.read(b, off, len);
3065 }
3066 int returnedVal = readAndBuffer();
3067 if (returnedVal == -1) {
3068 fEndOffset = fOffset;
3069 return -1;
3070 }
3071 b[off] = (byte)returnedVal;
3072 return 1;
3073 }
3074 if (len < bytesLeft) {
3075 if (len <= 0) {
3076 return 0;
3077 }
3078 } else {
3079 len = bytesLeft;
3080 }
3081 if (b != null) {
3082 System.arraycopy(fData, fOffset, b, off, len);
3083 }
3084 fOffset += len;
3085 return len;
3086 }
3087
3088 public long skip(long n) throws IOException {
3089 int bytesLeft;
3090 if (n <= 0) {
3091 return 0;
3092 }
3093 bytesLeft = fLength - fOffset;
3094 if (bytesLeft == 0) {
3095 if (fOffset == fEndOffset) {
3096 return 0;
3097 }
3098 return fInputStream.skip(n);
3099 }
3100 if (n <= bytesLeft) {
3101 fOffset += n;
3102 return n;
3103 }
3104 fOffset += bytesLeft;
3105 if (fOffset == fEndOffset) {
3106 return bytesLeft;
3107 }
3108 n -= bytesLeft;
3109 /*
3110 * In a manner of speaking, when this class isn't permitting more
3111 * than one byte at a time to be read, it is "blocking". The
3112 * available() method should indicate how much can be read without
3113 * blocking, so while we're in this mode, it should only indicate
3114 * that bytes in its buffer are available; otherwise, the result of
3115 * available() on the underlying InputStream is appropriate.
3116 */
3117 return fInputStream.skip(n) + bytesLeft;
3118 }
3119
3120 public int available() throws IOException {
3121 final int bytesLeft = fLength - fOffset;
3122 if (bytesLeft == 0) {
3123 if (fOffset == fEndOffset) {
3124 return -1;
3125 }
3126 return fCurrentEntity.mayReadChunks ? fInputStream.available()
3127 : 0;
3128 }
3129 return bytesLeft;
3130 }
3131
3132 public void mark(int howMuch) {
3133 fMark = fOffset;
3134 }
3135
3136 public void reset() {
3137 fOffset = fMark;
3138 }
3139
3140 public boolean markSupported() {
3141 return true;
3142 }
3143
3144 public void close() throws IOException {
3145 if (fInputStream != null) {
3146 fInputStream.close();
3147 fInputStream = null;
3148 }
3149 }
3150 } // end of RewindableInputStream class
3151
3152 public void test(){
3153 //System.out.println("TESTING: Added familytree to entityManager");
3154 //Usecase1
3155 fEntityStorage.addExternalEntity("entityUsecase1",null,
3156 "/space/home/stax/sun/6thJan2004/zephyr/data/test.txt",
3157 "/space/home/stax/sun/6thJan2004/zephyr/data/entity.xml");
|