1 /*
2 * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package com.sun.xml.internal.dtdparser;
27
28 import org.xml.sax.EntityResolver;
29 import org.xml.sax.InputSource;
30
31 import java.io.File;
32 import java.io.FileInputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.net.URL;
36 import java.net.URLConnection;
37 import java.util.Hashtable;
38
39 /**
40 * This entity resolver class provides a number of utilities which can help
41 * managment of external parsed entities in XML. These are commonly used
42 * to hold markup declarations that are to be used as part of a Document
43 * Type Declaration (DTD), or to hold text marked up with XML.
44 * <p/>
45 * <P> Features include: <UL>
46 * <p/>
47 * <LI> Static factory methods are provided for constructing SAX InputSource
48 * objects from Files, URLs, or MIME objects. This eliminates a class of
49 * error-prone coding in applications.
50 * <p/>
51 * <LI> Character encodings for XML documents are correctly supported: <UL>
52 * <p/>
53 * <LI> The encodings defined in the RFCs for MIME content types
54 * (2046 for general MIME, and 2376 for XML in particular), are
55 * supported, handling <em>charset=...</em> attributes and accepting
56 * content types which are known to be safe for use with XML;
57 * <p/>
58 * <LI> The character encoding autodetection algorithm identified
59 * in the XML specification is used, and leverages all of
60 * the JDK 1.1 (and later) character encoding support.
61 * <p/>
62 * <LI> The use of MIME typing may optionally be disabled, forcing the
63 * use of autodetection, to support web servers which don't correctly
64 * report MIME types for XML. For example, they may report text that
65 * is encoded in EUC-JP as being US-ASCII text, leading to fatal
66 * errors during parsing.
67 * <p/>
68 * <LI> The InputSource objects returned by this class always
69 * have a <code>java.io.Reader</code> available as the "character
70 * stream" property.
71 * <p/>
72 * </UL>
73 * <p/>
74 * <LI> Catalog entries can map public identifiers to Java resources or
75 * to local URLs. These are used to reduce network dependencies and loads,
76 * and will often be used for external DTD components. For example, packages
77 * shipping DTD files as resources in JAR files can eliminate network traffic
78 * when accessing them, and sites may provide local caches of common DTDs.
79 * Note that no particular catalog syntax is supported by this class, only
80 * the notion of a set of entries.
81 * <p/>
82 * </UL>
83 * <p/>
84 * <P> Subclasses can perform tasks such as supporting new URI schemes for
85 * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
86 * MIME entities which are part of a <em>multipart/related</em> group
87 * (see RFC 2387). They may also be used to support particular catalog
88 * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
89 * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
90 * Public Identifiers (FPIs).
91 *
92 * @author David Brownell
93 * @author Janet Koenig
94 * @version 1.3 00/02/24
95 */
96 public class Resolver implements EntityResolver {
97 private boolean ignoringMIME;
98
99 // table mapping public IDs to (local) URIs
100 private Hashtable id2uri;
101
102 // tables mapping public IDs to resources and classloaders
103 private Hashtable id2resource;
109 // idea is to rule out obvious braindamage ("image/jpg")
110 // not the subtle stuff ("text/html") that might actually
111 // be (or become) safe.
112 //
113 private static final String types [] = {
114 "application/xml",
115 "text/xml",
116 "text/plain",
117 "text/html", // commonly mis-inferred
118 "application/x-netcdf", // this is often illegal XML
119 "content/unknown"
120 };
121
122 /**
123 * Constructs a resolver.
124 */
125 public Resolver() {
126 }
127
128 /**
129 * Returns an input source, using the MIME type information and URL
130 * scheme to statically determine the correct character encoding if
131 * possible and otherwise autodetecting it. MIME carefully specifies
132 * the character encoding defaults, and how attributes of the content
133 * type can change it. XML further specifies two mandatory encodings
134 * (UTF-8 and UTF-16), and includes an XML declaration which can be
135 * used to internally label most documents encoded using US-ASCII
136 * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
137 * more).
138 * <p/>
139 * <P> This method can be used to access XML documents which do not
140 * have URIs (such as servlet input streams, or most JavaMail message
141 * entities) and to support access methods such as HTTP POST or PUT.
142 * (URLs normally return content using the GET method.)
143 * <p/>
144 * <P> <em> The caller should set the system ID in order for relative URIs
145 * found in this document to be interpreted correctly.</em> In some cases,
146 * a custom resolver will need to be used; for example, documents
147 * may be grouped in a single MIME "multipart/related" bundle, and
148 * relative URLs would refer to other documents in that bundle.
149 *
150 * @param contentType The MIME content type for the source for which
151 * an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
152 * @param stream The input byte stream for the input source.
153 * @param checkType If true, this verifies that the content type is known
154 * to support XML documents, such as <em>application/xml</em>.
155 * @param scheme Unless this is "file", unspecified MIME types
156 * default to US-ASCII. Files are always autodetected since most
157 * file systems discard character encoding information.
158 */
159 public static InputSource createInputSource(String contentType,
160 InputStream stream,
161 boolean checkType,
162 String scheme) throws IOException {
163 InputSource retval;
164 String charset = null;
165
166 if (contentType != null) {
167 int index;
168
169 contentType = contentType.toLowerCase();
170 index = contentType.indexOf(';');
171 if (index != -1) {
172 String attributes;
173
174 attributes = contentType.substring(index + 1);
175 contentType = contentType.substring(0, index);
176
177 // use "charset=..." if it's available
178 index = attributes.indexOf("charset");
179 if (index != -1) {
180 attributes = attributes.substring(index + 7);
181 // strip out subsequent attributes
182 if ((index = attributes.indexOf(';')) != -1)
183 attributes = attributes.substring(0, index);
184 // find start of value
185 if ((index = attributes.indexOf('=')) != -1) {
186 attributes = attributes.substring(index + 1);
187 // strip out rfc822 comments
188 if ((index = attributes.indexOf('(')) != -1)
189 attributes = attributes.substring(0, index);
278 if (File.separatorChar != '/')
279 path = path.replace(File.separatorChar, '/');
280 if (!path.startsWith("/"))
281 path = "/" + path;
282 if (!path.endsWith("/") && file.isDirectory())
283 path = path + "/";
284
285 retval.setSystemId("file:" + path);
286 return retval;
287 }
288
289
290 /**
291 * <b>SAX:</b>
292 * Resolve the given entity into an input source. If the name can't
293 * be mapped to a preferred form of the entity, the URI is used. To
294 * resolve the entity, first a local catalog mapping names to URIs is
295 * consulted. If no mapping is found there, a catalog mapping names
296 * to java resources is consulted. Finally, if neither mapping found
297 * a copy of the entity, the specified URI is used.
298 * <p/>
299 * <P> When a URI is used, <a href="#createInputSource">
300 * createInputSource</a> is used to correctly deduce the character
301 * encoding used by this entity. No MIME type checking is done.
302 *
303 * @param name Used to find alternate copies of the entity, when
304 * this value is non-null; this is the XML "public ID".
305 * @param uri Used when no alternate copy of the entity is found;
306 * this is the XML "system ID", normally a URI.
307 */
308 public InputSource resolveEntity(String name, String uri)
309 throws IOException {
310 InputSource retval;
311 String mappedURI = name2uri(name);
312 InputStream stream;
313
314 // prefer explicit URI mappings, then bundled resources...
315 if (mappedURI == null && (stream = mapResource(name)) != null) {
316 uri = "java:resource:" + (String) id2resource.get(name);
317 retval = new InputSource(XmlReader.createReader(stream));
318
319 // ...and treat all URIs the same (as URLs for now).
320 } else {
321 URL url;
322 URLConnection conn;
323
324 if (mappedURI != null)
325 uri = mappedURI;
326 else if (uri == null)
327 return null;
328
329 url = new URL(uri);
330 conn = url.openConnection();
331 uri = conn.getURL().toString();
332 // System.out.println ("++ URI: " + url);
333 if (ignoringMIME)
334 retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
335 else {
406
407 if (resourceName == null)
408 return null;
409 // System.out.println ("++ Resource: " + resourceName);
410
411 if (id2loader != null)
412 loader = (ClassLoader) id2loader.get(publicId);
413 // System.out.println ("++ Loader: " + loader);
414 if (loader == null)
415 return ClassLoader.getSystemResourceAsStream(resourceName);
416 return loader.getResourceAsStream(resourceName);
417 }
418
419 /**
420 * Registers a given public ID as corresponding to a particular Java
421 * resource in a given class loader, typically distributed with a
422 * software package. This resource will be preferred over system IDs
423 * included in XML documents. This mechanism should most typically be
424 * used for Document Type Definitions (DTDs), where the public IDs are
425 * formally managed and versioned.
426 * <p/>
427 * <P> If a mapping to a URI has been provided, that mapping takes
428 * precedence over this one.
429 *
430 * @param publicId The managed public ID being mapped
431 * @param resourceName The name of the Java resource
432 * @param loader The class loader holding the resource, or null if
433 * it is a system resource.
434 */
435 public void registerCatalogEntry(String publicId,
436 String resourceName,
437 ClassLoader loader) {
438 if (id2resource == null)
439 id2resource = new Hashtable(17);
440 id2resource.put(publicId, resourceName);
441
442 if (loader != null) {
443 if (id2loader == null)
444 id2loader = new Hashtable(17);
445 id2loader.put(publicId, loader);
446 }
|
1 /*
2 * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package com.sun.xml.internal.dtdparser;
27
28 import org.xml.sax.EntityResolver;
29 import org.xml.sax.InputSource;
30
31 import java.io.File;
32 import java.io.FileInputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.net.URL;
36 import java.net.URLConnection;
37 import java.util.Hashtable;
38 import java.util.Locale;
39
40 /**
41 * This entity resolver class provides a number of utilities which can help
42 * managment of external parsed entities in XML. These are commonly used
43 * to hold markup declarations that are to be used as part of a Document
44 * Type Declaration (DTD), or to hold text marked up with XML.
45 * <p>
46 * <P> Features include: <UL>
47 *
48 * <LI> Static factory methods are provided for constructing SAX InputSource
49 * objects from Files, URLs, or MIME objects. This eliminates a class of
50 * error-prone coding in applications.</LI>
51 *
52 * <LI> Character encodings for XML documents are correctly supported:<UL>
53 *
54 * <LI> The encodings defined in the RFCs for MIME content types
55 * (2046 for general MIME, and 2376 for XML in particular), are
56 * supported, handling <em>charset=...</em> attributes and accepting
57 * content types which are known to be safe for use with XML;</LI>
58 *
59 * <LI> The character encoding autodetection algorithm identified
60 * in the XML specification is used, and leverages all of
61 * the JDK 1.1 (and later) character encoding support.</LI>
62 *
63 * <LI> The use of MIME typing may optionally be disabled, forcing the
64 * use of autodetection, to support web servers which don't correctly
65 * report MIME types for XML. For example, they may report text that
66 * is encoded in EUC-JP as being US-ASCII text, leading to fatal
67 * errors during parsing.</LI>
68 *
69 * <LI> The InputSource objects returned by this class always
70 * have a <code>java.io.Reader</code> available as the "character
71 * stream" property.</LI>
72 *
73 * </UL></LI>
74 *
75 * <LI> Catalog entries can map public identifiers to Java resources or
76 * to local URLs. These are used to reduce network dependencies and loads,
77 * and will often be used for external DTD components. For example, packages
78 * shipping DTD files as resources in JAR files can eliminate network traffic
79 * when accessing them, and sites may provide local caches of common DTDs.
80 * Note that no particular catalog syntax is supported by this class, only
81 * the notion of a set of entries.</LI>
82 *
83 * </UL>
84 * <p>
85 * <P> Subclasses can perform tasks such as supporting new URI schemes for
86 * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
87 * MIME entities which are part of a <em>multipart/related</em> group
88 * (see RFC 2387). They may also be used to support particular catalog
89 * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
90 * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
91 * Public Identifiers (FPIs).
92 *
93 * @author David Brownell
94 * @author Janet Koenig
95 * @version 1.3 00/02/24
96 */
97 public class Resolver implements EntityResolver {
98 private boolean ignoringMIME;
99
100 // table mapping public IDs to (local) URIs
101 private Hashtable id2uri;
102
103 // tables mapping public IDs to resources and classloaders
104 private Hashtable id2resource;
110 // idea is to rule out obvious braindamage ("image/jpg")
111 // not the subtle stuff ("text/html") that might actually
112 // be (or become) safe.
113 //
114 private static final String types [] = {
115 "application/xml",
116 "text/xml",
117 "text/plain",
118 "text/html", // commonly mis-inferred
119 "application/x-netcdf", // this is often illegal XML
120 "content/unknown"
121 };
122
123 /**
124 * Constructs a resolver.
125 */
126 public Resolver() {
127 }
128
129 /**
130 * <p>Returns an input source, using the MIME type information and URL
131 * scheme to statically determine the correct character encoding if
132 * possible and otherwise autodetecting it. MIME carefully specifies
133 * the character encoding defaults, and how attributes of the content
134 * type can change it. XML further specifies two mandatory encodings
135 * (UTF-8 and UTF-16), and includes an XML declaration which can be
136 * used to internally label most documents encoded using US-ASCII
137 * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
138 * more).</p>
139 *
140 * <p> This method can be used to access XML documents which do not
141 * have URIs (such as servlet input streams, or most JavaMail message
142 * entities) and to support access methods such as HTTP POST or PUT.
143 * (URLs normally return content using the GET method.)</p>
144 *
145 * <p> <em> The caller should set the system ID in order for relative URIs
146 * found in this document to be interpreted correctly.</em> In some cases,
147 * a custom resolver will need to be used; for example, documents
148 * may be grouped in a single MIME "multipart/related" bundle, and
149 * relative URLs would refer to other documents in that bundle.</p>
150 *
151 * @param contentType The MIME content type for the source for which
152 * an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
153 * @param stream The input byte stream for the input source.
154 * @param checkType If true, this verifies that the content type is known
155 * to support XML documents, such as <em>application/xml</em>.
156 * @param scheme Unless this is "file", unspecified MIME types
157 * default to US-ASCII. Files are always autodetected since most
158 * file systems discard character encoding information.
159 */
160 public static InputSource createInputSource(String contentType,
161 InputStream stream,
162 boolean checkType,
163 String scheme) throws IOException {
164 InputSource retval;
165 String charset = null;
166
167 if (contentType != null) {
168 int index;
169
170 contentType = contentType.toLowerCase(Locale.ENGLISH);
171 index = contentType.indexOf(';');
172 if (index != -1) {
173 String attributes;
174
175 attributes = contentType.substring(index + 1);
176 contentType = contentType.substring(0, index);
177
178 // use "charset=..." if it's available
179 index = attributes.indexOf("charset");
180 if (index != -1) {
181 attributes = attributes.substring(index + 7);
182 // strip out subsequent attributes
183 if ((index = attributes.indexOf(';')) != -1)
184 attributes = attributes.substring(0, index);
185 // find start of value
186 if ((index = attributes.indexOf('=')) != -1) {
187 attributes = attributes.substring(index + 1);
188 // strip out rfc822 comments
189 if ((index = attributes.indexOf('(')) != -1)
190 attributes = attributes.substring(0, index);
279 if (File.separatorChar != '/')
280 path = path.replace(File.separatorChar, '/');
281 if (!path.startsWith("/"))
282 path = "/" + path;
283 if (!path.endsWith("/") && file.isDirectory())
284 path = path + "/";
285
286 retval.setSystemId("file:" + path);
287 return retval;
288 }
289
290
291 /**
292 * <b>SAX:</b>
293 * Resolve the given entity into an input source. If the name can't
294 * be mapped to a preferred form of the entity, the URI is used. To
295 * resolve the entity, first a local catalog mapping names to URIs is
296 * consulted. If no mapping is found there, a catalog mapping names
297 * to java resources is consulted. Finally, if neither mapping found
298 * a copy of the entity, the specified URI is used.
299 * <p>
300 * <P> When a URI is used, <a href="#createInputSource">
301 * createInputSource</a> is used to correctly deduce the character
302 * encoding used by this entity. No MIME type checking is done.
303 *
304 * @param name Used to find alternate copies of the entity, when
305 * this value is non-null; this is the XML "public ID".
306 * @param uri Used when no alternate copy of the entity is found;
307 * this is the XML "system ID", normally a URI.
308 */
309 @Override
310 public InputSource resolveEntity(String name, String uri)
311 throws IOException {
312 InputSource retval;
313 String mappedURI = name2uri(name);
314 InputStream stream;
315
316 // prefer explicit URI mappings, then bundled resources...
317 if (mappedURI == null && (stream = mapResource(name)) != null && id2resource != null) {
318 uri = "java:resource:" + (String) id2resource.get(name);
319 retval = new InputSource(XmlReader.createReader(stream));
320
321 // ...and treat all URIs the same (as URLs for now).
322 } else {
323 URL url;
324 URLConnection conn;
325
326 if (mappedURI != null)
327 uri = mappedURI;
328 else if (uri == null)
329 return null;
330
331 url = new URL(uri);
332 conn = url.openConnection();
333 uri = conn.getURL().toString();
334 // System.out.println ("++ URI: " + url);
335 if (ignoringMIME)
336 retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
337 else {
408
409 if (resourceName == null)
410 return null;
411 // System.out.println ("++ Resource: " + resourceName);
412
413 if (id2loader != null)
414 loader = (ClassLoader) id2loader.get(publicId);
415 // System.out.println ("++ Loader: " + loader);
416 if (loader == null)
417 return ClassLoader.getSystemResourceAsStream(resourceName);
418 return loader.getResourceAsStream(resourceName);
419 }
420
421 /**
422 * Registers a given public ID as corresponding to a particular Java
423 * resource in a given class loader, typically distributed with a
424 * software package. This resource will be preferred over system IDs
425 * included in XML documents. This mechanism should most typically be
426 * used for Document Type Definitions (DTDs), where the public IDs are
427 * formally managed and versioned.
428 * <p>
429 * <P> If a mapping to a URI has been provided, that mapping takes
430 * precedence over this one.
431 *
432 * @param publicId The managed public ID being mapped
433 * @param resourceName The name of the Java resource
434 * @param loader The class loader holding the resource, or null if
435 * it is a system resource.
436 */
437 public void registerCatalogEntry(String publicId,
438 String resourceName,
439 ClassLoader loader) {
440 if (id2resource == null)
441 id2resource = new Hashtable(17);
442 id2resource.put(publicId, resourceName);
443
444 if (loader != null) {
445 if (id2loader == null)
446 id2loader = new Hashtable(17);
447 id2loader.put(publicId, loader);
448 }
|