1 /*
2 * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package com.sun.xml.internal.dtdparser;
27
28 import java.io.ByteArrayInputStream;
29 import java.io.CharConversionException;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.InputStreamReader;
33 import java.io.PushbackInputStream;
34 import java.io.Reader;
35 import java.util.Hashtable;
36
37
38 // NOTE: Add I18N support to this class when JDK gets the ability to
39 // defer selection of locale for exception messages ... use the same
40 // technique for both.
41
42
43 /**
44 * This handles several XML-related tasks that normal java.io Readers
45 * don't support, inluding use of IETF standard encoding names and
46 * automatic detection of most XML encodings. The former is needed
47 * for interoperability; the latter is needed to conform with the XML
48 * spec. This class also optimizes reading some common encodings by
49 * providing low-overhead unsynchronized Reader support.
50 * <p/>
51 * <P> Note that the autodetection facility should be used only on
52 * data streams which have an unknown character encoding. For example,
53 * it should never be used on MIME text/xml entities.
54 * <p/>
55 * <P> Note that XML processors are only required to support UTF-8 and
69 private String assignedEncoding;
70 private boolean closed;
71
72 //
73 // This class always delegates I/O to a reader, which gets
74 // its data from the very beginning of the XML text. It needs
75 // to use a pushback stream since (a) autodetection can read
76 // partial UTF-8 characters which need to be fully processed,
77 // (b) the "Unicode" readers swallow characters that they think
78 // are byte order marks, so tests fail if they don't see the
79 // real byte order mark.
80 //
81 // It's got do this efficiently: character I/O is solidly on the
82 // critical path. (So keep buffer length over 2 Kbytes to avoid
83 // excess buffering. Many URL handlers stuff a BufferedInputStream
84 // between here and the real data source, and larger buffers keep
85 // that from slowing you down.)
86 //
87
88 /**
89 * Constructs the reader from an input stream, autodetecting
90 * the encoding to use according to the heuristic specified
91 * in the XML 1.0 recommendation.
92 *
93 * @param in the input stream from which the reader is constructed
94 * @throws IOException on error, such as unrecognized encoding
95 */
96 public static Reader createReader(InputStream in) throws IOException {
97 return new XmlReader(in);
98 }
99
100 /**
101 * Creates a reader supporting the given encoding, mapping
102 * from standard encoding names to ones that understood by
103 * Java where necessary.
104 *
105 * @param in the input stream from which the reader is constructed
106 * @param encoding the IETF standard name of the encoding to use;
107 * if null, autodetection is used.
108 * @throws IOException on error, including unrecognized encoding
109 */
110 public static Reader createReader(InputStream in, String encoding)
111 throws IOException {
112 if (encoding == null)
113 return new XmlReader(in);
114 if ("UTF-8".equalsIgnoreCase(encoding)
115 || "UTF8".equalsIgnoreCase(encoding))
116 return new Utf8Reader(in);
117 if ("US-ASCII".equalsIgnoreCase(encoding)
118 || "ASCII".equalsIgnoreCase(encoding))
119 return new AsciiReader(in);
120 if ("ISO-8859-1".equalsIgnoreCase(encoding)
121 // plus numerous aliases ...
122 )
123 return new Iso8859_1Reader(in);
124
125 //
126 // What we really want is an administerable resource mapping
127 // encoding names/aliases to classnames. For example a property
161 charsets.put("EBCDIC-CP-FR", "cp297");
162
163 charsets.put("EBCDIC-CP-AR1", "cp420");
164 charsets.put("EBCDIC-CP-HE", "cp424");
165 charsets.put("EBCDIC-CP-BE", "cp500");
166 charsets.put("EBCDIC-CP-CH", "cp500");
167
168 charsets.put("EBCDIC-CP-ROECE", "cp870");
169 charsets.put("EBCDIC-CP-YU", "cp870");
170 charsets.put("EBCDIC-CP-IS", "cp871");
171 charsets.put("EBCDIC-CP-AR2", "cp918");
172
173 // IANA also defines two that JDK 1.2 doesn't handle:
174 // EBCDIC-CP-GR --> CP423
175 // EBCDIC-CP-TR --> CP905
176 }
177
178 // returns an encoding name supported by JDK >= 1.1.6
179 // for some cases required by the XML spec
180 private static String std2java(String encoding) {
181 String temp = encoding.toUpperCase();
182 temp = (String) charsets.get(temp);
183 return temp != null ? temp : encoding;
184 }
185
186 /**
187 * Returns the standard name of the encoding in use
188 */
189 public String getEncoding() {
190 return assignedEncoding;
191 }
192
193 private XmlReader(InputStream stream) throws IOException {
194 super(stream);
195
196 PushbackInputStream pb;
197 byte buf [];
198 int len;
199
200 if (stream instanceof PushbackInputStream)
201 pb = (PushbackInputStream) stream;
304 byte buffer [] = new byte[MAXPUSHBACK];
305 int len;
306 Reader r;
307 int c;
308
309 //
310 // Buffer up a bunch of input, and set up to read it in
311 // the specified encoding ... we can skip the first four
312 // bytes since we know that "<?xm" was read to determine
313 // what encoding to use!
314 //
315 len = pb.read(buffer, 0, buffer.length);
316 pb.unread(buffer, 0, len);
317 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
318 encoding);
319
320 //
321 // Next must be "l" (and whitespace) else we conclude
322 // error and choose UTF-8.
323 //
324 if ((c = r.read()) != 'l') {
325 setEncoding(pb, "UTF-8");
326 return;
327 }
328
329 //
330 // Then, we'll skip any
331 // S version="..." [or single quotes]
332 // bit and get any subsequent
333 // S encoding="..." [or single quotes]
334 //
335 // We put an arbitrary size limit on how far we read; lots
336 // of space will break this algorithm.
337 //
338 StringBuffer buf = new StringBuffer();
339 StringBuffer keyBuf = null;
340 String key = null;
341 boolean sawEq = false;
342 char quoteChar = 0;
343 boolean sawQuestion = false;
344
741 break;
742 }
743 }
744 c = buffer[start++];
745 if ((c & 0x80) != 0)
746 throw new CharConversionException("Illegal ASCII character, 0x"
747 + Integer.toHexString(c & 0xff));
748 buf[offset + i] = (char) c;
749 }
750 if (i == 0 && finish <= 0)
751 return -1;
752 return i;
753 }
754 }
755
756 static final class Iso8859_1Reader extends BaseReader {
757 Iso8859_1Reader(InputStream in) {
758 super(in);
759 }
760
761 public int read(char buf [], int offset, int len) throws IOException {
762 int i;
763
764 if (instream == null)
765 return -1;
766
767 for (i = 0; i < len; i++) {
768 if (start >= finish) {
769 start = 0;
770 finish = instream.read(buffer, 0, buffer.length);
771 if (finish <= 0) {
772 if (finish <= 0)
773 this.close();
774 break;
775 }
776 }
777 buf[offset + i] = (char) (0x0ff & buffer[start++]);
778 }
779 if (i == 0 && finish <= 0)
780 return -1;
|
1 /*
2 * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package com.sun.xml.internal.dtdparser;
27
28 import java.io.ByteArrayInputStream;
29 import java.io.CharConversionException;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.InputStreamReader;
33 import java.io.PushbackInputStream;
34 import java.io.Reader;
35 import java.util.Hashtable;
36 import java.util.Locale;
37
38
39 // NOTE: Add I18N support to this class when JDK gets the ability to
40 // defer selection of locale for exception messages ... use the same
41 // technique for both.
42
43
44 /**
45 * This handles several XML-related tasks that normal java.io Readers
46 * don't support, inluding use of IETF standard encoding names and
47 * automatic detection of most XML encodings. The former is needed
48 * for interoperability; the latter is needed to conform with the XML
49 * spec. This class also optimizes reading some common encodings by
50 * providing low-overhead unsynchronized Reader support.
51 * <p/>
52 * <P> Note that the autodetection facility should be used only on
53 * data streams which have an unknown character encoding. For example,
54 * it should never be used on MIME text/xml entities.
55 * <p/>
56 * <P> Note that XML processors are only required to support UTF-8 and
70 private String assignedEncoding;
71 private boolean closed;
72
73 //
74 // This class always delegates I/O to a reader, which gets
75 // its data from the very beginning of the XML text. It needs
76 // to use a pushback stream since (a) autodetection can read
77 // partial UTF-8 characters which need to be fully processed,
78 // (b) the "Unicode" readers swallow characters that they think
79 // are byte order marks, so tests fail if they don't see the
80 // real byte order mark.
81 //
82 // It's got do this efficiently: character I/O is solidly on the
83 // critical path. (So keep buffer length over 2 Kbytes to avoid
84 // excess buffering. Many URL handlers stuff a BufferedInputStream
85 // between here and the real data source, and larger buffers keep
86 // that from slowing you down.)
87 //
88
89 /**
90 * Constructs the reader from an input stream, auto-detecting
91 * the encoding to use according to the heuristic specified
92 * in the XML 1.0 recommendation.
93 *
94 * @param in the input stream from which the reader is constructed
95 * @throws IOException on error, such as unrecognized encoding
96 */
97 public static Reader createReader(InputStream in) throws IOException {
98 return new XmlReader(in);
99 }
100
101 /**
102 * Creates a reader supporting the given encoding, mapping
103 * from standard encoding names to ones that understood by
104 * Java where necessary.
105 *
106 * @param in the input stream from which the reader is constructed
107 * @param encoding the IETF standard name of the encoding to use;
108 * if null, auto-detection is used.
109 * @throws IOException on error, including unrecognized encoding
110 */
111 public static Reader createReader(InputStream in, String encoding)
112 throws IOException {
113 if (encoding == null)
114 return new XmlReader(in);
115 if ("UTF-8".equalsIgnoreCase(encoding)
116 || "UTF8".equalsIgnoreCase(encoding))
117 return new Utf8Reader(in);
118 if ("US-ASCII".equalsIgnoreCase(encoding)
119 || "ASCII".equalsIgnoreCase(encoding))
120 return new AsciiReader(in);
121 if ("ISO-8859-1".equalsIgnoreCase(encoding)
122 // plus numerous aliases ...
123 )
124 return new Iso8859_1Reader(in);
125
126 //
127 // What we really want is an administerable resource mapping
128 // encoding names/aliases to classnames. For example a property
162 charsets.put("EBCDIC-CP-FR", "cp297");
163
164 charsets.put("EBCDIC-CP-AR1", "cp420");
165 charsets.put("EBCDIC-CP-HE", "cp424");
166 charsets.put("EBCDIC-CP-BE", "cp500");
167 charsets.put("EBCDIC-CP-CH", "cp500");
168
169 charsets.put("EBCDIC-CP-ROECE", "cp870");
170 charsets.put("EBCDIC-CP-YU", "cp870");
171 charsets.put("EBCDIC-CP-IS", "cp871");
172 charsets.put("EBCDIC-CP-AR2", "cp918");
173
174 // IANA also defines two that JDK 1.2 doesn't handle:
175 // EBCDIC-CP-GR --> CP423
176 // EBCDIC-CP-TR --> CP905
177 }
178
179 // returns an encoding name supported by JDK >= 1.1.6
180 // for some cases required by the XML spec
181 private static String std2java(String encoding) {
182 String temp = encoding.toUpperCase(Locale.ENGLISH);
183 temp = (String) charsets.get(temp);
184 return temp != null ? temp : encoding;
185 }
186
187 /**
188 * Returns the standard name of the encoding in use
189 */
190 public String getEncoding() {
191 return assignedEncoding;
192 }
193
194 private XmlReader(InputStream stream) throws IOException {
195 super(stream);
196
197 PushbackInputStream pb;
198 byte buf [];
199 int len;
200
201 if (stream instanceof PushbackInputStream)
202 pb = (PushbackInputStream) stream;
305 byte buffer [] = new byte[MAXPUSHBACK];
306 int len;
307 Reader r;
308 int c;
309
310 //
311 // Buffer up a bunch of input, and set up to read it in
312 // the specified encoding ... we can skip the first four
313 // bytes since we know that "<?xm" was read to determine
314 // what encoding to use!
315 //
316 len = pb.read(buffer, 0, buffer.length);
317 pb.unread(buffer, 0, len);
318 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
319 encoding);
320
321 //
322 // Next must be "l" (and whitespace) else we conclude
323 // error and choose UTF-8.
324 //
325 if ((r.read()) != 'l') {
326 setEncoding(pb, "UTF-8");
327 return;
328 }
329
330 //
331 // Then, we'll skip any
332 // S version="..." [or single quotes]
333 // bit and get any subsequent
334 // S encoding="..." [or single quotes]
335 //
336 // We put an arbitrary size limit on how far we read; lots
337 // of space will break this algorithm.
338 //
339 StringBuffer buf = new StringBuffer();
340 StringBuffer keyBuf = null;
341 String key = null;
342 boolean sawEq = false;
343 char quoteChar = 0;
344 boolean sawQuestion = false;
345
742 break;
743 }
744 }
745 c = buffer[start++];
746 if ((c & 0x80) != 0)
747 throw new CharConversionException("Illegal ASCII character, 0x"
748 + Integer.toHexString(c & 0xff));
749 buf[offset + i] = (char) c;
750 }
751 if (i == 0 && finish <= 0)
752 return -1;
753 return i;
754 }
755 }
756
757 static final class Iso8859_1Reader extends BaseReader {
758 Iso8859_1Reader(InputStream in) {
759 super(in);
760 }
761
762 @Override
763 public int read(char buf [], int offset, int len) throws IOException {
764 int i;
765
766 if (instream == null)
767 return -1;
768
769 for (i = 0; i < len; i++) {
770 if (start >= finish) {
771 start = 0;
772 finish = instream.read(buffer, 0, buffer.length);
773 if (finish <= 0) {
774 if (finish <= 0)
775 this.close();
776 break;
777 }
778 }
779 buf[offset + i] = (char) (0x0ff & buffer[start++]);
780 }
781 if (i == 0 && finish <= 0)
782 return -1;
|