< prev index next >

src/jdk.xml.bind/share/classes/com/sun/xml/internal/dtdparser/XmlReader.java

Print this page


   1 /*
   2  * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.dtdparser;
  27 
  28 import java.io.ByteArrayInputStream;
  29 import java.io.CharConversionException;
  30 import java.io.IOException;
  31 import java.io.InputStream;
  32 import java.io.InputStreamReader;
  33 import java.io.PushbackInputStream;
  34 import java.io.Reader;
  35 import java.util.Hashtable;

  36 
  37 
  38 // NOTE:  Add I18N support to this class when JDK gets the ability to
  39 // defer selection of locale for exception messages ... use the same
  40 // technique for both.
  41 
  42 
  43 /**
  44  * This handles several XML-related tasks that normal java.io Readers
  45  * don't support, inluding use of IETF standard encoding names and
  46  * automatic detection of most XML encodings.  The former is needed
  47  * for interoperability; the latter is needed to conform with the XML
  48  * spec.  This class also optimizes reading some common encodings by
  49  * providing low-overhead unsynchronized Reader support.
  50  * <p/>
  51  * <P> Note that the autodetection facility should be used only on
  52  * data streams which have an unknown character encoding.  For example,
  53  * it should never be used on MIME text/xml entities.
  54  * <p/>
  55  * <P> Note that XML processors are only required to support UTF-8 and


  69     private String assignedEncoding;
  70     private boolean closed;
  71 
  72     //
  73     // This class always delegates I/O to a reader, which gets
  74     // its data from the very beginning of the XML text.  It needs
  75     // to use a pushback stream since (a) autodetection can read
  76     // partial UTF-8 characters which need to be fully processed,
  77     // (b) the "Unicode" readers swallow characters that they think
  78     // are byte order marks, so tests fail if they don't see the
  79     // real byte order mark.
  80     //
  81     // It's got do this efficiently:  character I/O is solidly on the
  82     // critical path.  (So keep buffer length over 2 Kbytes to avoid
  83     // excess buffering. Many URL handlers stuff a BufferedInputStream
  84     // between here and the real data source, and larger buffers keep
  85     // that from slowing you down.)
  86     //
  87 
  88     /**
  89      * Constructs the reader from an input stream, autodetecting
  90      * the encoding to use according to the heuristic specified
  91      * in the XML 1.0 recommendation.
  92      *
  93      * @param in the input stream from which the reader is constructed
  94      * @throws IOException on error, such as unrecognized encoding
  95      */
  96     public static Reader createReader(InputStream in) throws IOException {
  97         return new XmlReader(in);
  98     }
  99 
 100     /**
 101      * Creates a reader supporting the given encoding, mapping
 102      * from standard encoding names to ones that understood by
 103      * Java where necessary.
 104      *
 105      * @param in       the input stream from which the reader is constructed
 106      * @param encoding the IETF standard name of the encoding to use;
 107      *                 if null, autodetection is used.
 108      * @throws IOException on error, including unrecognized encoding
 109      */
 110     public static Reader createReader(InputStream in, String encoding)
 111             throws IOException {
 112         if (encoding == null)
 113             return new XmlReader(in);
 114         if ("UTF-8".equalsIgnoreCase(encoding)
 115                 || "UTF8".equalsIgnoreCase(encoding))
 116             return new Utf8Reader(in);
 117         if ("US-ASCII".equalsIgnoreCase(encoding)
 118                 || "ASCII".equalsIgnoreCase(encoding))
 119             return new AsciiReader(in);
 120         if ("ISO-8859-1".equalsIgnoreCase(encoding)
 121         // plus numerous aliases ...
 122         )
 123             return new Iso8859_1Reader(in);
 124 
 125         //
 126         // What we really want is an administerable resource mapping
 127         // encoding names/aliases to classnames.  For example a property


 161         charsets.put("EBCDIC-CP-FR", "cp297");
 162 
 163         charsets.put("EBCDIC-CP-AR1", "cp420");
 164         charsets.put("EBCDIC-CP-HE", "cp424");
 165         charsets.put("EBCDIC-CP-BE", "cp500");
 166         charsets.put("EBCDIC-CP-CH", "cp500");
 167 
 168         charsets.put("EBCDIC-CP-ROECE", "cp870");
 169         charsets.put("EBCDIC-CP-YU", "cp870");
 170         charsets.put("EBCDIC-CP-IS", "cp871");
 171         charsets.put("EBCDIC-CP-AR2", "cp918");
 172 
 173         // IANA also defines two that JDK 1.2 doesn't handle:
 174         //    EBCDIC-CP-GR        --> CP423
 175         //    EBCDIC-CP-TR        --> CP905
 176     }
 177 
 178     // returns an encoding name supported by JDK >= 1.1.6
 179     // for some cases required by the XML spec
 180     private static String std2java(String encoding) {
 181         String temp = encoding.toUpperCase();
 182         temp = (String) charsets.get(temp);
 183         return temp != null ? temp : encoding;
 184     }
 185 
 186     /**
 187      * Returns the standard name of the encoding in use
 188      */
 189     public String getEncoding() {
 190         return assignedEncoding;
 191     }
 192 
 193     private XmlReader(InputStream stream) throws IOException {
 194         super(stream);
 195 
 196         PushbackInputStream pb;
 197         byte buf [];
 198         int len;
 199 
 200         if (stream instanceof PushbackInputStream)
 201             pb = (PushbackInputStream) stream;


 304         byte buffer [] = new byte[MAXPUSHBACK];
 305         int len;
 306         Reader r;
 307         int c;
 308 
 309         //
 310         // Buffer up a bunch of input, and set up to read it in
 311         // the specified encoding ... we can skip the first four
 312         // bytes since we know that "<?xm" was read to determine
 313         // what encoding to use!
 314         //
 315         len = pb.read(buffer, 0, buffer.length);
 316         pb.unread(buffer, 0, len);
 317         r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
 318                 encoding);
 319 
 320         //
 321         // Next must be "l" (and whitespace) else we conclude
 322         // error and choose UTF-8.
 323         //
 324         if ((c = r.read()) != 'l') {
 325             setEncoding(pb, "UTF-8");
 326             return;
 327         }
 328 
 329         //
 330         // Then, we'll skip any
 331         //     S version="..."     [or single quotes]
 332         // bit and get any subsequent
 333         //     S encoding="..."     [or single quotes]
 334         //
 335         // We put an arbitrary size limit on how far we read; lots
 336         // of space will break this algorithm.
 337         //
 338         StringBuffer buf = new StringBuffer();
 339         StringBuffer keyBuf = null;
 340         String key = null;
 341         boolean sawEq = false;
 342         char quoteChar = 0;
 343         boolean sawQuestion = false;
 344 


 741                         break;
 742                     }
 743                 }
 744                 c = buffer[start++];
 745                 if ((c & 0x80) != 0)
 746                     throw new CharConversionException("Illegal ASCII character, 0x"
 747                             + Integer.toHexString(c & 0xff));
 748                 buf[offset + i] = (char) c;
 749             }
 750             if (i == 0 && finish <= 0)
 751                 return -1;
 752             return i;
 753         }
 754     }
 755 
 756     static final class Iso8859_1Reader extends BaseReader {
 757         Iso8859_1Reader(InputStream in) {
 758             super(in);
 759         }
 760 

 761         public int read(char buf [], int offset, int len) throws IOException {
 762             int i;
 763 
 764             if (instream == null)
 765                 return -1;
 766 
 767             for (i = 0; i < len; i++) {
 768                 if (start >= finish) {
 769                     start = 0;
 770                     finish = instream.read(buffer, 0, buffer.length);
 771                     if (finish <= 0) {
 772                         if (finish <= 0)
 773                             this.close();
 774                         break;
 775                     }
 776                 }
 777                 buf[offset + i] = (char) (0x0ff & buffer[start++]);
 778             }
 779             if (i == 0 && finish <= 0)
 780                 return -1;
   1 /*
   2  * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.dtdparser;
  27 
  28 import java.io.ByteArrayInputStream;
  29 import java.io.CharConversionException;
  30 import java.io.IOException;
  31 import java.io.InputStream;
  32 import java.io.InputStreamReader;
  33 import java.io.PushbackInputStream;
  34 import java.io.Reader;
  35 import java.util.Hashtable;
  36 import java.util.Locale;
  37 
  38 
  39 // NOTE:  Add I18N support to this class when JDK gets the ability to
  40 // defer selection of locale for exception messages ... use the same
  41 // technique for both.
  42 
  43 
  44 /**
  45  * This handles several XML-related tasks that normal java.io Readers
  46  * don't support, inluding use of IETF standard encoding names and
  47  * automatic detection of most XML encodings.  The former is needed
  48  * for interoperability; the latter is needed to conform with the XML
  49  * spec.  This class also optimizes reading some common encodings by
  50  * providing low-overhead unsynchronized Reader support.
  51  * <p/>
  52  * <P> Note that the autodetection facility should be used only on
  53  * data streams which have an unknown character encoding.  For example,
  54  * it should never be used on MIME text/xml entities.
  55  * <p/>
  56  * <P> Note that XML processors are only required to support UTF-8 and


  70     private String assignedEncoding;
  71     private boolean closed;
  72 
  73     //
  74     // This class always delegates I/O to a reader, which gets
  75     // its data from the very beginning of the XML text.  It needs
  76     // to use a pushback stream since (a) autodetection can read
  77     // partial UTF-8 characters which need to be fully processed,
  78     // (b) the "Unicode" readers swallow characters that they think
  79     // are byte order marks, so tests fail if they don't see the
  80     // real byte order mark.
  81     //
  82     // It's got do this efficiently:  character I/O is solidly on the
  83     // critical path.  (So keep buffer length over 2 Kbytes to avoid
  84     // excess buffering. Many URL handlers stuff a BufferedInputStream
  85     // between here and the real data source, and larger buffers keep
  86     // that from slowing you down.)
  87     //
  88 
  89     /**
  90      * Constructs the reader from an input stream, auto-detecting
  91      * the encoding to use according to the heuristic specified
  92      * in the XML 1.0 recommendation.
  93      *
  94      * @param in the input stream from which the reader is constructed
  95      * @throws IOException on error, such as unrecognized encoding
  96      */
  97     public static Reader createReader(InputStream in) throws IOException {
  98         return new XmlReader(in);
  99     }
 100 
 101     /**
 102      * Creates a reader supporting the given encoding, mapping
 103      * from standard encoding names to ones that understood by
 104      * Java where necessary.
 105      *
 106      * @param in       the input stream from which the reader is constructed
 107      * @param encoding the IETF standard name of the encoding to use;
 108      *                 if null, auto-detection is used.
 109      * @throws IOException on error, including unrecognized encoding
 110      */
 111     public static Reader createReader(InputStream in, String encoding)
 112             throws IOException {
 113         if (encoding == null)
 114             return new XmlReader(in);
 115         if ("UTF-8".equalsIgnoreCase(encoding)
 116                 || "UTF8".equalsIgnoreCase(encoding))
 117             return new Utf8Reader(in);
 118         if ("US-ASCII".equalsIgnoreCase(encoding)
 119                 || "ASCII".equalsIgnoreCase(encoding))
 120             return new AsciiReader(in);
 121         if ("ISO-8859-1".equalsIgnoreCase(encoding)
 122         // plus numerous aliases ...
 123         )
 124             return new Iso8859_1Reader(in);
 125 
 126         //
 127         // What we really want is an administerable resource mapping
 128         // encoding names/aliases to classnames.  For example a property


 162         charsets.put("EBCDIC-CP-FR", "cp297");
 163 
 164         charsets.put("EBCDIC-CP-AR1", "cp420");
 165         charsets.put("EBCDIC-CP-HE", "cp424");
 166         charsets.put("EBCDIC-CP-BE", "cp500");
 167         charsets.put("EBCDIC-CP-CH", "cp500");
 168 
 169         charsets.put("EBCDIC-CP-ROECE", "cp870");
 170         charsets.put("EBCDIC-CP-YU", "cp870");
 171         charsets.put("EBCDIC-CP-IS", "cp871");
 172         charsets.put("EBCDIC-CP-AR2", "cp918");
 173 
 174         // IANA also defines two that JDK 1.2 doesn't handle:
 175         //    EBCDIC-CP-GR        --> CP423
 176         //    EBCDIC-CP-TR        --> CP905
 177     }
 178 
 179     // returns an encoding name supported by JDK >= 1.1.6
 180     // for some cases required by the XML spec
 181     private static String std2java(String encoding) {
 182         String temp = encoding.toUpperCase(Locale.ENGLISH);
 183         temp = (String) charsets.get(temp);
 184         return temp != null ? temp : encoding;
 185     }
 186 
 187     /**
 188      * Returns the standard name of the encoding in use
 189      */
 190     public String getEncoding() {
 191         return assignedEncoding;
 192     }
 193 
 194     private XmlReader(InputStream stream) throws IOException {
 195         super(stream);
 196 
 197         PushbackInputStream pb;
 198         byte buf [];
 199         int len;
 200 
 201         if (stream instanceof PushbackInputStream)
 202             pb = (PushbackInputStream) stream;


 305         byte buffer [] = new byte[MAXPUSHBACK];
 306         int len;
 307         Reader r;
 308         int c;
 309 
 310         //
 311         // Buffer up a bunch of input, and set up to read it in
 312         // the specified encoding ... we can skip the first four
 313         // bytes since we know that "<?xm" was read to determine
 314         // what encoding to use!
 315         //
 316         len = pb.read(buffer, 0, buffer.length);
 317         pb.unread(buffer, 0, len);
 318         r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
 319                 encoding);
 320 
 321         //
 322         // Next must be "l" (and whitespace) else we conclude
 323         // error and choose UTF-8.
 324         //
 325         if ((r.read()) != 'l') {
 326             setEncoding(pb, "UTF-8");
 327             return;
 328         }
 329 
 330         //
 331         // Then, we'll skip any
 332         //     S version="..."     [or single quotes]
 333         // bit and get any subsequent
 334         //     S encoding="..."     [or single quotes]
 335         //
 336         // We put an arbitrary size limit on how far we read; lots
 337         // of space will break this algorithm.
 338         //
 339         StringBuffer buf = new StringBuffer();
 340         StringBuffer keyBuf = null;
 341         String key = null;
 342         boolean sawEq = false;
 343         char quoteChar = 0;
 344         boolean sawQuestion = false;
 345 


 742                         break;
 743                     }
 744                 }
 745                 c = buffer[start++];
 746                 if ((c & 0x80) != 0)
 747                     throw new CharConversionException("Illegal ASCII character, 0x"
 748                             + Integer.toHexString(c & 0xff));
 749                 buf[offset + i] = (char) c;
 750             }
 751             if (i == 0 && finish <= 0)
 752                 return -1;
 753             return i;
 754         }
 755     }
 756 
 757     static final class Iso8859_1Reader extends BaseReader {
 758         Iso8859_1Reader(InputStream in) {
 759             super(in);
 760         }
 761 
 762         @Override
 763         public int read(char buf [], int offset, int len) throws IOException {
 764             int i;
 765 
 766             if (instream == null)
 767                 return -1;
 768 
 769             for (i = 0; i < len; i++) {
 770                 if (start >= finish) {
 771                     start = 0;
 772                     finish = instream.read(buffer, 0, buffer.length);
 773                     if (finish <= 0) {
 774                         if (finish <= 0)
 775                             this.close();
 776                         break;
 777                     }
 778                 }
 779                 buf[offset + i] = (char) (0x0ff & buffer[start++]);
 780             }
 781             if (i == 0 && finish <= 0)
 782                 return -1;
< prev index next >