1 /*
   2  * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.dtdparser;
  27 
  28 import java.io.ByteArrayInputStream;
  29 import java.io.CharConversionException;
  30 import java.io.IOException;
  31 import java.io.InputStream;
  32 import java.io.InputStreamReader;
  33 import java.io.PushbackInputStream;
  34 import java.io.Reader;
  35 import java.util.Hashtable;
  36 import java.util.Locale;
  37 
  38 
  39 // NOTE:  Add I18N support to this class when JDK gets the ability to
  40 // defer selection of locale for exception messages ... use the same
  41 // technique for both.
  42 
  43 
  44 /**
  45  * This handles several XML-related tasks that normal java.io Readers
  46  * don't support, inluding use of IETF standard encoding names and
  47  * automatic detection of most XML encodings.  The former is needed
  48  * for interoperability; the latter is needed to conform with the XML
  49  * spec.  This class also optimizes reading some common encodings by
  50  * providing low-overhead unsynchronized Reader support.
  51  * <p/>
  52  * <P> Note that the autodetection facility should be used only on
  53  * data streams which have an unknown character encoding.  For example,
  54  * it should never be used on MIME text/xml entities.
  55  * <p/>
  56  * <P> Note that XML processors are only required to support UTF-8 and
  57  * UTF-16 character encodings.  Autodetection permits the underlying Java
  58  * implementation to provide support for many other encodings, such as
  59  * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
  60  *
  61  * @author David Brownell
  62  * @author Janet Koenig
  63  * @version 1.3 00/02/24
  64  */
  65 // package private
  66 final class XmlReader extends Reader {
  67     private static final int MAXPUSHBACK = 512;
  68 
  69     private Reader in;
  70     private String assignedEncoding;
  71     private boolean closed;
  72 
  73     //
  74     // This class always delegates I/O to a reader, which gets
  75     // its data from the very beginning of the XML text.  It needs
  76     // to use a pushback stream since (a) autodetection can read
  77     // partial UTF-8 characters which need to be fully processed,
  78     // (b) the "Unicode" readers swallow characters that they think
  79     // are byte order marks, so tests fail if they don't see the
  80     // real byte order mark.
  81     //
  82     // It's got do this efficiently:  character I/O is solidly on the
  83     // critical path.  (So keep buffer length over 2 Kbytes to avoid
  84     // excess buffering. Many URL handlers stuff a BufferedInputStream
  85     // between here and the real data source, and larger buffers keep
  86     // that from slowing you down.)
  87     //
  88 
  89     /**
  90      * Constructs the reader from an input stream, auto-detecting
  91      * the encoding to use according to the heuristic specified
  92      * in the XML 1.0 recommendation.
  93      *
  94      * @param in the input stream from which the reader is constructed
  95      * @throws IOException on error, such as unrecognized encoding
  96      */
  97     public static Reader createReader(InputStream in) throws IOException {
  98         return new XmlReader(in);
  99     }
 100 
 101     /**
 102      * Creates a reader supporting the given encoding, mapping
 103      * from standard encoding names to ones that understood by
 104      * Java where necessary.
 105      *
 106      * @param in       the input stream from which the reader is constructed
 107      * @param encoding the IETF standard name of the encoding to use;
 108      *                 if null, auto-detection is used.
 109      * @throws IOException on error, including unrecognized encoding
 110      */
 111     public static Reader createReader(InputStream in, String encoding)
 112             throws IOException {
 113         if (encoding == null)
 114             return new XmlReader(in);
 115         if ("UTF-8".equalsIgnoreCase(encoding)
 116                 || "UTF8".equalsIgnoreCase(encoding))
 117             return new Utf8Reader(in);
 118         if ("US-ASCII".equalsIgnoreCase(encoding)
 119                 || "ASCII".equalsIgnoreCase(encoding))
 120             return new AsciiReader(in);
 121         if ("ISO-8859-1".equalsIgnoreCase(encoding)
 122         // plus numerous aliases ...
 123         )
 124             return new Iso8859_1Reader(in);
 125 
 126         //
 127         // What we really want is an administerable resource mapping
 128         // encoding names/aliases to classnames.  For example a property
 129         // file resource, "readers/mapping.props", holding and a set
 130         // of readers in that (sub)package... defaulting to this call
 131         // only if no better choice is available.
 132         //
 133         return new InputStreamReader(in, std2java(encoding));
 134     }
 135 
 136     //
 137     // JDK doesn't know all of the standard encoding names, and
 138     // in particular none of the EBCDIC ones IANA defines (and
 139     // which IBM encourages).
 140     //
 141     static private final Hashtable charsets = new Hashtable(31);
 142 
 143     static {
 144         charsets.put("UTF-16", "Unicode");
 145         charsets.put("ISO-10646-UCS-2", "Unicode");
 146 
 147         // NOTE: no support for ISO-10646-UCS-4 yet.
 148 
 149         charsets.put("EBCDIC-CP-US", "cp037");
 150         charsets.put("EBCDIC-CP-CA", "cp037");
 151         charsets.put("EBCDIC-CP-NL", "cp037");
 152         charsets.put("EBCDIC-CP-WT", "cp037");
 153 
 154         charsets.put("EBCDIC-CP-DK", "cp277");
 155         charsets.put("EBCDIC-CP-NO", "cp277");
 156         charsets.put("EBCDIC-CP-FI", "cp278");
 157         charsets.put("EBCDIC-CP-SE", "cp278");
 158 
 159         charsets.put("EBCDIC-CP-IT", "cp280");
 160         charsets.put("EBCDIC-CP-ES", "cp284");
 161         charsets.put("EBCDIC-CP-GB", "cp285");
 162         charsets.put("EBCDIC-CP-FR", "cp297");
 163 
 164         charsets.put("EBCDIC-CP-AR1", "cp420");
 165         charsets.put("EBCDIC-CP-HE", "cp424");
 166         charsets.put("EBCDIC-CP-BE", "cp500");
 167         charsets.put("EBCDIC-CP-CH", "cp500");
 168 
 169         charsets.put("EBCDIC-CP-ROECE", "cp870");
 170         charsets.put("EBCDIC-CP-YU", "cp870");
 171         charsets.put("EBCDIC-CP-IS", "cp871");
 172         charsets.put("EBCDIC-CP-AR2", "cp918");
 173 
 174         // IANA also defines two that JDK 1.2 doesn't handle:
 175         //    EBCDIC-CP-GR        --> CP423
 176         //    EBCDIC-CP-TR        --> CP905
 177     }
 178 
 179     // returns an encoding name supported by JDK >= 1.1.6
 180     // for some cases required by the XML spec
 181     private static String std2java(String encoding) {
 182         String temp = encoding.toUpperCase(Locale.ENGLISH);
 183         temp = (String) charsets.get(temp);
 184         return temp != null ? temp : encoding;
 185     }
 186 
 187     /**
 188      * Returns the standard name of the encoding in use
 189      */
 190     public String getEncoding() {
 191         return assignedEncoding;
 192     }
 193 
 194     private XmlReader(InputStream stream) throws IOException {
 195         super(stream);
 196 
 197         PushbackInputStream pb;
 198         byte buf [];
 199         int len;
 200 
 201         if (stream instanceof PushbackInputStream)
 202             pb = (PushbackInputStream) stream;
 203         else
 204             pb = new PushbackInputStream(stream, MAXPUSHBACK);
 205 
 206         //
 207         // See if we can figure out the character encoding used
 208         // in this file by peeking at the first few bytes.
 209         //
 210         buf = new byte[4];
 211         len = pb.read(buf);
 212         if (len > 0)
 213             pb.unread(buf, 0, len);
 214 
 215         if (len == 4)
 216             switch (buf[0] & 0x0ff) {
 217             case 0:
 218                 // 00 3c 00 3f == illegal UTF-16 big-endian
 219                 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
 220                     setEncoding(pb, "UnicodeBig");
 221                     return;
 222                 }
 223                 // else it's probably UCS-4
 224                 break;
 225 
 226             case '<':      // 0x3c: the most common cases!
 227                 switch (buf[1] & 0x0ff) {
 228                 // First character is '<'; could be XML without
 229                 // an XML directive such as "<hello>", "<!-- ...",
 230                 // and so on.
 231                 default:
 232                     break;
 233 
 234                     // 3c 00 3f 00 == illegal UTF-16 little endian
 235                 case 0x00:
 236                     if (buf[2] == 0x3f && buf[3] == 0x00) {
 237                         setEncoding(pb, "UnicodeLittle");
 238                         return;
 239                     }
 240                     // else probably UCS-4
 241                     break;
 242 
 243                     // 3c 3f 78 6d == ASCII and supersets '<?xm'
 244                 case '?':
 245                     if (buf[2] != 'x' || buf[3] != 'm')
 246                         break;
 247                     //
 248                     // One of several encodings could be used:
 249                     // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
 250                     //
 251                     useEncodingDecl(pb, "UTF8");
 252                     return;
 253                 }
 254                 break;
 255 
 256                 // 4c 6f a7 94 ... some EBCDIC code page
 257             case 0x4c:
 258                 if (buf[1] == 0x6f
 259                         && (0x0ff & buf[2]) == 0x0a7
 260                         && (0x0ff & buf[3]) == 0x094) {
 261                     useEncodingDecl(pb, "CP037");
 262                     return;
 263                 }
 264                 // whoops, treat as UTF-8
 265                 break;
 266 
 267                 // UTF-16 big-endian
 268             case 0xfe:
 269                 if ((buf[1] & 0x0ff) != 0xff)
 270                     break;
 271                 setEncoding(pb, "UTF-16");
 272                 return;
 273 
 274                 // UTF-16 little-endian
 275             case 0xff:
 276                 if ((buf[1] & 0x0ff) != 0xfe)
 277                     break;
 278                 setEncoding(pb, "UTF-16");
 279                 return;
 280 
 281                 // default ... no XML declaration
 282             default:
 283                 break;
 284             }
 285 
 286         //
 287         // If all else fails, assume XML without a declaration, and
 288         // using UTF-8 encoding.
 289         //
 290         setEncoding(pb, "UTF-8");
 291     }
 292 
 293     /*
 294      * Read the encoding decl on the stream, knowing that it should
 295      * be readable using the specified encoding (basically, ASCII or
 296      * EBCDIC).  The body of the document may use a wider range of
 297      * characters than the XML/Text decl itself, so we switch to use
 298      * the specified encoding as soon as we can.  (ASCII is a subset
 299      * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
 300      * has a variety of "code pages" that have these characters as
 301      * a common subset.)
 302      */
 303     private void useEncodingDecl(PushbackInputStream pb, String encoding)
 304             throws IOException {
 305         byte buffer [] = new byte[MAXPUSHBACK];
 306         int len;
 307         Reader r;
 308         int c;
 309 
 310         //
 311         // Buffer up a bunch of input, and set up to read it in
 312         // the specified encoding ... we can skip the first four
 313         // bytes since we know that "<?xm" was read to determine
 314         // what encoding to use!
 315         //
 316         len = pb.read(buffer, 0, buffer.length);
 317         pb.unread(buffer, 0, len);
 318         r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
 319                 encoding);
 320 
 321         //
 322         // Next must be "l" (and whitespace) else we conclude
 323         // error and choose UTF-8.
 324         //
 325         if ((r.read()) != 'l') {
 326             setEncoding(pb, "UTF-8");
 327             return;
 328         }
 329 
 330         //
 331         // Then, we'll skip any
 332         //     S version="..."     [or single quotes]
 333         // bit and get any subsequent
 334         //     S encoding="..."     [or single quotes]
 335         //
 336         // We put an arbitrary size limit on how far we read; lots
 337         // of space will break this algorithm.
 338         //
 339         StringBuffer buf = new StringBuffer();
 340         StringBuffer keyBuf = null;
 341         String key = null;
 342         boolean sawEq = false;
 343         char quoteChar = 0;
 344         boolean sawQuestion = false;
 345 
 346         XmlDecl:
 347         for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
 348             if ((c = r.read()) == -1)
 349                 break;
 350 
 351             // ignore whitespace before/between "key = 'value'"
 352             if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
 353                 continue;
 354 
 355             // ... but require at least a little!
 356             if (i == 0)
 357                 break;
 358 
 359             // terminate the loop ASAP
 360             if (c == '?')
 361                 sawQuestion = true;
 362             else if (sawQuestion) {
 363                 if (c == '>')
 364                     break;
 365                 sawQuestion = false;
 366             }
 367 
 368             // did we get the "key =" bit yet?
 369             if (key == null || !sawEq) {
 370                 if (keyBuf == null) {
 371                     if (Character.isWhitespace((char) c))
 372                         continue;
 373                     keyBuf = buf;
 374                     buf.setLength(0);
 375                     buf.append((char) c);
 376                     sawEq = false;
 377                 } else if (Character.isWhitespace((char) c)) {
 378                     key = keyBuf.toString();
 379                 } else if (c == '=') {
 380                     if (key == null)
 381                         key = keyBuf.toString();
 382                     sawEq = true;
 383                     keyBuf = null;
 384                     quoteChar = 0;
 385                 } else
 386                     keyBuf.append((char) c);
 387                 continue;
 388             }
 389 
 390             // space before quoted value
 391             if (Character.isWhitespace((char) c))
 392                 continue;
 393             if (c == '"' || c == '\'') {
 394                 if (quoteChar == 0) {
 395                     quoteChar = (char) c;
 396                     buf.setLength(0);
 397                     continue;
 398                 } else if (c == quoteChar) {
 399                     if ("encoding".equals(key)) {
 400                         assignedEncoding = buf.toString();
 401 
 402                         // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
 403                         for (i = 0; i < assignedEncoding.length(); i++) {
 404                             c = assignedEncoding.charAt(i);
 405                             if ((c >= 'A' && c <= 'Z')
 406                                     || (c >= 'a' && c <= 'z'))
 407                                 continue;
 408                             if (i == 0)
 409                                 break XmlDecl;
 410                             if (i > 0 && (c == '-'
 411                                     || (c >= '0' && c <= '9')
 412                                     || c == '.' || c == '_'))
 413                                 continue;
 414                             // map illegal names to UTF-8 default
 415                             break XmlDecl;
 416                         }
 417 
 418                         setEncoding(pb, assignedEncoding);
 419                         return;
 420 
 421                     } else {
 422                         key = null;
 423                         continue;
 424                     }
 425                 }
 426             }
 427             buf.append((char) c);
 428         }
 429 
 430         setEncoding(pb, "UTF-8");
 431     }
 432 
 433     private void setEncoding(InputStream stream, String encoding)
 434             throws IOException {
 435         assignedEncoding = encoding;
 436         in = createReader(stream, encoding);
 437     }
 438 
 439     /**
 440      * Reads the number of characters read into the buffer, or -1 on EOF.
 441      */
 442     public int read(char buf [], int off, int len) throws IOException {
 443         int val;
 444 
 445         if (closed)
 446             return -1;        // throw new IOException ("closed");
 447         val = in.read(buf, off, len);
 448         if (val == -1)
 449             close();
 450         return val;
 451     }
 452 
 453     /**
 454      * Reads a single character.
 455      */
 456     public int read() throws IOException {
 457         int val;
 458 
 459         if (closed)
 460             throw new IOException("closed");
 461         val = in.read();
 462         if (val == -1)
 463             close();
 464         return val;
 465     }
 466 
 467     /**
 468      * Returns true iff the reader supports mark/reset.
 469      */
 470     public boolean markSupported() {
 471         return in == null ? false : in.markSupported();
 472     }
 473 
 474     /**
 475      * Sets a mark allowing a limited number of characters to
 476      * be "peeked", by reading and then resetting.
 477      *
 478      * @param value how many characters may be "peeked".
 479      */
 480     public void mark(int value) throws IOException {
 481         if (in != null) in.mark(value);
 482     }
 483 
 484     /**
 485      * Resets the current position to the last marked position.
 486      */
 487     public void reset() throws IOException {
 488         if (in != null) in.reset();
 489     }
 490 
 491     /**
 492      * Skips a specified number of characters.
 493      */
 494     public long skip(long value) throws IOException {
 495         return in == null ? 0 : in.skip(value);
 496     }
 497 
 498     /**
 499      * Returns true iff input characters are known to be ready.
 500      */
 501     public boolean ready() throws IOException {
 502         return in == null ? false : in.ready();
 503     }
 504 
 505     /**
 506      * Closes the reader.
 507      */
 508     public void close() throws IOException {
 509         if (closed)
 510             return;
 511         in.close();
 512         in = null;
 513         closed = true;
 514     }
 515 
 516     //
 517     // Delegating to a converter module will always be slower than
 518     // direct conversion.  Use a similar approach for any other
 519     // readers that need to be particularly fast; only block I/O
 520     // speed matters to this package.  For UTF-16, separate readers
 521     // for big and little endian streams make a difference, too;
 522     // fewer conditionals in the critical path!
 523     //
 524     static abstract class BaseReader extends Reader {
 525         protected InputStream instream;
 526         protected byte buffer [];
 527         protected int start, finish;
 528 
 529         BaseReader(InputStream stream) {
 530             super(stream);
 531 
 532             instream = stream;
 533             buffer = new byte[8192];
 534         }
 535 
 536         public boolean ready() throws IOException {
 537             return instream == null
 538                     || (finish - start) > 0
 539                     || instream.available() != 0;
 540         }
 541 
 542         // caller shouldn't read again
 543         public void close() throws IOException {
 544             if (instream != null) {
 545                 instream.close();
 546                 start = finish = 0;
 547                 buffer = null;
 548                 instream = null;
 549             }
 550         }
 551     }
 552 
 553     //
 554     // We want this reader, to make the default encoding be as fast
 555     // as we can make it.  JDK's "UTF8" (not "UTF-8" till JDK 1.2)
 556     // InputStreamReader works, but 20+% slower speed isn't OK for
 557     // the default/primary encoding.
 558     //
 559     static final class Utf8Reader extends BaseReader {
 560         // 2nd half of UTF-8 surrogate pair
 561         private char nextChar;
 562 
 563         Utf8Reader(InputStream stream) {
 564             super(stream);
 565         }
 566 
 567         public int read(char buf [], int offset, int len) throws IOException {
 568             int i = 0, c = 0;
 569 
 570             if (len <= 0)
 571                 return 0;
 572 
 573             // Consume remaining half of any surrogate pair immediately
 574             if (nextChar != 0) {
 575                 buf[offset + i++] = nextChar;
 576                 nextChar = 0;
 577             }
 578 
 579             while (i < len) {
 580                 // stop or read data if needed
 581                 if (finish <= start) {
 582                     if (instream == null) {
 583                         c = -1;
 584                         break;
 585                     }
 586                     start = 0;
 587                     finish = instream.read(buffer, 0, buffer.length);
 588                     if (finish <= 0) {
 589                         this.close();
 590                         c = -1;
 591                         break;
 592                     }
 593                 }
 594 
 595                 //
 596                 // RFC 2279 describes UTF-8; there are six encodings.
 597                 // Each encoding takes a fixed number of characters
 598                 // (1-6 bytes) and is flagged by a bit pattern in the
 599                 // first byte.  The five and six byte-per-character
 600                 // encodings address characters which are disallowed
 601                 // in XML documents, as do some four byte ones.
 602                 //
 603 
 604                 //
 605                 // Single byte == ASCII.  Common; optimize.
 606                 //
 607                 c = buffer[start] & 0x0ff;
 608                 if ((c & 0x80) == 0x00) {
 609                     // 0x0000 <= c <= 0x007f
 610                     start++;
 611                     buf[offset + i++] = (char) c;
 612                     continue;
 613                 }
 614 
 615                 //
 616                 // Multibyte chars -- check offsets optimistically,
 617                 // ditto the "10xx xxxx" format for subsequent bytes
 618                 //
 619                 int off = start;
 620 
 621                 try {
 622                     // 2 bytes
 623                     if ((buffer[off] & 0x0E0) == 0x0C0) {
 624                         c = (buffer[off++] & 0x1f) << 6;
 625                         c += buffer[off++] & 0x3f;
 626 
 627                         // 0x0080 <= c <= 0x07ff
 628 
 629                         // 3 bytes
 630                     } else if ((buffer[off] & 0x0F0) == 0x0E0) {
 631                         c = (buffer[off++] & 0x0f) << 12;
 632                         c += (buffer[off++] & 0x3f) << 6;
 633                         c += buffer[off++] & 0x3f;
 634 
 635                         // 0x0800 <= c <= 0xffff
 636 
 637                         // 4 bytes
 638                     } else if ((buffer[off] & 0x0f8) == 0x0F0) {
 639                         c = (buffer[off++] & 0x07) << 18;
 640                         c += (buffer[off++] & 0x3f) << 12;
 641                         c += (buffer[off++] & 0x3f) << 6;
 642                         c += buffer[off++] & 0x3f;
 643 
 644                         // 0x0001 0000  <= c  <= 0x001f ffff
 645 
 646                         // Unicode supports c <= 0x0010 ffff ...
 647                         if (c > 0x0010ffff)
 648                             throw new CharConversionException("UTF-8 encoding of character 0x00"
 649                                     + Integer.toHexString(c)
 650                                     + " can't be converted to Unicode.");
 651 
 652                         // Convert UCS-4 char to surrogate pair (UTF-16)
 653                         c -= 0x10000;
 654                         nextChar = (char) (0xDC00 + (c & 0x03ff));
 655                         c = 0xD800 + (c >> 10);
 656 
 657                         // 5 and 6 byte versions are XML WF errors, but
 658                         // typically come from mislabeled encodings
 659                     } else
 660                         throw new CharConversionException("Unconvertible UTF-8 character"
 661                                 + " beginning with 0x"
 662                                 + Integer.toHexString(buffer[start] & 0xff));
 663 
 664                 } catch (ArrayIndexOutOfBoundsException e) {
 665                     // off > length && length >= buffer.length
 666                     c = 0;
 667                 }
 668 
 669                 //
 670                 // if the buffer held only a partial character,
 671                 // compact it and try to read the rest of the
 672                 // character.  worst case involves three
 673                 // single-byte reads -- quite rare.
 674                 //
 675                 if (off > finish) {
 676                     System.arraycopy(buffer, start,
 677                             buffer, 0, finish - start);
 678                     finish -= start;
 679                     start = 0;
 680                     off = instream.read(buffer, finish,
 681                             buffer.length - finish);
 682                     if (off < 0) {
 683                         this.close();
 684                         throw new CharConversionException("Partial UTF-8 char");
 685                     }
 686                     finish += off;
 687                     continue;
 688                 }
 689 
 690                 //
 691                 // check the format of the non-initial bytes
 692                 //
 693                 for (start++; start < off; start++) {
 694                     if ((buffer[start] & 0xC0) != 0x80) {
 695                         this.close();
 696                         throw new CharConversionException("Malformed UTF-8 char -- "
 697                                 + "is an XML encoding declaration missing?");
 698                     }
 699                 }
 700 
 701                 //
 702                 // If this needed a surrogate pair, consume ASAP
 703                 //
 704                 buf[offset + i++] = (char) c;
 705                 if (nextChar != 0 && i < len) {
 706                     buf[offset + i++] = nextChar;
 707                     nextChar = 0;
 708                 }
 709             }
 710             if (i > 0)
 711                 return i;
 712             return (c == -1) ? -1 : 0;
 713         }
 714     }
 715 
 716     //
 717     // We want ASCII and ISO-8859 Readers since they're the most common
 718     // encodings in the US and Europe, and we don't want performance
 719     // regressions for them.  They're also easy to implement efficiently,
 720     // since they're bitmask subsets of UNICODE.
 721     //
 722     // XXX haven't benchmarked these readers vs what we get out of JDK.
 723     //
 724     static final class AsciiReader extends BaseReader {
 725         AsciiReader(InputStream in) {
 726             super(in);
 727         }
 728 
 729         public int read(char buf [], int offset, int len) throws IOException {
 730             int i, c;
 731 
 732             if (instream == null)
 733                 return -1;
 734 
 735             for (i = 0; i < len; i++) {
 736                 if (start >= finish) {
 737                     start = 0;
 738                     finish = instream.read(buffer, 0, buffer.length);
 739                     if (finish <= 0) {
 740                         if (finish <= 0)
 741                             this.close();
 742                         break;
 743                     }
 744                 }
 745                 c = buffer[start++];
 746                 if ((c & 0x80) != 0)
 747                     throw new CharConversionException("Illegal ASCII character, 0x"
 748                             + Integer.toHexString(c & 0xff));
 749                 buf[offset + i] = (char) c;
 750             }
 751             if (i == 0 && finish <= 0)
 752                 return -1;
 753             return i;
 754         }
 755     }
 756 
 757     static final class Iso8859_1Reader extends BaseReader {
 758         Iso8859_1Reader(InputStream in) {
 759             super(in);
 760         }
 761 
 762         @Override
 763         public int read(char buf [], int offset, int len) throws IOException {
 764             int i;
 765 
 766             if (instream == null)
 767                 return -1;
 768 
 769             for (i = 0; i < len; i++) {
 770                 if (start >= finish) {
 771                     start = 0;
 772                     finish = instream.read(buffer, 0, buffer.length);
 773                     if (finish <= 0) {
 774                         if (finish <= 0)
 775                             this.close();
 776                         break;
 777                     }
 778                 }
 779                 buf[offset + i] = (char) (0x0ff & buffer[start++]);
 780             }
 781             if (i == 0 && finish <= 0)
 782                 return -1;
 783             return i;
 784         }
 785     }
 786 }