Old src/jdk.xml.bind/share/classes/com/sun/xml/internal/dtdparser/XmlReader.java

   1 /*
   2  * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.dtdparser;
  27 
  28 import java.io.ByteArrayInputStream;
  29 import java.io.CharConversionException;
  30 import java.io.IOException;
  31 import java.io.InputStream;
  32 import java.io.InputStreamReader;
  33 import java.io.PushbackInputStream;
  34 import java.io.Reader;
  35 import java.util.Hashtable;
  36 
  37 
  38 // NOTE:  Add I18N support to this class when JDK gets the ability to
  39 // defer selection of locale for exception messages ... use the same
  40 // technique for both.
  41 
  42 
  43 /**
  44  * This handles several XML-related tasks that normal java.io Readers
  45  * don't support, inluding use of IETF standard encoding names and
  46  * automatic detection of most XML encodings.  The former is needed
  47  * for interoperability; the latter is needed to conform with the XML
  48  * spec.  This class also optimizes reading some common encodings by
  49  * providing low-overhead unsynchronized Reader support.
  50  * <p/>
  51  * <P> Note that the autodetection facility should be used only on
  52  * data streams which have an unknown character encoding.  For example,
  53  * it should never be used on MIME text/xml entities.
  54  * <p/>
  55  * <P> Note that XML processors are only required to support UTF-8 and
  56  * UTF-16 character encodings.  Autodetection permits the underlying Java
  57  * implementation to provide support for many other encodings, such as
  58  * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
  59  *
  60  * @author David Brownell
  61  * @author Janet Koenig
  62  * @version 1.3 00/02/24
  63  */
  64 // package private
  65 final class XmlReader extends Reader {
  66     private static final int MAXPUSHBACK = 512;
  67 
  68     private Reader in;
  69     private String assignedEncoding;
  70     private boolean closed;
  71 
  72     //
  73     // This class always delegates I/O to a reader, which gets
  74     // its data from the very beginning of the XML text.  It needs
  75     // to use a pushback stream since (a) autodetection can read
  76     // partial UTF-8 characters which need to be fully processed,
  77     // (b) the "Unicode" readers swallow characters that they think
  78     // are byte order marks, so tests fail if they don't see the
  79     // real byte order mark.
  80     //
  81     // It's got do this efficiently:  character I/O is solidly on the
  82     // critical path.  (So keep buffer length over 2 Kbytes to avoid
  83     // excess buffering. Many URL handlers stuff a BufferedInputStream
  84     // between here and the real data source, and larger buffers keep
  85     // that from slowing you down.)
  86     //
  87 
  88     /**
  89      * Constructs the reader from an input stream, autodetecting
  90      * the encoding to use according to the heuristic specified
  91      * in the XML 1.0 recommendation.
  92      *
  93      * @param in the input stream from which the reader is constructed
  94      * @throws IOException on error, such as unrecognized encoding
  95      */
  96     public static Reader createReader(InputStream in) throws IOException {
  97         return new XmlReader(in);
  98     }
  99 
 100     /**
 101      * Creates a reader supporting the given encoding, mapping
 102      * from standard encoding names to ones that understood by
 103      * Java where necessary.
 104      *
 105      * @param in       the input stream from which the reader is constructed
 106      * @param encoding the IETF standard name of the encoding to use;
 107      *                 if null, autodetection is used.
 108      * @throws IOException on error, including unrecognized encoding
 109      */
 110     public static Reader createReader(InputStream in, String encoding)
 111             throws IOException {
 112         if (encoding == null)
 113             return new XmlReader(in);
 114         if ("UTF-8".equalsIgnoreCase(encoding)
 115                 || "UTF8".equalsIgnoreCase(encoding))
 116             return new Utf8Reader(in);
 117         if ("US-ASCII".equalsIgnoreCase(encoding)
 118                 || "ASCII".equalsIgnoreCase(encoding))
 119             return new AsciiReader(in);
 120         if ("ISO-8859-1".equalsIgnoreCase(encoding)
 121         // plus numerous aliases ...
 122         )
 123             return new Iso8859_1Reader(in);
 124 
 125         //
 126         // What we really want is an administerable resource mapping
 127         // encoding names/aliases to classnames.  For example a property
 128         // file resource, "readers/mapping.props", holding and a set
 129         // of readers in that (sub)package... defaulting to this call
 130         // only if no better choice is available.
 131         //
 132         return new InputStreamReader(in, std2java(encoding));
 133     }
 134 
 135     //
 136     // JDK doesn't know all of the standard encoding names, and
 137     // in particular none of the EBCDIC ones IANA defines (and
 138     // which IBM encourages).
 139     //
 140     static private final Hashtable charsets = new Hashtable(31);
 141 
 142     static {
 143         charsets.put("UTF-16", "Unicode");
 144         charsets.put("ISO-10646-UCS-2", "Unicode");
 145 
 146         // NOTE: no support for ISO-10646-UCS-4 yet.
 147 
 148         charsets.put("EBCDIC-CP-US", "cp037");
 149         charsets.put("EBCDIC-CP-CA", "cp037");
 150         charsets.put("EBCDIC-CP-NL", "cp037");
 151         charsets.put("EBCDIC-CP-WT", "cp037");
 152 
 153         charsets.put("EBCDIC-CP-DK", "cp277");
 154         charsets.put("EBCDIC-CP-NO", "cp277");
 155         charsets.put("EBCDIC-CP-FI", "cp278");
 156         charsets.put("EBCDIC-CP-SE", "cp278");
 157 
 158         charsets.put("EBCDIC-CP-IT", "cp280");
 159         charsets.put("EBCDIC-CP-ES", "cp284");
 160         charsets.put("EBCDIC-CP-GB", "cp285");
 161         charsets.put("EBCDIC-CP-FR", "cp297");
 162 
 163         charsets.put("EBCDIC-CP-AR1", "cp420");
 164         charsets.put("EBCDIC-CP-HE", "cp424");
 165         charsets.put("EBCDIC-CP-BE", "cp500");
 166         charsets.put("EBCDIC-CP-CH", "cp500");
 167 
 168         charsets.put("EBCDIC-CP-ROECE", "cp870");
 169         charsets.put("EBCDIC-CP-YU", "cp870");
 170         charsets.put("EBCDIC-CP-IS", "cp871");
 171         charsets.put("EBCDIC-CP-AR2", "cp918");
 172 
 173         // IANA also defines two that JDK 1.2 doesn't handle:
 174         //    EBCDIC-CP-GR        --> CP423
 175         //    EBCDIC-CP-TR        --> CP905
 176     }
 177 
 178     // returns an encoding name supported by JDK >= 1.1.6
 179     // for some cases required by the XML spec
 180     private static String std2java(String encoding) {
 181         String temp = encoding.toUpperCase();
 182         temp = (String) charsets.get(temp);
 183         return temp != null ? temp : encoding;
 184     }
 185 
 186     /**
 187      * Returns the standard name of the encoding in use
 188      */
 189     public String getEncoding() {
 190         return assignedEncoding;
 191     }
 192 
 193     private XmlReader(InputStream stream) throws IOException {
 194         super(stream);
 195 
 196         PushbackInputStream pb;
 197         byte buf [];
 198         int len;
 199 
 200         if (stream instanceof PushbackInputStream)
 201             pb = (PushbackInputStream) stream;
 202         else
 203             pb = new PushbackInputStream(stream, MAXPUSHBACK);
 204 
 205         //
 206         // See if we can figure out the character encoding used
 207         // in this file by peeking at the first few bytes.
 208         //
 209         buf = new byte[4];
 210         len = pb.read(buf);
 211         if (len > 0)
 212             pb.unread(buf, 0, len);
 213 
 214         if (len == 4)
 215             switch (buf[0] & 0x0ff) {
 216             case 0:
 217                 // 00 3c 00 3f == illegal UTF-16 big-endian
 218                 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
 219                     setEncoding(pb, "UnicodeBig");
 220                     return;
 221                 }
 222                 // else it's probably UCS-4
 223                 break;
 224 
 225             case '<':      // 0x3c: the most common cases!
 226                 switch (buf[1] & 0x0ff) {
 227                 // First character is '<'; could be XML without
 228                 // an XML directive such as "<hello>", "<!-- ...",
 229                 // and so on.
 230                 default:
 231                     break;
 232 
 233                     // 3c 00 3f 00 == illegal UTF-16 little endian
 234                 case 0x00:
 235                     if (buf[2] == 0x3f && buf[3] == 0x00) {
 236                         setEncoding(pb, "UnicodeLittle");
 237                         return;
 238                     }
 239                     // else probably UCS-4
 240                     break;
 241 
 242                     // 3c 3f 78 6d == ASCII and supersets '<?xm'
 243                 case '?':
 244                     if (buf[2] != 'x' || buf[3] != 'm')
 245                         break;
 246                     //
 247                     // One of several encodings could be used:
 248                     // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
 249                     //
 250                     useEncodingDecl(pb, "UTF8");
 251                     return;
 252                 }
 253                 break;
 254 
 255                 // 4c 6f a7 94 ... some EBCDIC code page
 256             case 0x4c:
 257                 if (buf[1] == 0x6f
 258                         && (0x0ff & buf[2]) == 0x0a7
 259                         && (0x0ff & buf[3]) == 0x094) {
 260                     useEncodingDecl(pb, "CP037");
 261                     return;
 262                 }
 263                 // whoops, treat as UTF-8
 264                 break;
 265 
 266                 // UTF-16 big-endian
 267             case 0xfe:
 268                 if ((buf[1] & 0x0ff) != 0xff)
 269                     break;
 270                 setEncoding(pb, "UTF-16");
 271                 return;
 272 
 273                 // UTF-16 little-endian
 274             case 0xff:
 275                 if ((buf[1] & 0x0ff) != 0xfe)
 276                     break;
 277                 setEncoding(pb, "UTF-16");
 278                 return;
 279 
 280                 // default ... no XML declaration
 281             default:
 282                 break;
 283             }
 284 
 285         //
 286         // If all else fails, assume XML without a declaration, and
 287         // using UTF-8 encoding.
 288         //
 289         setEncoding(pb, "UTF-8");
 290     }
 291 
 292     /*
 293      * Read the encoding decl on the stream, knowing that it should
 294      * be readable using the specified encoding (basically, ASCII or
 295      * EBCDIC).  The body of the document may use a wider range of
 296      * characters than the XML/Text decl itself, so we switch to use
 297      * the specified encoding as soon as we can.  (ASCII is a subset
 298      * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
 299      * has a variety of "code pages" that have these characters as
 300      * a common subset.)
 301      */
 302     private void useEncodingDecl(PushbackInputStream pb, String encoding)
 303             throws IOException {
 304         byte buffer [] = new byte[MAXPUSHBACK];
 305         int len;
 306         Reader r;
 307         int c;
 308 
 309         //
 310         // Buffer up a bunch of input, and set up to read it in
 311         // the specified encoding ... we can skip the first four
 312         // bytes since we know that "<?xm" was read to determine
 313         // what encoding to use!
 314         //
 315         len = pb.read(buffer, 0, buffer.length);
 316         pb.unread(buffer, 0, len);
 317         r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
 318                 encoding);
 319 
 320         //
 321         // Next must be "l" (and whitespace) else we conclude
 322         // error and choose UTF-8.
 323         //
 324         if ((c = r.read()) != 'l') {
 325             setEncoding(pb, "UTF-8");
 326             return;
 327         }
 328 
 329         //
 330         // Then, we'll skip any
 331         //     S version="..."     [or single quotes]
 332         // bit and get any subsequent
 333         //     S encoding="..."     [or single quotes]
 334         //
 335         // We put an arbitrary size limit on how far we read; lots
 336         // of space will break this algorithm.
 337         //
 338         StringBuffer buf = new StringBuffer();
 339         StringBuffer keyBuf = null;
 340         String key = null;
 341         boolean sawEq = false;
 342         char quoteChar = 0;
 343         boolean sawQuestion = false;
 344 
 345         XmlDecl:
 346         for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
 347             if ((c = r.read()) == -1)
 348                 break;
 349 
 350             // ignore whitespace before/between "key = 'value'"
 351             if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
 352                 continue;
 353 
 354             // ... but require at least a little!
 355             if (i == 0)
 356                 break;
 357 
 358             // terminate the loop ASAP
 359             if (c == '?')
 360                 sawQuestion = true;
 361             else if (sawQuestion) {
 362                 if (c == '>')
 363                     break;
 364                 sawQuestion = false;
 365             }
 366 
 367             // did we get the "key =" bit yet?
 368             if (key == null || !sawEq) {
 369                 if (keyBuf == null) {
 370                     if (Character.isWhitespace((char) c))
 371                         continue;
 372                     keyBuf = buf;
 373                     buf.setLength(0);
 374                     buf.append((char) c);
 375                     sawEq = false;
 376                 } else if (Character.isWhitespace((char) c)) {
 377                     key = keyBuf.toString();
 378                 } else if (c == '=') {
 379                     if (key == null)
 380                         key = keyBuf.toString();
 381                     sawEq = true;
 382                     keyBuf = null;
 383                     quoteChar = 0;
 384                 } else
 385                     keyBuf.append((char) c);
 386                 continue;
 387             }
 388 
 389             // space before quoted value
 390             if (Character.isWhitespace((char) c))
 391                 continue;
 392             if (c == '"' || c == '\'') {
 393                 if (quoteChar == 0) {
 394                     quoteChar = (char) c;
 395                     buf.setLength(0);
 396                     continue;
 397                 } else if (c == quoteChar) {
 398                     if ("encoding".equals(key)) {
 399                         assignedEncoding = buf.toString();
 400 
 401                         // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
 402                         for (i = 0; i < assignedEncoding.length(); i++) {
 403                             c = assignedEncoding.charAt(i);
 404                             if ((c >= 'A' && c <= 'Z')
 405                                     || (c >= 'a' && c <= 'z'))
 406                                 continue;
 407                             if (i == 0)
 408                                 break XmlDecl;
 409                             if (i > 0 && (c == '-'
 410                                     || (c >= '0' && c <= '9')
 411                                     || c == '.' || c == '_'))
 412                                 continue;
 413                             // map illegal names to UTF-8 default
 414                             break XmlDecl;
 415                         }
 416 
 417                         setEncoding(pb, assignedEncoding);
 418                         return;
 419 
 420                     } else {
 421                         key = null;
 422                         continue;
 423                     }
 424                 }
 425             }
 426             buf.append((char) c);
 427         }
 428 
 429         setEncoding(pb, "UTF-8");
 430     }
 431 
 432     private void setEncoding(InputStream stream, String encoding)
 433             throws IOException {
 434         assignedEncoding = encoding;
 435         in = createReader(stream, encoding);
 436     }
 437 
 438     /**
 439      * Reads the number of characters read into the buffer, or -1 on EOF.
 440      */
 441     public int read(char buf [], int off, int len) throws IOException {
 442         int val;
 443 
 444         if (closed)
 445             return -1;        // throw new IOException ("closed");
 446         val = in.read(buf, off, len);
 447         if (val == -1)
 448             close();
 449         return val;
 450     }
 451 
 452     /**
 453      * Reads a single character.
 454      */
 455     public int read() throws IOException {
 456         int val;
 457 
 458         if (closed)
 459             throw new IOException("closed");
 460         val = in.read();
 461         if (val == -1)
 462             close();
 463         return val;
 464     }
 465 
 466     /**
 467      * Returns true iff the reader supports mark/reset.
 468      */
 469     public boolean markSupported() {
 470         return in == null ? false : in.markSupported();
 471     }
 472 
 473     /**
 474      * Sets a mark allowing a limited number of characters to
 475      * be "peeked", by reading and then resetting.
 476      *
 477      * @param value how many characters may be "peeked".
 478      */
 479     public void mark(int value) throws IOException {
 480         if (in != null) in.mark(value);
 481     }
 482 
 483     /**
 484      * Resets the current position to the last marked position.
 485      */
 486     public void reset() throws IOException {
 487         if (in != null) in.reset();
 488     }
 489 
 490     /**
 491      * Skips a specified number of characters.
 492      */
 493     public long skip(long value) throws IOException {
 494         return in == null ? 0 : in.skip(value);
 495     }
 496 
 497     /**
 498      * Returns true iff input characters are known to be ready.
 499      */
 500     public boolean ready() throws IOException {
 501         return in == null ? false : in.ready();
 502     }
 503 
 504     /**
 505      * Closes the reader.
 506      */
 507     public void close() throws IOException {
 508         if (closed)
 509             return;
 510         in.close();
 511         in = null;
 512         closed = true;
 513     }
 514 
 515     //
 516     // Delegating to a converter module will always be slower than
 517     // direct conversion.  Use a similar approach for any other
 518     // readers that need to be particularly fast; only block I/O
 519     // speed matters to this package.  For UTF-16, separate readers
 520     // for big and little endian streams make a difference, too;
 521     // fewer conditionals in the critical path!
 522     //
 523     static abstract class BaseReader extends Reader {
 524         protected InputStream instream;
 525         protected byte buffer [];
 526         protected int start, finish;
 527 
 528         BaseReader(InputStream stream) {
 529             super(stream);
 530 
 531             instream = stream;
 532             buffer = new byte[8192];
 533         }
 534 
 535         public boolean ready() throws IOException {
 536             return instream == null
 537                     || (finish - start) > 0
 538                     || instream.available() != 0;
 539         }
 540 
 541         // caller shouldn't read again
 542         public void close() throws IOException {
 543             if (instream != null) {
 544                 instream.close();
 545                 start = finish = 0;
 546                 buffer = null;
 547                 instream = null;
 548             }
 549         }
 550     }
 551 
 552     //
 553     // We want this reader, to make the default encoding be as fast
 554     // as we can make it.  JDK's "UTF8" (not "UTF-8" till JDK 1.2)
 555     // InputStreamReader works, but 20+% slower speed isn't OK for
 556     // the default/primary encoding.
 557     //
 558     static final class Utf8Reader extends BaseReader {
 559         // 2nd half of UTF-8 surrogate pair
 560         private char nextChar;
 561 
 562         Utf8Reader(InputStream stream) {
 563             super(stream);
 564         }
 565 
 566         public int read(char buf [], int offset, int len) throws IOException {
 567             int i = 0, c = 0;
 568 
 569             if (len <= 0)
 570                 return 0;
 571 
 572             // Consume remaining half of any surrogate pair immediately
 573             if (nextChar != 0) {
 574                 buf[offset + i++] = nextChar;
 575                 nextChar = 0;
 576             }
 577 
 578             while (i < len) {
 579                 // stop or read data if needed
 580                 if (finish <= start) {
 581                     if (instream == null) {
 582                         c = -1;
 583                         break;
 584                     }
 585                     start = 0;
 586                     finish = instream.read(buffer, 0, buffer.length);
 587                     if (finish <= 0) {
 588                         this.close();
 589                         c = -1;
 590                         break;
 591                     }
 592                 }
 593 
 594                 //
 595                 // RFC 2279 describes UTF-8; there are six encodings.
 596                 // Each encoding takes a fixed number of characters
 597                 // (1-6 bytes) and is flagged by a bit pattern in the
 598                 // first byte.  The five and six byte-per-character
 599                 // encodings address characters which are disallowed
 600                 // in XML documents, as do some four byte ones.
 601                 //
 602 
 603                 //
 604                 // Single byte == ASCII.  Common; optimize.
 605                 //
 606                 c = buffer[start] & 0x0ff;
 607                 if ((c & 0x80) == 0x00) {
 608                     // 0x0000 <= c <= 0x007f
 609                     start++;
 610                     buf[offset + i++] = (char) c;
 611                     continue;
 612                 }
 613 
 614                 //
 615                 // Multibyte chars -- check offsets optimistically,
 616                 // ditto the "10xx xxxx" format for subsequent bytes
 617                 //
 618                 int off = start;
 619 
 620                 try {
 621                     // 2 bytes
 622                     if ((buffer[off] & 0x0E0) == 0x0C0) {
 623                         c = (buffer[off++] & 0x1f) << 6;
 624                         c += buffer[off++] & 0x3f;
 625 
 626                         // 0x0080 <= c <= 0x07ff
 627 
 628                         // 3 bytes
 629                     } else if ((buffer[off] & 0x0F0) == 0x0E0) {
 630                         c = (buffer[off++] & 0x0f) << 12;
 631                         c += (buffer[off++] & 0x3f) << 6;
 632                         c += buffer[off++] & 0x3f;
 633 
 634                         // 0x0800 <= c <= 0xffff
 635 
 636                         // 4 bytes
 637                     } else if ((buffer[off] & 0x0f8) == 0x0F0) {
 638                         c = (buffer[off++] & 0x07) << 18;
 639                         c += (buffer[off++] & 0x3f) << 12;
 640                         c += (buffer[off++] & 0x3f) << 6;
 641                         c += buffer[off++] & 0x3f;
 642 
 643                         // 0x0001 0000  <= c  <= 0x001f ffff
 644 
 645                         // Unicode supports c <= 0x0010 ffff ...
 646                         if (c > 0x0010ffff)
 647                             throw new CharConversionException("UTF-8 encoding of character 0x00"
 648                                     + Integer.toHexString(c)
 649                                     + " can't be converted to Unicode.");
 650 
 651                         // Convert UCS-4 char to surrogate pair (UTF-16)
 652                         c -= 0x10000;
 653                         nextChar = (char) (0xDC00 + (c & 0x03ff));
 654                         c = 0xD800 + (c >> 10);
 655 
 656                         // 5 and 6 byte versions are XML WF errors, but
 657                         // typically come from mislabeled encodings
 658                     } else
 659                         throw new CharConversionException("Unconvertible UTF-8 character"
 660                                 + " beginning with 0x"
 661                                 + Integer.toHexString(buffer[start] & 0xff));
 662 
 663                 } catch (ArrayIndexOutOfBoundsException e) {
 664                     // off > length && length >= buffer.length
 665                     c = 0;
 666                 }
 667 
 668                 //
 669                 // if the buffer held only a partial character,
 670                 // compact it and try to read the rest of the
 671                 // character.  worst case involves three
 672                 // single-byte reads -- quite rare.
 673                 //
 674                 if (off > finish) {
 675                     System.arraycopy(buffer, start,
 676                             buffer, 0, finish - start);
 677                     finish -= start;
 678                     start = 0;
 679                     off = instream.read(buffer, finish,
 680                             buffer.length - finish);
 681                     if (off < 0) {
 682                         this.close();
 683                         throw new CharConversionException("Partial UTF-8 char");
 684                     }
 685                     finish += off;
 686                     continue;
 687                 }
 688 
 689                 //
 690                 // check the format of the non-initial bytes
 691                 //
 692                 for (start++; start < off; start++) {
 693                     if ((buffer[start] & 0xC0) != 0x80) {
 694                         this.close();
 695                         throw new CharConversionException("Malformed UTF-8 char -- "
 696                                 + "is an XML encoding declaration missing?");
 697                     }
 698                 }
 699 
 700                 //
 701                 // If this needed a surrogate pair, consume ASAP
 702                 //
 703                 buf[offset + i++] = (char) c;
 704                 if (nextChar != 0 && i < len) {
 705                     buf[offset + i++] = nextChar;
 706                     nextChar = 0;
 707                 }
 708             }
 709             if (i > 0)
 710                 return i;
 711             return (c == -1) ? -1 : 0;
 712         }
 713     }
 714 
 715     //
 716     // We want ASCII and ISO-8859 Readers since they're the most common
 717     // encodings in the US and Europe, and we don't want performance
 718     // regressions for them.  They're also easy to implement efficiently,
 719     // since they're bitmask subsets of UNICODE.
 720     //
 721     // XXX haven't benchmarked these readers vs what we get out of JDK.
 722     //
 723     static final class AsciiReader extends BaseReader {
 724         AsciiReader(InputStream in) {
 725             super(in);
 726         }
 727 
 728         public int read(char buf [], int offset, int len) throws IOException {
 729             int i, c;
 730 
 731             if (instream == null)
 732                 return -1;
 733 
 734             for (i = 0; i < len; i++) {
 735                 if (start >= finish) {
 736                     start = 0;
 737                     finish = instream.read(buffer, 0, buffer.length);
 738                     if (finish <= 0) {
 739                         if (finish <= 0)
 740                             this.close();
 741                         break;
 742                     }
 743                 }
 744                 c = buffer[start++];
 745                 if ((c & 0x80) != 0)
 746                     throw new CharConversionException("Illegal ASCII character, 0x"
 747                             + Integer.toHexString(c & 0xff));
 748                 buf[offset + i] = (char) c;
 749             }
 750             if (i == 0 && finish <= 0)
 751                 return -1;
 752             return i;
 753         }
 754     }
 755 
 756     static final class Iso8859_1Reader extends BaseReader {
 757         Iso8859_1Reader(InputStream in) {
 758             super(in);
 759         }
 760 
 761         public int read(char buf [], int offset, int len) throws IOException {
 762             int i;
 763 
 764             if (instream == null)
 765                 return -1;
 766 
 767             for (i = 0; i < len; i++) {
 768                 if (start >= finish) {
 769                     start = 0;
 770                     finish = instream.read(buffer, 0, buffer.length);
 771                     if (finish <= 0) {
 772                         if (finish <= 0)
 773                             this.close();
 774                         break;
 775                     }
 776                 }
 777                 buf[offset + i] = (char) (0x0ff & buffer[start++]);
 778             }
 779             if (i == 0 && finish <= 0)
 780                 return -1;
 781             return i;
 782         }
 783     }
 784 }