1 /* 2 * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.xml.internal.dtdparser; 27 28 import java.io.ByteArrayInputStream; 29 import java.io.CharConversionException; 30 import java.io.IOException; 31 import java.io.InputStream; 32 import java.io.InputStreamReader; 33 import java.io.PushbackInputStream; 34 import java.io.Reader; 35 import java.util.Hashtable; 36 37 38 // NOTE: Add I18N support to this class when JDK gets the ability to 39 // defer selection of locale for exception messages ... use the same 40 // technique for both. 41 42 43 /** 44 * This handles several XML-related tasks that normal java.io Readers 45 * don't support, inluding use of IETF standard encoding names and 46 * automatic detection of most XML encodings. The former is needed 47 * for interoperability; the latter is needed to conform with the XML 48 * spec. This class also optimizes reading some common encodings by 49 * providing low-overhead unsynchronized Reader support. 50 * <p/> 51 * <P> Note that the autodetection facility should be used only on 52 * data streams which have an unknown character encoding. For example, 53 * it should never be used on MIME text/xml entities. 54 * <p/> 55 * <P> Note that XML processors are only required to support UTF-8 and 56 * UTF-16 character encodings. Autodetection permits the underlying Java 57 * implementation to provide support for many other encodings, such as 58 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. 59 * 60 * @author David Brownell 61 * @author Janet Koenig 62 * @version 1.3 00/02/24 63 */ 64 // package private 65 final class XmlReader extends Reader { 66 private static final int MAXPUSHBACK = 512; 67 68 private Reader in; 69 private String assignedEncoding; 70 private boolean closed; 71 72 // 73 // This class always delegates I/O to a reader, which gets 74 // its data from the very beginning of the XML text. It needs 75 // to use a pushback stream since (a) autodetection can read 76 // partial UTF-8 characters which need to be fully processed, 77 // (b) the "Unicode" readers swallow characters that they think 78 // are byte order marks, so tests fail if they don't see the 79 // real byte order mark. 80 // 81 // It's got do this efficiently: character I/O is solidly on the 82 // critical path. (So keep buffer length over 2 Kbytes to avoid 83 // excess buffering. Many URL handlers stuff a BufferedInputStream 84 // between here and the real data source, and larger buffers keep 85 // that from slowing you down.) 86 // 87 88 /** 89 * Constructs the reader from an input stream, autodetecting 90 * the encoding to use according to the heuristic specified 91 * in the XML 1.0 recommendation. 92 * 93 * @param in the input stream from which the reader is constructed 94 * @throws IOException on error, such as unrecognized encoding 95 */ 96 public static Reader createReader(InputStream in) throws IOException { 97 return new XmlReader(in); 98 } 99 100 /** 101 * Creates a reader supporting the given encoding, mapping 102 * from standard encoding names to ones that understood by 103 * Java where necessary. 104 * 105 * @param in the input stream from which the reader is constructed 106 * @param encoding the IETF standard name of the encoding to use; 107 * if null, autodetection is used. 108 * @throws IOException on error, including unrecognized encoding 109 */ 110 public static Reader createReader(InputStream in, String encoding) 111 throws IOException { 112 if (encoding == null) 113 return new XmlReader(in); 114 if ("UTF-8".equalsIgnoreCase(encoding) 115 || "UTF8".equalsIgnoreCase(encoding)) 116 return new Utf8Reader(in); 117 if ("US-ASCII".equalsIgnoreCase(encoding) 118 || "ASCII".equalsIgnoreCase(encoding)) 119 return new AsciiReader(in); 120 if ("ISO-8859-1".equalsIgnoreCase(encoding) 121 // plus numerous aliases ... 122 ) 123 return new Iso8859_1Reader(in); 124 125 // 126 // What we really want is an administerable resource mapping 127 // encoding names/aliases to classnames. For example a property 128 // file resource, "readers/mapping.props", holding and a set 129 // of readers in that (sub)package... defaulting to this call 130 // only if no better choice is available. 131 // 132 return new InputStreamReader(in, std2java(encoding)); 133 } 134 135 // 136 // JDK doesn't know all of the standard encoding names, and 137 // in particular none of the EBCDIC ones IANA defines (and 138 // which IBM encourages). 139 // 140 static private final Hashtable charsets = new Hashtable(31); 141 142 static { 143 charsets.put("UTF-16", "Unicode"); 144 charsets.put("ISO-10646-UCS-2", "Unicode"); 145 146 // NOTE: no support for ISO-10646-UCS-4 yet. 147 148 charsets.put("EBCDIC-CP-US", "cp037"); 149 charsets.put("EBCDIC-CP-CA", "cp037"); 150 charsets.put("EBCDIC-CP-NL", "cp037"); 151 charsets.put("EBCDIC-CP-WT", "cp037"); 152 153 charsets.put("EBCDIC-CP-DK", "cp277"); 154 charsets.put("EBCDIC-CP-NO", "cp277"); 155 charsets.put("EBCDIC-CP-FI", "cp278"); 156 charsets.put("EBCDIC-CP-SE", "cp278"); 157 158 charsets.put("EBCDIC-CP-IT", "cp280"); 159 charsets.put("EBCDIC-CP-ES", "cp284"); 160 charsets.put("EBCDIC-CP-GB", "cp285"); 161 charsets.put("EBCDIC-CP-FR", "cp297"); 162 163 charsets.put("EBCDIC-CP-AR1", "cp420"); 164 charsets.put("EBCDIC-CP-HE", "cp424"); 165 charsets.put("EBCDIC-CP-BE", "cp500"); 166 charsets.put("EBCDIC-CP-CH", "cp500"); 167 168 charsets.put("EBCDIC-CP-ROECE", "cp870"); 169 charsets.put("EBCDIC-CP-YU", "cp870"); 170 charsets.put("EBCDIC-CP-IS", "cp871"); 171 charsets.put("EBCDIC-CP-AR2", "cp918"); 172 173 // IANA also defines two that JDK 1.2 doesn't handle: 174 // EBCDIC-CP-GR --> CP423 175 // EBCDIC-CP-TR --> CP905 176 } 177 178 // returns an encoding name supported by JDK >= 1.1.6 179 // for some cases required by the XML spec 180 private static String std2java(String encoding) { 181 String temp = encoding.toUpperCase(); 182 temp = (String) charsets.get(temp); 183 return temp != null ? temp : encoding; 184 } 185 186 /** 187 * Returns the standard name of the encoding in use 188 */ 189 public String getEncoding() { 190 return assignedEncoding; 191 } 192 193 private XmlReader(InputStream stream) throws IOException { 194 super(stream); 195 196 PushbackInputStream pb; 197 byte buf []; 198 int len; 199 200 if (stream instanceof PushbackInputStream) 201 pb = (PushbackInputStream) stream; 202 else 203 pb = new PushbackInputStream(stream, MAXPUSHBACK); 204 205 // 206 // See if we can figure out the character encoding used 207 // in this file by peeking at the first few bytes. 208 // 209 buf = new byte[4]; 210 len = pb.read(buf); 211 if (len > 0) 212 pb.unread(buf, 0, len); 213 214 if (len == 4) 215 switch (buf[0] & 0x0ff) { 216 case 0: 217 // 00 3c 00 3f == illegal UTF-16 big-endian 218 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) { 219 setEncoding(pb, "UnicodeBig"); 220 return; 221 } 222 // else it's probably UCS-4 223 break; 224 225 case '<': // 0x3c: the most common cases! 226 switch (buf[1] & 0x0ff) { 227 // First character is '<'; could be XML without 228 // an XML directive such as "<hello>", "<!-- ...", 229 // and so on. 230 default: 231 break; 232 233 // 3c 00 3f 00 == illegal UTF-16 little endian 234 case 0x00: 235 if (buf[2] == 0x3f && buf[3] == 0x00) { 236 setEncoding(pb, "UnicodeLittle"); 237 return; 238 } 239 // else probably UCS-4 240 break; 241 242 // 3c 3f 78 6d == ASCII and supersets '<?xm' 243 case '?': 244 if (buf[2] != 'x' || buf[3] != 'm') 245 break; 246 // 247 // One of several encodings could be used: 248 // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc 249 // 250 useEncodingDecl(pb, "UTF8"); 251 return; 252 } 253 break; 254 255 // 4c 6f a7 94 ... some EBCDIC code page 256 case 0x4c: 257 if (buf[1] == 0x6f 258 && (0x0ff & buf[2]) == 0x0a7 259 && (0x0ff & buf[3]) == 0x094) { 260 useEncodingDecl(pb, "CP037"); 261 return; 262 } 263 // whoops, treat as UTF-8 264 break; 265 266 // UTF-16 big-endian 267 case 0xfe: 268 if ((buf[1] & 0x0ff) != 0xff) 269 break; 270 setEncoding(pb, "UTF-16"); 271 return; 272 273 // UTF-16 little-endian 274 case 0xff: 275 if ((buf[1] & 0x0ff) != 0xfe) 276 break; 277 setEncoding(pb, "UTF-16"); 278 return; 279 280 // default ... no XML declaration 281 default: 282 break; 283 } 284 285 // 286 // If all else fails, assume XML without a declaration, and 287 // using UTF-8 encoding. 288 // 289 setEncoding(pb, "UTF-8"); 290 } 291 292 /* 293 * Read the encoding decl on the stream, knowing that it should 294 * be readable using the specified encoding (basically, ASCII or 295 * EBCDIC). The body of the document may use a wider range of 296 * characters than the XML/Text decl itself, so we switch to use 297 * the specified encoding as soon as we can. (ASCII is a subset 298 * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC 299 * has a variety of "code pages" that have these characters as 300 * a common subset.) 301 */ 302 private void useEncodingDecl(PushbackInputStream pb, String encoding) 303 throws IOException { 304 byte buffer [] = new byte[MAXPUSHBACK]; 305 int len; 306 Reader r; 307 int c; 308 309 // 310 // Buffer up a bunch of input, and set up to read it in 311 // the specified encoding ... we can skip the first four 312 // bytes since we know that "<?xm" was read to determine 313 // what encoding to use! 314 // 315 len = pb.read(buffer, 0, buffer.length); 316 pb.unread(buffer, 0, len); 317 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len), 318 encoding); 319 320 // 321 // Next must be "l" (and whitespace) else we conclude 322 // error and choose UTF-8. 323 // 324 if ((c = r.read()) != 'l') { 325 setEncoding(pb, "UTF-8"); 326 return; 327 } 328 329 // 330 // Then, we'll skip any 331 // S version="..." [or single quotes] 332 // bit and get any subsequent 333 // S encoding="..." [or single quotes] 334 // 335 // We put an arbitrary size limit on how far we read; lots 336 // of space will break this algorithm. 337 // 338 StringBuffer buf = new StringBuffer(); 339 StringBuffer keyBuf = null; 340 String key = null; 341 boolean sawEq = false; 342 char quoteChar = 0; 343 boolean sawQuestion = false; 344 345 XmlDecl: 346 for (int i = 0; i < MAXPUSHBACK - 5; ++i) { 347 if ((c = r.read()) == -1) 348 break; 349 350 // ignore whitespace before/between "key = 'value'" 351 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') 352 continue; 353 354 // ... but require at least a little! 355 if (i == 0) 356 break; 357 358 // terminate the loop ASAP 359 if (c == '?') 360 sawQuestion = true; 361 else if (sawQuestion) { 362 if (c == '>') 363 break; 364 sawQuestion = false; 365 } 366 367 // did we get the "key =" bit yet? 368 if (key == null || !sawEq) { 369 if (keyBuf == null) { 370 if (Character.isWhitespace((char) c)) 371 continue; 372 keyBuf = buf; 373 buf.setLength(0); 374 buf.append((char) c); 375 sawEq = false; 376 } else if (Character.isWhitespace((char) c)) { 377 key = keyBuf.toString(); 378 } else if (c == '=') { 379 if (key == null) 380 key = keyBuf.toString(); 381 sawEq = true; 382 keyBuf = null; 383 quoteChar = 0; 384 } else 385 keyBuf.append((char) c); 386 continue; 387 } 388 389 // space before quoted value 390 if (Character.isWhitespace((char) c)) 391 continue; 392 if (c == '"' || c == '\'') { 393 if (quoteChar == 0) { 394 quoteChar = (char) c; 395 buf.setLength(0); 396 continue; 397 } else if (c == quoteChar) { 398 if ("encoding".equals(key)) { 399 assignedEncoding = buf.toString(); 400 401 // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')* 402 for (i = 0; i < assignedEncoding.length(); i++) { 403 c = assignedEncoding.charAt(i); 404 if ((c >= 'A' && c <= 'Z') 405 || (c >= 'a' && c <= 'z')) 406 continue; 407 if (i == 0) 408 break XmlDecl; 409 if (i > 0 && (c == '-' 410 || (c >= '0' && c <= '9') 411 || c == '.' || c == '_')) 412 continue; 413 // map illegal names to UTF-8 default 414 break XmlDecl; 415 } 416 417 setEncoding(pb, assignedEncoding); 418 return; 419 420 } else { 421 key = null; 422 continue; 423 } 424 } 425 } 426 buf.append((char) c); 427 } 428 429 setEncoding(pb, "UTF-8"); 430 } 431 432 private void setEncoding(InputStream stream, String encoding) 433 throws IOException { 434 assignedEncoding = encoding; 435 in = createReader(stream, encoding); 436 } 437 438 /** 439 * Reads the number of characters read into the buffer, or -1 on EOF. 440 */ 441 public int read(char buf [], int off, int len) throws IOException { 442 int val; 443 444 if (closed) 445 return -1; // throw new IOException ("closed"); 446 val = in.read(buf, off, len); 447 if (val == -1) 448 close(); 449 return val; 450 } 451 452 /** 453 * Reads a single character. 454 */ 455 public int read() throws IOException { 456 int val; 457 458 if (closed) 459 throw new IOException("closed"); 460 val = in.read(); 461 if (val == -1) 462 close(); 463 return val; 464 } 465 466 /** 467 * Returns true iff the reader supports mark/reset. 468 */ 469 public boolean markSupported() { 470 return in == null ? false : in.markSupported(); 471 } 472 473 /** 474 * Sets a mark allowing a limited number of characters to 475 * be "peeked", by reading and then resetting. 476 * 477 * @param value how many characters may be "peeked". 478 */ 479 public void mark(int value) throws IOException { 480 if (in != null) in.mark(value); 481 } 482 483 /** 484 * Resets the current position to the last marked position. 485 */ 486 public void reset() throws IOException { 487 if (in != null) in.reset(); 488 } 489 490 /** 491 * Skips a specified number of characters. 492 */ 493 public long skip(long value) throws IOException { 494 return in == null ? 0 : in.skip(value); 495 } 496 497 /** 498 * Returns true iff input characters are known to be ready. 499 */ 500 public boolean ready() throws IOException { 501 return in == null ? false : in.ready(); 502 } 503 504 /** 505 * Closes the reader. 506 */ 507 public void close() throws IOException { 508 if (closed) 509 return; 510 in.close(); 511 in = null; 512 closed = true; 513 } 514 515 // 516 // Delegating to a converter module will always be slower than 517 // direct conversion. Use a similar approach for any other 518 // readers that need to be particularly fast; only block I/O 519 // speed matters to this package. For UTF-16, separate readers 520 // for big and little endian streams make a difference, too; 521 // fewer conditionals in the critical path! 522 // 523 static abstract class BaseReader extends Reader { 524 protected InputStream instream; 525 protected byte buffer []; 526 protected int start, finish; 527 528 BaseReader(InputStream stream) { 529 super(stream); 530 531 instream = stream; 532 buffer = new byte[8192]; 533 } 534 535 public boolean ready() throws IOException { 536 return instream == null 537 || (finish - start) > 0 538 || instream.available() != 0; 539 } 540 541 // caller shouldn't read again 542 public void close() throws IOException { 543 if (instream != null) { 544 instream.close(); 545 start = finish = 0; 546 buffer = null; 547 instream = null; 548 } 549 } 550 } 551 552 // 553 // We want this reader, to make the default encoding be as fast 554 // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2) 555 // InputStreamReader works, but 20+% slower speed isn't OK for 556 // the default/primary encoding. 557 // 558 static final class Utf8Reader extends BaseReader { 559 // 2nd half of UTF-8 surrogate pair 560 private char nextChar; 561 562 Utf8Reader(InputStream stream) { 563 super(stream); 564 } 565 566 public int read(char buf [], int offset, int len) throws IOException { 567 int i = 0, c = 0; 568 569 if (len <= 0) 570 return 0; 571 572 // Consume remaining half of any surrogate pair immediately 573 if (nextChar != 0) { 574 buf[offset + i++] = nextChar; 575 nextChar = 0; 576 } 577 578 while (i < len) { 579 // stop or read data if needed 580 if (finish <= start) { 581 if (instream == null) { 582 c = -1; 583 break; 584 } 585 start = 0; 586 finish = instream.read(buffer, 0, buffer.length); 587 if (finish <= 0) { 588 this.close(); 589 c = -1; 590 break; 591 } 592 } 593 594 // 595 // RFC 2279 describes UTF-8; there are six encodings. 596 // Each encoding takes a fixed number of characters 597 // (1-6 bytes) and is flagged by a bit pattern in the 598 // first byte. The five and six byte-per-character 599 // encodings address characters which are disallowed 600 // in XML documents, as do some four byte ones. 601 // 602 603 // 604 // Single byte == ASCII. Common; optimize. 605 // 606 c = buffer[start] & 0x0ff; 607 if ((c & 0x80) == 0x00) { 608 // 0x0000 <= c <= 0x007f 609 start++; 610 buf[offset + i++] = (char) c; 611 continue; 612 } 613 614 // 615 // Multibyte chars -- check offsets optimistically, 616 // ditto the "10xx xxxx" format for subsequent bytes 617 // 618 int off = start; 619 620 try { 621 // 2 bytes 622 if ((buffer[off] & 0x0E0) == 0x0C0) { 623 c = (buffer[off++] & 0x1f) << 6; 624 c += buffer[off++] & 0x3f; 625 626 // 0x0080 <= c <= 0x07ff 627 628 // 3 bytes 629 } else if ((buffer[off] & 0x0F0) == 0x0E0) { 630 c = (buffer[off++] & 0x0f) << 12; 631 c += (buffer[off++] & 0x3f) << 6; 632 c += buffer[off++] & 0x3f; 633 634 // 0x0800 <= c <= 0xffff 635 636 // 4 bytes 637 } else if ((buffer[off] & 0x0f8) == 0x0F0) { 638 c = (buffer[off++] & 0x07) << 18; 639 c += (buffer[off++] & 0x3f) << 12; 640 c += (buffer[off++] & 0x3f) << 6; 641 c += buffer[off++] & 0x3f; 642 643 // 0x0001 0000 <= c <= 0x001f ffff 644 645 // Unicode supports c <= 0x0010 ffff ... 646 if (c > 0x0010ffff) 647 throw new CharConversionException("UTF-8 encoding of character 0x00" 648 + Integer.toHexString(c) 649 + " can't be converted to Unicode."); 650 651 // Convert UCS-4 char to surrogate pair (UTF-16) 652 c -= 0x10000; 653 nextChar = (char) (0xDC00 + (c & 0x03ff)); 654 c = 0xD800 + (c >> 10); 655 656 // 5 and 6 byte versions are XML WF errors, but 657 // typically come from mislabeled encodings 658 } else 659 throw new CharConversionException("Unconvertible UTF-8 character" 660 + " beginning with 0x" 661 + Integer.toHexString(buffer[start] & 0xff)); 662 663 } catch (ArrayIndexOutOfBoundsException e) { 664 // off > length && length >= buffer.length 665 c = 0; 666 } 667 668 // 669 // if the buffer held only a partial character, 670 // compact it and try to read the rest of the 671 // character. worst case involves three 672 // single-byte reads -- quite rare. 673 // 674 if (off > finish) { 675 System.arraycopy(buffer, start, 676 buffer, 0, finish - start); 677 finish -= start; 678 start = 0; 679 off = instream.read(buffer, finish, 680 buffer.length - finish); 681 if (off < 0) { 682 this.close(); 683 throw new CharConversionException("Partial UTF-8 char"); 684 } 685 finish += off; 686 continue; 687 } 688 689 // 690 // check the format of the non-initial bytes 691 // 692 for (start++; start < off; start++) { 693 if ((buffer[start] & 0xC0) != 0x80) { 694 this.close(); 695 throw new CharConversionException("Malformed UTF-8 char -- " 696 + "is an XML encoding declaration missing?"); 697 } 698 } 699 700 // 701 // If this needed a surrogate pair, consume ASAP 702 // 703 buf[offset + i++] = (char) c; 704 if (nextChar != 0 && i < len) { 705 buf[offset + i++] = nextChar; 706 nextChar = 0; 707 } 708 } 709 if (i > 0) 710 return i; 711 return (c == -1) ? -1 : 0; 712 } 713 } 714 715 // 716 // We want ASCII and ISO-8859 Readers since they're the most common 717 // encodings in the US and Europe, and we don't want performance 718 // regressions for them. They're also easy to implement efficiently, 719 // since they're bitmask subsets of UNICODE. 720 // 721 // XXX haven't benchmarked these readers vs what we get out of JDK. 722 // 723 static final class AsciiReader extends BaseReader { 724 AsciiReader(InputStream in) { 725 super(in); 726 } 727 728 public int read(char buf [], int offset, int len) throws IOException { 729 int i, c; 730 731 if (instream == null) 732 return -1; 733 734 for (i = 0; i < len; i++) { 735 if (start >= finish) { 736 start = 0; 737 finish = instream.read(buffer, 0, buffer.length); 738 if (finish <= 0) { 739 if (finish <= 0) 740 this.close(); 741 break; 742 } 743 } 744 c = buffer[start++]; 745 if ((c & 0x80) != 0) 746 throw new CharConversionException("Illegal ASCII character, 0x" 747 + Integer.toHexString(c & 0xff)); 748 buf[offset + i] = (char) c; 749 } 750 if (i == 0 && finish <= 0) 751 return -1; 752 return i; 753 } 754 } 755 756 static final class Iso8859_1Reader extends BaseReader { 757 Iso8859_1Reader(InputStream in) { 758 super(in); 759 } 760 761 public int read(char buf [], int offset, int len) throws IOException { 762 int i; 763 764 if (instream == null) 765 return -1; 766 767 for (i = 0; i < len; i++) { 768 if (start >= finish) { 769 start = 0; 770 finish = instream.read(buffer, 0, buffer.length); 771 if (finish <= 0) { 772 if (finish <= 0) 773 this.close(); 774 break; 775 } 776 } 777 buf[offset + i] = (char) (0x0ff & buffer[start++]); 778 } 779 if (i == 0 && finish <= 0) 780 return -1; 781 return i; 782 } 783 } 784 }