1 /* 2 * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.xml.internal.dtdparser; 27 28 import java.io.ByteArrayInputStream; 29 import java.io.CharConversionException; 30 import java.io.IOException; 31 import java.io.InputStream; 32 import java.io.InputStreamReader; 33 import java.io.PushbackInputStream; 34 import java.io.Reader; 35 import java.util.Hashtable; 36 import java.util.Locale; 37 38 39 // NOTE: Add I18N support to this class when JDK gets the ability to 40 // defer selection of locale for exception messages ... use the same 41 // technique for both. 42 43 44 /** 45 * This handles several XML-related tasks that normal java.io Readers 46 * don't support, inluding use of IETF standard encoding names and 47 * automatic detection of most XML encodings. The former is needed 48 * for interoperability; the latter is needed to conform with the XML 49 * spec. This class also optimizes reading some common encodings by 50 * providing low-overhead unsynchronized Reader support. 51 * <p/> 52 * <P> Note that the autodetection facility should be used only on 53 * data streams which have an unknown character encoding. For example, 54 * it should never be used on MIME text/xml entities. 55 * <p/> 56 * <P> Note that XML processors are only required to support UTF-8 and 57 * UTF-16 character encodings. Autodetection permits the underlying Java 58 * implementation to provide support for many other encodings, such as 59 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. 60 * 61 * @author David Brownell 62 * @author Janet Koenig 63 * @version 1.3 00/02/24 64 */ 65 // package private 66 final class XmlReader extends Reader { 67 private static final int MAXPUSHBACK = 512; 68 69 private Reader in; 70 private String assignedEncoding; 71 private boolean closed; 72 73 // 74 // This class always delegates I/O to a reader, which gets 75 // its data from the very beginning of the XML text. It needs 76 // to use a pushback stream since (a) autodetection can read 77 // partial UTF-8 characters which need to be fully processed, 78 // (b) the "Unicode" readers swallow characters that they think 79 // are byte order marks, so tests fail if they don't see the 80 // real byte order mark. 81 // 82 // It's got do this efficiently: character I/O is solidly on the 83 // critical path. (So keep buffer length over 2 Kbytes to avoid 84 // excess buffering. Many URL handlers stuff a BufferedInputStream 85 // between here and the real data source, and larger buffers keep 86 // that from slowing you down.) 87 // 88 89 /** 90 * Constructs the reader from an input stream, auto-detecting 91 * the encoding to use according to the heuristic specified 92 * in the XML 1.0 recommendation. 93 * 94 * @param in the input stream from which the reader is constructed 95 * @throws IOException on error, such as unrecognized encoding 96 */ 97 public static Reader createReader(InputStream in) throws IOException { 98 return new XmlReader(in); 99 } 100 101 /** 102 * Creates a reader supporting the given encoding, mapping 103 * from standard encoding names to ones that understood by 104 * Java where necessary. 105 * 106 * @param in the input stream from which the reader is constructed 107 * @param encoding the IETF standard name of the encoding to use; 108 * if null, auto-detection is used. 109 * @throws IOException on error, including unrecognized encoding 110 */ 111 public static Reader createReader(InputStream in, String encoding) 112 throws IOException { 113 if (encoding == null) 114 return new XmlReader(in); 115 if ("UTF-8".equalsIgnoreCase(encoding) 116 || "UTF8".equalsIgnoreCase(encoding)) 117 return new Utf8Reader(in); 118 if ("US-ASCII".equalsIgnoreCase(encoding) 119 || "ASCII".equalsIgnoreCase(encoding)) 120 return new AsciiReader(in); 121 if ("ISO-8859-1".equalsIgnoreCase(encoding) 122 // plus numerous aliases ... 123 ) 124 return new Iso8859_1Reader(in); 125 126 // 127 // What we really want is an administerable resource mapping 128 // encoding names/aliases to classnames. For example a property 129 // file resource, "readers/mapping.props", holding and a set 130 // of readers in that (sub)package... defaulting to this call 131 // only if no better choice is available. 132 // 133 return new InputStreamReader(in, std2java(encoding)); 134 } 135 136 // 137 // JDK doesn't know all of the standard encoding names, and 138 // in particular none of the EBCDIC ones IANA defines (and 139 // which IBM encourages). 140 // 141 static private final Hashtable charsets = new Hashtable(31); 142 143 static { 144 charsets.put("UTF-16", "Unicode"); 145 charsets.put("ISO-10646-UCS-2", "Unicode"); 146 147 // NOTE: no support for ISO-10646-UCS-4 yet. 148 149 charsets.put("EBCDIC-CP-US", "cp037"); 150 charsets.put("EBCDIC-CP-CA", "cp037"); 151 charsets.put("EBCDIC-CP-NL", "cp037"); 152 charsets.put("EBCDIC-CP-WT", "cp037"); 153 154 charsets.put("EBCDIC-CP-DK", "cp277"); 155 charsets.put("EBCDIC-CP-NO", "cp277"); 156 charsets.put("EBCDIC-CP-FI", "cp278"); 157 charsets.put("EBCDIC-CP-SE", "cp278"); 158 159 charsets.put("EBCDIC-CP-IT", "cp280"); 160 charsets.put("EBCDIC-CP-ES", "cp284"); 161 charsets.put("EBCDIC-CP-GB", "cp285"); 162 charsets.put("EBCDIC-CP-FR", "cp297"); 163 164 charsets.put("EBCDIC-CP-AR1", "cp420"); 165 charsets.put("EBCDIC-CP-HE", "cp424"); 166 charsets.put("EBCDIC-CP-BE", "cp500"); 167 charsets.put("EBCDIC-CP-CH", "cp500"); 168 169 charsets.put("EBCDIC-CP-ROECE", "cp870"); 170 charsets.put("EBCDIC-CP-YU", "cp870"); 171 charsets.put("EBCDIC-CP-IS", "cp871"); 172 charsets.put("EBCDIC-CP-AR2", "cp918"); 173 174 // IANA also defines two that JDK 1.2 doesn't handle: 175 // EBCDIC-CP-GR --> CP423 176 // EBCDIC-CP-TR --> CP905 177 } 178 179 // returns an encoding name supported by JDK >= 1.1.6 180 // for some cases required by the XML spec 181 private static String std2java(String encoding) { 182 String temp = encoding.toUpperCase(Locale.ENGLISH); 183 temp = (String) charsets.get(temp); 184 return temp != null ? temp : encoding; 185 } 186 187 /** 188 * Returns the standard name of the encoding in use 189 */ 190 public String getEncoding() { 191 return assignedEncoding; 192 } 193 194 private XmlReader(InputStream stream) throws IOException { 195 super(stream); 196 197 PushbackInputStream pb; 198 byte buf []; 199 int len; 200 201 if (stream instanceof PushbackInputStream) 202 pb = (PushbackInputStream) stream; 203 else 204 pb = new PushbackInputStream(stream, MAXPUSHBACK); 205 206 // 207 // See if we can figure out the character encoding used 208 // in this file by peeking at the first few bytes. 209 // 210 buf = new byte[4]; 211 len = pb.read(buf); 212 if (len > 0) 213 pb.unread(buf, 0, len); 214 215 if (len == 4) 216 switch (buf[0] & 0x0ff) { 217 case 0: 218 // 00 3c 00 3f == illegal UTF-16 big-endian 219 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) { 220 setEncoding(pb, "UnicodeBig"); 221 return; 222 } 223 // else it's probably UCS-4 224 break; 225 226 case '<': // 0x3c: the most common cases! 227 switch (buf[1] & 0x0ff) { 228 // First character is '<'; could be XML without 229 // an XML directive such as "<hello>", "<!-- ...", 230 // and so on. 231 default: 232 break; 233 234 // 3c 00 3f 00 == illegal UTF-16 little endian 235 case 0x00: 236 if (buf[2] == 0x3f && buf[3] == 0x00) { 237 setEncoding(pb, "UnicodeLittle"); 238 return; 239 } 240 // else probably UCS-4 241 break; 242 243 // 3c 3f 78 6d == ASCII and supersets '<?xm' 244 case '?': 245 if (buf[2] != 'x' || buf[3] != 'm') 246 break; 247 // 248 // One of several encodings could be used: 249 // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc 250 // 251 useEncodingDecl(pb, "UTF8"); 252 return; 253 } 254 break; 255 256 // 4c 6f a7 94 ... some EBCDIC code page 257 case 0x4c: 258 if (buf[1] == 0x6f 259 && (0x0ff & buf[2]) == 0x0a7 260 && (0x0ff & buf[3]) == 0x094) { 261 useEncodingDecl(pb, "CP037"); 262 return; 263 } 264 // whoops, treat as UTF-8 265 break; 266 267 // UTF-16 big-endian 268 case 0xfe: 269 if ((buf[1] & 0x0ff) != 0xff) 270 break; 271 setEncoding(pb, "UTF-16"); 272 return; 273 274 // UTF-16 little-endian 275 case 0xff: 276 if ((buf[1] & 0x0ff) != 0xfe) 277 break; 278 setEncoding(pb, "UTF-16"); 279 return; 280 281 // default ... no XML declaration 282 default: 283 break; 284 } 285 286 // 287 // If all else fails, assume XML without a declaration, and 288 // using UTF-8 encoding. 289 // 290 setEncoding(pb, "UTF-8"); 291 } 292 293 /* 294 * Read the encoding decl on the stream, knowing that it should 295 * be readable using the specified encoding (basically, ASCII or 296 * EBCDIC). The body of the document may use a wider range of 297 * characters than the XML/Text decl itself, so we switch to use 298 * the specified encoding as soon as we can. (ASCII is a subset 299 * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC 300 * has a variety of "code pages" that have these characters as 301 * a common subset.) 302 */ 303 private void useEncodingDecl(PushbackInputStream pb, String encoding) 304 throws IOException { 305 byte buffer [] = new byte[MAXPUSHBACK]; 306 int len; 307 Reader r; 308 int c; 309 310 // 311 // Buffer up a bunch of input, and set up to read it in 312 // the specified encoding ... we can skip the first four 313 // bytes since we know that "<?xm" was read to determine 314 // what encoding to use! 315 // 316 len = pb.read(buffer, 0, buffer.length); 317 pb.unread(buffer, 0, len); 318 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len), 319 encoding); 320 321 // 322 // Next must be "l" (and whitespace) else we conclude 323 // error and choose UTF-8. 324 // 325 if ((r.read()) != 'l') { 326 setEncoding(pb, "UTF-8"); 327 return; 328 } 329 330 // 331 // Then, we'll skip any 332 // S version="..." [or single quotes] 333 // bit and get any subsequent 334 // S encoding="..." [or single quotes] 335 // 336 // We put an arbitrary size limit on how far we read; lots 337 // of space will break this algorithm. 338 // 339 StringBuffer buf = new StringBuffer(); 340 StringBuffer keyBuf = null; 341 String key = null; 342 boolean sawEq = false; 343 char quoteChar = 0; 344 boolean sawQuestion = false; 345 346 XmlDecl: 347 for (int i = 0; i < MAXPUSHBACK - 5; ++i) { 348 if ((c = r.read()) == -1) 349 break; 350 351 // ignore whitespace before/between "key = 'value'" 352 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') 353 continue; 354 355 // ... but require at least a little! 356 if (i == 0) 357 break; 358 359 // terminate the loop ASAP 360 if (c == '?') 361 sawQuestion = true; 362 else if (sawQuestion) { 363 if (c == '>') 364 break; 365 sawQuestion = false; 366 } 367 368 // did we get the "key =" bit yet? 369 if (key == null || !sawEq) { 370 if (keyBuf == null) { 371 if (Character.isWhitespace((char) c)) 372 continue; 373 keyBuf = buf; 374 buf.setLength(0); 375 buf.append((char) c); 376 sawEq = false; 377 } else if (Character.isWhitespace((char) c)) { 378 key = keyBuf.toString(); 379 } else if (c == '=') { 380 if (key == null) 381 key = keyBuf.toString(); 382 sawEq = true; 383 keyBuf = null; 384 quoteChar = 0; 385 } else 386 keyBuf.append((char) c); 387 continue; 388 } 389 390 // space before quoted value 391 if (Character.isWhitespace((char) c)) 392 continue; 393 if (c == '"' || c == '\'') { 394 if (quoteChar == 0) { 395 quoteChar = (char) c; 396 buf.setLength(0); 397 continue; 398 } else if (c == quoteChar) { 399 if ("encoding".equals(key)) { 400 assignedEncoding = buf.toString(); 401 402 // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')* 403 for (i = 0; i < assignedEncoding.length(); i++) { 404 c = assignedEncoding.charAt(i); 405 if ((c >= 'A' && c <= 'Z') 406 || (c >= 'a' && c <= 'z')) 407 continue; 408 if (i == 0) 409 break XmlDecl; 410 if (i > 0 && (c == '-' 411 || (c >= '0' && c <= '9') 412 || c == '.' || c == '_')) 413 continue; 414 // map illegal names to UTF-8 default 415 break XmlDecl; 416 } 417 418 setEncoding(pb, assignedEncoding); 419 return; 420 421 } else { 422 key = null; 423 continue; 424 } 425 } 426 } 427 buf.append((char) c); 428 } 429 430 setEncoding(pb, "UTF-8"); 431 } 432 433 private void setEncoding(InputStream stream, String encoding) 434 throws IOException { 435 assignedEncoding = encoding; 436 in = createReader(stream, encoding); 437 } 438 439 /** 440 * Reads the number of characters read into the buffer, or -1 on EOF. 441 */ 442 public int read(char buf [], int off, int len) throws IOException { 443 int val; 444 445 if (closed) 446 return -1; // throw new IOException ("closed"); 447 val = in.read(buf, off, len); 448 if (val == -1) 449 close(); 450 return val; 451 } 452 453 /** 454 * Reads a single character. 455 */ 456 public int read() throws IOException { 457 int val; 458 459 if (closed) 460 throw new IOException("closed"); 461 val = in.read(); 462 if (val == -1) 463 close(); 464 return val; 465 } 466 467 /** 468 * Returns true iff the reader supports mark/reset. 469 */ 470 public boolean markSupported() { 471 return in == null ? false : in.markSupported(); 472 } 473 474 /** 475 * Sets a mark allowing a limited number of characters to 476 * be "peeked", by reading and then resetting. 477 * 478 * @param value how many characters may be "peeked". 479 */ 480 public void mark(int value) throws IOException { 481 if (in != null) in.mark(value); 482 } 483 484 /** 485 * Resets the current position to the last marked position. 486 */ 487 public void reset() throws IOException { 488 if (in != null) in.reset(); 489 } 490 491 /** 492 * Skips a specified number of characters. 493 */ 494 public long skip(long value) throws IOException { 495 return in == null ? 0 : in.skip(value); 496 } 497 498 /** 499 * Returns true iff input characters are known to be ready. 500 */ 501 public boolean ready() throws IOException { 502 return in == null ? false : in.ready(); 503 } 504 505 /** 506 * Closes the reader. 507 */ 508 public void close() throws IOException { 509 if (closed) 510 return; 511 in.close(); 512 in = null; 513 closed = true; 514 } 515 516 // 517 // Delegating to a converter module will always be slower than 518 // direct conversion. Use a similar approach for any other 519 // readers that need to be particularly fast; only block I/O 520 // speed matters to this package. For UTF-16, separate readers 521 // for big and little endian streams make a difference, too; 522 // fewer conditionals in the critical path! 523 // 524 static abstract class BaseReader extends Reader { 525 protected InputStream instream; 526 protected byte buffer []; 527 protected int start, finish; 528 529 BaseReader(InputStream stream) { 530 super(stream); 531 532 instream = stream; 533 buffer = new byte[8192]; 534 } 535 536 public boolean ready() throws IOException { 537 return instream == null 538 || (finish - start) > 0 539 || instream.available() != 0; 540 } 541 542 // caller shouldn't read again 543 public void close() throws IOException { 544 if (instream != null) { 545 instream.close(); 546 start = finish = 0; 547 buffer = null; 548 instream = null; 549 } 550 } 551 } 552 553 // 554 // We want this reader, to make the default encoding be as fast 555 // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2) 556 // InputStreamReader works, but 20+% slower speed isn't OK for 557 // the default/primary encoding. 558 // 559 static final class Utf8Reader extends BaseReader { 560 // 2nd half of UTF-8 surrogate pair 561 private char nextChar; 562 563 Utf8Reader(InputStream stream) { 564 super(stream); 565 } 566 567 public int read(char buf [], int offset, int len) throws IOException { 568 int i = 0, c = 0; 569 570 if (len <= 0) 571 return 0; 572 573 // Consume remaining half of any surrogate pair immediately 574 if (nextChar != 0) { 575 buf[offset + i++] = nextChar; 576 nextChar = 0; 577 } 578 579 while (i < len) { 580 // stop or read data if needed 581 if (finish <= start) { 582 if (instream == null) { 583 c = -1; 584 break; 585 } 586 start = 0; 587 finish = instream.read(buffer, 0, buffer.length); 588 if (finish <= 0) { 589 this.close(); 590 c = -1; 591 break; 592 } 593 } 594 595 // 596 // RFC 2279 describes UTF-8; there are six encodings. 597 // Each encoding takes a fixed number of characters 598 // (1-6 bytes) and is flagged by a bit pattern in the 599 // first byte. The five and six byte-per-character 600 // encodings address characters which are disallowed 601 // in XML documents, as do some four byte ones. 602 // 603 604 // 605 // Single byte == ASCII. Common; optimize. 606 // 607 c = buffer[start] & 0x0ff; 608 if ((c & 0x80) == 0x00) { 609 // 0x0000 <= c <= 0x007f 610 start++; 611 buf[offset + i++] = (char) c; 612 continue; 613 } 614 615 // 616 // Multibyte chars -- check offsets optimistically, 617 // ditto the "10xx xxxx" format for subsequent bytes 618 // 619 int off = start; 620 621 try { 622 // 2 bytes 623 if ((buffer[off] & 0x0E0) == 0x0C0) { 624 c = (buffer[off++] & 0x1f) << 6; 625 c += buffer[off++] & 0x3f; 626 627 // 0x0080 <= c <= 0x07ff 628 629 // 3 bytes 630 } else if ((buffer[off] & 0x0F0) == 0x0E0) { 631 c = (buffer[off++] & 0x0f) << 12; 632 c += (buffer[off++] & 0x3f) << 6; 633 c += buffer[off++] & 0x3f; 634 635 // 0x0800 <= c <= 0xffff 636 637 // 4 bytes 638 } else if ((buffer[off] & 0x0f8) == 0x0F0) { 639 c = (buffer[off++] & 0x07) << 18; 640 c += (buffer[off++] & 0x3f) << 12; 641 c += (buffer[off++] & 0x3f) << 6; 642 c += buffer[off++] & 0x3f; 643 644 // 0x0001 0000 <= c <= 0x001f ffff 645 646 // Unicode supports c <= 0x0010 ffff ... 647 if (c > 0x0010ffff) 648 throw new CharConversionException("UTF-8 encoding of character 0x00" 649 + Integer.toHexString(c) 650 + " can't be converted to Unicode."); 651 652 // Convert UCS-4 char to surrogate pair (UTF-16) 653 c -= 0x10000; 654 nextChar = (char) (0xDC00 + (c & 0x03ff)); 655 c = 0xD800 + (c >> 10); 656 657 // 5 and 6 byte versions are XML WF errors, but 658 // typically come from mislabeled encodings 659 } else 660 throw new CharConversionException("Unconvertible UTF-8 character" 661 + " beginning with 0x" 662 + Integer.toHexString(buffer[start] & 0xff)); 663 664 } catch (ArrayIndexOutOfBoundsException e) { 665 // off > length && length >= buffer.length 666 c = 0; 667 } 668 669 // 670 // if the buffer held only a partial character, 671 // compact it and try to read the rest of the 672 // character. worst case involves three 673 // single-byte reads -- quite rare. 674 // 675 if (off > finish) { 676 System.arraycopy(buffer, start, 677 buffer, 0, finish - start); 678 finish -= start; 679 start = 0; 680 off = instream.read(buffer, finish, 681 buffer.length - finish); 682 if (off < 0) { 683 this.close(); 684 throw new CharConversionException("Partial UTF-8 char"); 685 } 686 finish += off; 687 continue; 688 } 689 690 // 691 // check the format of the non-initial bytes 692 // 693 for (start++; start < off; start++) { 694 if ((buffer[start] & 0xC0) != 0x80) { 695 this.close(); 696 throw new CharConversionException("Malformed UTF-8 char -- " 697 + "is an XML encoding declaration missing?"); 698 } 699 } 700 701 // 702 // If this needed a surrogate pair, consume ASAP 703 // 704 buf[offset + i++] = (char) c; 705 if (nextChar != 0 && i < len) { 706 buf[offset + i++] = nextChar; 707 nextChar = 0; 708 } 709 } 710 if (i > 0) 711 return i; 712 return (c == -1) ? -1 : 0; 713 } 714 } 715 716 // 717 // We want ASCII and ISO-8859 Readers since they're the most common 718 // encodings in the US and Europe, and we don't want performance 719 // regressions for them. They're also easy to implement efficiently, 720 // since they're bitmask subsets of UNICODE. 721 // 722 // XXX haven't benchmarked these readers vs what we get out of JDK. 723 // 724 static final class AsciiReader extends BaseReader { 725 AsciiReader(InputStream in) { 726 super(in); 727 } 728 729 public int read(char buf [], int offset, int len) throws IOException { 730 int i, c; 731 732 if (instream == null) 733 return -1; 734 735 for (i = 0; i < len; i++) { 736 if (start >= finish) { 737 start = 0; 738 finish = instream.read(buffer, 0, buffer.length); 739 if (finish <= 0) { 740 if (finish <= 0) 741 this.close(); 742 break; 743 } 744 } 745 c = buffer[start++]; 746 if ((c & 0x80) != 0) 747 throw new CharConversionException("Illegal ASCII character, 0x" 748 + Integer.toHexString(c & 0xff)); 749 buf[offset + i] = (char) c; 750 } 751 if (i == 0 && finish <= 0) 752 return -1; 753 return i; 754 } 755 } 756 757 static final class Iso8859_1Reader extends BaseReader { 758 Iso8859_1Reader(InputStream in) { 759 super(in); 760 } 761 762 @Override 763 public int read(char buf [], int offset, int len) throws IOException { 764 int i; 765 766 if (instream == null) 767 return -1; 768 769 for (i = 0; i < len; i++) { 770 if (start >= finish) { 771 start = 0; 772 finish = instream.read(buffer, 0, buffer.length); 773 if (finish <= 0) { 774 if (finish <= 0) 775 this.close(); 776 break; 777 } 778 } 779 buf[offset + i] = (char) (0x0ff & buffer[start++]); 780 } 781 if (i == 0 && finish <= 0) 782 return -1; 783 return i; 784 } 785 } 786 }