1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Licensed to the Apache Software Foundation (ASF) under one or more 7 * contributor license agreements. See the NOTICE file distributed with 8 * this work for additional information regarding copyright ownership. 9 * The ASF licenses this file to You under the Apache License, Version 2.0 10 * (the "License"); you may not use this file except in compliance with 11 * the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, software 16 * distributed under the License is distributed on an "AS IS" BASIS, 17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 * See the License for the specific language governing permissions and 19 * limitations under the License. 20 */ 21 22 package com.sun.org.apache.xerces.internal.impl.io; 23 24 import java.io.InputStream; 25 import java.io.IOException; 26 import java.io.Reader; 27 28 import java.util.Locale; 29 import com.sun.org.apache.xerces.internal.util.MessageFormatter; 30 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; 31 32 import com.sun.xml.internal.stream.util.BufferAllocator; 33 import com.sun.xml.internal.stream.util.ThreadLocalBufferAllocator; 34 35 /** 36 * <p>A UTF-8 reader.</p> 37 * 38 * @xerces.internal 39 * 40 * @author Andy Clark, IBM 41 * 42 */ 43 public class UTF8Reader 44 extends Reader { 45 46 // 47 // Constants 48 // 49 50 /** Default byte buffer size (2048). */ 51 public static final int DEFAULT_BUFFER_SIZE = 2048; 52 53 // debugging 54 55 /** Debug read. */ 56 private static final boolean DEBUG_READ = false; 57 58 // 59 // Data 60 // 61 62 /** Input stream. */ 63 protected InputStream fInputStream; 64 65 /** Byte buffer. */ 66 protected byte[] fBuffer; 67 68 /** Offset into buffer. */ 69 protected int fOffset; 70 71 /** Surrogate character. */ 72 private int fSurrogate = -1; 73 74 // message formatter; used to produce localized 75 // exception messages 76 private MessageFormatter fFormatter = null; 77 78 //Locale to use for messages 79 private Locale fLocale = null; 80 81 // 82 // Constructors 83 // 84 85 /** 86 * Constructs a UTF-8 reader from the specified input stream 87 * using the default buffer size. Primarily for testing. 88 * 89 * @param inputStream The input stream. 90 */ 91 public UTF8Reader(InputStream inputStream) { 92 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault()); 93 } // <init>(InputStream, MessageFormatter) 94 95 /** 96 * Constructs a UTF-8 reader from the specified input stream 97 * using the default buffer size and the given MessageFormatter. 98 * 99 * @param inputStream The input stream. 100 * @param messageFormatter given MessageFormatter 101 * @param locale Locale to use for messages 102 */ 103 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter, 104 Locale locale) { 105 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale); 106 } // <init>(InputStream, MessageFormatter, Locale) 107 108 /** 109 * Constructs a UTF-8 reader from the specified input stream, 110 * buffer size and MessageFormatter. 111 * 112 * @param inputStream The input stream. 113 * @param size The initial buffer size. 114 * @param messageFormatter the formatter for localizing/formatting errors. 115 * @param locale the Locale to use for messages 116 */ 117 public UTF8Reader(InputStream inputStream, int size, 118 MessageFormatter messageFormatter, Locale locale) { 119 fInputStream = inputStream; 120 BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator(); 121 fBuffer = ba.getByteBuffer(size); 122 if (fBuffer == null) { 123 fBuffer = new byte[size]; 124 } 125 fFormatter = messageFormatter; 126 fLocale = locale; 127 } // <init>(InputStream, int, MessageFormatter, Locale) 128 129 // 130 // Reader methods 131 // 132 133 /** 134 * Read a single character. This method will block until a character is 135 * available, an I/O error occurs, or the end of the stream is reached. 136 * 137 * <p> Subclasses that intend to support efficient single-character input 138 * should override this method. 139 * 140 * @return The character read, as an integer in the range 0 to 16383 141 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has 142 * been reached 143 * 144 * @exception IOException If an I/O error occurs 145 */ 146 public int read() throws IOException { 147 148 // decode character 149 int c = fSurrogate; 150 if (fSurrogate == -1) { 151 // NOTE: We use the index into the buffer if there are remaining 152 // bytes from the last block read. -Ac 153 int index = 0; 154 155 // get first byte 156 int b0 = index == fOffset 157 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 158 if (b0 == -1) { 159 return -1; 160 } 161 162 // UTF-8: [0xxx xxxx] 163 // Unicode: [0000 0000] [0xxx xxxx] 164 if (b0 < 0x80) { 165 c = (char)b0; 166 } 167 168 // UTF-8: [110y yyyy] [10xx xxxx] 169 // Unicode: [0000 0yyy] [yyxx xxxx] 170 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 171 int b1 = index == fOffset 172 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 173 if (b1 == -1) { 174 expectedByte(2, 2); 175 } 176 if ((b1 & 0xC0) != 0x80) { 177 invalidByte(2, 2, b1); 178 } 179 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 180 } 181 182 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] 183 // Unicode: [zzzz yyyy] [yyxx xxxx] 184 else if ((b0 & 0xF0) == 0xE0) { 185 int b1 = index == fOffset 186 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 187 if (b1 == -1) { 188 expectedByte(2, 3); 189 } 190 if ((b1 & 0xC0) != 0x80 191 || (b0 == 0xED && b1 >= 0xA0) 192 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 193 invalidByte(2, 3, b1); 194 } 195 int b2 = index == fOffset 196 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 197 if (b2 == -1) { 198 expectedByte(3, 3); 199 } 200 if ((b2 & 0xC0) != 0x80) { 201 invalidByte(3, 3, b2); 202 } 203 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 204 (b2 & 0x003F); 205 } 206 207 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 208 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 209 // [1101 11yy] [yyxx xxxx] (low surrogate) 210 // * uuuuu = wwww + 1 211 else if ((b0 & 0xF8) == 0xF0) { 212 int b1 = index == fOffset 213 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 214 if (b1 == -1) { 215 expectedByte(2, 4); 216 } 217 if ((b1 & 0xC0) != 0x80 218 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 219 invalidByte(2, 3, b1); 220 } 221 int b2 = index == fOffset 222 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 223 if (b2 == -1) { 224 expectedByte(3, 4); 225 } 226 if ((b2 & 0xC0) != 0x80) { 227 invalidByte(3, 3, b2); 228 } 229 int b3 = index == fOffset 230 ? fInputStream.read() : fBuffer[index++] & 0x00FF; 231 if (b3 == -1) { 232 expectedByte(4, 4); 233 } 234 if ((b3 & 0xC0) != 0x80) { 235 invalidByte(4, 4, b3); 236 } 237 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 238 if (uuuuu > 0x10) { 239 invalidSurrogate(uuuuu); 240 } 241 int wwww = uuuuu - 1; 242 int hs = 0xD800 | 243 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | 244 ((b2 >> 4) & 0x0003); 245 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F); 246 c = hs; 247 fSurrogate = ls; 248 } 249 250 // error 251 else { 252 invalidByte(1, 1, b0); 253 } 254 } 255 256 // use surrogate 257 else { 258 fSurrogate = -1; 259 } 260 261 // return character 262 if (DEBUG_READ) { 263 System.out.println("read(): 0x"+Integer.toHexString(c)); 264 } 265 return c; 266 267 } // read():int 268 269 /** 270 * Read characters into a portion of an array. This method will block 271 * until some input is available, an I/O error occurs, or the end of the 272 * stream is reached. 273 * 274 * @param ch Destination buffer 275 * @param offset Offset at which to start storing characters 276 * @param length Maximum number of characters to read 277 * 278 * @return The number of characters read, or -1 if the end of the 279 * stream has been reached 280 * 281 * @exception IOException If an I/O error occurs 282 */ 283 public int read(char ch[], int offset, int length) throws IOException { 284 285 // handle surrogate 286 int out = offset; 287 if (fSurrogate != -1) { 288 ch[offset + 1] = (char)fSurrogate; 289 fSurrogate = -1; 290 length--; 291 out++; 292 } 293 294 // read bytes 295 int count = 0; 296 if (fOffset == 0) { 297 // adjust length to read 298 if (length > fBuffer.length) { 299 length = fBuffer.length; 300 } 301 302 // perform read operation 303 count = fInputStream.read(fBuffer, 0, length); 304 if (count == -1) { 305 return -1; 306 } 307 count += out - offset; 308 } 309 310 // skip read; last character was in error 311 // NOTE: Having an offset value other than zero means that there was 312 // an error in the last character read. In this case, we have 313 // skipped the read so we don't consume any bytes past the 314 // error. By signalling the error on the next block read we 315 // allow the method to return the most valid characters that 316 // it can on the previous block read. -Ac 317 else { 318 count = fOffset; 319 fOffset = 0; 320 } 321 322 // convert bytes to characters 323 final int total = count; 324 int in; 325 byte byte1; 326 final byte byte0 = 0; 327 for (in = 0; in < total; in++) { 328 byte1 = fBuffer[in]; 329 if (byte1 >= byte0) { 330 ch[out++] = (char)byte1; 331 } 332 else { 333 break; 334 } 335 } 336 for ( ; in < total; in++) { 337 byte1 = fBuffer[in]; 338 339 // UTF-8: [0xxx xxxx] 340 // Unicode: [0000 0000] [0xxx xxxx] 341 if (byte1 >= byte0) { 342 ch[out++] = (char)byte1; 343 continue; 344 } 345 346 // UTF-8: [110y yyyy] [10xx xxxx] 347 // Unicode: [0000 0yyy] [yyxx xxxx] 348 int b0 = byte1 & 0x0FF; 349 if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { 350 int b1 = -1; 351 if (++in < total) { 352 b1 = fBuffer[in] & 0x00FF; 353 } 354 else { 355 b1 = fInputStream.read(); 356 if (b1 == -1) { 357 if (out > offset) { 358 fBuffer[0] = (byte)b0; 359 fOffset = 1; 360 return out - offset; 361 } 362 expectedByte(2, 2); 363 } 364 count++; 365 } 366 if ((b1 & 0xC0) != 0x80) { 367 if (out > offset) { 368 fBuffer[0] = (byte)b0; 369 fBuffer[1] = (byte)b1; 370 fOffset = 2; 371 return out - offset; 372 } 373 invalidByte(2, 2, b1); 374 } 375 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); 376 ch[out++] = (char)c; 377 count -= 1; 378 continue; 379 } 380 381 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] 382 // Unicode: [zzzz yyyy] [yyxx xxxx] 383 if ((b0 & 0xF0) == 0xE0) { 384 int b1 = -1; 385 if (++in < total) { 386 b1 = fBuffer[in] & 0x00FF; 387 } 388 else { 389 b1 = fInputStream.read(); 390 if (b1 == -1) { 391 if (out > offset) { 392 fBuffer[0] = (byte)b0; 393 fOffset = 1; 394 return out - offset; 395 } 396 expectedByte(2, 3); 397 } 398 count++; 399 } 400 if ((b1 & 0xC0) != 0x80 401 || (b0 == 0xED && b1 >= 0xA0) 402 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { 403 if (out > offset) { 404 fBuffer[0] = (byte)b0; 405 fBuffer[1] = (byte)b1; 406 fOffset = 2; 407 return out - offset; 408 } 409 invalidByte(2, 3, b1); 410 } 411 int b2 = -1; 412 if (++in < total) { 413 b2 = fBuffer[in] & 0x00FF; 414 } 415 else { 416 b2 = fInputStream.read(); 417 if (b2 == -1) { 418 if (out > offset) { 419 fBuffer[0] = (byte)b0; 420 fBuffer[1] = (byte)b1; 421 fOffset = 2; 422 return out - offset; 423 } 424 expectedByte(3, 3); 425 } 426 count++; 427 } 428 if ((b2 & 0xC0) != 0x80) { 429 if (out > offset) { 430 fBuffer[0] = (byte)b0; 431 fBuffer[1] = (byte)b1; 432 fBuffer[2] = (byte)b2; 433 fOffset = 3; 434 return out - offset; 435 } 436 invalidByte(3, 3, b2); 437 } 438 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | 439 (b2 & 0x003F); 440 ch[out++] = (char)c; 441 count -= 2; 442 continue; 443 } 444 445 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 446 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 447 // [1101 11yy] [yyxx xxxx] (low surrogate) 448 // * uuuuu = wwww + 1 449 if ((b0 & 0xF8) == 0xF0) { 450 int b1 = -1; 451 if (++in < total) { 452 b1 = fBuffer[in] & 0x00FF; 453 } 454 else { 455 b1 = fInputStream.read(); 456 if (b1 == -1) { 457 if (out > offset) { 458 fBuffer[0] = (byte)b0; 459 fOffset = 1; 460 return out - offset; 461 } 462 expectedByte(2, 4); 463 } 464 count++; 465 } 466 if ((b1 & 0xC0) != 0x80 467 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { 468 if (out > offset) { 469 fBuffer[0] = (byte)b0; 470 fBuffer[1] = (byte)b1; 471 fOffset = 2; 472 return out - offset; 473 } 474 invalidByte(2, 4, b1); 475 } 476 int b2 = -1; 477 if (++in < total) { 478 b2 = fBuffer[in] & 0x00FF; 479 } 480 else { 481 b2 = fInputStream.read(); 482 if (b2 == -1) { 483 if (out > offset) { 484 fBuffer[0] = (byte)b0; 485 fBuffer[1] = (byte)b1; 486 fOffset = 2; 487 return out - offset; 488 } 489 expectedByte(3, 4); 490 } 491 count++; 492 } 493 if ((b2 & 0xC0) != 0x80) { 494 if (out > offset) { 495 fBuffer[0] = (byte)b0; 496 fBuffer[1] = (byte)b1; 497 fBuffer[2] = (byte)b2; 498 fOffset = 3; 499 return out - offset; 500 } 501 invalidByte(3, 4, b2); 502 } 503 int b3 = -1; 504 if (++in < total) { 505 b3 = fBuffer[in] & 0x00FF; 506 } 507 else { 508 b3 = fInputStream.read(); 509 if (b3 == -1) { 510 if (out > offset) { 511 fBuffer[0] = (byte)b0; 512 fBuffer[1] = (byte)b1; 513 fBuffer[2] = (byte)b2; 514 fOffset = 3; 515 return out - offset; 516 } 517 expectedByte(4, 4); 518 } 519 count++; 520 } 521 if ((b3 & 0xC0) != 0x80) { 522 if (out > offset) { 523 fBuffer[0] = (byte)b0; 524 fBuffer[1] = (byte)b1; 525 fBuffer[2] = (byte)b2; 526 fBuffer[3] = (byte)b3; 527 fOffset = 4; 528 return out - offset; 529 } 530 invalidByte(4, 4, b2); 531 } 532 533 // check if output buffer is large enough to hold 2 surrogate chars 534 if (out + 1 >= ch.length) { 535 fBuffer[0] = (byte)b0; 536 fBuffer[1] = (byte)b1; 537 fBuffer[2] = (byte)b2; 538 fBuffer[3] = (byte)b3; 539 fOffset = 4; 540 return out - offset; 541 } 542 543 // decode bytes into surrogate characters 544 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); 545 if (uuuuu > 0x10) { 546 invalidSurrogate(uuuuu); 547 } 548 int wwww = uuuuu - 1; 549 int zzzz = b1 & 0x000F; 550 int yyyyyy = b2 & 0x003F; 551 int xxxxxx = b3 & 0x003F; 552 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4); 553 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; 554 555 // set characters 556 ch[out++] = (char)hs; 557 ch[out++] = (char)ls; 558 count -= 2; 559 continue; 560 } 561 562 // error 563 if (out > offset) { 564 fBuffer[0] = (byte)b0; 565 fOffset = 1; 566 return out - offset; 567 } 568 invalidByte(1, 1, b0); 569 } 570 571 // return number of characters converted 572 if (DEBUG_READ) { 573 System.out.println("read(char[],"+offset+','+length+"): count="+count); 574 } 575 return count; 576 577 } // read(char[],int,int) 578 579 /** 580 * Skip characters. This method will block until some characters are 581 * available, an I/O error occurs, or the end of the stream is reached. 582 * 583 * @param n The number of characters to skip 584 * 585 * @return The number of characters actually skipped 586 * 587 * @exception IOException If an I/O error occurs 588 */ 589 public long skip(long n) throws IOException { 590 591 long remaining = n; 592 final char[] ch = new char[fBuffer.length]; 593 do { 594 int length = ch.length < remaining ? ch.length : (int)remaining; 595 int count = read(ch, 0, length); 596 if (count > 0) { 597 remaining -= count; 598 } 599 else { 600 break; 601 } 602 } while (remaining > 0); 603 604 long skipped = n - remaining; 605 return skipped; 606 607 } // skip(long):long 608 609 /** 610 * Tell whether this stream is ready to be read. 611 * 612 * @return True if the next read() is guaranteed not to block for input, 613 * false otherwise. Note that returning false does not guarantee that the 614 * next read will block. 615 * 616 * @exception IOException If an I/O error occurs 617 */ 618 public boolean ready() throws IOException { 619 return false; 620 } // ready() 621 622 /** 623 * Tell whether this stream supports the mark() operation. 624 */ 625 public boolean markSupported() { 626 return false; 627 } // markSupported() 628 629 /** 630 * Mark the present position in the stream. Subsequent calls to reset() 631 * will attempt to reposition the stream to this point. Not all 632 * character-input streams support the mark() operation. 633 * 634 * @param readAheadLimit Limit on the number of characters that may be 635 * read while still preserving the mark. After 636 * reading this many characters, attempting to 637 * reset the stream may fail. 638 * 639 * @exception IOException If the stream does not support mark(), 640 * or if some other I/O error occurs 641 */ 642 public void mark(int readAheadLimit) throws IOException { 643 throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"})); 644 } // mark(int) 645 646 /** 647 * Reset the stream. If the stream has been marked, then attempt to 648 * reposition it at the mark. If the stream has not been marked, then 649 * attempt to reset it in some way appropriate to the particular stream, 650 * for example by repositioning it to its starting point. Not all 651 * character-input streams support the reset() operation, and some support 652 * reset() without supporting mark(). 653 * 654 * @exception IOException If the stream has not been marked, 655 * or if the mark has been invalidated, 656 * or if the stream does not support reset(), 657 * or if some other I/O error occurs 658 */ 659 public void reset() throws IOException { 660 fOffset = 0; 661 fSurrogate = -1; 662 } // reset() 663 664 /** 665 * Close the stream. Once a stream has been closed, further read(), 666 * ready(), mark(), or reset() invocations will throw an IOException. 667 * Closing a previously-closed stream, however, has no effect. 668 * 669 * @exception IOException If an I/O error occurs 670 */ 671 public void close() throws IOException { 672 BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator(); 673 ba.returnByteBuffer(fBuffer); 674 fBuffer = null; 675 fInputStream.close(); 676 } // close() 677 678 // 679 // Private methods 680 // 681 682 /** Throws an exception for expected byte. */ 683 private void expectedByte(int position, int count) 684 throws MalformedByteSequenceException { 685 686 throw new MalformedByteSequenceException(fFormatter, 687 fLocale, 688 XMLMessageFormatter.XML_DOMAIN, 689 "ExpectedByte", 690 new Object[] {Integer.toString(position), Integer.toString(count)}); 691 692 } // expectedByte(int,int) 693 694 /** Throws an exception for invalid byte. */ 695 private void invalidByte(int position, int count, int c) 696 throws MalformedByteSequenceException { 697 698 throw new MalformedByteSequenceException(fFormatter, 699 fLocale, 700 XMLMessageFormatter.XML_DOMAIN, 701 "InvalidByte", 702 new Object [] {Integer.toString(position), Integer.toString(count)}); 703 704 } // invalidByte(int,int,int) 705 706 /** Throws an exception for invalid surrogate bits. */ 707 private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException { 708 709 throw new MalformedByteSequenceException(fFormatter, 710 fLocale, 711 XMLMessageFormatter.XML_DOMAIN, 712 "InvalidHighSurrogate", 713 new Object[] {Integer.toHexString(uuuuu)}); 714 715 } // invalidSurrogate(int) 716 717 } // class UTF8Reader