1 /* 2 * Copyright (c) 2012, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.internal.util.xml.impl; 27 28 import java.io.IOException; 29 import java.io.InputStream; 30 import java.io.InputStreamReader; 31 import java.io.Reader; 32 import java.io.UnsupportedEncodingException; 33 import java.util.HashMap; 34 import java.util.Map; 35 import jdk.internal.org.xml.sax.InputSource; 36 import jdk.internal.org.xml.sax.SAXException; 37 38 /** 39 * XML non-validating parser engine. 40 */ 41 public abstract class Parser { 42 43 public static final String FAULT = ""; 44 protected static final int BUFFSIZE_READER = 512; 45 protected static final int BUFFSIZE_PARSER = 128; 46 /** 47 * The end of stream character. 48 */ 49 public static final char EOS = 0xffff; 50 private Pair mNoNS; // there is no namespace 51 private Pair mXml; // the xml namespace 52 private Map<String, Input> mEnt; // the entities look up table 53 private Map<String, Input> mPEnt; // the parmeter entities look up table 54 protected boolean mIsSAlone; // xml decl standalone flag 55 protected boolean mIsSAloneSet; // standalone is explicitely set 56 protected boolean mIsNSAware; // if true - namespace aware mode 57 protected int mPh; // current phase of document processing 58 protected static final int PH_BEFORE_DOC = -1; // before parsing 59 protected static final int PH_DOC_START = 0; // document start 60 protected static final int PH_MISC_DTD = 1; // misc before DTD 61 protected static final int PH_DTD = 2; // DTD 62 protected static final int PH_DTD_MISC = 3; // misc after DTD 63 protected static final int PH_DOCELM = 4; // document's element 64 protected static final int PH_DOCELM_MISC = 5; // misc after element 65 protected static final int PH_AFTER_DOC = 6; // after parsing 66 protected int mEvt; // current event type 67 protected static final int EV_NULL = 0; // unknown 68 protected static final int EV_ELM = 1; // empty element 69 protected static final int EV_ELMS = 2; // start element 70 protected static final int EV_ELME = 3; // end element 71 protected static final int EV_TEXT = 4; // textual content 72 protected static final int EV_WSPC = 5; // white space content 73 protected static final int EV_PI = 6; // processing instruction 74 protected static final int EV_CDAT = 7; // character data 75 protected static final int EV_COMM = 8; // comment 76 protected static final int EV_DTD = 9; // document type definition 77 protected static final int EV_ENT = 10; // skipped entity 78 private char mESt; // built-in entity recognizer state 79 // mESt values: 80 // 0x100 : the initial state 81 // > 0x100 : unrecognized name 82 // < 0x100 : replacement character 83 protected char[] mBuff; // parser buffer 84 protected int mBuffIdx; // index of the last char 85 protected Pair mPref; // stack of prefixes 86 protected Pair mElm; // stack of elements 87 // mAttL.chars - element qname 88 // mAttL.next - next element 89 // mAttL.list - list of attributes defined on this element 90 // mAttL.list.chars - attribute qname 91 // mAttL.list.id - a char representing attribute's type see below 92 // mAttL.list.next - next attribute defined on the element 93 // mAttL.list.list - devault value structure or null 94 // mAttL.list.list.chars - "name='value' " chars array for Input 95 // 96 // Attribute type character values: 97 // 'i' - "ID" 98 // 'r' - "IDREF" 99 // 'R' - "IDREFS" 100 // 'n' - "ENTITY" 101 // 'N' - "ENTITIES" 102 // 't' - "NMTOKEN" 103 // 'T' - "NMTOKENS" 104 // 'u' - enumeration type 105 // 'o' - "NOTATION" 106 // 'c' - "CDATA" 107 // see also: bkeyword() and atype() 108 // 109 protected Pair mAttL; // list of defined attrs by element name 110 protected Input mDoc; // document entity 111 protected Input mInp; // stack of entities 112 private char[] mChars; // reading buffer 113 private int mChLen; // current capacity 114 private int mChIdx; // index to the next char 115 protected Attrs mAttrs; // attributes of the curr. element 116 private String[] mItems; // attributes array of the curr. element 117 private char mAttrIdx; // attributes counter/index 118 private String mUnent; // unresolved entity name 119 private Pair mDltd; // deleted objects for reuse 120 /** 121 * Default prefixes 122 */ 123 private static final char NONS[]; 124 private static final char XML[]; 125 private static final char XMLNS[]; 126 127 static { 128 NONS = new char[1]; 129 NONS[0] = (char) 0; 130 131 XML = new char[4]; 132 XML[0] = (char) 4; 133 XML[1] = 'x'; 134 XML[2] = 'm'; 135 XML[3] = 'l'; 136 137 XMLNS = new char[6]; 138 XMLNS[0] = (char) 6; 139 XMLNS[1] = 'x'; 140 XMLNS[2] = 'm'; 141 XMLNS[3] = 'l'; 142 XMLNS[4] = 'n'; 143 XMLNS[5] = 's'; 144 } 145 /** 146 * ASCII character type array. 147 * 148 * This array maps an ASCII (7 bit) character to the character type.<br> 149 * Possible character type values are:<br> - ' ' for any kind of white 150 * space character;<br> - 'a' for any lower case alphabetical character 151 * value;<br> - 'A' for any upper case alphabetical character value;<br> 152 * - 'd' for any decimal digit character value;<br> - 'z' for any 153 * character less than ' ' except '\t', '\n', '\r';<br> An ASCII (7 bit) 154 * character which does not fall in any category listed above is mapped to 155 * it self. 156 */ 157 private static final byte asctyp[]; 158 /** 159 * NMTOKEN character type array. 160 * 161 * This array maps an ASCII (7 bit) character to the character type.<br> 162 * Possible character type values are:<br> - 0 for underscore ('_') or any 163 * lower and upper case alphabetical character value;<br> - 1 for colon 164 * (':') character;<br> - 2 for dash ('-') and dot ('.') or any decimal 165 * digit character value;<br> - 3 for any kind of white space character<br> 166 * An ASCII (7 bit) character which does not fall in any category listed 167 * above is mapped to 0xff. 168 */ 169 private static final byte nmttyp[]; 170 171 /** 172 * Static constructor. 173 * 174 * Sets up the ASCII character type array which is used by 175 * {@link #asctyp asctyp} method and NMTOKEN character type array. 176 */ 177 static { 178 short i = 0; 179 180 asctyp = new byte[0x80]; 181 while (i < ' ') { 182 asctyp[i++] = (byte) 'z'; 183 } 184 asctyp['\t'] = (byte) ' '; 185 asctyp['\r'] = (byte) ' '; 186 asctyp['\n'] = (byte) ' '; 187 while (i < '0') { 188 asctyp[i] = (byte) i++; 189 } 190 while (i <= '9') { 191 asctyp[i++] = (byte) 'd'; 192 } 193 while (i < 'A') { 194 asctyp[i] = (byte) i++; 195 } 196 while (i <= 'Z') { 197 asctyp[i++] = (byte) 'A'; 198 } 199 while (i < 'a') { 200 asctyp[i] = (byte) i++; 201 } 202 while (i <= 'z') { 203 asctyp[i++] = (byte) 'a'; 204 } 205 while (i < 0x80) { 206 asctyp[i] = (byte) i++; 207 } 208 209 nmttyp = new byte[0x80]; 210 for (i = 0; i < '0'; i++) { 211 nmttyp[i] = (byte) 0xff; 212 } 213 while (i <= '9') { 214 nmttyp[i++] = (byte) 2; // digits 215 } 216 while (i < 'A') { 217 nmttyp[i++] = (byte) 0xff; 218 } 219 // skiped upper case alphabetical character are already 0 220 for (i = '['; i < 'a'; i++) { 221 nmttyp[i] = (byte) 0xff; 222 } 223 // skiped lower case alphabetical character are already 0 224 for (i = '{'; i < 0x80; i++) { 225 nmttyp[i] = (byte) 0xff; 226 } 227 nmttyp['_'] = 0; 228 nmttyp[':'] = 1; 229 nmttyp['.'] = 2; 230 nmttyp['-'] = 2; 231 nmttyp[' '] = 3; 232 nmttyp['\t'] = 3; 233 nmttyp['\r'] = 3; 234 nmttyp['\n'] = 3; 235 } 236 237 /** 238 * Constructor. 239 */ 240 protected Parser() { 241 mPh = PH_BEFORE_DOC; // before parsing 242 243 // Initialize the parser 244 mBuff = new char[BUFFSIZE_PARSER]; 245 mAttrs = new Attrs(); 246 247 // Default namespace 248 mPref = pair(mPref); 249 mPref.name = ""; 250 mPref.value = ""; 251 mPref.chars = NONS; 252 mNoNS = mPref; // no namespace 253 // XML namespace 254 mPref = pair(mPref); 255 mPref.name = "xml"; 256 mPref.value = "http://www.w3.org/XML/1998/namespace"; 257 mPref.chars = XML; 258 mXml = mPref; // XML namespace 259 } 260 261 /** 262 * Initializes parser's internals. Note, current input has to be set before 263 * this method is called. 264 */ 265 protected void init() { 266 mUnent = null; 267 mElm = null; 268 mPref = mXml; 269 mAttL = null; 270 mPEnt = new HashMap<>(); 271 mEnt = new HashMap<>(); 272 mDoc = mInp; // current input is document entity 273 mChars = mInp.chars; // use document entity buffer 274 mPh = PH_DOC_START; // the begining of the document 275 } 276 277 /** 278 * Cleans up parser internal resources. 279 */ 280 protected void cleanup() { 281 // Default attributes 282 while (mAttL != null) { 283 while (mAttL.list != null) { 284 if (mAttL.list.list != null) { 285 del(mAttL.list.list); 286 } 287 mAttL.list = del(mAttL.list); 288 } 289 mAttL = del(mAttL); 290 } 291 // Element stack 292 while (mElm != null) { 293 mElm = del(mElm); 294 } 295 // Namespace prefixes 296 while (mPref != mXml) { 297 mPref = del(mPref); 298 } 299 // Inputs 300 while (mInp != null) { 301 pop(); 302 } 303 // Document reader 304 if ((mDoc != null) && (mDoc.src != null)) { 305 try { 306 mDoc.src.close(); 307 } catch (IOException ioe) { 308 } 309 } 310 mPEnt = null; 311 mEnt = null; 312 mDoc = null; 313 mPh = PH_AFTER_DOC; // before documnet processing 314 } 315 316 /** 317 * Processes a portion of document. This method returns one of EV_* 318 * constants as an identifier of the portion of document have been read. 319 * 320 * @return Identifier of processed document portion. 321 * @exception Exception is parser specific exception form panic method. 322 * @exception IOException 323 */ 324 @SuppressWarnings("fallthrough") 325 protected int step() throws Exception { 326 mEvt = EV_NULL; 327 int st = 0; 328 while (mEvt == EV_NULL) { 329 char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 330 switch (st) { 331 case 0: // all sorts of markup (dispetcher) 332 if (ch != '<') { 333 bkch(); 334 mBuffIdx = -1; // clean parser buffer 335 st = 1; 336 break; 337 } 338 switch (getch()) { 339 case '/': // the end of the element content 340 mEvt = EV_ELME; 341 if (mElm == null) { 342 panic(FAULT); 343 } 344 // Check element's open/close tags balance 345 mBuffIdx = -1; // clean parser buffer 346 bname(mIsNSAware); 347 char[] chars = mElm.chars; 348 if (chars.length == (mBuffIdx + 1)) { 349 for (char i = 1; i <= mBuffIdx; i += 1) { 350 if (chars[i] != mBuff[i]) { 351 panic(FAULT); 352 } 353 } 354 } else { 355 panic(FAULT); 356 } 357 // Skip white spaces before '>' 358 if (wsskip() != '>') { 359 panic(FAULT); 360 } 361 getch(); // read '>' 362 break; 363 364 case '!': // a comment or a CDATA 365 ch = getch(); 366 bkch(); 367 switch (ch) { 368 case '-': // must be a comment 369 mEvt = EV_COMM; 370 comm(); 371 break; 372 373 case '[': // must be a CDATA section 374 mEvt = EV_CDAT; 375 cdat(); 376 break; 377 378 default: // must be 'DOCTYPE' 379 mEvt = EV_DTD; 380 dtd(); 381 break; 382 } 383 break; 384 385 case '?': // processing instruction 386 mEvt = EV_PI; 387 pi(); 388 break; 389 390 default: // must be the first char of an xml name 391 bkch(); 392 // Read an element name and put it on top of the 393 // element stack 394 mElm = pair(mElm); // add new element to the stack 395 mElm.chars = qname(mIsNSAware); 396 mElm.name = mElm.local(); 397 mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags 398 mElm.num = 0; // namespace counter 399 // Find the list of defined attributs of the current 400 // element 401 Pair elm = find(mAttL, mElm.chars); 402 mElm.list = (elm != null) ? elm.list : null; 403 // Read attributes till the end of the element tag 404 mAttrIdx = 0; 405 Pair att = pair(null); 406 att.num = 0; // clear attribute's flags 407 attr(att); // get all attributes inc. defaults 408 del(att); 409 mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; 410 // Skip white spaces before '>' 411 switch (wsskip()) { 412 case '>': 413 getch(); // read '>' 414 mEvt = EV_ELMS; 415 break; 416 417 case '/': 418 getch(); // read '/' 419 if (getch() != '>') // read '>' 420 { 421 panic(FAULT); 422 } 423 mEvt = EV_ELM; 424 break; 425 426 default: 427 panic(FAULT); 428 } 429 break; 430 } 431 break; 432 433 case 1: // read white space 434 switch (ch) { 435 case ' ': 436 case '\t': 437 case '\n': 438 bappend(ch); 439 break; 440 441 case '\r': // EOL processing [#2.11] 442 if (getch() != '\n') { 443 bkch(); 444 } 445 bappend('\n'); 446 break; 447 448 case '<': 449 mEvt = EV_WSPC; 450 bkch(); 451 bflash_ws(); 452 break; 453 454 default: 455 bkch(); 456 st = 2; 457 break; 458 } 459 break; 460 461 case 2: // read the text content of the element 462 switch (ch) { 463 case '&': 464 if (mUnent == null) { 465 // There was no unresolved entity on previous step. 466 if ((mUnent = ent('x')) != null) { 467 mEvt = EV_TEXT; 468 bkch(); // move back to ';' after entity name 469 setch('&'); // parser must be back on next step 470 bflash(); 471 } 472 } else { 473 // There was unresolved entity on previous step. 474 mEvt = EV_ENT; 475 skippedEnt(mUnent); 476 mUnent = null; 477 } 478 break; 479 480 case '<': 481 mEvt = EV_TEXT; 482 bkch(); 483 bflash(); 484 break; 485 486 case '\r': // EOL processing [#2.11] 487 if (getch() != '\n') { 488 bkch(); 489 } 490 bappend('\n'); 491 break; 492 493 case EOS: 494 panic(FAULT); 495 496 default: 497 bappend(ch); 498 break; 499 } 500 break; 501 502 default: 503 panic(FAULT); 504 } 505 } 506 507 return mEvt; 508 } 509 510 /** 511 * Parses the document type declaration. 512 * 513 * @exception Exception is parser specific exception form panic method. 514 * @exception IOException 515 */ 516 private void dtd() throws Exception { 517 char ch; 518 String str = null; 519 String name = null; 520 Pair psid = null; 521 // read 'DOCTYPE' 522 if ("DOCTYPE".equals(name(false)) != true) { 523 panic(FAULT); 524 } 525 mPh = PH_DTD; // DTD 526 for (short st = 0; st >= 0;) { 527 ch = getch(); 528 switch (st) { 529 case 0: // read the document type name 530 if (chtyp(ch) != ' ') { 531 bkch(); 532 name = name(mIsNSAware); 533 wsskip(); 534 st = 1; // read 'PUPLIC' or 'SYSTEM' 535 } 536 break; 537 538 case 1: // read 'PUPLIC' or 'SYSTEM' 539 switch (chtyp(ch)) { 540 case 'A': 541 bkch(); 542 psid = pubsys(' '); 543 st = 2; // skip spaces before internal subset 544 docType(name, psid.name, psid.value); 545 break; 546 547 case '[': 548 bkch(); 549 st = 2; // skip spaces before internal subset 550 docType(name, null, null); 551 break; 552 553 case '>': 554 bkch(); 555 st = 3; // skip spaces after internal subset 556 docType(name, null, null); 557 break; 558 559 default: 560 panic(FAULT); 561 } 562 break; 563 564 case 2: // skip spaces before internal subset 565 switch (chtyp(ch)) { 566 case '[': 567 // Process internal subset 568 dtdsub(); 569 st = 3; // skip spaces after internal subset 570 break; 571 572 case '>': 573 // There is no internal subset 574 bkch(); 575 st = 3; // skip spaces after internal subset 576 break; 577 578 case ' ': 579 // skip white spaces 580 break; 581 582 default: 583 panic(FAULT); 584 } 585 break; 586 587 case 3: // skip spaces after internal subset 588 switch (chtyp(ch)) { 589 case '>': 590 if (psid != null) { 591 // Report the DTD external subset 592 InputSource is = resolveEnt(name, psid.name, psid.value); 593 if (is != null) { 594 if (mIsSAlone == false) { 595 // Set the end of DTD external subset char 596 bkch(); 597 setch(']'); 598 // Set the DTD external subset InputSource 599 push(new Input(BUFFSIZE_READER)); 600 setinp(is); 601 mInp.pubid = psid.name; 602 mInp.sysid = psid.value; 603 // Parse the DTD external subset 604 dtdsub(); 605 } else { 606 // Unresolved DTD external subset 607 skippedEnt("[dtd]"); 608 // Release reader and stream 609 if (is.getCharacterStream() != null) { 610 try { 611 is.getCharacterStream().close(); 612 } catch (IOException ioe) { 613 } 614 } 615 if (is.getByteStream() != null) { 616 try { 617 is.getByteStream().close(); 618 } catch (IOException ioe) { 619 } 620 } 621 } 622 } else { 623 // Unresolved DTD external subset 624 skippedEnt("[dtd]"); 625 } 626 del(psid); 627 } 628 st = -1; // end of DTD 629 break; 630 631 case ' ': 632 // skip white spaces 633 break; 634 635 default: 636 panic(FAULT); 637 } 638 break; 639 640 default: 641 panic(FAULT); 642 } 643 } 644 } 645 646 /** 647 * Parses the document type declaration subset. 648 * 649 * @exception Exception is parser specific exception form panic method. 650 * @exception IOException 651 */ 652 private void dtdsub() throws Exception { 653 startInternalSub(); // reports the event before parsing the subset 654 655 char ch; 656 for (short st = 0; st >= 0;) { 657 ch = getch(); 658 switch (st) { 659 case 0: // skip white spaces before a declaration 660 switch (chtyp(ch)) { 661 case '<': 662 ch = getch(); 663 switch (ch) { 664 case '?': 665 pi(); 666 break; 667 668 case '!': 669 ch = getch(); 670 bkch(); 671 if (ch == '-') { 672 comm(); 673 break; 674 } 675 // A markup or an entity declaration 676 bntok(); 677 switch (bkeyword()) { 678 case 'n': 679 dtdent(); 680 break; 681 682 case 'a': 683 dtdattl(); // parse attributes declaration 684 break; 685 686 case 'e': 687 dtdelm(); // parse element declaration 688 break; 689 690 case 'o': 691 dtdnot(); // parse notation declaration 692 break; 693 694 default: 695 panic(FAULT); // unsupported markup declaration 696 break; 697 } 698 st = 1; // read the end of declaration 699 break; 700 701 default: 702 panic(FAULT); 703 break; 704 } 705 break; 706 707 case '%': 708 // A parameter entity reference 709 pent(' '); 710 break; 711 712 case ']': 713 // End of DTD subset 714 st = -1; 715 break; 716 717 case ' ': 718 // Skip white spaces 719 break; 720 721 case 'Z': 722 // End of stream 723 if (getch() != ']') { 724 panic(FAULT); 725 } 726 st = -1; 727 break; 728 729 default: 730 panic(FAULT); 731 } 732 break; 733 734 case 1: // read the end of declaration 735 switch (ch) { 736 case '>': // there is no notation 737 st = 0; // skip white spaces before a declaration 738 break; 739 740 case ' ': 741 case '\n': 742 case '\r': 743 case '\t': 744 // Skip white spaces 745 break; 746 747 default: 748 panic(FAULT); 749 break; 750 } 751 break; 752 753 default: 754 panic(FAULT); 755 } 756 } 757 } 758 759 /** 760 * Parses an entity declaration. This method fills the general ( 761 * <code>mEnt</code>) and parameter 762 * ( 763 * <code>mPEnt</code>) entity look up table. 764 * 765 * @exception Exception is parser specific exception form panic method. 766 * @exception IOException 767 */ 768 @SuppressWarnings("fallthrough") 769 private void dtdent() throws Exception { 770 String str = null; 771 char[] val = null; 772 Input inp = null; 773 Pair ids = null; 774 char ch; 775 for (short st = 0; st >= 0;) { 776 ch = getch(); 777 switch (st) { 778 case 0: // skip white spaces before entity name 779 switch (chtyp(ch)) { 780 case ' ': 781 // Skip white spaces 782 break; 783 784 case '%': 785 // Parameter entity or parameter entity declaration. 786 ch = getch(); 787 bkch(); 788 if (chtyp(ch) == ' ') { 789 // Parameter entity declaration. 790 wsskip(); 791 str = name(false); 792 switch (chtyp(wsskip())) { 793 case 'A': 794 // Read the external identifier 795 ids = pubsys(' '); 796 if (wsskip() == '>') { 797 // External parsed entity 798 if (mPEnt.containsKey(str) == false) { // [#4.2] 799 inp = new Input(); 800 inp.pubid = ids.name; 801 inp.sysid = ids.value; 802 mPEnt.put(str, inp); 803 } 804 } else { 805 panic(FAULT); 806 } 807 del(ids); 808 st = -1; // the end of declaration 809 break; 810 811 case '\"': 812 case '\'': 813 // Read the parameter entity value 814 bqstr('d'); 815 // Create the parameter entity value 816 val = new char[mBuffIdx + 1]; 817 System.arraycopy(mBuff, 1, val, 1, val.length - 1); 818 // Add surrounding spaces [#4.4.8] 819 val[0] = ' '; 820 // Add the entity to the entity look up table 821 if (mPEnt.containsKey(str) == false) { // [#4.2] 822 inp = new Input(val); 823 inp.pubid = mInp.pubid; 824 inp.sysid = mInp.sysid; 825 inp.xmlenc = mInp.xmlenc; 826 inp.xmlver = mInp.xmlver; 827 mPEnt.put(str, inp); 828 } 829 st = -1; // the end of declaration 830 break; 831 832 default: 833 panic(FAULT); 834 break; 835 } 836 } else { 837 // Parameter entity reference. 838 pent(' '); 839 } 840 break; 841 842 default: 843 bkch(); 844 str = name(false); 845 st = 1; // read entity declaration value 846 break; 847 } 848 break; 849 850 case 1: // read entity declaration value 851 switch (chtyp(ch)) { 852 case '\"': // internal entity 853 case '\'': 854 bkch(); 855 bqstr('d'); // read a string into the buffer 856 if (mEnt.get(str) == null) { 857 // Create general entity value 858 val = new char[mBuffIdx]; 859 System.arraycopy(mBuff, 1, val, 0, val.length); 860 // Add the entity to the entity look up table 861 if (mEnt.containsKey(str) == false) { // [#4.2] 862 inp = new Input(val); 863 inp.pubid = mInp.pubid; 864 inp.sysid = mInp.sysid; 865 inp.xmlenc = mInp.xmlenc; 866 inp.xmlver = mInp.xmlver; 867 mEnt.put(str, inp); 868 } 869 } 870 st = -1; // the end of declaration 871 break; 872 873 case 'A': // external entity 874 bkch(); 875 ids = pubsys(' '); 876 switch (wsskip()) { 877 case '>': // external parsed entity 878 if (mEnt.containsKey(str) == false) { // [#4.2] 879 inp = new Input(); 880 inp.pubid = ids.name; 881 inp.sysid = ids.value; 882 mEnt.put(str, inp); 883 } 884 break; 885 886 case 'N': // external general unparsed entity 887 if ("NDATA".equals(name(false)) == true) { 888 wsskip(); 889 unparsedEntDecl(str, ids.name, ids.value, name(false)); 890 break; 891 } 892 default: 893 panic(FAULT); 894 break; 895 } 896 del(ids); 897 st = -1; // the end of declaration 898 break; 899 900 case ' ': 901 // Skip white spaces 902 break; 903 904 default: 905 panic(FAULT); 906 break; 907 } 908 break; 909 910 default: 911 panic(FAULT); 912 } 913 } 914 } 915 916 /** 917 * Parses an element declaration. 918 * 919 * This method parses the declaration up to the closing angle bracket. 920 * 921 * @exception Exception is parser specific exception form panic method. 922 * @exception IOException 923 */ 924 @SuppressWarnings("fallthrough") 925 private void dtdelm() throws Exception { 926 // This is stub implementation which skips an element 927 // declaration. 928 wsskip(); 929 name(mIsNSAware); 930 931 char ch; 932 while (true) { 933 ch = getch(); 934 switch (ch) { 935 case '>': 936 bkch(); 937 return; 938 939 case EOS: 940 panic(FAULT); 941 942 default: 943 break; 944 } 945 } 946 } 947 948 /** 949 * Parses an attribute list declaration. 950 * 951 * This method parses the declaration up to the closing angle bracket. 952 * 953 * @exception Exception is parser specific exception form panic method. 954 * @exception IOException 955 */ 956 private void dtdattl() throws Exception { 957 char elmqn[] = null; 958 Pair elm = null; 959 char ch; 960 for (short st = 0; st >= 0;) { 961 ch = getch(); 962 switch (st) { 963 case 0: // read the element name 964 switch (chtyp(ch)) { 965 case 'a': 966 case 'A': 967 case '_': 968 case 'X': 969 case ':': 970 bkch(); 971 // Get the element from the list or add a new one. 972 elmqn = qname(mIsNSAware); 973 elm = find(mAttL, elmqn); 974 if (elm == null) { 975 elm = pair(mAttL); 976 elm.chars = elmqn; 977 mAttL = elm; 978 } 979 st = 1; // read an attribute declaration 980 break; 981 982 case ' ': 983 break; 984 985 case '%': 986 pent(' '); 987 break; 988 989 default: 990 panic(FAULT); 991 break; 992 } 993 break; 994 995 case 1: // read an attribute declaration 996 switch (chtyp(ch)) { 997 case 'a': 998 case 'A': 999 case '_': 1000 case 'X': 1001 case ':': 1002 bkch(); 1003 dtdatt(elm); 1004 if (wsskip() == '>') { 1005 return; 1006 } 1007 break; 1008 1009 case ' ': 1010 break; 1011 1012 case '%': 1013 pent(' '); 1014 break; 1015 1016 default: 1017 panic(FAULT); 1018 break; 1019 } 1020 break; 1021 1022 default: 1023 panic(FAULT); 1024 break; 1025 } 1026 } 1027 } 1028 1029 /** 1030 * Parses an attribute declaration. 1031 * 1032 * The attribute uses the following fields of Pair object: chars - characters 1033 * of qualified name id - the type identifier of the attribute list - a pair 1034 * which holds the default value (chars field) 1035 * 1036 * @param elm An object which represents all defined attributes on an 1037 * element. 1038 * @exception Exception is parser specific exception form panic method. 1039 * @exception IOException 1040 */ 1041 @SuppressWarnings("fallthrough") 1042 private void dtdatt(Pair elm) throws Exception { 1043 char attqn[] = null; 1044 Pair att = null; 1045 char ch; 1046 for (short st = 0; st >= 0;) { 1047 ch = getch(); 1048 switch (st) { 1049 case 0: // the attribute name 1050 switch (chtyp(ch)) { 1051 case 'a': 1052 case 'A': 1053 case '_': 1054 case 'X': 1055 case ':': 1056 bkch(); 1057 // Get the attribute from the list or add a new one. 1058 attqn = qname(mIsNSAware); 1059 att = find(elm.list, attqn); 1060 if (att == null) { 1061 // New attribute declaration 1062 att = pair(elm.list); 1063 att.chars = attqn; 1064 elm.list = att; 1065 } else { 1066 // Do not override the attribute declaration [#3.3] 1067 att = pair(null); 1068 att.chars = attqn; 1069 att.id = 'c'; 1070 } 1071 wsskip(); 1072 st = 1; 1073 break; 1074 1075 case '%': 1076 pent(' '); 1077 break; 1078 1079 case ' ': 1080 break; 1081 1082 default: 1083 panic(FAULT); 1084 break; 1085 } 1086 break; 1087 1088 case 1: // the attribute type 1089 switch (chtyp(ch)) { 1090 case '(': 1091 att.id = 'u'; // enumeration type 1092 st = 2; // read the first element of the list 1093 break; 1094 1095 case '%': 1096 pent(' '); 1097 break; 1098 1099 case ' ': 1100 break; 1101 1102 default: 1103 bkch(); 1104 bntok(); // read type id 1105 att.id = bkeyword(); 1106 switch (att.id) { 1107 case 'o': // NOTATION 1108 if (wsskip() != '(') { 1109 panic(FAULT); 1110 } 1111 ch = getch(); 1112 st = 2; // read the first element of the list 1113 break; 1114 1115 case 'i': // ID 1116 case 'r': // IDREF 1117 case 'R': // IDREFS 1118 case 'n': // ENTITY 1119 case 'N': // ENTITIES 1120 case 't': // NMTOKEN 1121 case 'T': // NMTOKENS 1122 case 'c': // CDATA 1123 wsskip(); 1124 st = 4; // read default declaration 1125 break; 1126 1127 default: 1128 panic(FAULT); 1129 break; 1130 } 1131 break; 1132 } 1133 break; 1134 1135 case 2: // read the first element of the list 1136 switch (chtyp(ch)) { 1137 case 'a': 1138 case 'A': 1139 case 'd': 1140 case '.': 1141 case ':': 1142 case '-': 1143 case '_': 1144 case 'X': 1145 bkch(); 1146 switch (att.id) { 1147 case 'u': // enumeration type 1148 bntok(); 1149 break; 1150 1151 case 'o': // NOTATION 1152 mBuffIdx = -1; 1153 bname(false); 1154 break; 1155 1156 default: 1157 panic(FAULT); 1158 break; 1159 } 1160 wsskip(); 1161 st = 3; // read next element of the list 1162 break; 1163 1164 case '%': 1165 pent(' '); 1166 break; 1167 1168 case ' ': 1169 break; 1170 1171 default: 1172 panic(FAULT); 1173 break; 1174 } 1175 break; 1176 1177 case 3: // read next element of the list 1178 switch (ch) { 1179 case ')': 1180 wsskip(); 1181 st = 4; // read default declaration 1182 break; 1183 1184 case '|': 1185 wsskip(); 1186 switch (att.id) { 1187 case 'u': // enumeration type 1188 bntok(); 1189 break; 1190 1191 case 'o': // NOTATION 1192 mBuffIdx = -1; 1193 bname(false); 1194 break; 1195 1196 default: 1197 panic(FAULT); 1198 break; 1199 } 1200 wsskip(); 1201 break; 1202 1203 case '%': 1204 pent(' '); 1205 break; 1206 1207 default: 1208 panic(FAULT); 1209 break; 1210 } 1211 break; 1212 1213 case 4: // read default declaration 1214 switch (ch) { 1215 case '#': 1216 bntok(); 1217 switch (bkeyword()) { 1218 case 'F': // FIXED 1219 switch (wsskip()) { 1220 case '\"': 1221 case '\'': 1222 st = 5; // read the default value 1223 break; 1224 1225 case EOS: 1226 panic(FAULT); 1227 1228 default: 1229 st = -1; 1230 break; 1231 } 1232 break; 1233 1234 case 'Q': // REQUIRED 1235 case 'I': // IMPLIED 1236 st = -1; 1237 break; 1238 1239 default: 1240 panic(FAULT); 1241 break; 1242 } 1243 break; 1244 1245 case '\"': 1246 case '\'': 1247 bkch(); 1248 st = 5; // read the default value 1249 break; 1250 1251 case ' ': 1252 case '\n': 1253 case '\r': 1254 case '\t': 1255 break; 1256 1257 case '%': 1258 pent(' '); 1259 break; 1260 1261 default: 1262 bkch(); 1263 st = -1; 1264 break; 1265 } 1266 break; 1267 1268 case 5: // read the default value 1269 switch (ch) { 1270 case '\"': 1271 case '\'': 1272 bkch(); 1273 bqstr('d'); // the value in the mBuff now 1274 att.list = pair(null); 1275 // Create a string like "attqname='value' " 1276 att.list.chars = new char[att.chars.length + mBuffIdx + 3]; 1277 System.arraycopy( 1278 att.chars, 1, att.list.chars, 0, att.chars.length - 1); 1279 att.list.chars[att.chars.length - 1] = '='; 1280 att.list.chars[att.chars.length] = ch; 1281 System.arraycopy( 1282 mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); 1283 att.list.chars[att.chars.length + mBuffIdx + 1] = ch; 1284 att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; 1285 st = -1; 1286 break; 1287 1288 default: 1289 panic(FAULT); 1290 break; 1291 } 1292 break; 1293 1294 default: 1295 panic(FAULT); 1296 break; 1297 } 1298 } 1299 } 1300 1301 /** 1302 * Parses a notation declaration. 1303 * 1304 * This method parses the declaration up to the closing angle bracket. 1305 * 1306 * @exception Exception is parser specific exception form panic method. 1307 * @exception IOException 1308 */ 1309 private void dtdnot() throws Exception { 1310 wsskip(); 1311 String name = name(false); 1312 wsskip(); 1313 Pair ids = pubsys('N'); 1314 notDecl(name, ids.name, ids.value); 1315 del(ids); 1316 } 1317 1318 /** 1319 * Parses an attribute. 1320 * 1321 * This recursive method is responsible for prefix addition 1322 * ( 1323 * <code>mPref</code>) on the way down. The element's start tag end triggers 1324 * the return process. The method then on it's way back resolves prefixes 1325 * and accumulates attributes. 1326 * 1327 * <p><code>att.num</code> carries attribute flags where: 0x1 - attribute is 1328 * declared in DTD (attribute decalration had been read); 0x2 - attribute's 1329 * default value is used.</p> 1330 * 1331 * @param att An object which reprecents current attribute. 1332 * @exception Exception is parser specific exception form panic method. 1333 * @exception IOException 1334 */ 1335 @SuppressWarnings("fallthrough") 1336 private void attr(Pair att) throws Exception { 1337 switch (wsskip()) { 1338 case '/': 1339 case '>': 1340 if ((att.num & 0x2) == 0) { // all attributes have been read 1341 att.num |= 0x2; // set default attribute flag 1342 Input inp = mInp; 1343 // Go through all attributes defined on current element. 1344 for (Pair def = mElm.list; def != null; def = def.next) { 1345 if (def.list == null) // no default value 1346 { 1347 continue; 1348 } 1349 // Go through all attributes defined on current 1350 // element and add defaults. 1351 Pair act = find(att.next, def.chars); 1352 if (act == null) { 1353 push(new Input(def.list.chars)); 1354 } 1355 } 1356 if (mInp != inp) { // defaults have been added 1357 attr(att); 1358 return; 1359 } 1360 } 1361 // Ensure the attribute string array capacity 1362 mAttrs.setLength(mAttrIdx); 1363 mItems = mAttrs.mItems; 1364 return; 1365 1366 case EOS: 1367 panic(FAULT); 1368 1369 default: 1370 // Read the attribute name and value 1371 att.chars = qname(mIsNSAware); 1372 att.name = att.local(); 1373 String type = atype(att); // sets attribute's type on att.id 1374 wsskip(); 1375 if (getch() != '=') { 1376 panic(FAULT); 1377 } 1378 bqstr((char) att.id); // read the value with normalization. 1379 String val = new String(mBuff, 1, mBuffIdx); 1380 Pair next = pair(att); 1381 next.num = (att.num & ~0x1); // inherit attribute flags 1382 // Put a namespace declaration on top of the prefix stack 1383 if ((mIsNSAware == false) || (isdecl(att, val) == false)) { 1384 // An ordinary attribute 1385 mAttrIdx++; 1386 attr(next); // recursive call to parse the next attribute 1387 mAttrIdx--; 1388 // Add the attribute to the attributes string array 1389 char idx = (char) (mAttrIdx << 3); 1390 mItems[idx + 1] = att.qname(); // attr qname 1391 mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name 1392 mItems[idx + 3] = val; // attr value 1393 mItems[idx + 4] = type; // attr type 1394 switch (att.num & 0x3) { 1395 case 0x0: 1396 mItems[idx + 5] = null; 1397 break; 1398 1399 case 0x1: // declared attribute 1400 mItems[idx + 5] = "d"; 1401 break; 1402 1403 default: // 0x2, 0x3 - default attribute always declared 1404 mItems[idx + 5] = "D"; 1405 break; 1406 } 1407 // Resolve the prefix if any and report the attribute 1408 // NOTE: The attribute does not accept the default namespace. 1409 mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; 1410 } else { 1411 // A namespace declaration. mPref.name contains prefix and 1412 // mPref.value contains namespace URI set by isdecl method. 1413 // Report a start of the new mapping 1414 newPrefix(); 1415 // Recursive call to parse the next attribute 1416 attr(next); 1417 // NOTE: The namespace declaration is not reported. 1418 } 1419 del(next); 1420 break; 1421 } 1422 } 1423 1424 /** 1425 * Retrieves attribute type. 1426 * 1427 * This method sets the type of normalization in the attribute 1428 * <code>id</code> field and returns the name of attribute type. 1429 * 1430 * @param att An object which represents current attribute. 1431 * @return The name of the attribute type. 1432 * @exception Exception is parser specific exception form panic method. 1433 */ 1434 private String atype(Pair att) 1435 throws Exception { 1436 Pair attr; 1437 1438 // CDATA-type normalization by default [#3.3.3] 1439 att.id = 'c'; 1440 if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { 1441 return "CDATA"; 1442 } 1443 1444 att.num |= 0x1; // attribute is declared 1445 1446 // Non-CDATA normalization except when the attribute type is CDATA. 1447 att.id = 'i'; 1448 switch (attr.id) { 1449 case 'i': 1450 return "ID"; 1451 1452 case 'r': 1453 return "IDREF"; 1454 1455 case 'R': 1456 return "IDREFS"; 1457 1458 case 'n': 1459 return "ENTITY"; 1460 1461 case 'N': 1462 return "ENTITIES"; 1463 1464 case 't': 1465 return "NMTOKEN"; 1466 1467 case 'T': 1468 return "NMTOKENS"; 1469 1470 case 'u': 1471 return "NMTOKEN"; 1472 1473 case 'o': 1474 return "NOTATION"; 1475 1476 case 'c': 1477 att.id = 'c'; 1478 return "CDATA"; 1479 1480 default: 1481 panic(FAULT); 1482 } 1483 return null; 1484 } 1485 1486 /** 1487 * Parses a comment. 1488 * 1489 * The '<!' part is read in dispatcher so the method starts 1490 * with first '-' after '<!'. 1491 * 1492 * @exception Exception is parser specific exception form panic method. 1493 */ 1494 @SuppressWarnings("fallthrough") 1495 private void comm() throws Exception { 1496 if (mPh == PH_DOC_START) { 1497 mPh = PH_MISC_DTD; // misc before DTD 1498 } // '<!' has been already read by dispetcher. 1499 char ch; 1500 mBuffIdx = -1; 1501 for (short st = 0; st >= 0;) { 1502 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1503 if (ch == EOS) { 1504 panic(FAULT); 1505 } 1506 switch (st) { 1507 case 0: // first '-' of the comment open 1508 if (ch == '-') { 1509 st = 1; 1510 } else { 1511 panic(FAULT); 1512 } 1513 break; 1514 1515 case 1: // secind '-' of the comment open 1516 if (ch == '-') { 1517 st = 2; 1518 } else { 1519 panic(FAULT); 1520 } 1521 break; 1522 1523 case 2: // skip the comment body 1524 switch (ch) { 1525 case '-': 1526 st = 3; 1527 break; 1528 1529 default: 1530 bappend(ch); 1531 break; 1532 } 1533 break; 1534 1535 case 3: // second '-' of the comment close 1536 switch (ch) { 1537 case '-': 1538 st = 4; 1539 break; 1540 1541 default: 1542 bappend('-'); 1543 bappend(ch); 1544 st = 2; 1545 break; 1546 } 1547 break; 1548 1549 case 4: // '>' of the comment close 1550 if (ch == '>') { 1551 comm(mBuff, mBuffIdx + 1); 1552 st = -1; 1553 break; 1554 } 1555 // else - panic [#2.5 compatibility note] 1556 1557 default: 1558 panic(FAULT); 1559 } 1560 } 1561 } 1562 1563 /** 1564 * Parses a processing instruction. 1565 * 1566 * The '<?' is read in dispatcher so the method starts with 1567 * first character of PI target name after '<?'. 1568 * 1569 * @exception Exception is parser specific exception form panic method. 1570 * @exception IOException 1571 */ 1572 private void pi() throws Exception { 1573 // '<?' has been already read by dispetcher. 1574 char ch; 1575 String str = null; 1576 mBuffIdx = -1; 1577 for (short st = 0; st >= 0;) { 1578 ch = getch(); 1579 if (ch == EOS) { 1580 panic(FAULT); 1581 } 1582 switch (st) { 1583 case 0: // read the PI target name 1584 switch (chtyp(ch)) { 1585 case 'a': 1586 case 'A': 1587 case '_': 1588 case ':': 1589 case 'X': 1590 bkch(); 1591 str = name(false); 1592 // PI target name may not be empty string [#2.6] 1593 // PI target name 'XML' is reserved [#2.6] 1594 if ((str.length() == 0) 1595 || (mXml.name.equals(str.toLowerCase()) == true)) { 1596 panic(FAULT); 1597 } 1598 // This is processing instruction 1599 if (mPh == PH_DOC_START) // the begining of the document 1600 { 1601 mPh = PH_MISC_DTD; // misc before DTD 1602 } 1603 wsskip(); // skip spaces after the PI target name 1604 st = 1; // accumulate the PI body 1605 mBuffIdx = -1; 1606 break; 1607 1608 default: 1609 panic(FAULT); 1610 } 1611 break; 1612 1613 case 1: // accumulate the PI body 1614 switch (ch) { 1615 case '?': 1616 st = 2; // end of the PI body 1617 break; 1618 1619 default: 1620 bappend(ch); 1621 break; 1622 } 1623 break; 1624 1625 case 2: // end of the PI body 1626 switch (ch) { 1627 case '>': 1628 // PI has been read. 1629 pi(str, new String(mBuff, 0, mBuffIdx + 1)); 1630 st = -1; 1631 break; 1632 1633 case '?': 1634 bappend('?'); 1635 break; 1636 1637 default: 1638 bappend('?'); 1639 bappend(ch); 1640 st = 1; // accumulate the PI body 1641 break; 1642 } 1643 break; 1644 1645 default: 1646 panic(FAULT); 1647 } 1648 } 1649 } 1650 1651 /** 1652 * Parses a character data. 1653 * 1654 * The '<!' part is read in dispatcher so the method starts 1655 * with first '[' after '<!'. 1656 * 1657 * @exception Exception is parser specific exception form panic method. 1658 * @exception IOException 1659 */ 1660 private void cdat() 1661 throws Exception { 1662 // '<!' has been already read by dispetcher. 1663 char ch; 1664 mBuffIdx = -1; 1665 for (short st = 0; st >= 0;) { 1666 ch = getch(); 1667 switch (st) { 1668 case 0: // the first '[' of the CDATA open 1669 if (ch == '[') { 1670 st = 1; 1671 } else { 1672 panic(FAULT); 1673 } 1674 break; 1675 1676 case 1: // read "CDATA" 1677 if (chtyp(ch) == 'A') { 1678 bappend(ch); 1679 } else { 1680 if ("CDATA".equals( 1681 new String(mBuff, 0, mBuffIdx + 1)) != true) { 1682 panic(FAULT); 1683 } 1684 bkch(); 1685 st = 2; 1686 } 1687 break; 1688 1689 case 2: // the second '[' of the CDATA open 1690 if (ch != '[') { 1691 panic(FAULT); 1692 } 1693 mBuffIdx = -1; 1694 st = 3; 1695 break; 1696 1697 case 3: // read data before the first ']' 1698 if (ch != ']') { 1699 bappend(ch); 1700 } else { 1701 st = 4; 1702 } 1703 break; 1704 1705 case 4: // read the second ']' or continue to read the data 1706 if (ch != ']') { 1707 bappend(']'); 1708 bappend(ch); 1709 st = 3; 1710 } else { 1711 st = 5; 1712 } 1713 break; 1714 1715 case 5: // read '>' or continue to read the data 1716 switch (ch) { 1717 case ']': 1718 bappend(']'); 1719 break; 1720 1721 case '>': 1722 bflash(); 1723 st = -1; 1724 break; 1725 1726 default: 1727 bappend(']'); 1728 bappend(']'); 1729 bappend(ch); 1730 st = 3; 1731 break; 1732 } 1733 break; 1734 1735 default: 1736 panic(FAULT); 1737 } 1738 } 1739 } 1740 1741 /** 1742 * Reads a xml name. 1743 * 1744 * The xml name must conform "Namespaces in XML" specification. Therefore 1745 * the ':' character is not allowed in the name. This method should be used 1746 * for PI and entity names which may not have a namespace according to the 1747 * specification mentioned above. 1748 * 1749 * @param ns The true value turns namespace conformance on. 1750 * @return The name has been read. 1751 * @exception Exception When incorrect character appear in the name. 1752 * @exception IOException 1753 */ 1754 protected String name(boolean ns) 1755 throws Exception { 1756 mBuffIdx = -1; 1757 bname(ns); 1758 return new String(mBuff, 1, mBuffIdx); 1759 } 1760 1761 /** 1762 * Reads a qualified xml name. 1763 * 1764 * The characters of a qualified name is an array of characters. The first 1765 * (chars[0]) character is the index of the colon character which separates 1766 * the prefix from the local name. If the index is zero, the name does not 1767 * contain separator or the parser works in the namespace unaware mode. The 1768 * length of qualified name is the length of the array minus one. 1769 * 1770 * @param ns The true value turns namespace conformance on. 1771 * @return The characters of a qualified name. 1772 * @exception Exception When incorrect character appear in the name. 1773 * @exception IOException 1774 */ 1775 protected char[] qname(boolean ns) 1776 throws Exception { 1777 mBuffIdx = -1; 1778 bname(ns); 1779 char chars[] = new char[mBuffIdx + 1]; 1780 System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); 1781 return chars; 1782 } 1783 1784 /** 1785 * Reads the public or/and system identifiers. 1786 * 1787 * @param inp The input object. 1788 * @exception Exception is parser specific exception form panic method. 1789 * @exception IOException 1790 */ 1791 private void pubsys(Input inp) 1792 throws Exception { 1793 Pair pair = pubsys(' '); 1794 inp.pubid = pair.name; 1795 inp.sysid = pair.value; 1796 del(pair); 1797 } 1798 1799 /** 1800 * Reads the public or/and system identifiers. 1801 * 1802 * @param flag The 'N' allows public id be without system id. 1803 * @return The public or/and system identifiers pair. 1804 * @exception Exception is parser specific exception form panic method. 1805 * @exception IOException 1806 */ 1807 @SuppressWarnings("fallthrough") 1808 private Pair pubsys(char flag) throws Exception { 1809 Pair ids = pair(null); 1810 String str = name(false); 1811 if ("PUBLIC".equals(str) == true) { 1812 bqstr('i'); // non-CDATA normalization [#4.2.2] 1813 ids.name = new String(mBuff, 1, mBuffIdx); 1814 switch (wsskip()) { 1815 case '\"': 1816 case '\'': 1817 bqstr(' '); 1818 ids.value = new String(mBuff, 1, mBuffIdx); 1819 break; 1820 1821 case EOS: 1822 panic(FAULT); 1823 1824 default: 1825 if (flag != 'N') // [#4.7] 1826 { 1827 panic(FAULT); 1828 } 1829 ids.value = null; 1830 break; 1831 } 1832 return ids; 1833 } else if ("SYSTEM".equals(str) == true) { 1834 ids.name = null; 1835 bqstr(' '); 1836 ids.value = new String(mBuff, 1, mBuffIdx); 1837 return ids; 1838 } 1839 panic(FAULT); 1840 return null; 1841 } 1842 1843 /** 1844 * Reads an attribute value. 1845 * 1846 * The grammar this method can read is: 1847 * <pre>{@code 1848 * eqstr := S "=" qstr 1849 * qstr := S ("'" string "'") | ('"' string '"') 1850 * }</pre> 1851 * This method resolves entities 1852 * inside a string unless the parser parses DTD. 1853 * 1854 * @param flag The '=' character forces the method to accept the '=' 1855 * character before quoted string and read the following string as not an 1856 * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; 1857 * '-' - not an attribute value; 'd' - in DTD context. 1858 * @return The content of the quoted strign as a string. 1859 * @exception Exception is parser specific exception form panic method. 1860 * @exception IOException 1861 */ 1862 protected String eqstr(char flag) throws Exception { 1863 if (flag == '=') { 1864 wsskip(); 1865 if (getch() != '=') { 1866 panic(FAULT); 1867 } 1868 } 1869 bqstr((flag == '=') ? '-' : flag); 1870 return new String(mBuff, 1, mBuffIdx); 1871 } 1872 1873 /** 1874 * Resoves an entity. 1875 * 1876 * This method resolves built-in and character entity references. It is also 1877 * reports external entities to the application. 1878 * 1879 * @param flag The 'x' character forces the method to report a skipped 1880 * entity; 'i' character - indicates non-CDATA normalization. 1881 * @return Name of unresolved entity or <code>null</code> if entity had been 1882 * resolved successfully. 1883 * @exception Exception is parser specific exception form panic method. 1884 * @exception IOException 1885 */ 1886 @SuppressWarnings("fallthrough") 1887 private String ent(char flag) throws Exception { 1888 char ch; 1889 int idx = mBuffIdx + 1; 1890 Input inp = null; 1891 String str = null; 1892 mESt = 0x100; // reset the built-in entity recognizer 1893 bappend('&'); 1894 for (short st = 0; st >= 0;) { 1895 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1896 switch (st) { 1897 case 0: // the first character of the entity name 1898 case 1: // read built-in entity name 1899 switch (chtyp(ch)) { 1900 case 'd': 1901 case '.': 1902 case '-': 1903 if (st != 1) { 1904 panic(FAULT); 1905 } 1906 case 'a': 1907 case 'A': 1908 case '_': 1909 case 'X': 1910 bappend(ch); 1911 eappend(ch); 1912 st = 1; 1913 break; 1914 1915 case ':': 1916 if (mIsNSAware != false) { 1917 panic(FAULT); 1918 } 1919 bappend(ch); 1920 eappend(ch); 1921 st = 1; 1922 break; 1923 1924 case ';': 1925 if (mESt < 0x100) { 1926 // The entity is a built-in entity 1927 mBuffIdx = idx - 1; 1928 bappend(mESt); 1929 st = -1; 1930 break; 1931 } else if (mPh == PH_DTD) { 1932 // In DTD entity declaration has to resolve character 1933 // entities and include "as is" others. [#4.4.7] 1934 bappend(';'); 1935 st = -1; 1936 break; 1937 } 1938 // Convert an entity name to a string 1939 str = new String(mBuff, idx + 1, mBuffIdx - idx); 1940 inp = mEnt.get(str); 1941 // Restore the buffer offset 1942 mBuffIdx = idx - 1; 1943 if (inp != null) { 1944 if (inp.chars == null) { 1945 // External entity 1946 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 1947 if (is != null) { 1948 push(new Input(BUFFSIZE_READER)); 1949 setinp(is); 1950 mInp.pubid = inp.pubid; 1951 mInp.sysid = inp.sysid; 1952 str = null; // the entity is resolved 1953 } else { 1954 // Unresolved external entity 1955 if (flag != 'x') { 1956 panic(FAULT); // unknown entity within marckup 1957 } // str is name of unresolved entity 1958 } 1959 } else { 1960 // Internal entity 1961 push(inp); 1962 str = null; // the entity is resolved 1963 } 1964 } else { 1965 // Unknown or general unparsed entity 1966 if (flag != 'x') { 1967 panic(FAULT); // unknown entity within marckup 1968 } // str is name of unresolved entity 1969 } 1970 st = -1; 1971 break; 1972 1973 case '#': 1974 if (st != 0) { 1975 panic(FAULT); 1976 } 1977 st = 2; 1978 break; 1979 1980 default: 1981 panic(FAULT); 1982 } 1983 break; 1984 1985 case 2: // read character entity 1986 switch (chtyp(ch)) { 1987 case 'd': 1988 bappend(ch); 1989 break; 1990 1991 case ';': 1992 // Convert the character entity to a character 1993 try { 1994 int i = Integer.parseInt( 1995 new String(mBuff, idx + 1, mBuffIdx - idx), 10); 1996 if (i >= 0xffff) { 1997 panic(FAULT); 1998 } 1999 ch = (char) i; 2000 } catch (NumberFormatException nfe) { 2001 panic(FAULT); 2002 } 2003 // Restore the buffer offset 2004 mBuffIdx = idx - 1; 2005 if (ch == ' ' || mInp.next != null) { 2006 bappend(ch, flag); 2007 } else { 2008 bappend(ch); 2009 } 2010 st = -1; 2011 break; 2012 2013 case 'a': 2014 // If the entity buffer is empty and ch == 'x' 2015 if ((mBuffIdx == idx) && (ch == 'x')) { 2016 st = 3; 2017 break; 2018 } 2019 default: 2020 panic(FAULT); 2021 } 2022 break; 2023 2024 case 3: // read hex character entity 2025 switch (chtyp(ch)) { 2026 case 'A': 2027 case 'a': 2028 case 'd': 2029 bappend(ch); 2030 break; 2031 2032 case ';': 2033 // Convert the character entity to a character 2034 try { 2035 int i = Integer.parseInt( 2036 new String(mBuff, idx + 1, mBuffIdx - idx), 16); 2037 if (i >= 0xffff) { 2038 panic(FAULT); 2039 } 2040 ch = (char) i; 2041 } catch (NumberFormatException nfe) { 2042 panic(FAULT); 2043 } 2044 // Restore the buffer offset 2045 mBuffIdx = idx - 1; 2046 if (ch == ' ' || mInp.next != null) { 2047 bappend(ch, flag); 2048 } else { 2049 bappend(ch); 2050 } 2051 st = -1; 2052 break; 2053 2054 default: 2055 panic(FAULT); 2056 } 2057 break; 2058 2059 default: 2060 panic(FAULT); 2061 } 2062 } 2063 2064 return str; 2065 } 2066 2067 /** 2068 * Resoves a parameter entity. 2069 * 2070 * This method resolves a parameter entity references. It is also reports 2071 * external entities to the application. 2072 * 2073 * @param flag The '-' instruct the method to do not set up surrounding 2074 * spaces [#4.4.8]. 2075 * @exception Exception is parser specific exception form panic method. 2076 * @exception IOException 2077 */ 2078 @SuppressWarnings("fallthrough") 2079 private void pent(char flag) throws Exception { 2080 char ch; 2081 int idx = mBuffIdx + 1; 2082 Input inp = null; 2083 String str = null; 2084 bappend('%'); 2085 if (mPh != PH_DTD) // the DTD internal subset 2086 { 2087 return; // Not Recognized [#4.4.1] 2088 } // Read entity name 2089 bname(false); 2090 str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); 2091 if (getch() != ';') { 2092 panic(FAULT); 2093 } 2094 inp = mPEnt.get(str); 2095 // Restore the buffer offset 2096 mBuffIdx = idx - 1; 2097 if (inp != null) { 2098 if (inp.chars == null) { 2099 // External parameter entity 2100 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 2101 if (is != null) { 2102 if (flag != '-') { 2103 bappend(' '); // tail space 2104 } 2105 push(new Input(BUFFSIZE_READER)); 2106 // BUG: there is no leading space! [#4.4.8] 2107 setinp(is); 2108 mInp.pubid = inp.pubid; 2109 mInp.sysid = inp.sysid; 2110 } else { 2111 // Unresolved external parameter entity 2112 skippedEnt("%" + str); 2113 } 2114 } else { 2115 // Internal parameter entity 2116 if (flag == '-') { 2117 // No surrounding spaces 2118 inp.chIdx = 1; 2119 } else { 2120 // Insert surrounding spaces 2121 bappend(' '); // tail space 2122 inp.chIdx = 0; 2123 } 2124 push(inp); 2125 } 2126 } else { 2127 // Unknown parameter entity 2128 skippedEnt("%" + str); 2129 } 2130 } 2131 2132 /** 2133 * Recognizes and handles a namespace declaration. 2134 * 2135 * This method identifies a type of namespace declaration if any and puts 2136 * new mapping on top of prefix stack. 2137 * 2138 * @param name The attribute qualified name (<code>name.value</code> is a 2139 * <code>String</code> object which represents the attribute prefix). 2140 * @param value The attribute value. 2141 * @return <code>true</code> if a namespace declaration is recognized. 2142 */ 2143 private boolean isdecl(Pair name, String value) { 2144 if (name.chars[0] == 0) { 2145 if ("xmlns".equals(name.name) == true) { 2146 // New default namespace declaration 2147 mPref = pair(mPref); 2148 mPref.list = mElm; // prefix owner element 2149 mPref.value = value; 2150 mPref.name = ""; 2151 mPref.chars = NONS; 2152 mElm.num++; // namespace counter 2153 return true; 2154 } 2155 } else { 2156 if (name.eqpref(XMLNS) == true) { 2157 // New prefix declaration 2158 int len = name.name.length(); 2159 mPref = pair(mPref); 2160 mPref.list = mElm; // prefix owner element 2161 mPref.value = value; 2162 mPref.name = name.name; 2163 mPref.chars = new char[len + 1]; 2164 mPref.chars[0] = (char) (len + 1); 2165 name.name.getChars(0, len, mPref.chars, 1); 2166 mElm.num++; // namespace counter 2167 return true; 2168 } 2169 } 2170 return false; 2171 } 2172 2173 /** 2174 * Resolves a prefix. 2175 * 2176 * @return The namespace assigned to the prefix. 2177 * @exception Exception When mapping for specified prefix is not found. 2178 */ 2179 private String rslv(char[] qname) 2180 throws Exception { 2181 for (Pair pref = mPref; pref != null; pref = pref.next) { 2182 if (pref.eqpref(qname) == true) { 2183 return pref.value; 2184 } 2185 } 2186 if (qname[0] == 1) { // QNames like ':local' 2187 for (Pair pref = mPref; pref != null; pref = pref.next) { 2188 if (pref.chars[0] == 0) { 2189 return pref.value; 2190 } 2191 } 2192 } 2193 panic(FAULT); 2194 return null; 2195 } 2196 2197 /** 2198 * Skips xml white space characters. 2199 * 2200 * This method skips white space characters (' ', '\t', '\n', '\r') and 2201 * looks ahead not white space character. 2202 * 2203 * @return The first not white space look ahead character. 2204 * @exception IOException 2205 */ 2206 protected char wsskip() 2207 throws IOException { 2208 char ch; 2209 while (true) { 2210 // Read next character 2211 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2212 if (ch < 0x80) { 2213 if (nmttyp[ch] != 3) // [ \t\n\r] 2214 { 2215 break; 2216 } 2217 } else { 2218 break; 2219 } 2220 } 2221 mChIdx--; // bkch(); 2222 return ch; 2223 } 2224 2225 /** 2226 * Reports document type. 2227 * 2228 * @param name The name of the entity. 2229 * @param pubid The public identifier of the entity or <code>null</code>. 2230 * @param sysid The system identifier of the entity or <code>null</code>. 2231 */ 2232 protected abstract void docType(String name, String pubid, String sysid) 2233 throws SAXException; 2234 2235 /** 2236 * Reports the start of DTD internal subset. 2237 * 2238 * @throws SAXException if the receiver throws SAXException 2239 */ 2240 public abstract void startInternalSub () throws SAXException; 2241 2242 /** 2243 * Reports a comment. 2244 * 2245 * @param text The comment text starting from first charcater. 2246 * @param length The number of characters in comment. 2247 */ 2248 protected abstract void comm(char[] text, int length); 2249 2250 /** 2251 * Reports a processing instruction. 2252 * 2253 * @param target The processing instruction target name. 2254 * @param body The processing instruction body text. 2255 */ 2256 protected abstract void pi(String target, String body) 2257 throws Exception; 2258 2259 /** 2260 * Reports new namespace prefix. The Namespace prefix ( 2261 * <code>mPref.name</code>) being declared and the Namespace URI ( 2262 * <code>mPref.value</code>) the prefix is mapped to. An empty string is 2263 * used for the default element namespace, which has no prefix. 2264 */ 2265 protected abstract void newPrefix() 2266 throws Exception; 2267 2268 /** 2269 * Reports skipped entity name. 2270 * 2271 * @param name The entity name. 2272 */ 2273 protected abstract void skippedEnt(String name) 2274 throws Exception; 2275 2276 /** 2277 * Returns an 2278 * <code>InputSource</code> for specified entity or 2279 * <code>null</code>. 2280 * 2281 * @param name The name of the entity. 2282 * @param pubid The public identifier of the entity. 2283 * @param sysid The system identifier of the entity. 2284 */ 2285 protected abstract InputSource resolveEnt( 2286 String name, String pubid, String sysid) 2287 throws Exception; 2288 2289 /** 2290 * Reports notation declaration. 2291 * 2292 * @param name The notation's name. 2293 * @param pubid The notation's public identifier, or null if none was given. 2294 * @param sysid The notation's system identifier, or null if none was given. 2295 */ 2296 protected abstract void notDecl(String name, String pubid, String sysid) 2297 throws Exception; 2298 2299 /** 2300 * Reports unparsed entity name. 2301 * 2302 * @param name The unparsed entity's name. 2303 * @param pubid The entity's public identifier, or null if none was given. 2304 * @param sysid The entity's system identifier. 2305 * @param notation The name of the associated notation. 2306 */ 2307 protected abstract void unparsedEntDecl( 2308 String name, String pubid, String sysid, String notation) 2309 throws Exception; 2310 2311 /** 2312 * Notifies the handler about fatal parsing error. 2313 * 2314 * @param msg The problem description message. 2315 */ 2316 protected abstract void panic(String msg) 2317 throws Exception; 2318 2319 /** 2320 * Reads a qualified xml name. 2321 * 2322 * This is low level routine which leaves a qName in the buffer. The 2323 * characters of a qualified name is an array of characters. The first 2324 * (chars[0]) character is the index of the colon character which separates 2325 * the prefix from the local name. If the index is zero, the name does not 2326 * contain separator or the parser works in the namespace unaware mode. The 2327 * length of qualified name is the length of the array minus one. 2328 * 2329 * @param ns The true value turns namespace conformance on. 2330 * @exception Exception is parser specific exception form panic method. 2331 * @exception IOException 2332 */ 2333 private void bname(boolean ns) 2334 throws Exception { 2335 char ch; 2336 char type; 2337 mBuffIdx++; // allocate a char for colon offset 2338 int bqname = mBuffIdx; 2339 int bcolon = bqname; 2340 int bchidx = bqname + 1; 2341 int bstart = bchidx; 2342 int cstart = mChIdx; 2343 short st = (short) ((ns == true) ? 0 : 2); 2344 while (true) { 2345 // Read next character 2346 if (mChIdx >= mChLen) { 2347 bcopy(cstart, bstart); 2348 getch(); 2349 mChIdx--; // bkch(); 2350 cstart = mChIdx; 2351 bstart = bchidx; 2352 } 2353 ch = mChars[mChIdx++]; 2354 type = (char) 0; // [X] 2355 if (ch < 0x80) { 2356 type = (char) nmttyp[ch]; 2357 } else if (ch == EOS) { 2358 panic(FAULT); 2359 } 2360 // Parse QName 2361 switch (st) { 2362 case 0: // read the first char of the prefix 2363 case 2: // read the first char of the suffix 2364 switch (type) { 2365 case 0: // [aA_X] 2366 bchidx++; // append char to the buffer 2367 st++; // (st == 0)? 1: 3; 2368 break; 2369 2370 case 1: // [:] 2371 mChIdx--; // bkch(); 2372 st++; // (st == 0)? 1: 3; 2373 break; 2374 2375 default: 2376 panic(FAULT); 2377 } 2378 break; 2379 2380 case 1: // read the prefix 2381 case 3: // read the suffix 2382 switch (type) { 2383 case 0: // [aA_X] 2384 case 2: // [.-d] 2385 bchidx++; // append char to the buffer 2386 break; 2387 2388 case 1: // [:] 2389 bchidx++; // append char to the buffer 2390 if (ns == true) { 2391 if (bcolon != bqname) { 2392 panic(FAULT); // it must be only one colon 2393 } 2394 bcolon = bchidx - 1; 2395 if (st == 1) { 2396 st = 2; 2397 } 2398 } 2399 break; 2400 2401 default: 2402 mChIdx--; // bkch(); 2403 bcopy(cstart, bstart); 2404 mBuff[bqname] = (char) (bcolon - bqname); 2405 return; 2406 } 2407 break; 2408 2409 default: 2410 panic(FAULT); 2411 } 2412 } 2413 } 2414 2415 /** 2416 * Reads a nmtoken. 2417 * 2418 * This is low level routine which leaves a nmtoken in the buffer. 2419 * 2420 * @exception Exception is parser specific exception form panic method. 2421 * @exception IOException 2422 */ 2423 @SuppressWarnings("fallthrough") 2424 private void bntok() throws Exception { 2425 char ch; 2426 mBuffIdx = -1; 2427 bappend((char) 0); // default offset to the colon char 2428 while (true) { 2429 ch = getch(); 2430 switch (chtyp(ch)) { 2431 case 'a': 2432 case 'A': 2433 case 'd': 2434 case '.': 2435 case ':': 2436 case '-': 2437 case '_': 2438 case 'X': 2439 bappend(ch); 2440 break; 2441 2442 case 'Z': 2443 panic(FAULT); 2444 2445 default: 2446 bkch(); 2447 return; 2448 } 2449 } 2450 } 2451 2452 /** 2453 * Recognizes a keyword. 2454 * 2455 * This is low level routine which recognizes one of keywords in the buffer. 2456 * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - 2457 * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - 2458 * Q IMPLIED - I FIXED - F 2459 * 2460 * @return an id of a keyword or '?'. 2461 * @exception Exception is parser specific exception form panic method. 2462 * @exception IOException 2463 */ 2464 private char bkeyword() 2465 throws Exception { 2466 String str = new String(mBuff, 1, mBuffIdx); 2467 switch (str.length()) { 2468 case 2: // ID 2469 return ("ID".equals(str) == true) ? 'i' : '?'; 2470 2471 case 5: // IDREF, CDATA, FIXED 2472 switch (mBuff[1]) { 2473 case 'I': 2474 return ("IDREF".equals(str) == true) ? 'r' : '?'; 2475 case 'C': 2476 return ("CDATA".equals(str) == true) ? 'c' : '?'; 2477 case 'F': 2478 return ("FIXED".equals(str) == true) ? 'F' : '?'; 2479 default: 2480 break; 2481 } 2482 break; 2483 2484 case 6: // IDREFS, ENTITY 2485 switch (mBuff[1]) { 2486 case 'I': 2487 return ("IDREFS".equals(str) == true) ? 'R' : '?'; 2488 case 'E': 2489 return ("ENTITY".equals(str) == true) ? 'n' : '?'; 2490 default: 2491 break; 2492 } 2493 break; 2494 2495 case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT 2496 switch (mBuff[1]) { 2497 case 'I': 2498 return ("IMPLIED".equals(str) == true) ? 'I' : '?'; 2499 case 'N': 2500 return ("NMTOKEN".equals(str) == true) ? 't' : '?'; 2501 case 'A': 2502 return ("ATTLIST".equals(str) == true) ? 'a' : '?'; 2503 case 'E': 2504 return ("ELEMENT".equals(str) == true) ? 'e' : '?'; 2505 default: 2506 break; 2507 } 2508 break; 2509 2510 case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED 2511 switch (mBuff[2]) { 2512 case 'N': 2513 return ("ENTITIES".equals(str) == true) ? 'N' : '?'; 2514 case 'M': 2515 return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; 2516 case 'O': 2517 return ("NOTATION".equals(str) == true) ? 'o' : '?'; 2518 case 'E': 2519 return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; 2520 default: 2521 break; 2522 } 2523 break; 2524 2525 default: 2526 break; 2527 } 2528 return '?'; 2529 } 2530 2531 /** 2532 * Reads a single or double quotted string in to the buffer. 2533 * 2534 * This method resolves entities inside a string unless the parser parses 2535 * DTD. 2536 * 2537 * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - 2538 * not an attribute value; 'd' - in DTD context. 2539 * @exception Exception is parser specific exception form panic method. 2540 * @exception IOException 2541 */ 2542 @SuppressWarnings("fallthrough") 2543 private void bqstr(char flag) throws Exception { 2544 Input inp = mInp; // remember the original input 2545 mBuffIdx = -1; 2546 bappend((char) 0); // default offset to the colon char 2547 char ch; 2548 for (short st = 0; st >= 0;) { 2549 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2550 switch (st) { 2551 case 0: // read a single or double quote 2552 switch (ch) { 2553 case ' ': 2554 case '\n': 2555 case '\r': 2556 case '\t': 2557 break; 2558 2559 case '\'': 2560 st = 2; // read a single quoted string 2561 break; 2562 2563 case '\"': 2564 st = 3; // read a double quoted string 2565 break; 2566 2567 default: 2568 panic(FAULT); 2569 break; 2570 } 2571 break; 2572 2573 case 2: // read a single quoted string 2574 case 3: // read a double quoted string 2575 switch (ch) { 2576 case '\'': 2577 if ((st == 2) && (mInp == inp)) { 2578 st = -1; 2579 } else { 2580 bappend(ch); 2581 } 2582 break; 2583 2584 case '\"': 2585 if ((st == 3) && (mInp == inp)) { 2586 st = -1; 2587 } else { 2588 bappend(ch); 2589 } 2590 break; 2591 2592 case '&': 2593 if (flag != 'd') { 2594 ent(flag); 2595 } else { 2596 bappend(ch); 2597 } 2598 break; 2599 2600 case '%': 2601 if (flag == 'd') { 2602 pent('-'); 2603 } else { 2604 bappend(ch); 2605 } 2606 break; 2607 2608 case '<': 2609 if ((flag == '-') || (flag == 'd')) { 2610 bappend(ch); 2611 } else { 2612 panic(FAULT); 2613 } 2614 break; 2615 2616 case EOS: // EOS before single/double quote 2617 panic(FAULT); 2618 2619 case '\r': // EOL processing [#2.11 & #3.3.3] 2620 if (flag != ' ' && mInp.next == null) { 2621 if (getch() != '\n') { 2622 bkch(); 2623 } 2624 ch = '\n'; 2625 } 2626 default: 2627 bappend(ch, flag); 2628 break; 2629 } 2630 break; 2631 2632 default: 2633 panic(FAULT); 2634 } 2635 } 2636 // There is maximum one space at the end of the string in 2637 // i-mode (non CDATA normalization) and it has to be removed. 2638 if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { 2639 mBuffIdx -= 1; 2640 } 2641 } 2642 2643 /** 2644 * Reports characters and empties the parser's buffer. This method is called 2645 * only if parser is going to return control to the main loop. This means 2646 * that this method may use parser buffer to report white space without 2647 * copying characters to temporary buffer. 2648 */ 2649 protected abstract void bflash() 2650 throws Exception; 2651 2652 /** 2653 * Reports white space characters and empties the parser's buffer. This 2654 * method is called only if parser is going to return control to the main 2655 * loop. This means that this method may use parser buffer to report white 2656 * space without copying characters to temporary buffer. 2657 */ 2658 protected abstract void bflash_ws() 2659 throws Exception; 2660 2661 /** 2662 * Appends a character to parser's buffer with normalization. 2663 * 2664 * @param ch The character to append to the buffer. 2665 * @param mode The normalization mode. 2666 */ 2667 private void bappend(char ch, char mode) { 2668 // This implements attribute value normalization as 2669 // described in the XML specification [#3.3.3]. 2670 switch (mode) { 2671 case 'i': // non CDATA normalization 2672 switch (ch) { 2673 case ' ': 2674 case '\n': 2675 case '\r': 2676 case '\t': 2677 if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { 2678 bappend(' '); 2679 } 2680 return; 2681 2682 default: 2683 break; 2684 } 2685 break; 2686 2687 case 'c': // CDATA normalization 2688 switch (ch) { 2689 case '\n': 2690 case '\r': 2691 case '\t': 2692 ch = ' '; 2693 break; 2694 2695 default: 2696 break; 2697 } 2698 break; 2699 2700 default: // no normalization 2701 break; 2702 } 2703 mBuffIdx++; 2704 if (mBuffIdx < mBuff.length) { 2705 mBuff[mBuffIdx] = ch; 2706 } else { 2707 mBuffIdx--; 2708 bappend(ch); 2709 } 2710 } 2711 2712 /** 2713 * Appends a character to parser's buffer. 2714 * 2715 * @param ch The character to append to the buffer. 2716 */ 2717 private void bappend(char ch) { 2718 try { 2719 mBuff[++mBuffIdx] = ch; 2720 } catch (Exception exp) { 2721 // Double the buffer size 2722 char buff[] = new char[mBuff.length << 1]; 2723 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2724 mBuff = buff; 2725 mBuff[mBuffIdx] = ch; 2726 } 2727 } 2728 2729 /** 2730 * Appends (mChIdx - cidx) characters from character buffer (mChars) to 2731 * parser's buffer (mBuff). 2732 * 2733 * @param cidx The character buffer (mChars) start index. 2734 * @param bidx The parser buffer (mBuff) start index. 2735 */ 2736 private void bcopy(int cidx, int bidx) { 2737 int length = mChIdx - cidx; 2738 if ((bidx + length + 1) >= mBuff.length) { 2739 // Expand the buffer 2740 char buff[] = new char[mBuff.length + length]; 2741 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2742 mBuff = buff; 2743 } 2744 System.arraycopy(mChars, cidx, mBuff, bidx, length); 2745 mBuffIdx += length; 2746 } 2747 2748 /** 2749 * Recognizes the built-in entities <i>lt</i>, <i>gt</i>, <i>amp</i>, 2750 * <i>apos</i>, <i>quot</i>. The initial state is 0x100. Any state belowe 2751 * 0x100 is a built-in entity replacement character. 2752 * 2753 * @param ch the next character of an entity name. 2754 */ 2755 @SuppressWarnings("fallthrough") 2756 private void eappend(char ch) { 2757 switch (mESt) { 2758 case 0x100: // "l" or "g" or "a" or "q" 2759 switch (ch) { 2760 case 'l': 2761 mESt = 0x101; 2762 break; 2763 case 'g': 2764 mESt = 0x102; 2765 break; 2766 case 'a': 2767 mESt = 0x103; 2768 break; 2769 case 'q': 2770 mESt = 0x107; 2771 break; 2772 default: 2773 mESt = 0x200; 2774 break; 2775 } 2776 break; 2777 2778 case 0x101: // "lt" 2779 mESt = (ch == 't') ? '<' : (char) 0x200; 2780 break; 2781 2782 case 0x102: // "gt" 2783 mESt = (ch == 't') ? '>' : (char) 0x200; 2784 break; 2785 2786 case 0x103: // "am" or "ap" 2787 switch (ch) { 2788 case 'm': 2789 mESt = 0x104; 2790 break; 2791 case 'p': 2792 mESt = 0x105; 2793 break; 2794 default: 2795 mESt = 0x200; 2796 break; 2797 } 2798 break; 2799 2800 case 0x104: // "amp" 2801 mESt = (ch == 'p') ? '&' : (char) 0x200; 2802 break; 2803 2804 case 0x105: // "apo" 2805 mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; 2806 break; 2807 2808 case 0x106: // "apos" 2809 mESt = (ch == 's') ? '\'' : (char) 0x200; 2810 break; 2811 2812 case 0x107: // "qu" 2813 mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; 2814 break; 2815 2816 case 0x108: // "quo" 2817 mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; 2818 break; 2819 2820 case 0x109: // "quot" 2821 mESt = (ch == 't') ? '\"' : (char) 0x200; 2822 break; 2823 2824 case '<': // "lt" 2825 case '>': // "gt" 2826 case '&': // "amp" 2827 case '\'': // "apos" 2828 case '\"': // "quot" 2829 mESt = 0x200; 2830 default: 2831 break; 2832 } 2833 } 2834 2835 /** 2836 * Sets up a new input source on the top of the input stack. Note, the first 2837 * byte returned by the entity's byte stream has to be the first byte in the 2838 * entity. However, the parser does not expect the byte order mask in both 2839 * cases when encoding is provided by the input source. 2840 * 2841 * @param is A new input source to set up. 2842 * @exception IOException If any IO errors occur. 2843 * @exception Exception is parser specific exception form panic method. 2844 */ 2845 protected void setinp(InputSource is) 2846 throws Exception { 2847 Reader reader = null; 2848 mChIdx = 0; 2849 mChLen = 0; 2850 mChars = mInp.chars; 2851 mInp.src = null; 2852 if (mPh < PH_DOC_START) { 2853 mIsSAlone = false; // default [#2.9] 2854 } 2855 mIsSAloneSet = false; 2856 if (is.getCharacterStream() != null) { 2857 // Ignore encoding in the xml text decl. 2858 reader = is.getCharacterStream(); 2859 xml(reader); 2860 } else if (is.getByteStream() != null) { 2861 String expenc; 2862 if (is.getEncoding() != null) { 2863 // Ignore encoding in the xml text decl. 2864 expenc = is.getEncoding().toUpperCase(); 2865 if (expenc.equals("UTF-16")) { 2866 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2867 } else { 2868 reader = enc(expenc, is.getByteStream()); 2869 } 2870 xml(reader); 2871 } else { 2872 // Get encoding from BOM or the xml text decl. 2873 reader = bom(is.getByteStream(), ' '); 2874 /** 2875 * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon 2876 * that it may be missing. A mature technique exists in Xerces 2877 * to further check for possible UTF-16 encoding 2878 */ 2879 if (reader == null) { 2880 reader = utf16(is.getByteStream()); 2881 } 2882 2883 if (reader == null) { 2884 // Encoding is defined by the xml text decl. 2885 reader = enc("UTF-8", is.getByteStream()); 2886 expenc = xml(reader); 2887 if (!expenc.equals("UTF-8")) { 2888 if (expenc.startsWith("UTF-16")) { 2889 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2890 } 2891 reader = enc(expenc, is.getByteStream()); 2892 } 2893 } else { 2894 // Encoding is defined by the BOM. 2895 xml(reader); 2896 } 2897 } 2898 } else { 2899 // There is no support for public/system identifiers. 2900 panic(FAULT); 2901 } 2902 mInp.src = reader; 2903 mInp.pubid = is.getPublicId(); 2904 mInp.sysid = is.getSystemId(); 2905 } 2906 2907 /** 2908 * Determines the entity encoding. 2909 * 2910 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2911 * first byte returned by the entity's byte stream has to be the first byte 2912 * in the entity. Also, there is no support for UCS-4. 2913 * 2914 * @param is A byte stream of the entity. 2915 * @param hint An encoding hint, character U means UTF-16. 2916 * @return a reader constructed from the BOM or UTF-8 by default. 2917 * @exception Exception is parser specific exception form panic method. 2918 * @exception IOException 2919 */ 2920 private Reader bom(InputStream is, char hint) 2921 throws Exception { 2922 int val = is.read(); 2923 switch (val) { 2924 case 0xef: // UTF-8 2925 if (hint == 'U') // must be UTF-16 2926 { 2927 panic(FAULT); 2928 } 2929 if (is.read() != 0xbb) { 2930 panic(FAULT); 2931 } 2932 if (is.read() != 0xbf) { 2933 panic(FAULT); 2934 } 2935 return new ReaderUTF8(is); 2936 2937 case 0xfe: // UTF-16, big-endian 2938 if (is.read() != 0xff) { 2939 panic(FAULT); 2940 } 2941 return new ReaderUTF16(is, 'b'); 2942 2943 case 0xff: // UTF-16, little-endian 2944 if (is.read() != 0xfe) { 2945 panic(FAULT); 2946 } 2947 return new ReaderUTF16(is, 'l'); 2948 2949 case -1: 2950 mChars[mChIdx++] = EOS; 2951 return new ReaderUTF8(is); 2952 2953 default: 2954 if (hint == 'U') // must be UTF-16 2955 { 2956 panic(FAULT); 2957 } 2958 // Read the rest of UTF-8 character 2959 switch (val & 0xf0) { 2960 case 0xc0: 2961 case 0xd0: 2962 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2963 break; 2964 2965 case 0xe0: 2966 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2967 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2968 break; 2969 2970 case 0xf0: // UCS-4 character 2971 throw new UnsupportedEncodingException(); 2972 2973 default: 2974 mChars[mChIdx++] = (char) val; 2975 break; 2976 } 2977 return null; 2978 } 2979 } 2980 2981 2982 /** 2983 * Using a mature technique from Xerces, this method checks further after 2984 * the bom method above to see if the encoding is UTF-16 2985 * 2986 * @param is A byte stream of the entity. 2987 * @return a reader, may be null 2988 * @exception Exception is parser specific exception form panic method. 2989 * @exception IOException 2990 */ 2991 private Reader utf16(InputStream is) 2992 throws Exception { 2993 if (mChIdx != 0) { 2994 //The bom method has read ONE byte into the buffer. 2995 byte b0 = (byte)mChars[0]; 2996 if (b0 == 0x00 || b0 == 0x3C) { 2997 int b1 = is.read(); 2998 int b2 = is.read(); 2999 int b3 = is.read(); 3000 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 3001 // UTF-16, big-endian, no BOM 3002 mChars[0] = (char)(b1); 3003 mChars[mChIdx++] = (char)(b3); 3004 return new ReaderUTF16(is, 'b'); 3005 } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 3006 // UTF-16, little-endian, no BOM 3007 mChars[0] = (char)(b0); 3008 mChars[mChIdx++] = (char)(b2); 3009 return new ReaderUTF16(is, 'l'); 3010 } else { 3011 /**not every InputStream supports reset, so we have to remember 3012 * the state for further parsing 3013 **/ 3014 mChars[0] = (char)(b0); 3015 mChars[mChIdx++] = (char)(b1); 3016 mChars[mChIdx++] = (char)(b2); 3017 mChars[mChIdx++] = (char)(b3); 3018 } 3019 3020 } 3021 } 3022 return null; 3023 } 3024 /** 3025 * Parses the xml text declaration. 3026 * 3027 * This method gets encoding from the xml text declaration [#4.3.1] if any. 3028 * The method assumes the buffer (mChars) is big enough to accommodate whole 3029 * xml text declaration. 3030 * 3031 * @param reader is entity reader. 3032 * @return The xml text declaration encoding or default UTF-8 encoding. 3033 * @exception Exception is parser specific exception form panic method. 3034 * @exception IOException 3035 */ 3036 private String xml(Reader reader) 3037 throws Exception { 3038 String str = null; 3039 String enc = "UTF-8"; 3040 char ch; 3041 int val; 3042 short st = 0; 3043 int byteRead = mChIdx; //number of bytes read prior to entering this method 3044 3045 while (st >= 0 && mChIdx < mChars.length) { 3046 if (st < byteRead) { 3047 ch = mChars[st]; 3048 } else { 3049 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3050 mChars[mChIdx++] = ch; 3051 } 3052 3053 switch (st) { 3054 case 0: // read '<' of xml declaration 3055 switch (ch) { 3056 case '<': 3057 st = 1; 3058 break; 3059 3060 case 0xfeff: // the byte order mask 3061 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3062 mChars[mChIdx - 1] = ch; 3063 st = (short) ((ch == '<') ? 1 : -1); 3064 break; 3065 3066 default: 3067 st = -1; 3068 break; 3069 } 3070 break; 3071 3072 case 1: // read '?' of xml declaration [#4.3.1] 3073 st = (short) ((ch == '?') ? 2 : -1); 3074 break; 3075 3076 case 2: // read 'x' of xml declaration [#4.3.1] 3077 st = (short) ((ch == 'x') ? 3 : -1); 3078 break; 3079 3080 case 3: // read 'm' of xml declaration [#4.3.1] 3081 st = (short) ((ch == 'm') ? 4 : -1); 3082 break; 3083 3084 case 4: // read 'l' of xml declaration [#4.3.1] 3085 st = (short) ((ch == 'l') ? 5 : -1); 3086 break; 3087 3088 case 5: // read white space after 'xml' 3089 switch (ch) { 3090 case ' ': 3091 case '\t': 3092 case '\r': 3093 case '\n': 3094 st = 6; 3095 break; 3096 3097 default: 3098 st = -1; 3099 break; 3100 } 3101 break; 3102 3103 case 6: // read content of xml declaration 3104 switch (ch) { 3105 case '?': 3106 st = 7; 3107 break; 3108 3109 case EOS: 3110 st = -2; 3111 break; 3112 3113 default: 3114 break; 3115 } 3116 break; 3117 3118 case 7: // read '>' after '?' of xml declaration 3119 switch (ch) { 3120 case '>': 3121 case EOS: 3122 st = -2; 3123 break; 3124 3125 default: 3126 st = 6; 3127 break; 3128 } 3129 break; 3130 3131 default: 3132 panic(FAULT); 3133 break; 3134 } 3135 } 3136 mChLen = mChIdx; 3137 mChIdx = 0; 3138 // If there is no xml text declaration, the encoding is default. 3139 if (st == -1) { 3140 return enc; 3141 } 3142 mChIdx = 5; // the first white space after "<?xml" 3143 // Parse the xml text declaration 3144 for (st = 0; st >= 0;) { 3145 ch = getch(); 3146 switch (st) { 3147 case 0: // skip spaces after the xml declaration name 3148 if (chtyp(ch) != ' ') { 3149 bkch(); 3150 st = 1; 3151 } 3152 break; 3153 3154 case 1: // read xml declaration version 3155 case 2: // read xml declaration encoding or standalone 3156 case 3: // read xml declaration standalone 3157 switch (chtyp(ch)) { 3158 case 'a': 3159 case 'A': 3160 case '_': 3161 bkch(); 3162 str = name(false).toLowerCase(); 3163 if ("version".equals(str) == true) { 3164 if (st != 1) { 3165 panic(FAULT); 3166 } 3167 if ("1.0".equals(eqstr('=')) != true) { 3168 panic(FAULT); 3169 } 3170 mInp.xmlver = 0x0100; 3171 st = 2; 3172 } else if ("encoding".equals(str) == true) { 3173 if (st != 2) { 3174 panic(FAULT); 3175 } 3176 mInp.xmlenc = eqstr('=').toUpperCase(); 3177 enc = mInp.xmlenc; 3178 st = 3; 3179 } else if ("standalone".equals(str) == true) { 3180 if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] 3181 { 3182 panic(FAULT); 3183 } 3184 str = eqstr('=').toLowerCase(); 3185 // Check the 'standalone' value and use it [#5.1] 3186 if (str.equals("yes") == true) { 3187 mIsSAlone = true; 3188 } else if (str.equals("no") == true) { 3189 mIsSAlone = false; 3190 } else { 3191 panic(FAULT); 3192 } 3193 mIsSAloneSet = true; 3194 st = 4; 3195 } else { 3196 panic(FAULT); 3197 } 3198 break; 3199 3200 case ' ': 3201 break; 3202 3203 case '?': 3204 if (st == 1) { 3205 panic(FAULT); 3206 } 3207 bkch(); 3208 st = 4; 3209 break; 3210 3211 default: 3212 panic(FAULT); 3213 } 3214 break; 3215 3216 case 4: // end of xml declaration 3217 switch (chtyp(ch)) { 3218 case '?': 3219 if (getch() != '>') { 3220 panic(FAULT); 3221 } 3222 if (mPh <= PH_DOC_START) { 3223 mPh = PH_MISC_DTD; // misc before DTD 3224 } 3225 st = -1; 3226 break; 3227 3228 case ' ': 3229 break; 3230 3231 default: 3232 panic(FAULT); 3233 } 3234 break; 3235 3236 default: 3237 panic(FAULT); 3238 } 3239 } 3240 return enc; 3241 } 3242 3243 /** 3244 * Sets up the document reader. 3245 * 3246 * @param name an encoding name. 3247 * @param is the document byte input stream. 3248 * @return a reader constructed from encoding name and input stream. 3249 * @exception UnsupportedEncodingException 3250 */ 3251 private Reader enc(String name, InputStream is) 3252 throws UnsupportedEncodingException { 3253 // DO NOT CLOSE current reader if any! 3254 if (name.equals("UTF-8")) { 3255 return new ReaderUTF8(is); 3256 } else if (name.equals("UTF-16LE")) { 3257 return new ReaderUTF16(is, 'l'); 3258 } else if (name.equals("UTF-16BE")) { 3259 return new ReaderUTF16(is, 'b'); 3260 } else { 3261 return new InputStreamReader(is, name); 3262 } 3263 } 3264 3265 /** 3266 * Sets up current input on the top of the input stack. 3267 * 3268 * @param inp A new input to set up. 3269 */ 3270 protected void push(Input inp) { 3271 mInp.chLen = mChLen; 3272 mInp.chIdx = mChIdx; 3273 inp.next = mInp; 3274 mInp = inp; 3275 mChars = inp.chars; 3276 mChLen = inp.chLen; 3277 mChIdx = inp.chIdx; 3278 } 3279 3280 /** 3281 * Restores previous input on the top of the input stack. 3282 */ 3283 protected void pop() { 3284 if (mInp.src != null) { 3285 try { 3286 mInp.src.close(); 3287 } catch (IOException ioe) { 3288 } 3289 mInp.src = null; 3290 } 3291 mInp = mInp.next; 3292 if (mInp != null) { 3293 mChars = mInp.chars; 3294 mChLen = mInp.chLen; 3295 mChIdx = mInp.chIdx; 3296 } else { 3297 mChars = null; 3298 mChLen = 0; 3299 mChIdx = 0; 3300 } 3301 } 3302 3303 /** 3304 * Maps a character to its type. 3305 * 3306 * Possible character type values are: 3307 * <ul> 3308 * <li>' ' - for any kind of whitespace character;</li> 3309 * <li>'a' - for any lower case alphabetical character value;</li> 3310 * <li>'A' - for any upper case alphabetical character value;</li> 3311 * <li>'d' - for any decimal digit character value;</li> 3312 * <li>'z' - for any character less than ' ' except '\t', '\n', '\r';</li> 3313 * <li>'X' - for any not ASCII character;</li> 3314 * <li>'Z' - for EOS character.</li> 3315 * </ul> 3316 * An ASCII (7 bit) character which does not fall in any category 3317 * listed above is mapped to itself. 3318 * 3319 * @param ch The character to map. 3320 * @return The type of character. 3321 */ 3322 protected char chtyp(char ch) { 3323 if (ch < 0x80) { 3324 return (char) asctyp[ch]; 3325 } 3326 return (ch != EOS) ? 'X' : 'Z'; 3327 } 3328 3329 /** 3330 * Retrives the next character in the document. 3331 * 3332 * @return The next character in the document. 3333 */ 3334 protected char getch() 3335 throws IOException { 3336 if (mChIdx >= mChLen) { 3337 if (mInp.src == null) { 3338 pop(); // remove internal entity 3339 return getch(); 3340 } 3341 // Read new portion of the document characters 3342 int Num = mInp.src.read(mChars, 0, mChars.length); 3343 if (Num < 0) { 3344 if (mInp != mDoc) { 3345 pop(); // restore the previous input 3346 return getch(); 3347 } else { 3348 mChars[0] = EOS; 3349 mChLen = 1; 3350 } 3351 } else { 3352 mChLen = Num; 3353 } 3354 mChIdx = 0; 3355 } 3356 return mChars[mChIdx++]; 3357 } 3358 3359 /** 3360 * Puts back the last read character. 3361 * 3362 * This method <strong>MUST NOT</strong> be called more then once after each 3363 * call of {@link #getch getch} method. 3364 */ 3365 protected void bkch() 3366 throws Exception { 3367 if (mChIdx <= 0) { 3368 panic(FAULT); 3369 } 3370 mChIdx--; 3371 } 3372 3373 /** 3374 * Sets the current character. 3375 * 3376 * @param ch The character to set. 3377 */ 3378 protected void setch(char ch) { 3379 mChars[mChIdx] = ch; 3380 } 3381 3382 /** 3383 * Finds a pair in the pair chain by a qualified name. 3384 * 3385 * @param chain The first element of the chain of pairs. 3386 * @param qname The qualified name. 3387 * @return A pair with the specified qualified name or null. 3388 */ 3389 protected Pair find(Pair chain, char[] qname) { 3390 for (Pair pair = chain; pair != null; pair = pair.next) { 3391 if (pair.eqname(qname) == true) { 3392 return pair; 3393 } 3394 } 3395 return null; 3396 } 3397 3398 /** 3399 * Provedes an instance of a pair. 3400 * 3401 * @param next The reference to a next pair. 3402 * @return An instance of a pair. 3403 */ 3404 protected Pair pair(Pair next) { 3405 Pair pair; 3406 3407 if (mDltd != null) { 3408 pair = mDltd; 3409 mDltd = pair.next; 3410 } else { 3411 pair = new Pair(); 3412 } 3413 pair.next = next; 3414 3415 return pair; 3416 } 3417 3418 /** 3419 * Deletes an instance of a pair. 3420 * 3421 * @param pair The pair to delete. 3422 * @return A reference to the next pair in a chain. 3423 */ 3424 protected Pair del(Pair pair) { 3425 Pair next = pair.next; 3426 3427 pair.name = null; 3428 pair.value = null; 3429 pair.chars = null; 3430 pair.list = null; 3431 pair.next = mDltd; 3432 mDltd = pair; 3433 3434 return next; 3435 } 3436 }